1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * PM8001 device state recovery routines 27 */ 28 29 #include <sys/scsi/adapters/pmcs/pmcs.h> 30 31 /* 32 * SAS Topology Configuration 33 */ 34 static void pmcs_ds_operational(pmcs_phy_t *pptr, pmcs_xscsi_t *tgt); 35 static void pmcs_handle_ds_recovery_error(pmcs_phy_t *phyp, 36 pmcs_xscsi_t *tgt, pmcs_hw_t *pwp, const char *func_name, 37 char *reason_string); 38 39 /* 40 * Get device state. Called with statlock and PHY lock held. 41 */ 42 static int 43 pmcs_get_dev_state(pmcs_hw_t *pwp, pmcs_phy_t *phyp, pmcs_xscsi_t *xp, 44 uint8_t *ds) 45 { 46 uint32_t htag, *ptr, msg[PMCS_MSG_SIZE]; 47 int result; 48 struct pmcwork *pwrk; 49 50 pmcs_prt(pwp, PMCS_PRT_DEBUG3, phyp, xp, "%s: tgt(0x%p)", __func__, 51 (void *)xp); 52 53 if (xp != NULL) { 54 ASSERT(mutex_owned(&xp->statlock)); 55 } 56 57 if (phyp == NULL) { 58 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, NULL, xp, 59 "%s: PHY is NULL", __func__); 60 return (-1); 61 } 62 ASSERT(mutex_owned(&phyp->phy_lock)); 63 64 pwrk = pmcs_gwork(pwp, PMCS_TAG_TYPE_WAIT, phyp); 65 if (pwrk == NULL) { 66 pmcs_prt(pwp, PMCS_PRT_ERR, phyp, xp, pmcs_nowrk, __func__); 67 return (-1); 68 } 69 pwrk->arg = msg; 70 pwrk->dtype = phyp->dtype; 71 72 if (phyp->valid_device_id == 0) { 73 pmcs_pwork(pwp, pwrk); 74 pmcs_prt(pwp, PMCS_PRT_DEBUG, phyp, xp, 75 "%s: Invalid DeviceID", __func__); 76 return (-1); 77 } 78 htag = pwrk->htag; 79 msg[0] = LE_32(PMCS_HIPRI(pwp, PMCS_OQ_GENERAL, 80 PMCIN_GET_DEVICE_STATE)); 81 msg[1] = LE_32(pwrk->htag); 82 msg[2] = LE_32(phyp->device_id); 83 CLEAN_MESSAGE(msg, 3); 84 85 mutex_enter(&pwp->iqp_lock[PMCS_IQ_OTHER]); 86 ptr = GET_IQ_ENTRY(pwp, PMCS_IQ_OTHER); 87 if (ptr == NULL) { 88 mutex_exit(&pwp->iqp_lock[PMCS_IQ_OTHER]); 89 pmcs_pwork(pwp, pwrk); 90 pmcs_prt(pwp, PMCS_PRT_ERR, phyp, xp, pmcs_nomsg, __func__); 91 return (-1); 92 } 93 COPY_MESSAGE(ptr, msg, PMCS_MSG_SIZE); 94 pwrk->state = PMCS_WORK_STATE_ONCHIP; 95 INC_IQ_ENTRY(pwp, PMCS_IQ_OTHER); 96 97 if (xp != NULL) { 98 mutex_exit(&xp->statlock); 99 } 100 pmcs_unlock_phy(phyp); 101 WAIT_FOR(pwrk, 1000, result); 102 pmcs_pwork(pwp, pwrk); 103 pmcs_lock_phy(phyp); 104 105 if (xp != NULL) { 106 mutex_enter(&xp->statlock); 107 } 108 109 if (result) { 110 pmcs_timed_out(pwp, htag, __func__); 111 pmcs_prt(pwp, PMCS_PRT_DEBUG, phyp, xp, 112 "%s: cmd timed out, returning", __func__); 113 return (-1); 114 } 115 if (LE_32(msg[2]) == 0) { 116 *ds = (uint8_t)(LE_32(msg[4])); 117 if (xp == NULL) { 118 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 119 "%s: retrieved_ds=0x%x", __func__, *ds); 120 } else if (*ds != xp->dev_state) { 121 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 122 "%s: retrieved_ds=0x%x, target_ds=0x%x", __func__, 123 *ds, xp->dev_state); 124 } 125 return (0); 126 } else { 127 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 128 "%s: cmd failed Status(0x%x), returning ", __func__, 129 LE_32(msg[2])); 130 return (-1); 131 } 132 } 133 134 /* 135 * Set device state. Called with target's statlock and PHY lock held. 136 */ 137 static int 138 pmcs_set_dev_state(pmcs_hw_t *pwp, pmcs_phy_t *phyp, pmcs_xscsi_t *xp, 139 uint8_t ds) 140 { 141 uint32_t htag, *ptr, msg[PMCS_MSG_SIZE]; 142 int result; 143 uint8_t pds, nds; 144 struct pmcwork *pwrk; 145 146 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 147 "%s: ds: 0x%x tgt: 0x%p phy: 0x%p", __func__, ds, (void *)xp, 148 (void *)phyp); 149 150 if (phyp == NULL) { 151 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, NULL, xp, 152 "%s: PHY is NULL", __func__); 153 return (-1); 154 } 155 156 pwrk = pmcs_gwork(pwp, PMCS_TAG_TYPE_WAIT, phyp); 157 if (pwrk == NULL) { 158 pmcs_prt(pwp, PMCS_PRT_ERR, phyp, xp, pmcs_nowrk, __func__); 159 return (-1); 160 } 161 if (phyp->valid_device_id == 0) { 162 pmcs_pwork(pwp, pwrk); 163 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 164 "%s: Invalid DeviceID", __func__); 165 return (-1); 166 } 167 pwrk->arg = msg; 168 pwrk->dtype = phyp->dtype; 169 htag = pwrk->htag; 170 msg[0] = LE_32(PMCS_HIPRI(pwp, PMCS_OQ_GENERAL, 171 PMCIN_SET_DEVICE_STATE)); 172 msg[1] = LE_32(pwrk->htag); 173 msg[2] = LE_32(phyp->device_id); 174 msg[3] = LE_32(ds); 175 CLEAN_MESSAGE(msg, 4); 176 177 mutex_enter(&pwp->iqp_lock[PMCS_IQ_OTHER]); 178 ptr = GET_IQ_ENTRY(pwp, PMCS_IQ_OTHER); 179 if (ptr == NULL) { 180 mutex_exit(&pwp->iqp_lock[PMCS_IQ_OTHER]); 181 pmcs_pwork(pwp, pwrk); 182 pmcs_prt(pwp, PMCS_PRT_ERR, phyp, xp, pmcs_nomsg, __func__); 183 return (-1); 184 } 185 COPY_MESSAGE(ptr, msg, PMCS_MSG_SIZE); 186 pwrk->state = PMCS_WORK_STATE_ONCHIP; 187 INC_IQ_ENTRY(pwp, PMCS_IQ_OTHER); 188 189 if (xp != NULL) { 190 mutex_exit(&xp->statlock); 191 } 192 pmcs_unlock_phy(phyp); 193 WAIT_FOR(pwrk, 1000, result); 194 pmcs_pwork(pwp, pwrk); 195 pmcs_lock_phy(phyp); 196 if (xp != NULL) { 197 mutex_enter(&xp->statlock); 198 } 199 200 if (result) { 201 pmcs_timed_out(pwp, htag, __func__); 202 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 203 "%s: cmd timed out, returning", __func__); 204 return (-1); 205 } 206 if (LE_32(msg[2]) == 0) { 207 pds = (uint8_t)(LE_32(msg[4]) >> 4); 208 nds = (uint8_t)(LE_32(msg[4]) & 0x0000000f); 209 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 210 "%s: previous_ds=0x%x, new_ds=0x%x", __func__, pds, nds); 211 if (xp != NULL) { 212 xp->dev_state = nds; 213 } 214 return (0); 215 } else { 216 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 217 "%s: cmd failed Status(0x%x), returning ", __func__, 218 LE_32(msg[2])); 219 return (-1); 220 } 221 } 222 223 static void 224 pmcs_ds_operational(pmcs_phy_t *pptr, pmcs_xscsi_t *tgt) 225 { 226 pmcs_hw_t *pwp; 227 228 ASSERT(pptr); 229 pwp = pptr->pwp; 230 231 if (tgt != NULL) { 232 tgt->recover_wait = 0; 233 } 234 pptr->ds_recovery_retries = 0; 235 236 if ((pptr->ds_prev_good_recoveries == 0) || 237 (ddi_get_lbolt() - pptr->last_good_recovery > 238 drv_usectohz(PMCS_MAX_DS_RECOVERY_TIME))) { 239 pptr->last_good_recovery = ddi_get_lbolt(); 240 pptr->ds_prev_good_recoveries = 1; 241 } else if (ddi_get_lbolt() < pptr->last_good_recovery + 242 drv_usectohz(PMCS_MAX_DS_RECOVERY_TIME)) { 243 pptr->ds_prev_good_recoveries++; 244 } else { 245 pmcs_handle_ds_recovery_error(pptr, tgt, pwp, __func__, 246 "Max recovery attempts reached. Declaring PHY dead"); 247 } 248 249 /* Don't bother to run the work queues if the PHY is dead */ 250 if (!pptr->dead) { 251 SCHEDULE_WORK(pwp, PMCS_WORK_RUN_QUEUES); 252 (void) ddi_taskq_dispatch(pwp->tq, pmcs_worker, 253 pwp, DDI_NOSLEEP); 254 } 255 } 256 257 void 258 pmcs_dev_state_recovery(pmcs_hw_t *pwp, pmcs_phy_t *phyp) 259 { 260 boolean_t reschedule = B_FALSE; 261 uint8_t ds, tgt_dev_state; 262 int rc; 263 pmcs_xscsi_t *tgt; 264 pmcs_phy_t *pptr, *pnext, *pchild; 265 266 /* 267 * First time, check to see if we're already performing recovery 268 */ 269 if (phyp == NULL) { 270 mutex_enter(&pwp->lock); 271 if (pwp->ds_err_recovering) { 272 mutex_exit(&pwp->lock); 273 SCHEDULE_WORK(pwp, PMCS_WORK_DS_ERR_RECOVERY); 274 return; 275 } 276 277 pwp->ds_err_recovering = 1; 278 pptr = pwp->root_phys; 279 mutex_exit(&pwp->lock); 280 } else { 281 pptr = phyp; 282 } 283 284 while (pptr) { 285 /* 286 * Since ds_err_recovering is set, we can be assured these 287 * PHYs won't disappear on us while we do this. 288 */ 289 pmcs_lock_phy(pptr); 290 pchild = pptr->children; 291 pnext = pptr->sibling; 292 pmcs_unlock_phy(pptr); 293 294 if (pchild) { 295 pmcs_dev_state_recovery(pwp, pchild); 296 } 297 298 tgt = NULL; 299 pmcs_lock_phy(pptr); 300 301 if (pptr->dead || !pptr->valid_device_id) { 302 goto next_phy; 303 } 304 305 if (pptr->iport && (pptr->iport->ua_state != UA_ACTIVE)) { 306 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, pptr->target, 307 "%s: No DS recovery on PHY %s, iport not active", 308 __func__, pptr->path); 309 goto next_phy; 310 } 311 312 tgt = pptr->target; 313 314 if (tgt != NULL) { 315 mutex_enter(&tgt->statlock); 316 if (tgt->recover_wait == 0) { 317 goto next_phy; 318 } 319 tgt_dev_state = tgt->dev_state; 320 } else { 321 tgt_dev_state = PMCS_DEVICE_STATE_NOT_AVAILABLE; 322 } 323 324 if (pptr->prev_recovery) { 325 if (ddi_get_lbolt() - pptr->prev_recovery < 326 drv_usectohz(PMCS_DS_RECOVERY_INTERVAL)) { 327 pmcs_prt(pwp, PMCS_PRT_DEBUG2, pptr, tgt, 328 "%s: DS recovery on PHY %s " 329 "re-invoked too soon. Skipping...", 330 __func__, pptr->path); 331 if ((tgt) && (tgt->recover_wait)) { 332 reschedule = B_TRUE; 333 } 334 goto next_phy; 335 } 336 } 337 pptr->prev_recovery = ddi_get_lbolt(); 338 339 /* 340 * Step 1: Put the device into the IN_RECOVERY state 341 */ 342 rc = pmcs_get_dev_state(pwp, pptr, tgt, &ds); 343 if (rc != 0) { 344 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 345 "%s: pmcs_get_dev_state on PHY %s " 346 "failed (rc=%d)", 347 __func__, pptr->path, rc); 348 349 pmcs_handle_ds_recovery_error(pptr, tgt, pwp, 350 __func__, "pmcs_get_dev_state"); 351 352 goto next_phy; 353 } 354 355 /* If the chip says it's operational, we're done */ 356 if (ds == PMCS_DEVICE_STATE_OPERATIONAL) { 357 pmcs_ds_operational(pptr, tgt); 358 goto next_phy; 359 } 360 361 if ((tgt_dev_state == ds) && 362 (ds == PMCS_DEVICE_STATE_IN_RECOVERY)) { 363 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt, 364 "%s: Target 0x%p already IN_RECOVERY", __func__, 365 (void *)tgt); 366 } else { 367 if (tgt != NULL) { 368 tgt->dev_state = ds; 369 } 370 tgt_dev_state = ds; 371 ds = PMCS_DEVICE_STATE_IN_RECOVERY; 372 rc = pmcs_send_err_recovery_cmd(pwp, ds, pptr, tgt); 373 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt, 374 "%s: pmcs_send_err_recovery_cmd " 375 "result(%d) tgt(0x%p) ds(0x%x) tgt->ds(0x%x)", 376 __func__, rc, (void *)tgt, ds, tgt_dev_state); 377 378 if (rc) { 379 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 380 "%s: pmcs_send_err_recovery_cmd to PHY %s " 381 "failed (rc=%d)", 382 __func__, pptr->path, rc); 383 384 pmcs_handle_ds_recovery_error(pptr, tgt, pwp, 385 __func__, "pmcs_send_err_recovery_cmd"); 386 387 goto next_phy; 388 } 389 } 390 391 /* 392 * Step 2: Perform a hard reset on the PHY. 393 */ 394 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt, 395 "%s: Issue HARD_RESET to PHY %s", __func__, 396 pptr->path); 397 /* 398 * Must release statlock here because pmcs_reset_phy 399 * will drop and reacquire the PHY lock. 400 */ 401 if (tgt != NULL) { 402 mutex_exit(&tgt->statlock); 403 } 404 rc = pmcs_reset_phy(pwp, pptr, PMCS_PHYOP_HARD_RESET); 405 if (tgt != NULL) { 406 mutex_enter(&tgt->statlock); 407 } 408 if (rc) { 409 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 410 "%s: HARD_RESET to PHY %s failed (rc=%d)", 411 __func__, pptr->path, rc); 412 413 pmcs_handle_ds_recovery_error(pptr, tgt, pwp, 414 __func__, "HARD_RESET"); 415 416 goto next_phy; 417 } 418 419 /* 420 * Step 3: Abort all I/Os to the device 421 */ 422 if (pptr->abort_all_start) { 423 while (pptr->abort_all_start) { 424 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 425 "%s: Waiting for outstanding ABORT_ALL on " 426 "PHY 0x%p", __func__, (void *)pptr); 427 cv_wait(&pptr->abort_all_cv, &pptr->phy_lock); 428 } 429 } else { 430 if (tgt != NULL) { 431 mutex_exit(&tgt->statlock); 432 } 433 rc = pmcs_abort(pwp, pptr, pptr->device_id, 1, 1); 434 if (tgt != NULL) { 435 mutex_enter(&tgt->statlock); 436 } 437 if (rc != 0) { 438 pptr->abort_pending = 1; 439 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 440 "%s: pmcs_abort to PHY %s failed (rc=%d)", 441 __func__, pptr->path, rc); 442 443 pmcs_handle_ds_recovery_error(pptr, tgt, 444 pwp, __func__, "pmcs_abort"); 445 446 goto next_phy; 447 } 448 } 449 450 /* 451 * Step 4: Set the device back to OPERATIONAL state 452 */ 453 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt, 454 "%s: Set PHY/tgt 0x%p/0x%p to OPERATIONAL state", 455 __func__, (void *)pptr, (void *)tgt); 456 rc = pmcs_set_dev_state(pwp, pptr, tgt, 457 PMCS_DEVICE_STATE_OPERATIONAL); 458 if (rc == 0) { 459 pmcs_ds_operational(pptr, tgt); 460 } else { 461 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt, 462 "%s: Failed to SET tgt 0x%p to OPERATIONAL state", 463 __func__, (void *)tgt); 464 465 pmcs_handle_ds_recovery_error(pptr, tgt, pwp, 466 __func__, "SET tgt to OPERATIONAL state"); 467 468 goto next_phy; 469 } 470 471 next_phy: 472 if (tgt) { 473 mutex_exit(&tgt->statlock); 474 } 475 pmcs_unlock_phy(pptr); 476 pptr = pnext; 477 } 478 479 /* 480 * Only clear ds_err_recovering if we're exiting for good and not 481 * just unwinding from recursion 482 */ 483 if (phyp == NULL) { 484 mutex_enter(&pwp->lock); 485 pwp->ds_err_recovering = 0; 486 mutex_exit(&pwp->lock); 487 } 488 489 if (reschedule) { 490 SCHEDULE_WORK(pwp, PMCS_WORK_DS_ERR_RECOVERY); 491 } 492 } 493 494 /* 495 * Called with target's statlock held (if target is non-NULL) and PHY lock held. 496 */ 497 int 498 pmcs_send_err_recovery_cmd(pmcs_hw_t *pwp, uint8_t dev_state, pmcs_phy_t *phyp, 499 pmcs_xscsi_t *tgt) 500 { 501 int rc = -1; 502 uint8_t tgt_dev_state = PMCS_DEVICE_STATE_NOT_AVAILABLE; 503 504 if (tgt != NULL) { 505 ASSERT(mutex_owned(&tgt->statlock)); 506 if (tgt->recovering) { 507 return (0); 508 } 509 510 tgt->recovering = 1; 511 tgt_dev_state = tgt->dev_state; 512 } 513 514 if (phyp == NULL) { 515 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, NULL, tgt, 516 "%s: PHY is NULL", __func__); 517 return (-1); 518 } 519 520 ASSERT(mutex_owned(&phyp->phy_lock)); 521 522 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 523 "%s: ds: 0x%x, tgt ds(0x%x)", __func__, dev_state, tgt_dev_state); 524 525 switch (dev_state) { 526 case PMCS_DEVICE_STATE_IN_RECOVERY: 527 if (tgt_dev_state == PMCS_DEVICE_STATE_IN_RECOVERY) { 528 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 529 "%s: Target 0x%p already IN_RECOVERY", __func__, 530 (void *)tgt); 531 rc = 0; /* This is not an error */ 532 goto no_action; 533 } 534 535 rc = pmcs_set_dev_state(pwp, phyp, tgt, 536 PMCS_DEVICE_STATE_IN_RECOVERY); 537 if (rc != 0) { 538 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 539 "%s(1): Failed to set tgt(0x%p) to IN_RECOVERY", 540 __func__, (void *)tgt); 541 } 542 543 break; 544 545 case PMCS_DEVICE_STATE_OPERATIONAL: 546 if (tgt_dev_state != PMCS_DEVICE_STATE_IN_RECOVERY) { 547 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 548 "%s: Target 0x%p not ready to go OPERATIONAL", 549 __func__, (void *)tgt); 550 goto no_action; 551 } 552 553 rc = pmcs_set_dev_state(pwp, phyp, tgt, 554 PMCS_DEVICE_STATE_OPERATIONAL); 555 if (tgt != NULL) { 556 tgt->reset_success = 1; 557 } 558 if (rc != 0) { 559 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 560 "%s(2): Failed to SET tgt(0x%p) to OPERATIONAL", 561 __func__, (void *)tgt); 562 if (tgt != NULL) { 563 tgt->reset_success = 0; 564 } 565 } 566 567 break; 568 569 case PMCS_DEVICE_STATE_NON_OPERATIONAL: 570 PHY_CHANGED(pwp, phyp); 571 RESTART_DISCOVERY(pwp); 572 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 573 "%s: Device at %s is non-operational", 574 __func__, phyp->path); 575 if (tgt != NULL) { 576 tgt->dev_state = PMCS_DEVICE_STATE_NON_OPERATIONAL; 577 } 578 rc = 0; 579 580 break; 581 582 default: 583 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt, 584 "%s: Invalid state requested (%d)", __func__, 585 dev_state); 586 break; 587 588 } 589 590 no_action: 591 if (tgt != NULL) { 592 tgt->recovering = 0; 593 } 594 return (rc); 595 } 596 597 /* 598 * Start ssp event recovery. We have to schedule recovery operation because 599 * it involves sending multiple commands to device and we should not do it 600 * in the interrupt context. 601 * If it is failure of a recovery command, let the recovery thread deal with it. 602 * Called with the work lock held. 603 */ 604 void 605 pmcs_start_ssp_event_recovery(pmcs_hw_t *pwp, pmcwork_t *pwrk, uint32_t *iomb, 606 size_t amt) 607 { 608 pmcs_xscsi_t *tgt = pwrk->xp; 609 uint32_t event = LE_32(iomb[2]); 610 pmcs_phy_t *pptr = pwrk->phy; 611 pmcs_cb_t callback; 612 uint32_t tag; 613 614 if (tgt != NULL) { 615 mutex_enter(&tgt->statlock); 616 if (!tgt->assigned) { 617 if (pptr) { 618 pmcs_dec_phy_ref_count(pptr); 619 } 620 pptr = NULL; 621 pwrk->phy = NULL; 622 } 623 mutex_exit(&tgt->statlock); 624 } 625 626 if (pptr == NULL) { 627 /* 628 * No target, need to run RE-DISCOVERY here. 629 */ 630 if (pwrk->state != PMCS_WORK_STATE_TIMED_OUT) { 631 pwrk->state = PMCS_WORK_STATE_INTR; 632 } 633 /* 634 * Although we cannot mark phy to force abort nor mark phy 635 * as changed, killing of a target would take care of aborting 636 * commands for the device. 637 */ 638 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 639 "%s: No valid target for event processing. Reconfigure.", 640 __func__); 641 pmcs_pwork(pwp, pwrk); 642 RESTART_DISCOVERY(pwp); 643 return; 644 } else { 645 /* We have a phy pointer, we'll need to lock it */ 646 mutex_exit(&pwrk->lock); 647 pmcs_lock_phy(pptr); 648 mutex_enter(&pwrk->lock); 649 if (tgt != NULL) { 650 mutex_enter(&tgt->statlock); 651 } 652 if (event == PMCOUT_STATUS_OPEN_CNX_ERROR_IT_NEXUS_LOSS) { 653 if ((tgt != NULL) && (tgt->dev_state != 654 PMCS_DEVICE_STATE_NON_OPERATIONAL)) { 655 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 656 "%s: Device at %s is non-operational", 657 __func__, pptr->path); 658 tgt->dev_state = 659 PMCS_DEVICE_STATE_NON_OPERATIONAL; 660 } 661 pptr->abort_pending = 1; 662 if (tgt != NULL) { 663 mutex_exit(&tgt->statlock); 664 } 665 mutex_exit(&pwrk->lock); 666 pmcs_unlock_phy(pptr); 667 SCHEDULE_WORK(pwp, PMCS_WORK_ABORT_HANDLE); 668 RESTART_DISCOVERY(pwp); 669 return; 670 } 671 672 /* 673 * If this command is run in WAIT mode, it is a failing recovery 674 * command. If so, just wake up recovery thread waiting for 675 * command completion. 676 */ 677 tag = PMCS_TAG_TYPE(pwrk->htag); 678 if (tag == PMCS_TAG_TYPE_WAIT) { 679 pwrk->htag |= PMCS_TAG_DONE; 680 if (pwrk->arg && amt) { 681 (void) memcpy(pwrk->arg, iomb, amt); 682 } 683 cv_signal(&pwrk->sleep_cv); 684 if (tgt != NULL) { 685 mutex_exit(&tgt->statlock); 686 } 687 mutex_exit(&pwrk->lock); 688 pmcs_unlock_phy(pptr); 689 return; 690 } 691 692 if (tgt == NULL) { 693 pmcs_prt(pwp, PMCS_PRT_DEBUG1, pptr, NULL, 694 "%s: Not scheduling SSP event recovery for NULL tgt" 695 " pwrk(%p) tag(0x%x)", __func__, (void *)pwrk, 696 pwrk->htag); 697 mutex_exit(&pwrk->lock); 698 pmcs_unlock_phy(pptr); 699 return; 700 } 701 702 /* 703 * If the SSP event was an OPEN_RETRY_TIMEOUT, we don't want 704 * to go through the recovery (abort/LU reset) process. 705 * Simply complete the command and return it as STATUS_BUSY. 706 * This will cause the target driver to simply retry. 707 */ 708 if (event == PMCOUT_STATUS_IO_XFER_OPEN_RETRY_TIMEOUT) { 709 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 710 "%s: Got OPEN_RETRY_TIMEOUT event (htag 0x%08x)", 711 __func__, pwrk->htag); 712 713 mutex_exit(&tgt->statlock); 714 /* Note: work remains locked for the callback */ 715 pmcs_unlock_phy(pptr); 716 pwrk->ssp_event = event; 717 callback = (pmcs_cb_t)pwrk->ptr; 718 (*callback)(pwp, pwrk, iomb); 719 return; 720 } 721 722 /* 723 * To recover from primary failures, 724 * we need to schedule handling events recovery. 725 */ 726 tgt->event_recovery = 1; 727 mutex_exit(&tgt->statlock); 728 pwrk->ssp_event = event; 729 mutex_exit(&pwrk->lock); 730 pmcs_unlock_phy(pptr); 731 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 732 "%s: Scheduling SSP event recovery for tgt(0x%p) " 733 "pwrk(%p) tag(0x%x)", __func__, (void *)tgt, (void *)pwrk, 734 pwrk->htag); 735 SCHEDULE_WORK(pwp, PMCS_WORK_SSP_EVT_RECOVERY); 736 } 737 738 /* Work cannot be completed until event recovery is completed. */ 739 } 740 741 /* 742 * SSP target event recovery 743 * phy->lock should be held upon entry. 744 * pwrk->lock should be held upon entry and gets released by this routine. 745 * tgt->statlock should not be held. 746 */ 747 void 748 pmcs_tgt_event_recovery(pmcs_hw_t *pwp, pmcwork_t *pwrk) 749 { 750 pmcs_phy_t *pptr = pwrk->phy; 751 pmcs_cmd_t *sp = pwrk->arg; 752 pmcs_lun_t *lun = sp->cmd_lun; 753 pmcs_xscsi_t *tgt = pwrk->xp; 754 uint32_t event; 755 uint32_t htag; 756 uint32_t status; 757 int rv; 758 759 ASSERT(pwrk->arg != NULL); 760 ASSERT(pwrk->xp != NULL); 761 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 762 "%s: event recovery for target 0x%p", __func__, (void *)pwrk->xp); 763 htag = pwrk->htag; 764 event = pwrk->ssp_event; 765 pwrk->ssp_event = 0xffffffff; 766 767 mutex_exit(&pwrk->lock); 768 769 if (event == PMCOUT_STATUS_XFER_ERR_BREAK || 770 event == PMCOUT_STATUS_XFER_ERR_PHY_NOT_READY || 771 event == PMCOUT_STATUS_XFER_ERROR_CMD_ISSUE_ACK_NAK_TIMEOUT) { 772 /* Command may be still pending on device */ 773 rv = pmcs_ssp_tmf(pwp, pptr, SAS_QUERY_TASK, htag, 774 lun->lun_num, &status); 775 if (rv != 0) { 776 goto out; 777 } 778 if (status == SAS_RSP_TMF_COMPLETE) { 779 /* Command NOT pending on a device */ 780 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt, 781 "%s: No pending command for tgt 0x%p", 782 __func__, (void *)tgt); 783 /* Nothing more to do, just abort it on chip */ 784 htag = 0; 785 } 786 } 787 /* 788 * All other events left the command pending in the host 789 * Send abort task and abort it on the chip 790 */ 791 if (htag != 0) { 792 if (pmcs_ssp_tmf(pwp, pptr, SAS_ABORT_TASK, htag, 793 lun->lun_num, &status)) 794 goto out; 795 } 796 (void) pmcs_abort(pwp, pptr, htag, 0, 1); 797 /* 798 * Abort either took care of work completion, or put device in 799 * a recovery state 800 */ 801 return; 802 out: 803 /* Abort failed, do full device recovery */ 804 mutex_enter(&pwrk->lock); 805 tgt = pwrk->xp; 806 mutex_exit(&pwrk->lock); 807 if (tgt != NULL) { 808 mutex_enter(&tgt->statlock); 809 pmcs_start_dev_state_recovery(tgt, pptr); 810 mutex_exit(&tgt->statlock); 811 } 812 } 813 814 /* 815 * SSP event recovery task. 816 */ 817 void 818 pmcs_ssp_event_recovery(pmcs_hw_t *pwp) 819 { 820 int idx; 821 pmcs_xscsi_t *tgt; 822 pmcs_cmd_t *cp; 823 pmcwork_t *pwrk; 824 pmcs_phy_t *pphy; 825 int er_flag; 826 uint32_t idxpwrk; 827 828 restart: 829 for (idx = 0; idx < pwp->max_dev; idx++) { 830 mutex_enter(&pwp->lock); 831 tgt = pwp->targets[idx]; 832 mutex_exit(&pwp->lock); 833 if (tgt == NULL) { 834 continue; 835 } 836 837 mutex_enter(&tgt->statlock); 838 if (!tgt->assigned) { 839 mutex_exit(&tgt->statlock); 840 continue; 841 } 842 pphy = tgt->phy; 843 er_flag = tgt->event_recovery; 844 mutex_exit(&tgt->statlock); 845 846 if ((pphy == NULL) || (er_flag == 0)) { 847 continue; 848 } 849 850 pmcs_lock_phy(pphy); 851 pmcs_prt(pwp, PMCS_PRT_DEBUG, pphy, tgt, 852 "%s: found target(0x%p)", __func__, (void *) tgt); 853 854 /* Check what cmd expects recovery */ 855 mutex_enter(&tgt->aqlock); 856 STAILQ_FOREACH(cp, &tgt->aq, cmd_next) { 857 idxpwrk = PMCS_TAG_INDEX(cp->cmd_tag); 858 pwrk = &pwp->work[idxpwrk]; 859 mutex_enter(&pwrk->lock); 860 if (pwrk->htag != cp->cmd_tag) { 861 /* 862 * aq may contain TMF commands, so we 863 * may not find work structure with htag 864 */ 865 mutex_exit(&pwrk->lock); 866 continue; 867 } 868 if (!PMCS_COMMAND_DONE(pwrk) && 869 (pwrk->ssp_event != 0) && 870 (pwrk->ssp_event != PMCS_REC_EVENT)) { 871 pmcs_prt(pwp, PMCS_PRT_DEBUG, pphy, tgt, 872 "%s: pwrk(%p) htag(0x%x)", 873 __func__, (void *) pwrk, cp->cmd_tag); 874 mutex_exit(&tgt->aqlock); 875 /* 876 * pwrk->lock gets dropped in 877 * pmcs_tgt_event_recovery() 878 */ 879 pmcs_tgt_event_recovery(pwp, pwrk); 880 pmcs_unlock_phy(pphy); 881 /* All bets are off on tgt/aq now, restart */ 882 goto restart; 883 } 884 mutex_exit(&pwrk->lock); 885 } 886 mutex_exit(&tgt->aqlock); 887 mutex_enter(&tgt->statlock); 888 tgt->event_recovery = 0; 889 pmcs_prt(pwp, PMCS_PRT_DEBUG, pphy, tgt, 890 "%s: end of SSP event recovery for target(0x%p)", 891 __func__, (void *) tgt); 892 mutex_exit(&tgt->statlock); 893 pmcs_unlock_phy(pphy); 894 } 895 pmcs_prt(pwp, PMCS_PRT_DEBUG, NULL, NULL, 896 "%s: end of SSP event recovery for pwp(0x%p)", __func__, 897 (void *) pwp); 898 } 899 900 void 901 pmcs_start_dev_state_recovery(pmcs_xscsi_t *xp, pmcs_phy_t *phyp) 902 { 903 ASSERT(mutex_owned(&xp->statlock)); 904 ASSERT(xp->pwp != NULL); 905 906 if (xp->recover_wait == 0) { 907 pmcs_prt(xp->pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp, 908 "%s: Start ds_recovery for tgt 0x%p/PHY 0x%p (%s)", 909 __func__, (void *)xp, (void *)phyp, phyp->path); 910 xp->recover_wait = 1; 911 912 /* 913 * Rather than waiting for the watchdog timer, we'll 914 * kick it right now. 915 */ 916 SCHEDULE_WORK(xp->pwp, PMCS_WORK_DS_ERR_RECOVERY); 917 (void) ddi_taskq_dispatch(xp->pwp->tq, pmcs_worker, xp->pwp, 918 DDI_NOSLEEP); 919 } 920 } 921 922 /* 923 * Increment the phy ds error retry count. 924 * If too many retries, mark phy dead and restart discovery; 925 * otherwise schedule ds recovery. 926 */ 927 static void 928 pmcs_handle_ds_recovery_error(pmcs_phy_t *phyp, pmcs_xscsi_t *tgt, 929 pmcs_hw_t *pwp, const char *func_name, char *reason_string) 930 { 931 ASSERT(mutex_owned(&phyp->phy_lock)); 932 ASSERT((tgt == NULL) || mutex_owned(&tgt->statlock)); 933 934 phyp->ds_recovery_retries++; 935 936 if (phyp->ds_recovery_retries > PMCS_MAX_DS_RECOVERY_RETRIES) { 937 pmcs_prt(pwp, PMCS_PRT_DEBUG, phyp, tgt, 938 "%s: retry limit reached after %s to PHY %s failed", 939 func_name, reason_string, phyp->path); 940 if (tgt != NULL) { 941 tgt->recover_wait = 0; 942 } 943 /* 944 * Mark the PHY as dead and it and its parent as changed, 945 * then restart discovery 946 */ 947 phyp->dead = 1; 948 PHY_CHANGED(pwp, phyp); 949 if (phyp->parent) 950 PHY_CHANGED(pwp, phyp->parent); 951 RESTART_DISCOVERY(pwp); 952 } else if ((phyp->ds_prev_good_recoveries > 953 PMCS_MAX_DS_RECOVERY_RETRIES) && 954 (phyp->last_good_recovery + drv_usectohz(PMCS_MAX_DS_RECOVERY_TIME) 955 < ddi_get_lbolt())) { 956 pmcs_prt(pwp, PMCS_PRT_DEBUG, phyp, tgt, "%s: max number of " 957 "successful recoveries reached, declaring PHY %s dead", 958 __func__, phyp->path); 959 if (tgt != NULL) { 960 tgt->recover_wait = 0; 961 } 962 /* 963 * Mark the PHY as dead and its parent as changed, 964 * then restart discovery 965 */ 966 phyp->dead = 1; 967 PHY_CHANGED(pwp, phyp); 968 if (phyp->parent) 969 PHY_CHANGED(pwp, phyp->parent); 970 RESTART_DISCOVERY(pwp); 971 } else { 972 SCHEDULE_WORK(pwp, PMCS_WORK_DS_ERR_RECOVERY); 973 } 974 } 975