1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25 /*
26 * PM8001 device state recovery routines
27 */
28
29 #include <sys/scsi/adapters/pmcs/pmcs.h>
30
31 /*
32 * SAS Topology Configuration
33 */
34 static void pmcs_ds_operational(pmcs_phy_t *pptr, pmcs_xscsi_t *tgt);
35 static void pmcs_handle_ds_recovery_error(pmcs_phy_t *phyp,
36 pmcs_xscsi_t *tgt, pmcs_hw_t *pwp, const char *func_name,
37 char *reason_string);
38
39 /*
40 * Get device state. Called with statlock and PHY lock held.
41 */
42 static int
pmcs_get_dev_state(pmcs_hw_t * pwp,pmcs_phy_t * phyp,pmcs_xscsi_t * xp,uint8_t * ds)43 pmcs_get_dev_state(pmcs_hw_t *pwp, pmcs_phy_t *phyp, pmcs_xscsi_t *xp,
44 uint8_t *ds)
45 {
46 uint32_t htag, *ptr, msg[PMCS_MSG_SIZE];
47 int result;
48 struct pmcwork *pwrk;
49
50 pmcs_prt(pwp, PMCS_PRT_DEBUG3, phyp, xp, "%s: tgt(0x%p)", __func__,
51 (void *)xp);
52
53 if (xp != NULL) {
54 ASSERT(mutex_owned(&xp->statlock));
55 }
56
57 if (phyp == NULL) {
58 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, NULL, xp,
59 "%s: PHY is NULL", __func__);
60 return (-1);
61 }
62 ASSERT(mutex_owned(&phyp->phy_lock));
63
64 pwrk = pmcs_gwork(pwp, PMCS_TAG_TYPE_WAIT, phyp);
65 if (pwrk == NULL) {
66 pmcs_prt(pwp, PMCS_PRT_ERR, phyp, xp, pmcs_nowrk, __func__);
67 return (-1);
68 }
69 pwrk->arg = msg;
70 pwrk->dtype = phyp->dtype;
71
72 if (phyp->valid_device_id == 0) {
73 pmcs_pwork(pwp, pwrk);
74 pmcs_prt(pwp, PMCS_PRT_DEBUG, phyp, xp,
75 "%s: Invalid DeviceID", __func__);
76 return (-1);
77 }
78 htag = pwrk->htag;
79 msg[0] = LE_32(PMCS_HIPRI(pwp, PMCS_OQ_GENERAL,
80 PMCIN_GET_DEVICE_STATE));
81 msg[1] = LE_32(pwrk->htag);
82 msg[2] = LE_32(phyp->device_id);
83 CLEAN_MESSAGE(msg, 3);
84
85 mutex_enter(&pwp->iqp_lock[PMCS_IQ_OTHER]);
86 ptr = GET_IQ_ENTRY(pwp, PMCS_IQ_OTHER);
87 if (ptr == NULL) {
88 mutex_exit(&pwp->iqp_lock[PMCS_IQ_OTHER]);
89 pmcs_pwork(pwp, pwrk);
90 pmcs_prt(pwp, PMCS_PRT_ERR, phyp, xp, pmcs_nomsg, __func__);
91 return (-1);
92 }
93 COPY_MESSAGE(ptr, msg, PMCS_MSG_SIZE);
94 pwrk->state = PMCS_WORK_STATE_ONCHIP;
95 INC_IQ_ENTRY(pwp, PMCS_IQ_OTHER);
96
97 if (xp != NULL) {
98 mutex_exit(&xp->statlock);
99 }
100 pmcs_unlock_phy(phyp);
101 WAIT_FOR(pwrk, 1000, result);
102 pmcs_pwork(pwp, pwrk);
103 pmcs_lock_phy(phyp);
104
105 if (xp != NULL) {
106 mutex_enter(&xp->statlock);
107 }
108
109 if (result) {
110 pmcs_timed_out(pwp, htag, __func__);
111 pmcs_prt(pwp, PMCS_PRT_DEBUG, phyp, xp,
112 "%s: cmd timed out, returning", __func__);
113 return (-1);
114 }
115 if (LE_32(msg[2]) == 0) {
116 *ds = (uint8_t)(LE_32(msg[4]));
117 if (xp == NULL) {
118 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp,
119 "%s: retrieved_ds=0x%x", __func__, *ds);
120 } else if (*ds != xp->dev_state) {
121 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp,
122 "%s: retrieved_ds=0x%x, target_ds=0x%x", __func__,
123 *ds, xp->dev_state);
124 }
125 return (0);
126 } else {
127 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp,
128 "%s: cmd failed Status(0x%x), returning ", __func__,
129 LE_32(msg[2]));
130 return (-1);
131 }
132 }
133
134 /*
135 * Set device state. Called with target's statlock and PHY lock held.
136 */
137 static int
pmcs_set_dev_state(pmcs_hw_t * pwp,pmcs_phy_t * phyp,pmcs_xscsi_t * xp,uint8_t ds)138 pmcs_set_dev_state(pmcs_hw_t *pwp, pmcs_phy_t *phyp, pmcs_xscsi_t *xp,
139 uint8_t ds)
140 {
141 uint32_t htag, *ptr, msg[PMCS_MSG_SIZE];
142 int result;
143 uint8_t pds, nds;
144 struct pmcwork *pwrk;
145
146 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp,
147 "%s: ds: 0x%x tgt: 0x%p phy: 0x%p", __func__, ds, (void *)xp,
148 (void *)phyp);
149
150 if (phyp == NULL) {
151 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, NULL, xp,
152 "%s: PHY is NULL", __func__);
153 return (-1);
154 }
155
156 pwrk = pmcs_gwork(pwp, PMCS_TAG_TYPE_WAIT, phyp);
157 if (pwrk == NULL) {
158 pmcs_prt(pwp, PMCS_PRT_ERR, phyp, xp, pmcs_nowrk, __func__);
159 return (-1);
160 }
161 if (phyp->valid_device_id == 0) {
162 pmcs_pwork(pwp, pwrk);
163 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp,
164 "%s: Invalid DeviceID", __func__);
165 return (-1);
166 }
167 pwrk->arg = msg;
168 pwrk->dtype = phyp->dtype;
169 htag = pwrk->htag;
170 msg[0] = LE_32(PMCS_HIPRI(pwp, PMCS_OQ_GENERAL,
171 PMCIN_SET_DEVICE_STATE));
172 msg[1] = LE_32(pwrk->htag);
173 msg[2] = LE_32(phyp->device_id);
174 msg[3] = LE_32(ds);
175 CLEAN_MESSAGE(msg, 4);
176
177 mutex_enter(&pwp->iqp_lock[PMCS_IQ_OTHER]);
178 ptr = GET_IQ_ENTRY(pwp, PMCS_IQ_OTHER);
179 if (ptr == NULL) {
180 mutex_exit(&pwp->iqp_lock[PMCS_IQ_OTHER]);
181 pmcs_pwork(pwp, pwrk);
182 pmcs_prt(pwp, PMCS_PRT_ERR, phyp, xp, pmcs_nomsg, __func__);
183 return (-1);
184 }
185 COPY_MESSAGE(ptr, msg, PMCS_MSG_SIZE);
186 pwrk->state = PMCS_WORK_STATE_ONCHIP;
187 INC_IQ_ENTRY(pwp, PMCS_IQ_OTHER);
188
189 if (xp != NULL) {
190 mutex_exit(&xp->statlock);
191 }
192 pmcs_unlock_phy(phyp);
193 WAIT_FOR(pwrk, 1000, result);
194 pmcs_pwork(pwp, pwrk);
195 pmcs_lock_phy(phyp);
196 if (xp != NULL) {
197 mutex_enter(&xp->statlock);
198 }
199
200 if (result) {
201 pmcs_timed_out(pwp, htag, __func__);
202 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp,
203 "%s: cmd timed out, returning", __func__);
204 return (-1);
205 }
206 if (LE_32(msg[2]) == 0) {
207 pds = (uint8_t)(LE_32(msg[4]) >> 4);
208 nds = (uint8_t)(LE_32(msg[4]) & 0x0000000f);
209 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp,
210 "%s: previous_ds=0x%x, new_ds=0x%x", __func__, pds, nds);
211 if (xp != NULL) {
212 xp->dev_state = nds;
213 }
214 return (0);
215 } else {
216 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp,
217 "%s: cmd failed Status(0x%x), returning ", __func__,
218 LE_32(msg[2]));
219 return (-1);
220 }
221 }
222
223 static void
pmcs_ds_operational(pmcs_phy_t * pptr,pmcs_xscsi_t * tgt)224 pmcs_ds_operational(pmcs_phy_t *pptr, pmcs_xscsi_t *tgt)
225 {
226 pmcs_hw_t *pwp;
227
228 ASSERT(pptr);
229 pwp = pptr->pwp;
230
231 if (tgt != NULL) {
232 tgt->recover_wait = 0;
233 }
234 pptr->ds_recovery_retries = 0;
235
236 if ((pptr->ds_prev_good_recoveries == 0) ||
237 (ddi_get_lbolt() - pptr->last_good_recovery >
238 drv_usectohz(PMCS_MAX_DS_RECOVERY_TIME))) {
239 pptr->last_good_recovery = ddi_get_lbolt();
240 pptr->ds_prev_good_recoveries = 1;
241 } else if (ddi_get_lbolt() < pptr->last_good_recovery +
242 drv_usectohz(PMCS_MAX_DS_RECOVERY_TIME)) {
243 pptr->ds_prev_good_recoveries++;
244 } else {
245 pmcs_handle_ds_recovery_error(pptr, tgt, pwp, __func__,
246 "Max recovery attempts reached. Declaring PHY dead");
247 }
248
249 /* Don't bother to run the work queues if the PHY is dead */
250 if (!pptr->dead) {
251 SCHEDULE_WORK(pwp, PMCS_WORK_RUN_QUEUES);
252 (void) ddi_taskq_dispatch(pwp->tq, pmcs_worker,
253 pwp, DDI_NOSLEEP);
254 }
255 }
256
257 void
pmcs_dev_state_recovery(pmcs_hw_t * pwp,pmcs_phy_t * phyp)258 pmcs_dev_state_recovery(pmcs_hw_t *pwp, pmcs_phy_t *phyp)
259 {
260 boolean_t reschedule = B_FALSE;
261 uint8_t ds, tgt_dev_state;
262 int rc;
263 pmcs_xscsi_t *tgt;
264 pmcs_phy_t *pptr, *pnext, *pchild;
265
266 /*
267 * First time, check to see if we're already performing recovery
268 */
269 if (phyp == NULL) {
270 mutex_enter(&pwp->lock);
271 if (pwp->ds_err_recovering) {
272 mutex_exit(&pwp->lock);
273 SCHEDULE_WORK(pwp, PMCS_WORK_DS_ERR_RECOVERY);
274 return;
275 }
276
277 pwp->ds_err_recovering = 1;
278 pptr = pwp->root_phys;
279 mutex_exit(&pwp->lock);
280 } else {
281 pptr = phyp;
282 }
283
284 while (pptr) {
285 /*
286 * Since ds_err_recovering is set, we can be assured these
287 * PHYs won't disappear on us while we do this.
288 */
289 pmcs_lock_phy(pptr);
290 pchild = pptr->children;
291 pnext = pptr->sibling;
292 pmcs_unlock_phy(pptr);
293
294 if (pchild) {
295 pmcs_dev_state_recovery(pwp, pchild);
296 }
297
298 tgt = NULL;
299 pmcs_lock_phy(pptr);
300
301 if (pptr->dead || !pptr->valid_device_id) {
302 goto next_phy;
303 }
304
305 if (pptr->iport && (pptr->iport->ua_state != UA_ACTIVE)) {
306 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, pptr->target,
307 "%s: No DS recovery on PHY %s, iport not active",
308 __func__, pptr->path);
309 goto next_phy;
310 }
311
312 tgt = pptr->target;
313
314 if (tgt != NULL) {
315 mutex_enter(&tgt->statlock);
316 if (tgt->recover_wait == 0) {
317 goto next_phy;
318 }
319 tgt_dev_state = tgt->dev_state;
320 } else {
321 tgt_dev_state = PMCS_DEVICE_STATE_NOT_AVAILABLE;
322 }
323
324 if (pptr->prev_recovery) {
325 if (ddi_get_lbolt() - pptr->prev_recovery <
326 drv_usectohz(PMCS_DS_RECOVERY_INTERVAL)) {
327 pmcs_prt(pwp, PMCS_PRT_DEBUG2, pptr, tgt,
328 "%s: DS recovery on PHY %s "
329 "re-invoked too soon. Skipping...",
330 __func__, pptr->path);
331 if ((tgt) && (tgt->recover_wait)) {
332 reschedule = B_TRUE;
333 }
334 goto next_phy;
335 }
336 }
337 pptr->prev_recovery = ddi_get_lbolt();
338
339 /*
340 * Step 1: Put the device into the IN_RECOVERY state
341 */
342 rc = pmcs_get_dev_state(pwp, pptr, tgt, &ds);
343 if (rc != 0) {
344 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt,
345 "%s: pmcs_get_dev_state on PHY %s "
346 "failed (rc=%d)",
347 __func__, pptr->path, rc);
348
349 pmcs_handle_ds_recovery_error(pptr, tgt, pwp,
350 __func__, "pmcs_get_dev_state");
351
352 goto next_phy;
353 }
354
355 /* If the chip says it's operational, we're done */
356 if (ds == PMCS_DEVICE_STATE_OPERATIONAL) {
357 pmcs_ds_operational(pptr, tgt);
358 goto next_phy;
359 }
360
361 if ((tgt_dev_state == ds) &&
362 (ds == PMCS_DEVICE_STATE_IN_RECOVERY)) {
363 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt,
364 "%s: Target 0x%p already IN_RECOVERY", __func__,
365 (void *)tgt);
366 } else {
367 if (tgt != NULL) {
368 tgt->dev_state = ds;
369 }
370 tgt_dev_state = ds;
371 ds = PMCS_DEVICE_STATE_IN_RECOVERY;
372 rc = pmcs_send_err_recovery_cmd(pwp, ds, pptr, tgt);
373 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt,
374 "%s: pmcs_send_err_recovery_cmd "
375 "result(%d) tgt(0x%p) ds(0x%x) tgt->ds(0x%x)",
376 __func__, rc, (void *)tgt, ds, tgt_dev_state);
377
378 if (rc) {
379 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt,
380 "%s: pmcs_send_err_recovery_cmd to PHY %s "
381 "failed (rc=%d)",
382 __func__, pptr->path, rc);
383
384 pmcs_handle_ds_recovery_error(pptr, tgt, pwp,
385 __func__, "pmcs_send_err_recovery_cmd");
386
387 goto next_phy;
388 }
389 }
390
391 /*
392 * Step 2: Perform a hard reset on the PHY.
393 */
394 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt,
395 "%s: Issue HARD_RESET to PHY %s", __func__,
396 pptr->path);
397 /*
398 * Must release statlock here because pmcs_reset_phy
399 * will drop and reacquire the PHY lock.
400 */
401 if (tgt != NULL) {
402 mutex_exit(&tgt->statlock);
403 }
404 rc = pmcs_reset_phy(pwp, pptr, PMCS_PHYOP_HARD_RESET);
405 if (tgt != NULL) {
406 mutex_enter(&tgt->statlock);
407 }
408 if (rc) {
409 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt,
410 "%s: HARD_RESET to PHY %s failed (rc=%d)",
411 __func__, pptr->path, rc);
412
413 pmcs_handle_ds_recovery_error(pptr, tgt, pwp,
414 __func__, "HARD_RESET");
415
416 goto next_phy;
417 }
418
419 /*
420 * Step 3: Abort all I/Os to the device
421 */
422 if (pptr->abort_all_start) {
423 while (pptr->abort_all_start) {
424 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt,
425 "%s: Waiting for outstanding ABORT_ALL on "
426 "PHY 0x%p", __func__, (void *)pptr);
427 cv_wait(&pptr->abort_all_cv, &pptr->phy_lock);
428 }
429 } else {
430 if (tgt != NULL) {
431 mutex_exit(&tgt->statlock);
432 }
433 rc = pmcs_abort(pwp, pptr, pptr->device_id, 1, 1);
434 if (tgt != NULL) {
435 mutex_enter(&tgt->statlock);
436 }
437 if (rc != 0) {
438 pptr->abort_pending = 1;
439 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt,
440 "%s: pmcs_abort to PHY %s failed (rc=%d)",
441 __func__, pptr->path, rc);
442
443 pmcs_handle_ds_recovery_error(pptr, tgt,
444 pwp, __func__, "pmcs_abort");
445
446 goto next_phy;
447 }
448 }
449
450 /*
451 * Step 4: Set the device back to OPERATIONAL state
452 */
453 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt,
454 "%s: Set PHY/tgt 0x%p/0x%p to OPERATIONAL state",
455 __func__, (void *)pptr, (void *)tgt);
456 rc = pmcs_set_dev_state(pwp, pptr, tgt,
457 PMCS_DEVICE_STATE_OPERATIONAL);
458 if (rc == 0) {
459 pmcs_ds_operational(pptr, tgt);
460 } else {
461 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, pptr, tgt,
462 "%s: Failed to SET tgt 0x%p to OPERATIONAL state",
463 __func__, (void *)tgt);
464
465 pmcs_handle_ds_recovery_error(pptr, tgt, pwp,
466 __func__, "SET tgt to OPERATIONAL state");
467
468 goto next_phy;
469 }
470
471 next_phy:
472 if (tgt) {
473 mutex_exit(&tgt->statlock);
474 }
475 pmcs_unlock_phy(pptr);
476 pptr = pnext;
477 }
478
479 /*
480 * Only clear ds_err_recovering if we're exiting for good and not
481 * just unwinding from recursion
482 */
483 if (phyp == NULL) {
484 mutex_enter(&pwp->lock);
485 pwp->ds_err_recovering = 0;
486 mutex_exit(&pwp->lock);
487 }
488
489 if (reschedule) {
490 SCHEDULE_WORK(pwp, PMCS_WORK_DS_ERR_RECOVERY);
491 }
492 }
493
494 /*
495 * Called with target's statlock held (if target is non-NULL) and PHY lock held.
496 */
497 int
pmcs_send_err_recovery_cmd(pmcs_hw_t * pwp,uint8_t dev_state,pmcs_phy_t * phyp,pmcs_xscsi_t * tgt)498 pmcs_send_err_recovery_cmd(pmcs_hw_t *pwp, uint8_t dev_state, pmcs_phy_t *phyp,
499 pmcs_xscsi_t *tgt)
500 {
501 int rc = -1;
502 uint8_t tgt_dev_state = PMCS_DEVICE_STATE_NOT_AVAILABLE;
503
504 if (tgt != NULL) {
505 ASSERT(mutex_owned(&tgt->statlock));
506 if (tgt->recovering) {
507 return (0);
508 }
509
510 tgt->recovering = 1;
511 tgt_dev_state = tgt->dev_state;
512 }
513
514 if (phyp == NULL) {
515 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, NULL, tgt,
516 "%s: PHY is NULL", __func__);
517 return (-1);
518 }
519
520 ASSERT(mutex_owned(&phyp->phy_lock));
521
522 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt,
523 "%s: ds: 0x%x, tgt ds(0x%x)", __func__, dev_state, tgt_dev_state);
524
525 switch (dev_state) {
526 case PMCS_DEVICE_STATE_IN_RECOVERY:
527 if (tgt_dev_state == PMCS_DEVICE_STATE_IN_RECOVERY) {
528 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt,
529 "%s: Target 0x%p already IN_RECOVERY", __func__,
530 (void *)tgt);
531 rc = 0; /* This is not an error */
532 goto no_action;
533 }
534
535 rc = pmcs_set_dev_state(pwp, phyp, tgt,
536 PMCS_DEVICE_STATE_IN_RECOVERY);
537 if (rc != 0) {
538 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt,
539 "%s(1): Failed to set tgt(0x%p) to IN_RECOVERY",
540 __func__, (void *)tgt);
541 }
542
543 break;
544
545 case PMCS_DEVICE_STATE_OPERATIONAL:
546 if (tgt_dev_state != PMCS_DEVICE_STATE_IN_RECOVERY) {
547 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt,
548 "%s: Target 0x%p not ready to go OPERATIONAL",
549 __func__, (void *)tgt);
550 goto no_action;
551 }
552
553 rc = pmcs_set_dev_state(pwp, phyp, tgt,
554 PMCS_DEVICE_STATE_OPERATIONAL);
555 if (tgt != NULL) {
556 tgt->reset_success = 1;
557 }
558 if (rc != 0) {
559 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt,
560 "%s(2): Failed to SET tgt(0x%p) to OPERATIONAL",
561 __func__, (void *)tgt);
562 if (tgt != NULL) {
563 tgt->reset_success = 0;
564 }
565 }
566
567 break;
568
569 case PMCS_DEVICE_STATE_NON_OPERATIONAL:
570 PHY_CHANGED(pwp, phyp);
571 RESTART_DISCOVERY(pwp);
572 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt,
573 "%s: Device at %s is non-operational",
574 __func__, phyp->path);
575 if (tgt != NULL) {
576 tgt->dev_state = PMCS_DEVICE_STATE_NON_OPERATIONAL;
577 }
578 rc = 0;
579
580 break;
581
582 default:
583 pmcs_prt(pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, tgt,
584 "%s: Invalid state requested (%d)", __func__,
585 dev_state);
586 break;
587
588 }
589
590 no_action:
591 if (tgt != NULL) {
592 tgt->recovering = 0;
593 }
594 return (rc);
595 }
596
597 /*
598 * Start ssp event recovery. We have to schedule recovery operation because
599 * it involves sending multiple commands to device and we should not do it
600 * in the interrupt context.
601 * If it is failure of a recovery command, let the recovery thread deal with it.
602 * Called with the work lock held.
603 */
604 void
pmcs_start_ssp_event_recovery(pmcs_hw_t * pwp,pmcwork_t * pwrk,uint32_t * iomb,size_t amt)605 pmcs_start_ssp_event_recovery(pmcs_hw_t *pwp, pmcwork_t *pwrk, uint32_t *iomb,
606 size_t amt)
607 {
608 pmcs_xscsi_t *tgt = pwrk->xp;
609 uint32_t event = LE_32(iomb[2]);
610 pmcs_phy_t *pptr = pwrk->phy;
611 pmcs_cb_t callback;
612 uint32_t tag;
613
614 if (tgt != NULL) {
615 mutex_enter(&tgt->statlock);
616 if (!tgt->assigned) {
617 if (pptr) {
618 pmcs_dec_phy_ref_count(pptr);
619 }
620 pptr = NULL;
621 pwrk->phy = NULL;
622 }
623 mutex_exit(&tgt->statlock);
624 }
625
626 if (pptr == NULL) {
627 /*
628 * No target, need to run RE-DISCOVERY here.
629 */
630 if (pwrk->state != PMCS_WORK_STATE_TIMED_OUT) {
631 pwrk->state = PMCS_WORK_STATE_INTR;
632 }
633 /*
634 * Although we cannot mark phy to force abort nor mark phy
635 * as changed, killing of a target would take care of aborting
636 * commands for the device.
637 */
638 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt,
639 "%s: No valid target for event processing. Reconfigure.",
640 __func__);
641 pmcs_pwork(pwp, pwrk);
642 RESTART_DISCOVERY(pwp);
643 return;
644 } else {
645 /* We have a phy pointer, we'll need to lock it */
646 mutex_exit(&pwrk->lock);
647 pmcs_lock_phy(pptr);
648 mutex_enter(&pwrk->lock);
649 if (tgt != NULL) {
650 mutex_enter(&tgt->statlock);
651 }
652 if (event == PMCOUT_STATUS_OPEN_CNX_ERROR_IT_NEXUS_LOSS) {
653 if ((tgt != NULL) && (tgt->dev_state !=
654 PMCS_DEVICE_STATE_NON_OPERATIONAL)) {
655 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt,
656 "%s: Device at %s is non-operational",
657 __func__, pptr->path);
658 tgt->dev_state =
659 PMCS_DEVICE_STATE_NON_OPERATIONAL;
660 }
661 pptr->abort_pending = 1;
662 if (tgt != NULL) {
663 mutex_exit(&tgt->statlock);
664 }
665 mutex_exit(&pwrk->lock);
666 pmcs_unlock_phy(pptr);
667 SCHEDULE_WORK(pwp, PMCS_WORK_ABORT_HANDLE);
668 RESTART_DISCOVERY(pwp);
669 return;
670 }
671
672 /*
673 * If this command is run in WAIT mode, it is a failing recovery
674 * command. If so, just wake up recovery thread waiting for
675 * command completion.
676 */
677 tag = PMCS_TAG_TYPE(pwrk->htag);
678 if (tag == PMCS_TAG_TYPE_WAIT) {
679 pwrk->htag |= PMCS_TAG_DONE;
680 if (pwrk->arg && amt) {
681 (void) memcpy(pwrk->arg, iomb, amt);
682 }
683 cv_signal(&pwrk->sleep_cv);
684 if (tgt != NULL) {
685 mutex_exit(&tgt->statlock);
686 }
687 mutex_exit(&pwrk->lock);
688 pmcs_unlock_phy(pptr);
689 return;
690 }
691
692 if (tgt == NULL) {
693 pmcs_prt(pwp, PMCS_PRT_DEBUG1, pptr, NULL,
694 "%s: Not scheduling SSP event recovery for NULL tgt"
695 " pwrk(%p) tag(0x%x)", __func__, (void *)pwrk,
696 pwrk->htag);
697 mutex_exit(&pwrk->lock);
698 pmcs_unlock_phy(pptr);
699 return;
700 }
701
702 /*
703 * If the SSP event was an OPEN_RETRY_TIMEOUT, we don't want
704 * to go through the recovery (abort/LU reset) process.
705 * Simply complete the command and return it as STATUS_BUSY.
706 * This will cause the target driver to simply retry.
707 */
708 if (event == PMCOUT_STATUS_IO_XFER_OPEN_RETRY_TIMEOUT) {
709 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt,
710 "%s: Got OPEN_RETRY_TIMEOUT event (htag 0x%08x)",
711 __func__, pwrk->htag);
712
713 mutex_exit(&tgt->statlock);
714 /* Note: work remains locked for the callback */
715 pmcs_unlock_phy(pptr);
716 pwrk->ssp_event = event;
717 callback = (pmcs_cb_t)pwrk->ptr;
718 (*callback)(pwp, pwrk, iomb);
719 return;
720 }
721
722 /*
723 * To recover from primary failures,
724 * we need to schedule handling events recovery.
725 */
726 tgt->event_recovery = 1;
727 mutex_exit(&tgt->statlock);
728 pwrk->ssp_event = event;
729 mutex_exit(&pwrk->lock);
730 pmcs_unlock_phy(pptr);
731 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt,
732 "%s: Scheduling SSP event recovery for tgt(0x%p) "
733 "pwrk(%p) tag(0x%x)", __func__, (void *)tgt, (void *)pwrk,
734 pwrk->htag);
735 SCHEDULE_WORK(pwp, PMCS_WORK_SSP_EVT_RECOVERY);
736 }
737
738 /* Work cannot be completed until event recovery is completed. */
739 }
740
741 /*
742 * SSP target event recovery
743 * phy->lock should be held upon entry.
744 * pwrk->lock should be held upon entry and gets released by this routine.
745 * tgt->statlock should not be held.
746 */
747 void
pmcs_tgt_event_recovery(pmcs_hw_t * pwp,pmcwork_t * pwrk)748 pmcs_tgt_event_recovery(pmcs_hw_t *pwp, pmcwork_t *pwrk)
749 {
750 pmcs_phy_t *pptr = pwrk->phy;
751 pmcs_cmd_t *sp = pwrk->arg;
752 pmcs_lun_t *lun = sp->cmd_lun;
753 pmcs_xscsi_t *tgt = pwrk->xp;
754 uint32_t event;
755 uint32_t htag;
756 uint32_t status;
757 int rv;
758
759 ASSERT(pwrk->arg != NULL);
760 ASSERT(pwrk->xp != NULL);
761 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt,
762 "%s: event recovery for target 0x%p", __func__, (void *)pwrk->xp);
763 htag = pwrk->htag;
764 event = pwrk->ssp_event;
765 pwrk->ssp_event = 0xffffffff;
766
767 mutex_exit(&pwrk->lock);
768
769 if (event == PMCOUT_STATUS_XFER_ERR_BREAK ||
770 event == PMCOUT_STATUS_XFER_ERR_PHY_NOT_READY ||
771 event == PMCOUT_STATUS_XFER_ERROR_CMD_ISSUE_ACK_NAK_TIMEOUT) {
772 /* Command may be still pending on device */
773 rv = pmcs_ssp_tmf(pwp, pptr, SAS_QUERY_TASK, htag,
774 lun->lun_num, &status);
775 if (rv != 0) {
776 goto out;
777 }
778 if (status == SAS_RSP_TMF_COMPLETE) {
779 /* Command NOT pending on a device */
780 pmcs_prt(pwp, PMCS_PRT_DEBUG, pptr, tgt,
781 "%s: No pending command for tgt 0x%p",
782 __func__, (void *)tgt);
783 /* Nothing more to do, just abort it on chip */
784 htag = 0;
785 }
786 }
787 /*
788 * All other events left the command pending in the host
789 * Send abort task and abort it on the chip
790 */
791 if (htag != 0) {
792 if (pmcs_ssp_tmf(pwp, pptr, SAS_ABORT_TASK, htag,
793 lun->lun_num, &status))
794 goto out;
795 }
796 (void) pmcs_abort(pwp, pptr, htag, 0, 1);
797 /*
798 * Abort either took care of work completion, or put device in
799 * a recovery state
800 */
801 return;
802 out:
803 /* Abort failed, do full device recovery */
804 mutex_enter(&pwrk->lock);
805 tgt = pwrk->xp;
806 mutex_exit(&pwrk->lock);
807 if (tgt != NULL) {
808 mutex_enter(&tgt->statlock);
809 pmcs_start_dev_state_recovery(tgt, pptr);
810 mutex_exit(&tgt->statlock);
811 }
812 }
813
814 /*
815 * SSP event recovery task.
816 */
817 void
pmcs_ssp_event_recovery(pmcs_hw_t * pwp)818 pmcs_ssp_event_recovery(pmcs_hw_t *pwp)
819 {
820 int idx;
821 pmcs_xscsi_t *tgt;
822 pmcs_cmd_t *cp;
823 pmcwork_t *pwrk;
824 pmcs_phy_t *pphy;
825 int er_flag;
826 uint32_t idxpwrk;
827
828 restart:
829 for (idx = 0; idx < pwp->max_dev; idx++) {
830 mutex_enter(&pwp->lock);
831 tgt = pwp->targets[idx];
832 mutex_exit(&pwp->lock);
833 if (tgt == NULL) {
834 continue;
835 }
836
837 mutex_enter(&tgt->statlock);
838 if (!tgt->assigned) {
839 mutex_exit(&tgt->statlock);
840 continue;
841 }
842 pphy = tgt->phy;
843 er_flag = tgt->event_recovery;
844 mutex_exit(&tgt->statlock);
845
846 if ((pphy == NULL) || (er_flag == 0)) {
847 continue;
848 }
849
850 pmcs_lock_phy(pphy);
851 pmcs_prt(pwp, PMCS_PRT_DEBUG, pphy, tgt,
852 "%s: found target(0x%p)", __func__, (void *) tgt);
853
854 /* Check what cmd expects recovery */
855 mutex_enter(&tgt->aqlock);
856 STAILQ_FOREACH(cp, &tgt->aq, cmd_next) {
857 idxpwrk = PMCS_TAG_INDEX(cp->cmd_tag);
858 pwrk = &pwp->work[idxpwrk];
859 mutex_enter(&pwrk->lock);
860 if (pwrk->htag != cp->cmd_tag) {
861 /*
862 * aq may contain TMF commands, so we
863 * may not find work structure with htag
864 */
865 mutex_exit(&pwrk->lock);
866 continue;
867 }
868 if (!PMCS_COMMAND_DONE(pwrk) &&
869 (pwrk->ssp_event != 0) &&
870 (pwrk->ssp_event != PMCS_REC_EVENT)) {
871 pmcs_prt(pwp, PMCS_PRT_DEBUG, pphy, tgt,
872 "%s: pwrk(%p) htag(0x%x)",
873 __func__, (void *) pwrk, cp->cmd_tag);
874 mutex_exit(&tgt->aqlock);
875 /*
876 * pwrk->lock gets dropped in
877 * pmcs_tgt_event_recovery()
878 */
879 pmcs_tgt_event_recovery(pwp, pwrk);
880 pmcs_unlock_phy(pphy);
881 /* All bets are off on tgt/aq now, restart */
882 goto restart;
883 }
884 mutex_exit(&pwrk->lock);
885 }
886 mutex_exit(&tgt->aqlock);
887 mutex_enter(&tgt->statlock);
888 tgt->event_recovery = 0;
889 pmcs_prt(pwp, PMCS_PRT_DEBUG, pphy, tgt,
890 "%s: end of SSP event recovery for target(0x%p)",
891 __func__, (void *) tgt);
892 mutex_exit(&tgt->statlock);
893 pmcs_unlock_phy(pphy);
894 }
895 pmcs_prt(pwp, PMCS_PRT_DEBUG, NULL, NULL,
896 "%s: end of SSP event recovery for pwp(0x%p)", __func__,
897 (void *) pwp);
898 }
899
900 void
pmcs_start_dev_state_recovery(pmcs_xscsi_t * xp,pmcs_phy_t * phyp)901 pmcs_start_dev_state_recovery(pmcs_xscsi_t *xp, pmcs_phy_t *phyp)
902 {
903 ASSERT(mutex_owned(&xp->statlock));
904 ASSERT(xp->pwp != NULL);
905
906 if (xp->recover_wait == 0) {
907 pmcs_prt(xp->pwp, PMCS_PRT_DEBUG_DEV_STATE, phyp, xp,
908 "%s: Start ds_recovery for tgt 0x%p/PHY 0x%p (%s)",
909 __func__, (void *)xp, (void *)phyp, phyp->path);
910 xp->recover_wait = 1;
911
912 /*
913 * Rather than waiting for the watchdog timer, we'll
914 * kick it right now.
915 */
916 SCHEDULE_WORK(xp->pwp, PMCS_WORK_DS_ERR_RECOVERY);
917 (void) ddi_taskq_dispatch(xp->pwp->tq, pmcs_worker, xp->pwp,
918 DDI_NOSLEEP);
919 }
920 }
921
922 /*
923 * Increment the phy ds error retry count.
924 * If too many retries, mark phy dead and restart discovery;
925 * otherwise schedule ds recovery.
926 */
927 static void
pmcs_handle_ds_recovery_error(pmcs_phy_t * phyp,pmcs_xscsi_t * tgt,pmcs_hw_t * pwp,const char * func_name,char * reason_string)928 pmcs_handle_ds_recovery_error(pmcs_phy_t *phyp, pmcs_xscsi_t *tgt,
929 pmcs_hw_t *pwp, const char *func_name, char *reason_string)
930 {
931 ASSERT(mutex_owned(&phyp->phy_lock));
932 ASSERT((tgt == NULL) || mutex_owned(&tgt->statlock));
933
934 phyp->ds_recovery_retries++;
935
936 if (phyp->ds_recovery_retries > PMCS_MAX_DS_RECOVERY_RETRIES) {
937 pmcs_prt(pwp, PMCS_PRT_DEBUG, phyp, tgt,
938 "%s: retry limit reached after %s to PHY %s failed",
939 func_name, reason_string, phyp->path);
940 if (tgt != NULL) {
941 tgt->recover_wait = 0;
942 }
943 /*
944 * Mark the PHY as dead and it and its parent as changed,
945 * then restart discovery
946 */
947 phyp->dead = 1;
948 PHY_CHANGED(pwp, phyp);
949 if (phyp->parent)
950 PHY_CHANGED(pwp, phyp->parent);
951 RESTART_DISCOVERY(pwp);
952 } else if ((phyp->ds_prev_good_recoveries >
953 PMCS_MAX_DS_RECOVERY_RETRIES) &&
954 (phyp->last_good_recovery + drv_usectohz(PMCS_MAX_DS_RECOVERY_TIME)
955 < ddi_get_lbolt())) {
956 pmcs_prt(pwp, PMCS_PRT_DEBUG, phyp, tgt, "%s: max number of "
957 "successful recoveries reached, declaring PHY %s dead",
958 __func__, phyp->path);
959 if (tgt != NULL) {
960 tgt->recover_wait = 0;
961 }
962 /*
963 * Mark the PHY as dead and its parent as changed,
964 * then restart discovery
965 */
966 phyp->dead = 1;
967 PHY_CHANGED(pwp, phyp);
968 if (phyp->parent)
969 PHY_CHANGED(pwp, phyp->parent);
970 RESTART_DISCOVERY(pwp);
971 } else {
972 SCHEDULE_WORK(pwp, PMCS_WORK_DS_ERR_RECOVERY);
973 }
974 }
975