1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright (c) 2017, Joyent, Inc. 14 */ 15 16 #include <sys/scsi/adapters/smrt/smrt.h> 17 18 /* 19 * Discovery, Resets, Periodics, and Events 20 * ---------------------------------------- 21 * 22 * Discovery is the act of figuring out what logical and physical volumes exist 23 * under the controller. Discovery happens in response to the following events: 24 * 25 * o iports for virtual and physical devices being attached 26 * o Controller event notifications indicating potential topology changes 27 * o After a reset of the controller, before we can perform I/O again 28 * 29 * Because we have to perform discovery after a reset, which can happen during 30 * panic(), that also means that discovery may be run in panic context. We 31 * also need to emphasize the need for discovery to happen after a controller 32 * reset. Once a reset is initiated, we cannot be certain about the addresses 33 * of any of the existing targets until the reset has completed. The driver 34 * performs I/Os to addresses that the controller provides. The controller 35 * specification says that these addresses may change after a controller reset. 36 * 37 * Unfortunately, all of this combined means that making sure we can correctly 38 * run discovery is somewhat complicated. In non-panic contexts, discovery is 39 * always run from a taskq. We'll kick off the discovery in the taskq if 40 * nothing is pending at that time. The state is managed by bits in the 41 * smrt_status member of the smrt_t. There are four bits at this time: 42 * 43 * SMRT_CTLR_DISCOVERY_REQUESTED This flag indicates that something has 44 * requested that a discovery be performed. 45 * If no flags are set when this is set, 46 * then we will kick off discovery. All 47 * discovery requests are initiated via the 48 * smrt_discover_request() function. 49 * 50 * SMRT_CTLR_DISCOVERY_RUNNING This flag is set at the start of us 51 * running a discovery. It is removed when 52 * discovery finishes. 53 * 54 * SMRT_CTLR_DISCOVERY_PERIODIC This flag is set in a number of 55 * circumstances, which will be described 56 * in a subsequent section. This indicates 57 * that the periodic must kick off the 58 * discovery process. 59 * 60 * SMRT_CTLR_DISCOVERY_REQUIRED This flag indicates that at some point a 61 * controller reset occurred and we need to 62 * have a successful discovery to finish 63 * the act of resetting and allowing I/O to 64 * continue. 65 * 66 * In general, a request to discover kicks off the taskq to discover entries, if 67 * it hasn't already been requested or started. This also allows us to coalesce 68 * multiple requests, if needed. Note that if a request comes in when a 69 * discovery is ongoing, we do not kick off discovery again. Instead, we set 70 * the SMRT_CTLR_DISCOVERY_REQUESTED flag which will rerun discovery after the 71 * initial pass has completed. 72 * 73 * When a discovery starts, the first thing it does is clear the 74 * SMRT_CTLR_DISCOVERY_REQUESTED flag. This is important, because any 75 * additional requests for discovery that come in after this has started likely 76 * indicate that we've missed something. As such, when the discovery process 77 * finishes, if it sees the REQUESTED flag, then it will need to set the 78 * PERIODIC flag. The PERIODIC flag is used to indicate that we should run 79 * discovery again, but not kick if off immediately. Instead, it should be 80 * driven by the normal periodic behavior. 81 * 82 * If for some reason the act of discovery fails, or we fail to dispatch 83 * discovery due to a transient error, then we will flag PERIODIC so that the 84 * periodic tick will try and run things again. 85 * 86 * Now, we need to talk about SMRT_CTLR_DISCOVERY_REQUIRED. This flag is set 87 * after a reset occurs. The reset thread will be blocked on this. 88 * Importantly, none of the code in the discovery path can ask for a controller 89 * reset at this time. If at the end of a discovery, this flag is set, then we 90 * will signal the reset thread that it should check on its status by 91 * broadcasting on the smrt_cv_finishq. At that point, the reset thread will 92 * continue. 93 * 94 * Panic Context 95 * ------------- 96 * 97 * All of this talk of threads and taskqs is well and good, but as an HBA 98 * driver, we have a serious responsibility to try and deal with panic sanely. 99 * In panic context, we will directly call the discovery functions and not poll 100 * for them to occur. 101 * 102 * However, because our discovery relies on the target maps, which aren't safe 103 * for panic context at this time, we have to take a different approach. We 104 * leverage the fact that we have a generation number stored with every 105 * discovery. If we try to do an I/O to a device where the generation doesn't 106 * match, then we know that it disappeared and should not be used. We also 107 * sanity check the model, serial numbers, and WWNs to make sure that these are 108 * the same devices. If they are, then we'll end up updating the address 109 * structures. 110 * 111 * Now, it is possible that when we were panicking, we had a thread that was in 112 * the process of running a discovery or even resetting the system. Once we're 113 * in panic, those threads aren't running, so if they didn't end up producing a 114 * new view of the world that the SCSI framework is using, then it shouldn't 115 * really matter, as we won't have updated the list of devices. Importantly, 116 * once we're in that context, we're not going to be attaching or detaching 117 * targets. If we get a request for one of these targets which has disappeared, 118 * we're going to have to end up giving up. 119 * 120 * Request Attributes 121 * ------------------ 122 * 123 * The CISS specification allows for three different kinds of attributes that 124 * describe how requests are queued to the controller. These are: 125 * 126 * HEAD OF QUEUE The request should go to the head of the 127 * controller queue. This is used for resets and 128 * aborts to ensure that they're not blocked behind 129 * additional I/O. 130 * 131 * SIMPLE This queues the request for normal processing. 132 * Commands queued this way are not special with 133 * respect to one another. We use this for all I/O 134 * and discovery commands. 135 * 136 * ORDERED This attribute is used to indicate that commands 137 * should be submitted and processed in some order. 138 * This is used primarily for the event 139 * notification bits so we can ensure that at the 140 * return of a cancellation of the event 141 * notification, that any outstanding request has 142 * been honored. 143 */ 144 145 static int smrt_ctlr_versions(smrt_t *, uint16_t, smrt_versions_t *); 146 static void smrt_discover(void *); 147 148 /* 149 * The maximum number of seconds to wait for the controller to come online. 150 */ 151 unsigned smrt_ciss_init_time = 90; 152 153 /* 154 * A tunable that determines the number of events per tick that we'll process 155 * via asynchronous event notification. If this rate is very high, then we will 156 * not submit the event and it will be picked up at the next tick of the 157 * periodic. 158 */ 159 uint_t smrt_event_intervention_threshold = 1000; 160 161 /* 162 * Converts a LUN Address to a BMIC Identifier. The BMIC Identifier is used 163 * when performing various physical commands and generally should stay the same 164 * for a given device across inserts and removals; however, not across 165 * controller resets. These are calculated based on what the CISS specification 166 * calls the 'Level 2' target and bus, which don't have a real meaning in the 167 * SAS world otherwise. 168 */ 169 uint16_t 170 smrt_lun_addr_to_bmic(PhysDevAddr_t *paddr) 171 { 172 uint16_t id; 173 174 id = (paddr->Target[1].PeripDev.Bus - 1) << 8; 175 id += paddr->Target[1].PeripDev.Dev; 176 177 return (id); 178 } 179 180 void 181 smrt_write_lun_addr_phys(LUNAddr_t *lun, boolean_t masked, unsigned bus, 182 unsigned target) 183 { 184 lun->PhysDev.Mode = masked ? MASK_PERIPHERIAL_DEV_ADDR : 185 PERIPHERIAL_DEV_ADDR; 186 187 lun->PhysDev.TargetId = target; 188 lun->PhysDev.Bus = bus; 189 190 bzero(&lun->PhysDev.Target, sizeof (lun->PhysDev.Target)); 191 } 192 193 /* 194 * According to the CISS Specification, the controller is always addressed in 195 * Mask Perhiperhal mode with a bus and target ID of zero. This is used by 196 * commands that need to write to the controller itself, which is generally 197 * discovery and other commands. 198 */ 199 void 200 smrt_write_controller_lun_addr(LUNAddr_t *lun) 201 { 202 smrt_write_lun_addr_phys(lun, B_TRUE, 0, 0); 203 } 204 205 void 206 smrt_write_message_common(smrt_command_t *smcm, uint8_t type, int timeout_secs) 207 { 208 switch (type) { 209 case CISS_MSG_ABORT: 210 case CISS_MSG_RESET: 211 case CISS_MSG_NOP: 212 break; 213 214 default: 215 panic("unknown message type"); 216 } 217 218 smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_MSG; 219 smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_HEADOFQUEUE; 220 smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_NONE; 221 smcm->smcm_va_cmd->Request.Timeout = LE_16(timeout_secs); 222 smcm->smcm_va_cmd->Request.CDBLen = CISS_CDBLEN; 223 smcm->smcm_va_cmd->Request.CDB[0] = type; 224 } 225 226 void 227 smrt_write_message_abort_one(smrt_command_t *smcm, uint32_t tag) 228 { 229 smrt_tag_t cisstag; 230 231 /* 232 * When aborting a particular command, the request is addressed 233 * to the controller. 234 */ 235 smrt_write_lun_addr_phys(&smcm->smcm_va_cmd->Header.LUN, 236 B_TRUE, 0, 0); 237 238 smrt_write_message_common(smcm, CISS_MSG_ABORT, 0); 239 240 /* 241 * Abort a single command. 242 */ 243 smcm->smcm_va_cmd->Request.CDB[1] = CISS_ABORT_TASK; 244 245 /* 246 * The CISS Specification says that the tag value for a task-level 247 * abort should be in the CDB in bytes 4-11. 248 */ 249 bzero(&cisstag, sizeof (cisstag)); 250 cisstag.tag_value = tag; 251 bcopy(&cisstag, &smcm->smcm_va_cmd->Request.CDB[4], 252 sizeof (cisstag)); 253 } 254 255 void 256 smrt_write_message_abort_all(smrt_command_t *smcm, LUNAddr_t *addr) 257 { 258 /* 259 * When aborting all tasks for a particular Logical Volume, 260 * the command is addressed not to the controller but to 261 * the Volume itself. 262 */ 263 smcm->smcm_va_cmd->Header.LUN = *addr; 264 265 smrt_write_message_common(smcm, CISS_MSG_ABORT, 0); 266 267 /* 268 * Abort all commands for a particular Logical Volume. 269 */ 270 smcm->smcm_va_cmd->Request.CDB[1] = CISS_ABORT_TASKSET; 271 } 272 273 void 274 smrt_write_message_event_notify(smrt_command_t *smcm) 275 { 276 smrt_event_notify_req_t senr; 277 278 smrt_write_controller_lun_addr(&smcm->smcm_va_cmd->Header.LUN); 279 280 smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD; 281 smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_ORDERED; 282 smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_READ; 283 smcm->smcm_va_cmd->Request.Timeout = 0; 284 smcm->smcm_va_cmd->Request.CDBLen = sizeof (senr); 285 286 bzero(&senr, sizeof (senr)); 287 senr.senr_opcode = CISS_SCMD_READ; 288 senr.senr_subcode = CISS_BMIC_NOTIFY_ON_EVENT; 289 senr.senr_flags = BE_32(0); 290 senr.senr_size = BE_32(SMRT_EVENT_NOTIFY_BUFLEN); 291 292 bcopy(&senr, &smcm->smcm_va_cmd->Request.CDB[0], 293 MIN(CISS_CDBLEN, sizeof (senr))); 294 } 295 296 void 297 smrt_write_message_cancel_event_notify(smrt_command_t *smcm) 298 { 299 smrt_event_notify_req_t senr; 300 301 smrt_write_controller_lun_addr(&smcm->smcm_va_cmd->Header.LUN); 302 303 smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD; 304 smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_ORDERED; 305 smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_WRITE; 306 smcm->smcm_va_cmd->Request.Timeout = LE_16(SMRT_ASYNC_CANCEL_TIMEOUT); 307 smcm->smcm_va_cmd->Request.CDBLen = sizeof (senr); 308 309 bzero(&senr, sizeof (senr)); 310 senr.senr_opcode = CISS_SCMD_WRITE; 311 senr.senr_subcode = CISS_BMIC_NOTIFY_ON_EVENT_CANCEL; 312 senr.senr_size = BE_32(SMRT_EVENT_NOTIFY_BUFLEN); 313 314 bcopy(&senr, &smcm->smcm_va_cmd->Request.CDB[0], 315 MIN(CISS_CDBLEN, sizeof (senr))); 316 } 317 318 void 319 smrt_write_message_reset_ctlr(smrt_command_t *smcm) 320 { 321 smrt_write_lun_addr_phys(&smcm->smcm_va_cmd->Header.LUN, 322 B_TRUE, 0, 0); 323 324 smrt_write_message_common(smcm, CISS_MSG_RESET, 0); 325 326 smcm->smcm_va_cmd->Request.CDB[1] = CISS_RESET_CTLR; 327 } 328 329 void 330 smrt_write_message_nop(smrt_command_t *smcm, int timeout_secs) 331 { 332 /* 333 * No-op messages are always sent to the controller. 334 */ 335 smrt_write_lun_addr_phys(&smcm->smcm_va_cmd->Header.LUN, 336 B_TRUE, 0, 0); 337 338 smrt_write_message_common(smcm, CISS_MSG_NOP, timeout_secs); 339 } 340 341 /* 342 * This routine is executed regularly by ddi_periodic_add(9F). It checks the 343 * health of the controller and looks for submitted commands that have timed 344 * out. 345 */ 346 void 347 smrt_periodic(void *arg) 348 { 349 smrt_t *smrt = arg; 350 351 mutex_enter(&smrt->smrt_mutex); 352 353 /* 354 * Before we even check if the controller is running to process 355 * everything else, we must first check if we had a request to kick off 356 * discovery. We do this before the check if the controller is running, 357 * as this may be required to finish a discovery. 358 */ 359 if ((smrt->smrt_status & SMRT_CTLR_DISCOVERY_PERIODIC) != 0 && 360 (smrt->smrt_status & SMRT_CTLR_DISCOVERY_RUNNING) == 0 && 361 (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) == 0) { 362 if (ddi_taskq_dispatch(smrt->smrt_discover_taskq, 363 smrt_discover, smrt, DDI_NOSLEEP) != DDI_SUCCESS) { 364 smrt->smrt_stats.smrts_discovery_tq_errors++; 365 } else { 366 smrt->smrt_status &= ~SMRT_CTLR_DISCOVERY_PERIODIC; 367 } 368 } 369 370 if (!(smrt->smrt_status & SMRT_CTLR_STATUS_RUNNING)) { 371 /* 372 * The device is currently not active, e.g. due to an 373 * in-progress controller reset. 374 */ 375 mutex_exit(&smrt->smrt_mutex); 376 return; 377 } 378 379 /* 380 * Check on the health of the controller firmware. Note that if the 381 * controller has locked up, this routine will panic the system. 382 */ 383 smrt_lockup_check(smrt); 384 385 /* 386 * Reset the event notification threshold counter. 387 */ 388 smrt->smrt_event_count = 0; 389 390 /* 391 * Check inflight commands to see if they have timed out. 392 */ 393 for (smrt_command_t *smcm = avl_first(&smrt->smrt_inflight); 394 smcm != NULL; smcm = AVL_NEXT(&smrt->smrt_inflight, smcm)) { 395 if (smcm->smcm_status & SMRT_CMD_STATUS_POLLED) { 396 /* 397 * Polled commands are timed out by the polling 398 * routine. 399 */ 400 continue; 401 } 402 403 if (smcm->smcm_status & SMRT_CMD_STATUS_ABORT_SENT) { 404 /* 405 * This command has been aborted; either it will 406 * complete or the controller will be reset. 407 */ 408 continue; 409 } 410 411 if (list_link_active(&smcm->smcm_link_abort)) { 412 /* 413 * Already on the abort queue. 414 */ 415 continue; 416 } 417 418 if (smcm->smcm_expiry == 0) { 419 /* 420 * This command has no expiry time. 421 */ 422 continue; 423 } 424 425 if (gethrtime() > smcm->smcm_expiry) { 426 list_insert_tail(&smrt->smrt_abortq, smcm); 427 smcm->smcm_status |= SMRT_CMD_STATUS_TIMEOUT; 428 } 429 } 430 431 /* 432 * Process the abort queue. 433 */ 434 (void) smrt_process_abortq(smrt); 435 436 /* 437 * Check if we have an outstanding event intervention request. Note, 438 * the command in question should always be in a state such that it is 439 * usable by the system here. The command is always prepared again by 440 * the normal event notification path, even if a reset has occurred. 441 * The reset will be processed before we'd ever consider running an 442 * event again. Note, if we fail to submit this, then we leave this for 443 * the next occurrence of the periodic. 444 */ 445 if (smrt->smrt_status & SMRT_CTLR_ASYNC_INTERVENTION) { 446 smrt->smrt_stats.smrts_events_intervened++; 447 448 if (smrt_submit(smrt, smrt->smrt_event_cmd) == 0) { 449 smrt->smrt_status &= ~SMRT_CTLR_ASYNC_INTERVENTION; 450 } 451 } 452 453 mutex_exit(&smrt->smrt_mutex); 454 } 455 456 int 457 smrt_retrieve(smrt_t *smrt) 458 { 459 VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); 460 461 switch (smrt->smrt_ctlr_mode) { 462 case SMRT_CTLR_MODE_SIMPLE: 463 smrt_retrieve_simple(smrt); 464 return (DDI_SUCCESS); 465 466 case SMRT_CTLR_MODE_UNKNOWN: 467 break; 468 } 469 470 panic("unknown controller mode"); 471 /* LINTED: E_FUNC_NO_RET_VAL */ 472 } 473 474 /* 475 * Grab a new tag number for this command. We aim to avoid reusing tag numbers 476 * as much as possible, so as to avoid spurious double completion from the 477 * controller. 478 */ 479 static void 480 smrt_set_new_tag(smrt_t *smrt, smrt_command_t *smcm) 481 { 482 VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); 483 484 /* 485 * Loop until we find a tag that is not in use. The tag space is 486 * very large (~30 bits) and the maximum number of inflight commands 487 * is comparatively small (~1024 in current controllers). 488 */ 489 for (;;) { 490 uint32_t new_tag = smrt->smrt_next_tag; 491 492 if (++smrt->smrt_next_tag > SMRT_MAX_TAG_NUMBER) { 493 smrt->smrt_next_tag = SMRT_MIN_TAG_NUMBER; 494 } 495 496 if (smrt_lookup_inflight(smrt, new_tag) != NULL) { 497 /* 498 * This tag is already used on an inflight command. 499 * Choose another. 500 */ 501 continue; 502 } 503 504 /* 505 * Set the tag for the command and also write it into the 506 * appropriate part of the request block. 507 */ 508 smcm->smcm_tag = new_tag; 509 smcm->smcm_va_cmd->Header.Tag.tag_value = new_tag; 510 return; 511 } 512 } 513 514 /* 515 * Submit a command to the controller. 516 */ 517 int 518 smrt_submit(smrt_t *smrt, smrt_command_t *smcm) 519 { 520 VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); 521 VERIFY(smcm->smcm_type != SMRT_CMDTYPE_PREINIT); 522 523 /* 524 * Anything that asks us to ignore the running state of the controller 525 * must be wired up to poll for completion. 526 */ 527 if (smcm->smcm_status & SMRT_CMD_IGNORE_RUNNING) { 528 VERIFY(smcm->smcm_status & SMRT_CMD_STATUS_POLLED); 529 } 530 531 /* 532 * If the controller is currently being reset, do not allow command 533 * submission. However, if this is one of the commands needed to finish 534 * reset, as indicated on the command structure, allow it. 535 */ 536 if (!(smrt->smrt_status & SMRT_CTLR_STATUS_RUNNING) && 537 !(smcm->smcm_status & SMRT_CMD_IGNORE_RUNNING)) { 538 return (EIO); 539 } 540 541 /* 542 * Do not allow submission of more concurrent commands than the 543 * controller supports. 544 */ 545 if (avl_numnodes(&smrt->smrt_inflight) >= smrt->smrt_maxcmds) { 546 return (EAGAIN); 547 } 548 549 /* 550 * Synchronise the Command Block DMA resources to ensure that the 551 * device has a consistent view before we pass it the command. 552 */ 553 if (ddi_dma_sync(smcm->smcm_contig.smdma_dma_handle, 0, 0, 554 DDI_DMA_SYNC_FORDEV) != DDI_SUCCESS) { 555 dev_err(smrt->smrt_dip, CE_PANIC, "DMA sync failure"); 556 return (EIO); 557 } 558 559 /* 560 * Ensure that this command is not re-used without issuing a new 561 * tag number and performing any appropriate cleanup. 562 */ 563 VERIFY(!(smcm->smcm_status & SMRT_CMD_STATUS_USED)); 564 smcm->smcm_status |= SMRT_CMD_STATUS_USED; 565 566 /* 567 * Assign a tag that is not currently in use 568 */ 569 smrt_set_new_tag(smrt, smcm); 570 571 /* 572 * Insert this command into the inflight AVL. 573 */ 574 avl_index_t where; 575 if (avl_find(&smrt->smrt_inflight, smcm, &where) != NULL) { 576 dev_err(smrt->smrt_dip, CE_PANIC, "duplicate submit tag %x", 577 smcm->smcm_tag); 578 } 579 avl_insert(&smrt->smrt_inflight, smcm, where); 580 if (smrt->smrt_stats.smrts_max_inflight < 581 avl_numnodes(&smrt->smrt_inflight)) { 582 smrt->smrt_stats.smrts_max_inflight = 583 avl_numnodes(&smrt->smrt_inflight); 584 } 585 586 VERIFY(!(smcm->smcm_status & SMRT_CMD_STATUS_INFLIGHT)); 587 smcm->smcm_status |= SMRT_CMD_STATUS_INFLIGHT; 588 589 smcm->smcm_time_submit = gethrtime(); 590 591 switch (smrt->smrt_ctlr_mode) { 592 case SMRT_CTLR_MODE_SIMPLE: 593 smrt_submit_simple(smrt, smcm); 594 return (0); 595 596 case SMRT_CTLR_MODE_UNKNOWN: 597 break; 598 } 599 panic("unknown controller mode"); 600 /* LINTED: E_FUNC_NO_RET_VAL */ 601 } 602 603 static void 604 smrt_process_finishq_sync(smrt_command_t *smcm) 605 { 606 smrt_t *smrt = smcm->smcm_ctlr; 607 608 if (ddi_dma_sync(smcm->smcm_contig.smdma_dma_handle, 0, 0, 609 DDI_DMA_SYNC_FORCPU) != DDI_SUCCESS) { 610 dev_err(smrt->smrt_dip, CE_PANIC, "finishq DMA sync failure"); 611 } 612 } 613 614 static void 615 smrt_process_finishq_one(smrt_command_t *smcm) 616 { 617 smrt_t *smrt = smcm->smcm_ctlr; 618 619 VERIFY(!(smcm->smcm_status & SMRT_CMD_STATUS_COMPLETE)); 620 smcm->smcm_status |= SMRT_CMD_STATUS_COMPLETE; 621 622 switch (smcm->smcm_type) { 623 case SMRT_CMDTYPE_INTERNAL: 624 cv_broadcast(&smcm->smcm_ctlr->smrt_cv_finishq); 625 return; 626 627 case SMRT_CMDTYPE_SCSA: 628 smrt_hba_complete(smcm); 629 return; 630 631 case SMRT_CMDTYPE_EVENT: 632 smrt_event_complete(smcm); 633 return; 634 635 case SMRT_CMDTYPE_ABORTQ: 636 /* 637 * Abort messages sent as part of abort queue processing 638 * do not require any completion activity. 639 */ 640 mutex_exit(&smrt->smrt_mutex); 641 smrt_command_free(smcm); 642 mutex_enter(&smrt->smrt_mutex); 643 return; 644 645 case SMRT_CMDTYPE_PREINIT: 646 dev_err(smrt->smrt_dip, CE_PANIC, "preinit command " 647 "completed after initialisation"); 648 return; 649 } 650 651 panic("unknown command type"); 652 } 653 654 /* 655 * Process commands in the completion queue. 656 */ 657 void 658 smrt_process_finishq(smrt_t *smrt) 659 { 660 smrt_command_t *smcm; 661 662 VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); 663 664 while ((smcm = list_remove_head(&smrt->smrt_finishq)) != NULL) { 665 /* 666 * Synchronise the Command Block before we read from it or 667 * free it, to ensure that any writes from the controller are 668 * visible. 669 */ 670 smrt_process_finishq_sync(smcm); 671 672 /* 673 * Check if this command was in line to be aborted. 674 */ 675 if (list_link_active(&smcm->smcm_link_abort)) { 676 /* 677 * This command was in line, but the controller 678 * subsequently completed the command before we 679 * were able to do so. 680 */ 681 list_remove(&smrt->smrt_abortq, smcm); 682 smcm->smcm_status &= ~SMRT_CMD_STATUS_TIMEOUT; 683 } 684 685 /* 686 * Check if this command has been abandoned by the original 687 * submitter. If it has, free it now to avoid a leak. 688 */ 689 if (smcm->smcm_status & SMRT_CMD_STATUS_ABANDONED) { 690 mutex_exit(&smrt->smrt_mutex); 691 smrt_command_free(smcm); 692 mutex_enter(&smrt->smrt_mutex); 693 continue; 694 } 695 696 if (smcm->smcm_status & SMRT_CMD_STATUS_POLLED) { 697 /* 698 * This command will be picked up and processed 699 * by "smrt_poll_for()" once the CV is triggered 700 * at the end of processing. 701 */ 702 smcm->smcm_status |= SMRT_CMD_STATUS_POLL_COMPLETE; 703 continue; 704 } 705 706 smrt_process_finishq_one(smcm); 707 } 708 709 cv_broadcast(&smrt->smrt_cv_finishq); 710 } 711 712 /* 713 * Process commands in the abort queue. 714 */ 715 void 716 smrt_process_abortq(smrt_t *smrt) 717 { 718 smrt_command_t *smcm; 719 smrt_command_t *abort_smcm = NULL; 720 721 VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); 722 723 if (list_is_empty(&smrt->smrt_abortq)) { 724 goto out; 725 } 726 727 another: 728 mutex_exit(&smrt->smrt_mutex); 729 if ((abort_smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_ABORTQ, 730 KM_NOSLEEP)) == NULL) { 731 /* 732 * No resources available to send abort messages. We will 733 * try again the next time around. 734 */ 735 mutex_enter(&smrt->smrt_mutex); 736 goto out; 737 } 738 mutex_enter(&smrt->smrt_mutex); 739 740 while ((smcm = list_remove_head(&smrt->smrt_abortq)) != NULL) { 741 if (!(smcm->smcm_status & SMRT_CMD_STATUS_INFLIGHT)) { 742 /* 743 * This message is not currently inflight, so 744 * no abort is needed. 745 */ 746 continue; 747 } 748 749 if (smcm->smcm_status & SMRT_CMD_STATUS_ABORT_SENT) { 750 /* 751 * An abort message has already been sent for 752 * this command. 753 */ 754 continue; 755 } 756 757 /* 758 * Send an abort message for the command. 759 */ 760 smrt_write_message_abort_one(abort_smcm, smcm->smcm_tag); 761 if (smrt_submit(smrt, abort_smcm) != 0) { 762 /* 763 * The command could not be submitted to the 764 * controller. Put it back in the abort queue 765 * and give up for now. 766 */ 767 list_insert_head(&smrt->smrt_abortq, smcm); 768 goto out; 769 } 770 smcm->smcm_status |= SMRT_CMD_STATUS_ABORT_SENT; 771 772 /* 773 * Record some debugging information about the abort we 774 * sent: 775 */ 776 smcm->smcm_abort_time = gethrtime(); 777 smcm->smcm_abort_tag = abort_smcm->smcm_tag; 778 779 /* 780 * The abort message was sent. Release it and 781 * allocate another command. 782 */ 783 abort_smcm = NULL; 784 goto another; 785 } 786 787 out: 788 cv_broadcast(&smrt->smrt_cv_finishq); 789 if (abort_smcm != NULL) { 790 mutex_exit(&smrt->smrt_mutex); 791 smrt_command_free(abort_smcm); 792 mutex_enter(&smrt->smrt_mutex); 793 } 794 } 795 796 int 797 smrt_poll_for(smrt_t *smrt, smrt_command_t *smcm) 798 { 799 VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); 800 VERIFY(smcm->smcm_status & SMRT_CMD_STATUS_POLLED); 801 802 while (!(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE)) { 803 if (smcm->smcm_expiry != 0) { 804 /* 805 * This command has an expiry time. Check to see 806 * if it has already passed: 807 */ 808 if (smcm->smcm_expiry < gethrtime()) { 809 return (ETIMEDOUT); 810 } 811 } 812 813 if (ddi_in_panic()) { 814 /* 815 * When the system is panicking, there are no 816 * interrupts or other threads. Drive the polling loop 817 * on our own, but with a small delay to avoid 818 * aggrevating the controller while we're trying to 819 * dump. 820 */ 821 (void) smrt_retrieve(smrt); 822 smrt_process_finishq(smrt); 823 drv_usecwait(100); 824 continue; 825 } 826 827 /* 828 * Wait for command completion to return through the regular 829 * interrupt handling path. 830 */ 831 if (smcm->smcm_expiry == 0) { 832 cv_wait(&smrt->smrt_cv_finishq, &smrt->smrt_mutex); 833 } else { 834 /* 835 * Wait only until the expiry time for this command. 836 */ 837 (void) cv_timedwait_sig_hrtime(&smrt->smrt_cv_finishq, 838 &smrt->smrt_mutex, smcm->smcm_expiry); 839 } 840 } 841 842 /* 843 * Fire the completion callback for this command. The callback 844 * is responsible for freeing the command, so it may not be 845 * referenced again once this call returns. 846 */ 847 smrt_process_finishq_one(smcm); 848 849 return (0); 850 } 851 852 void 853 smrt_intr_set(smrt_t *smrt, boolean_t enabled) 854 { 855 /* 856 * Read the Interrupt Mask Register. 857 */ 858 uint32_t imr = smrt_get32(smrt, CISS_I2O_INTERRUPT_MASK); 859 860 switch (smrt->smrt_ctlr_mode) { 861 case SMRT_CTLR_MODE_SIMPLE: 862 if (enabled) { 863 imr &= ~CISS_IMR_BIT_SIMPLE_INTR_DISABLE; 864 } else { 865 imr |= CISS_IMR_BIT_SIMPLE_INTR_DISABLE; 866 } 867 smrt_put32(smrt, CISS_I2O_INTERRUPT_MASK, imr); 868 return; 869 870 case SMRT_CTLR_MODE_UNKNOWN: 871 break; 872 } 873 panic("unknown controller mode"); 874 } 875 876 /* 877 * Signal to the controller that we have updated the Configuration Table by 878 * writing to the Inbound Doorbell Register. The controller will, after some 879 * number of seconds, acknowledge this by clearing the bit. 880 * 881 * If successful, return DDI_SUCCESS. If the controller takes too long to 882 * acknowledge, return DDI_FAILURE. 883 */ 884 int 885 smrt_cfgtbl_flush(smrt_t *smrt) 886 { 887 /* 888 * Read the current value of the Inbound Doorbell Register. 889 */ 890 uint32_t idr = smrt_get32(smrt, CISS_I2O_INBOUND_DOORBELL); 891 892 /* 893 * Signal the Configuration Table change to the controller. 894 */ 895 idr |= CISS_IDR_BIT_CFGTBL_CHANGE; 896 smrt_put32(smrt, CISS_I2O_INBOUND_DOORBELL, idr); 897 898 /* 899 * Wait for the controller to acknowledge the change. 900 */ 901 for (unsigned i = 0; i < smrt_ciss_init_time; i++) { 902 idr = smrt_get32(smrt, CISS_I2O_INBOUND_DOORBELL); 903 904 if ((idr & CISS_IDR_BIT_CFGTBL_CHANGE) == 0) { 905 return (DDI_SUCCESS); 906 } 907 908 /* 909 * Wait for one second before trying again. 910 */ 911 delay(drv_usectohz(1000000)); 912 } 913 914 dev_err(smrt->smrt_dip, CE_WARN, "time out expired before controller " 915 "configuration completed"); 916 return (DDI_FAILURE); 917 } 918 919 int 920 smrt_cfgtbl_transport_has_support(smrt_t *smrt, int xport) 921 { 922 VERIFY(xport == CISS_CFGTBL_XPORT_SIMPLE); 923 924 /* 925 * Read the current value of the "Supported Transport Methods" field in 926 * the Configuration Table. 927 */ 928 uint32_t xport_active = ddi_get32(smrt->smrt_ct_handle, 929 &smrt->smrt_ct->TransportSupport); 930 931 /* 932 * Check that the desired transport method is supported by the 933 * controller: 934 */ 935 if ((xport_active & xport) == 0) { 936 dev_err(smrt->smrt_dip, CE_WARN, "controller does not support " 937 "method \"%s\"", xport == CISS_CFGTBL_XPORT_SIMPLE ? 938 "simple" : "performant"); 939 return (DDI_FAILURE); 940 } 941 942 return (DDI_SUCCESS); 943 } 944 945 void 946 smrt_cfgtbl_transport_set(smrt_t *smrt, int xport) 947 { 948 VERIFY(xport == CISS_CFGTBL_XPORT_SIMPLE); 949 950 ddi_put32(smrt->smrt_ct_handle, &smrt->smrt_ct->TransportRequest, 951 xport); 952 } 953 954 int 955 smrt_cfgtbl_transport_confirm(smrt_t *smrt, int xport) 956 { 957 VERIFY(xport == CISS_CFGTBL_XPORT_SIMPLE); 958 959 /* 960 * Read the current value of the TransportActive field in the 961 * Configuration Table. 962 */ 963 uint32_t xport_active = ddi_get32(smrt->smrt_ct_handle, 964 &smrt->smrt_ct->TransportActive); 965 966 /* 967 * Check that the desired transport method is now active: 968 */ 969 if ((xport_active & xport) == 0) { 970 dev_err(smrt->smrt_dip, CE_WARN, "failed to enable transport " 971 "method \"%s\"", xport == CISS_CFGTBL_XPORT_SIMPLE ? 972 "simple" : "performant"); 973 return (DDI_FAILURE); 974 } 975 976 /* 977 * Ensure that the controller is now ready to accept commands. 978 */ 979 if ((xport_active & CISS_CFGTBL_READY_FOR_COMMANDS) == 0) { 980 dev_err(smrt->smrt_dip, CE_WARN, "controller not ready to " 981 "accept commands"); 982 return (DDI_FAILURE); 983 } 984 985 return (DDI_SUCCESS); 986 } 987 988 uint32_t 989 smrt_ctlr_get_maxsgelements(smrt_t *smrt) 990 { 991 return (ddi_get32(smrt->smrt_ct_handle, &smrt->smrt_ct->MaxSGElements)); 992 } 993 994 uint32_t 995 smrt_ctlr_get_cmdsoutmax(smrt_t *smrt) 996 { 997 return (ddi_get32(smrt->smrt_ct_handle, &smrt->smrt_ct->CmdsOutMax)); 998 } 999 1000 static uint32_t 1001 smrt_ctlr_get_hostdrvsup(smrt_t *smrt) 1002 { 1003 return (ddi_get32(smrt->smrt_ct_handle, 1004 &smrt->smrt_ct->HostDrvrSupport)); 1005 } 1006 1007 int 1008 smrt_ctlr_init(smrt_t *smrt) 1009 { 1010 uint8_t signature[4] = { 'C', 'I', 'S', 'S' }; 1011 int e; 1012 1013 if ((e = smrt_ctlr_wait_for_state(smrt, 1014 SMRT_WAIT_STATE_READY)) != DDI_SUCCESS) { 1015 return (e); 1016 } 1017 1018 /* 1019 * The configuration table contains an ASCII signature ("CISS") which 1020 * should be checked as we initialise the controller. 1021 * See: "9.1 Configuration Table" in CISS Specification. 1022 */ 1023 for (unsigned i = 0; i < 4; i++) { 1024 if (ddi_get8(smrt->smrt_ct_handle, 1025 &smrt->smrt_ct->Signature[i]) != signature[i]) { 1026 dev_err(smrt->smrt_dip, CE_WARN, "invalid signature " 1027 "detected"); 1028 return (DDI_FAILURE); 1029 } 1030 } 1031 1032 /* 1033 * Initialise an appropriate Transport Method. For now, this driver 1034 * only supports the "Simple" method. 1035 */ 1036 if ((e = smrt_ctlr_init_simple(smrt)) != DDI_SUCCESS) { 1037 return (e); 1038 } 1039 1040 /* 1041 * Save some common feature support bitfields. 1042 */ 1043 smrt->smrt_host_support = smrt_ctlr_get_hostdrvsup(smrt); 1044 smrt->smrt_bus_support = ddi_get32(smrt->smrt_ct_handle, 1045 &smrt->smrt_ct->BusTypes); 1046 1047 /* 1048 * Read initial controller heartbeat value and mark the current 1049 * reading time. 1050 */ 1051 smrt->smrt_last_heartbeat = ddi_get32(smrt->smrt_ct_handle, 1052 &smrt->smrt_ct->HeartBeat); 1053 smrt->smrt_last_heartbeat_time = gethrtime(); 1054 1055 /* 1056 * Determine the firmware version of the controller so that we can 1057 * select which type of interrupts to use. 1058 */ 1059 if ((e = smrt_ctlr_versions(smrt, SMRT_DISCOVER_TIMEOUT, 1060 &smrt->smrt_versions)) != 0) { 1061 dev_err(smrt->smrt_dip, CE_WARN, "could not identify " 1062 "controller (%d)", e); 1063 return (DDI_FAILURE); 1064 } 1065 1066 dev_err(smrt->smrt_dip, CE_NOTE, "!firmware rev %s", 1067 smrt->smrt_versions.smrtv_firmware_rev); 1068 1069 return (DDI_SUCCESS); 1070 } 1071 1072 void 1073 smrt_ctlr_teardown(smrt_t *smrt) 1074 { 1075 smrt->smrt_status &= ~SMRT_CTLR_STATUS_RUNNING; 1076 1077 switch (smrt->smrt_ctlr_mode) { 1078 case SMRT_CTLR_MODE_SIMPLE: 1079 smrt_ctlr_teardown_simple(smrt); 1080 return; 1081 1082 case SMRT_CTLR_MODE_UNKNOWN: 1083 return; 1084 } 1085 1086 panic("unknown controller mode"); 1087 } 1088 1089 int 1090 smrt_ctlr_wait_for_state(smrt_t *smrt, smrt_wait_state_t state) 1091 { 1092 unsigned wait_usec = 100 * 1000; 1093 unsigned wait_count = SMRT_WAIT_DELAY_SECONDS * 1000000 / wait_usec; 1094 1095 VERIFY(state == SMRT_WAIT_STATE_READY || 1096 state == SMRT_WAIT_STATE_UNREADY); 1097 1098 /* 1099 * Read from the Scratchpad Register until the expected ready signature 1100 * is detected. This behaviour is not described in the CISS 1101 * specification. 1102 * 1103 * If the device is not in the desired state immediately, sleep for a 1104 * second and try again. If the device has not become ready in 300 1105 * seconds, give up. 1106 */ 1107 for (unsigned i = 0; i < wait_count; i++) { 1108 uint32_t spr = smrt_get32(smrt, CISS_I2O_SCRATCHPAD); 1109 1110 switch (state) { 1111 case SMRT_WAIT_STATE_READY: 1112 if (spr == CISS_SCRATCHPAD_INITIALISED) { 1113 return (DDI_SUCCESS); 1114 } 1115 break; 1116 1117 case SMRT_WAIT_STATE_UNREADY: 1118 if (spr != CISS_SCRATCHPAD_INITIALISED) { 1119 return (DDI_SUCCESS); 1120 } 1121 break; 1122 } 1123 1124 if (ddi_in_panic()) { 1125 /* 1126 * There is no sleep for the panicking, so we 1127 * must spin wait: 1128 */ 1129 drv_usecwait(wait_usec); 1130 } else { 1131 /* 1132 * Wait for a quarter second and try again. 1133 */ 1134 delay(drv_usectohz(wait_usec)); 1135 } 1136 } 1137 1138 dev_err(smrt->smrt_dip, CE_WARN, "time out waiting for controller " 1139 "to enter state \"%s\"", state == SMRT_WAIT_STATE_READY ? 1140 "ready": "unready"); 1141 return (DDI_FAILURE); 1142 } 1143 1144 void 1145 smrt_lockup_check(smrt_t *smrt) 1146 { 1147 /* 1148 * Read the current controller heartbeat value. 1149 */ 1150 uint32_t heartbeat = ddi_get32(smrt->smrt_ct_handle, 1151 &smrt->smrt_ct->HeartBeat); 1152 1153 VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); 1154 1155 /* 1156 * Check to see if the value is the same as last time we looked: 1157 */ 1158 if (heartbeat != smrt->smrt_last_heartbeat) { 1159 /* 1160 * The heartbeat value has changed, which suggests that the 1161 * firmware in the controller has not yet come to a complete 1162 * stop. Record the new value, as well as the current time. 1163 */ 1164 smrt->smrt_last_heartbeat = heartbeat; 1165 smrt->smrt_last_heartbeat_time = gethrtime(); 1166 return; 1167 } 1168 1169 /* 1170 * The controller _might_ have been able to signal to us that is 1171 * has locked up. This is a truly unfathomable state of affairs: 1172 * If the firmware can tell it has flown off the rails, why not 1173 * simply reset the controller? 1174 */ 1175 uint32_t odr = smrt_get32(smrt, CISS_I2O_OUTBOUND_DOORBELL_STATUS); 1176 uint32_t spr = smrt_get32(smrt, CISS_I2O_SCRATCHPAD); 1177 if ((odr & CISS_ODR_BIT_LOCKUP) != 0) { 1178 dev_err(smrt->smrt_dip, CE_PANIC, "HP SmartArray firmware has " 1179 "reported a critical fault (odr %08x spr %08x)", 1180 odr, spr); 1181 } 1182 1183 if (gethrtime() > smrt->smrt_last_heartbeat_time + 60 * NANOSEC) { 1184 dev_err(smrt->smrt_dip, CE_PANIC, "HP SmartArray firmware has " 1185 "stopped responding (odr %08x spr %08x)", 1186 odr, spr); 1187 } 1188 } 1189 1190 /* 1191 * Probe the controller with the IDENTIFY CONTROLLER request. This is a BMIC 1192 * command, so it must be submitted to the controller and we must poll for its 1193 * completion. This functionality is only presently used during controller 1194 * initialisation, so it uses the special pre-initialisation path for command 1195 * allocation and submission. 1196 */ 1197 static int 1198 smrt_ctlr_identify(smrt_t *smrt, uint16_t timeout, 1199 smrt_identify_controller_t *resp) 1200 { 1201 smrt_command_t *smcm; 1202 smrt_identify_controller_req_t smicr; 1203 int r; 1204 size_t sz; 1205 1206 /* 1207 * Allocate a command with a data buffer; the controller will fill it 1208 * with identification information. There is some suggestion in the 1209 * firmware-level specification that the buffer length should be a 1210 * multiple of 512 bytes for some controllers, so we round up. 1211 */ 1212 sz = P2ROUNDUP_TYPED(sizeof (*resp), 512, size_t); 1213 if ((smcm = smrt_command_alloc_preinit(smrt, sz, KM_SLEEP)) == NULL) { 1214 return (ENOMEM); 1215 } 1216 1217 smrt_write_controller_lun_addr(&smcm->smcm_va_cmd->Header.LUN); 1218 1219 smcm->smcm_va_cmd->Request.CDBLen = sizeof (smicr); 1220 smcm->smcm_va_cmd->Request.Timeout = timeout; 1221 smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD; 1222 smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_SIMPLE; 1223 smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_READ; 1224 1225 /* 1226 * Construct the IDENTIFY CONTROLLER request CDB. Note that any 1227 * reserved fields in the request must be filled with zeroes. 1228 */ 1229 bzero(&smicr, sizeof (smicr)); 1230 smicr.smicr_opcode = CISS_SCMD_BMIC_READ; 1231 smicr.smicr_lun = 0; 1232 smicr.smicr_command = CISS_BMIC_IDENTIFY_CONTROLLER; 1233 bcopy(&smicr, &smcm->smcm_va_cmd->Request.CDB[0], 1234 MIN(CISS_CDBLEN, sizeof (smicr))); 1235 1236 /* 1237 * Send the command to the device and poll for its completion. 1238 */ 1239 smcm->smcm_status |= SMRT_CMD_STATUS_POLLED; 1240 smcm->smcm_expiry = gethrtime() + timeout * NANOSEC; 1241 if ((r = smrt_preinit_command_simple(smrt, smcm)) != 0) { 1242 VERIFY3S(r, ==, ETIMEDOUT); 1243 VERIFY0(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE); 1244 1245 /* 1246 * This command timed out, but the driver is not presently 1247 * initialised to the point where we can try to abort it. 1248 * The command was created with the PREINIT type, so it 1249 * does not appear in the global command tracking list. 1250 * In order to avoid problems with DMA from the controller, 1251 * we have to leak the command allocation. 1252 */ 1253 smcm = NULL; 1254 goto out; 1255 } 1256 1257 if (smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) { 1258 /* 1259 * The controller was reset while we were trying to identify 1260 * it. Report failure. 1261 */ 1262 r = EIO; 1263 goto out; 1264 } 1265 1266 if (smcm->smcm_status & SMRT_CMD_STATUS_ERROR) { 1267 ErrorInfo_t *ei = smcm->smcm_va_err; 1268 1269 if (ei->CommandStatus != CISS_CMD_DATA_UNDERRUN) { 1270 dev_err(smrt->smrt_dip, CE_WARN, "identify " 1271 "controller error: status 0x%x", 1272 ei->CommandStatus); 1273 r = EIO; 1274 goto out; 1275 } 1276 } 1277 1278 if (resp != NULL) { 1279 /* 1280 * Copy the identify response out for the caller. 1281 */ 1282 bcopy(smcm->smcm_internal->smcmi_va, resp, sizeof (*resp)); 1283 } 1284 1285 r = 0; 1286 1287 out: 1288 if (smcm != NULL) { 1289 smrt_command_free(smcm); 1290 } 1291 return (r); 1292 } 1293 1294 /* 1295 * The firmware versions in an IDENTIFY CONTROLLER response generally take 1296 * the form of a four byte ASCII string containing a dotted decimal version 1297 * number; e.g., "8.00". 1298 * 1299 * This function sanitises the firmware version, replacing unexpected 1300 * values with a question mark. 1301 */ 1302 static void 1303 smrt_copy_firmware_version(uint8_t *src, char *dst) 1304 { 1305 for (unsigned i = 0; i < 4; i++) { 1306 /* 1307 * Make sure that this is a 7-bit clean ASCII value. 1308 */ 1309 char c = src[i] <= 0x7f ? (char)(src[i] & 0x7f) : '?'; 1310 1311 if (isalnum(c) || c == '.' || c == ' ') { 1312 dst[i] = c; 1313 } else { 1314 dst[i] = '?'; 1315 } 1316 } 1317 dst[4] = '\0'; 1318 } 1319 1320 /* 1321 * Using an IDENTIFY CONTROLLER request, determine firmware and controller 1322 * version details. See the comments for "smrt_ctlr_identify()" for more 1323 * details about calling context. 1324 */ 1325 static int 1326 smrt_ctlr_versions(smrt_t *smrt, uint16_t timeout, smrt_versions_t *smrtv) 1327 { 1328 smrt_identify_controller_t smic; 1329 int r; 1330 1331 if ((r = smrt_ctlr_identify(smrt, timeout, &smic)) != 0) { 1332 return (r); 1333 } 1334 1335 smrtv->smrtv_hardware_version = smic.smic_hardware_version; 1336 smrt_copy_firmware_version(smic.smic_firmware_rev, 1337 smrtv->smrtv_firmware_rev); 1338 smrt_copy_firmware_version(smic.smic_recovery_rev, 1339 smrtv->smrtv_recovery_rev); 1340 smrt_copy_firmware_version(smic.smic_bootblock_rev, 1341 smrtv->smrtv_bootblock_rev); 1342 1343 return (0); 1344 } 1345 1346 int 1347 smrt_ctlr_reset(smrt_t *smrt) 1348 { 1349 smrt_command_t *smcm, *smcm_nop; 1350 int r; 1351 1352 VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); 1353 1354 if (ddi_in_panic()) { 1355 goto skip_check; 1356 } 1357 1358 if (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) { 1359 /* 1360 * Don't pile on. One reset is enough. Wait until 1361 * it's complete, and then return success. 1362 */ 1363 while (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) { 1364 cv_wait(&smrt->smrt_cv_finishq, &smrt->smrt_mutex); 1365 } 1366 return (0); 1367 } 1368 smrt->smrt_status |= SMRT_CTLR_STATUS_RESETTING; 1369 smrt->smrt_last_reset_start = gethrtime(); 1370 smrt->smrt_stats.smrts_ctlr_resets++; 1371 1372 skip_check: 1373 /* 1374 * Allocate two commands: one for the soft reset message, which we 1375 * cannot free until the controller has reset; and one for the ping we 1376 * will use to determine when it is once again functional. 1377 */ 1378 mutex_exit(&smrt->smrt_mutex); 1379 if ((smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL, 1380 KM_NOSLEEP)) == NULL) { 1381 mutex_enter(&smrt->smrt_mutex); 1382 return (ENOMEM); 1383 } 1384 if ((smcm_nop = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL, 1385 KM_NOSLEEP)) == NULL) { 1386 smrt_command_free(smcm); 1387 mutex_enter(&smrt->smrt_mutex); 1388 return (ENOMEM); 1389 } 1390 mutex_enter(&smrt->smrt_mutex); 1391 1392 /* 1393 * Send a soft reset command to the controller. If this command 1394 * succeeds, there will likely be no completion notification. Instead, 1395 * the device should become unavailable for some period of time and 1396 * then become available again. Once available again, we know the soft 1397 * reset has completed and should abort all in-flight commands. 1398 */ 1399 smrt_write_message_reset_ctlr(smcm); 1400 1401 /* 1402 * Disable interrupts now. 1403 */ 1404 smrt_intr_set(smrt, B_FALSE); 1405 1406 dev_err(smrt->smrt_dip, CE_WARN, "attempting controller soft reset"); 1407 smcm->smcm_status |= SMRT_CMD_STATUS_POLLED; 1408 if ((r = smrt_submit(smrt, smcm)) != 0) { 1409 dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: " 1410 "submit failed (%d)", r); 1411 } 1412 1413 /* 1414 * Mark every currently inflight command as being reset, including the 1415 * soft reset command we just sent. Once we confirm the reset works, 1416 * we can safely report that these commands have failed. 1417 */ 1418 for (smrt_command_t *t = avl_first(&smrt->smrt_inflight); 1419 t != NULL; t = AVL_NEXT(&smrt->smrt_inflight, t)) { 1420 t->smcm_status |= SMRT_CMD_STATUS_RESET_SENT; 1421 } 1422 1423 /* 1424 * Now that we have submitted our soft reset command, prevent 1425 * the rest of the driver from interacting with the controller. 1426 */ 1427 smrt->smrt_status &= ~SMRT_CTLR_STATUS_RUNNING; 1428 1429 /* 1430 * We do not expect a completion from the controller for our soft 1431 * reset command, but we also cannot remove it from the inflight 1432 * list until we know the controller has actually reset. To do 1433 * otherwise would potentially allow the controller to scribble 1434 * on the memory we were using. 1435 */ 1436 smcm->smcm_status |= SMRT_CMD_STATUS_ABANDONED; 1437 1438 if (smrt_ctlr_wait_for_state(smrt, SMRT_WAIT_STATE_UNREADY) != 1439 DDI_SUCCESS) { 1440 dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: " 1441 "controller did not become unready"); 1442 } 1443 dev_err(smrt->smrt_dip, CE_NOTE, "soft reset: controller unready"); 1444 1445 if (smrt_ctlr_wait_for_state(smrt, SMRT_WAIT_STATE_READY) != 1446 DDI_SUCCESS) { 1447 dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: " 1448 "controller did not come become ready"); 1449 } 1450 dev_err(smrt->smrt_dip, CE_NOTE, "soft reset: controller ready"); 1451 1452 /* 1453 * In at least the Smart Array P420i, the controller can take 30-45 1454 * seconds after the scratchpad register shows it as being available 1455 * before it is ready to receive commands. In order to avoid hitting 1456 * it too early with our post-reset ping, we will sleep for 10 seconds 1457 * here. 1458 */ 1459 if (ddi_in_panic()) { 1460 drv_usecwait(10 * MICROSEC); 1461 } else { 1462 delay(drv_usectohz(10 * MICROSEC)); 1463 } 1464 1465 smrt_ctlr_teardown(smrt); 1466 if (smrt_ctlr_init(smrt) != DDI_SUCCESS) { 1467 dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: " 1468 "controller transport could not be configured"); 1469 } 1470 dev_err(smrt->smrt_dip, CE_NOTE, "soft reset: controller configured"); 1471 1472 smrt_write_message_nop(smcm_nop, 0); 1473 smcm_nop->smcm_status |= SMRT_CMD_STATUS_POLLED | 1474 SMRT_CMD_IGNORE_RUNNING; 1475 if ((r = smrt_submit(smrt, smcm_nop)) != 0) { 1476 dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: " 1477 "ping could not be submitted (%d)", r); 1478 } 1479 1480 /* 1481 * Interrupts are still masked at this stage. Poll manually in 1482 * a way that will not trigger regular finish queue processing: 1483 */ 1484 VERIFY(smcm_nop->smcm_status & SMRT_CMD_STATUS_INFLIGHT); 1485 for (unsigned i = 0; i < 600; i++) { 1486 smrt_retrieve_simple(smrt); 1487 1488 if (!(smcm_nop->smcm_status & SMRT_CMD_STATUS_INFLIGHT)) { 1489 /* 1490 * Remove the ping command from the finish queue and 1491 * process it manually. This processing must mirror 1492 * what would have been done in smrt_process_finishq(). 1493 */ 1494 VERIFY(list_link_active(&smcm_nop->smcm_link_finish)); 1495 list_remove(&smrt->smrt_finishq, smcm_nop); 1496 smrt_process_finishq_sync(smcm_nop); 1497 smcm_nop->smcm_status |= SMRT_CMD_STATUS_POLL_COMPLETE; 1498 smrt_process_finishq_one(smcm_nop); 1499 break; 1500 } 1501 1502 if (ddi_in_panic()) { 1503 drv_usecwait(100 * 1000); 1504 } else { 1505 delay(drv_usectohz(100 * 1000)); 1506 } 1507 } 1508 1509 if (!(smcm_nop->smcm_status & SMRT_CMD_STATUS_COMPLETE)) { 1510 dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: " 1511 "ping did not complete"); 1512 } else if (smcm_nop->smcm_status & SMRT_CMD_STATUS_ERROR) { 1513 dev_err(smrt->smrt_dip, CE_WARN, "soft reset: ping completed " 1514 "in error (status %u)", 1515 (unsigned)smcm_nop->smcm_va_err->CommandStatus); 1516 } else { 1517 dev_err(smrt->smrt_dip, CE_NOTE, "soft reset: ping completed"); 1518 } 1519 1520 /* 1521 * Now that the controller is working again, we can abort any 1522 * commands that were inflight during the reset. 1523 */ 1524 smrt_command_t *nt; 1525 for (smrt_command_t *t = avl_first(&smrt->smrt_inflight); 1526 t != NULL; t = nt) { 1527 nt = AVL_NEXT(&smrt->smrt_inflight, t); 1528 1529 if (t->smcm_status & SMRT_CMD_STATUS_RESET_SENT) { 1530 avl_remove(&smrt->smrt_inflight, t); 1531 t->smcm_status &= ~SMRT_CMD_STATUS_INFLIGHT; 1532 1533 list_insert_tail(&smrt->smrt_finishq, t); 1534 } 1535 } 1536 1537 /* 1538 * Quiesce our discovery thread. Note, because 1539 * SMRT_CTLR_STATUS_RESTARTING is set, nothing can cause it to be 1540 * enabled again. 1541 */ 1542 if (!ddi_in_panic()) { 1543 mutex_exit(&smrt->smrt_mutex); 1544 ddi_taskq_wait(smrt->smrt_discover_taskq); 1545 mutex_enter(&smrt->smrt_mutex); 1546 } 1547 1548 /* 1549 * Re-enable interrupts. Now, we must kick off a discovery to make sure 1550 * that the system is in a sane state and that we can perform I/O. 1551 */ 1552 smrt_intr_set(smrt, B_TRUE); 1553 smrt->smrt_status &= ~SMRT_CTLR_STATUS_RESETTING; 1554 smrt->smrt_status |= SMRT_CTLR_DISCOVERY_REQUIRED; 1555 1556 /* 1557 * Attempt a discovery to make sure that the drivers sees a realistic 1558 * view of the world. If we're not in panic context, spin for the 1559 * asynchronous process to complete, otherwise we're in panic context 1560 * and this is going to happen regardless if we want it to or not. 1561 * Before we kick off the request to run discovery, we reset the 1562 * discovery request flags as we know that nothing else can consider 1563 * running discovery and we don't want to delay until the next smrt 1564 * periodic tick if we can avoid it. In panic context, if this failed, 1565 * then we won't make it back. 1566 */ 1567 VERIFY0(smrt->smrt_status & SMRT_CTLR_DISCOVERY_RUNNING); 1568 smrt->smrt_status &= ~(SMRT_CTLR_DISCOVERY_MASK); 1569 smrt_discover(smrt); 1570 if (!ddi_in_panic()) { 1571 while (smrt->smrt_status & SMRT_CTLR_DISCOVERY_REQUIRED) { 1572 cv_wait(&smrt->smrt_cv_finishq, &smrt->smrt_mutex); 1573 } 1574 } 1575 1576 smrt->smrt_status |= SMRT_CTLR_STATUS_RUNNING; 1577 smrt->smrt_last_reset_finish = gethrtime(); 1578 1579 /* 1580 * Wake anybody that was waiting for the reset to complete. 1581 */ 1582 cv_broadcast(&smrt->smrt_cv_finishq); 1583 1584 /* 1585 * Process the completion queue one last time before we let go 1586 * of the mutex. 1587 */ 1588 smrt_process_finishq(smrt); 1589 1590 mutex_exit(&smrt->smrt_mutex); 1591 smrt_command_free(smcm_nop); 1592 mutex_enter(&smrt->smrt_mutex); 1593 return (0); 1594 } 1595 1596 int 1597 smrt_event_init(smrt_t *smrt) 1598 { 1599 int ret; 1600 smrt_command_t *event, *cancel; 1601 1602 event = smrt_command_alloc(smrt, SMRT_CMDTYPE_EVENT, KM_NOSLEEP); 1603 if (event == NULL) 1604 return (ENOMEM); 1605 if (smrt_command_attach_internal(smrt, event, SMRT_EVENT_NOTIFY_BUFLEN, 1606 KM_NOSLEEP) != 0) { 1607 smrt_command_free(event); 1608 return (ENOMEM); 1609 } 1610 smrt_write_message_event_notify(event); 1611 1612 cancel = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL, KM_NOSLEEP); 1613 if (cancel == NULL) { 1614 smrt_command_free(event); 1615 return (ENOMEM); 1616 } 1617 if (smrt_command_attach_internal(smrt, cancel, SMRT_EVENT_NOTIFY_BUFLEN, 1618 KM_NOSLEEP) != 0) { 1619 smrt_command_free(event); 1620 smrt_command_free(cancel); 1621 return (ENOMEM); 1622 } 1623 smrt_write_message_cancel_event_notify(cancel); 1624 1625 cv_init(&smrt->smrt_event_queue, NULL, CV_DRIVER, NULL); 1626 1627 mutex_enter(&smrt->smrt_mutex); 1628 if ((ret = smrt_submit(smrt, event)) != 0) { 1629 mutex_exit(&smrt->smrt_mutex); 1630 smrt_command_free(event); 1631 smrt_command_free(cancel); 1632 return (ret); 1633 } 1634 1635 smrt->smrt_event_cmd = event; 1636 smrt->smrt_event_cancel_cmd = cancel; 1637 mutex_exit(&smrt->smrt_mutex); 1638 1639 return (0); 1640 } 1641 1642 void 1643 smrt_event_complete(smrt_command_t *smcm) 1644 { 1645 smrt_event_notify_t *sen; 1646 boolean_t log, rescan; 1647 1648 boolean_t intervene = B_FALSE; 1649 smrt_t *smrt = smcm->smcm_ctlr; 1650 1651 VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); 1652 VERIFY3P(smcm, ==, smrt->smrt_event_cmd); 1653 VERIFY0(smrt->smrt_status & SMRT_CTLR_ASYNC_INTERVENTION); 1654 1655 smrt->smrt_stats.smrts_events_received++; 1656 1657 if (smrt->smrt_status & SMRT_CTLR_STATUS_DETACHING) { 1658 cv_signal(&smrt->smrt_event_queue); 1659 return; 1660 } 1661 1662 if (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) { 1663 intervene = B_TRUE; 1664 goto clean; 1665 } 1666 1667 /* 1668 * The event notification command failed for some reason. Attempt to 1669 * drive on and try again at the next intervention period. Because this 1670 * may represent a programmer error (though it's hard to know), we wait 1671 * until the next intervention period and don't panic. 1672 */ 1673 if (smcm->smcm_status & SMRT_CMD_STATUS_ERROR) { 1674 ErrorInfo_t *ei = smcm->smcm_va_err; 1675 intervene = B_TRUE; 1676 1677 smrt->smrt_stats.smrts_events_errors++; 1678 dev_err(smrt->smrt_dip, CE_WARN, "!event notification request " 1679 "error: status 0x%x", ei->CommandStatus); 1680 goto clean; 1681 } 1682 1683 sen = smcm->smcm_internal->smcmi_va; 1684 log = rescan = B_FALSE; 1685 switch (sen->sen_class) { 1686 case SMRT_EVENT_CLASS_PROTOCOL: 1687 /* 1688 * Most of the event protocol class events aren't really 1689 * actionable. However, subclass 1 indicates errors. Today, 1690 * the only error is an event overflow. If there's an event 1691 * overflow, then we must assume that we need to rescan. 1692 */ 1693 if (sen->sen_subclass == SMRT_EVENT_PROTOCOL_SUBCLASS_ERROR) { 1694 rescan = B_TRUE; 1695 } 1696 break; 1697 case SMRT_EVENT_CLASS_HOTPLUG: 1698 /* 1699 * We want to log all hotplug events. However we only need to 1700 * scan these if the subclass indicates the event is for a disk. 1701 */ 1702 log = B_TRUE; 1703 if (sen->sen_subclass == SMRT_EVENT_HOTPLUG_SUBCLASS_DRIVE) { 1704 rescan = B_TRUE; 1705 } 1706 break; 1707 case SMRT_EVENT_CLASS_HWERROR: 1708 case SMRT_EVENT_CLASS_ENVIRONMENT: 1709 log = B_TRUE; 1710 break; 1711 case SMRT_EVENT_CLASS_PHYS: 1712 log = B_TRUE; 1713 /* 1714 * This subclass indicates some change for physical drives. As 1715 * such, this should trigger a rescan. 1716 */ 1717 if (sen->sen_subclass == SMRT_EVENT_PHYS_SUBCLASS_STATE) { 1718 rescan = B_TRUE; 1719 } 1720 break; 1721 case SMRT_EVENT_CLASS_LOGVOL: 1722 rescan = B_TRUE; 1723 log = B_TRUE; 1724 break; 1725 default: 1726 /* 1727 * While there are other classes of events, it's hard to say how 1728 * actionable they are for the moment. If we revamp this such 1729 * that it becomes an ireport based system, then we should just 1730 * always log these. We opt not to at the moment to try and be 1731 * kind to the system log. 1732 */ 1733 break; 1734 } 1735 1736 /* 1737 * Ideally, this would be an ireport that we could pass onto 1738 * administrators; however, since we don't have any way to generate 1739 * that, we provide a subset of the event information. 1740 */ 1741 if (log) { 1742 const char *rmsg; 1743 if (rescan == B_TRUE) { 1744 rmsg = "rescanning"; 1745 } else { 1746 rmsg = "not rescanning"; 1747 } 1748 if (sen->sen_message[0] != '\0') { 1749 sen->sen_message[sizeof (sen->sen_message) - 1] = '\0'; 1750 dev_err(smrt->smrt_dip, CE_NOTE, "!controller event " 1751 "class/sub-class/detail %x, %x, %x: %s; %s devices", 1752 sen->sen_class, sen->sen_subclass, sen->sen_detail, 1753 sen->sen_message, rmsg); 1754 } else { 1755 dev_err(smrt->smrt_dip, CE_NOTE, "!controller event " 1756 "class/sub-class/detail %x, %x, %x; %s devices", 1757 sen->sen_class, sen->sen_subclass, sen->sen_detail, 1758 rmsg); 1759 } 1760 } 1761 1762 if (rescan) 1763 smrt_discover_request(smrt); 1764 1765 clean: 1766 mutex_exit(&smrt->smrt_mutex); 1767 smrt_command_reuse(smcm); 1768 bzero(smcm->smcm_internal->smcmi_va, SMRT_EVENT_NOTIFY_BUFLEN); 1769 mutex_enter(&smrt->smrt_mutex); 1770 1771 /* 1772 * Make sure we're not _now_ detaching or resetting. 1773 */ 1774 if (smrt->smrt_status & SMRT_CTLR_STATUS_DETACHING) { 1775 cv_signal(&smrt->smrt_event_queue); 1776 return; 1777 } 1778 1779 if ((smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) != 0 || 1780 intervene == B_TRUE) { 1781 smrt->smrt_status |= SMRT_CTLR_ASYNC_INTERVENTION; 1782 return; 1783 } 1784 1785 /* 1786 * Check out command count per tick. If it's too high, leave it for 1787 * intervention to solve. Likely there is some serious driver or 1788 * firmware error going on. 1789 */ 1790 smrt->smrt_event_count++; 1791 if (smrt->smrt_event_count > smrt_event_intervention_threshold) { 1792 smrt->smrt_status |= SMRT_CTLR_ASYNC_INTERVENTION; 1793 return; 1794 } 1795 1796 if (smrt_submit(smrt, smcm) != 0) { 1797 smrt->smrt_status |= SMRT_CTLR_ASYNC_INTERVENTION; 1798 } 1799 } 1800 1801 void 1802 smrt_event_fini(smrt_t *smrt) 1803 { 1804 int ret; 1805 smrt_command_t *event, *cancel; 1806 mutex_enter(&smrt->smrt_mutex); 1807 1808 /* 1809 * If intervention has been requested, there is nothing for us to do. We 1810 * clear the flag so nothing else accidentally sees this and takes 1811 * action. We also don't need to bother sending a cancellation request, 1812 * as there is no outstanding event. 1813 */ 1814 if (smrt->smrt_status & SMRT_CTLR_ASYNC_INTERVENTION) { 1815 smrt->smrt_status &= ~SMRT_CTLR_ASYNC_INTERVENTION; 1816 goto free; 1817 } 1818 1819 /* 1820 * Submit a cancel request for the event notification queue. Because we 1821 * submit both the cancel event and the regular notification event as an 1822 * ordered command, we know that by the time this completes, that the 1823 * existing one will have completed. 1824 */ 1825 smrt->smrt_event_cancel_cmd->smcm_status |= SMRT_CMD_STATUS_POLLED; 1826 if ((ret = smrt_submit(smrt, smrt->smrt_event_cancel_cmd)) != 0) { 1827 /* 1828 * This is unfortunate. We've failed to submit the command. At 1829 * this point all we can do is reset the device. If the reset 1830 * succeeds, we're done and we can clear all the memory. If it 1831 * fails, then all we can do is just leak the command and scream 1832 * to the system, sorry. 1833 */ 1834 if (smrt_ctlr_reset(smrt) != 0) { 1835 dev_err(smrt->smrt_dip, CE_WARN, "failed to reset " 1836 "device after failure to submit cancellation " 1837 "(%d), abandoning smrt_command_t at address %p", 1838 ret, smrt->smrt_event_cmd); 1839 smrt->smrt_event_cmd = NULL; 1840 goto free; 1841 } 1842 } 1843 1844 smrt->smrt_event_cancel_cmd->smcm_expiry = gethrtime() + 1845 SMRT_ASYNC_CANCEL_TIMEOUT * NANOSEC; 1846 if ((ret = smrt_poll_for(smrt, smrt->smrt_event_cancel_cmd)) != 0) { 1847 VERIFY3S(ret, ==, ETIMEDOUT); 1848 VERIFY0(smrt->smrt_event_cancel_cmd->smcm_status & 1849 SMRT_CMD_STATUS_POLL_COMPLETE); 1850 1851 /* 1852 * The command timed out. All we can do is hope a reset will 1853 * work. 1854 */ 1855 if (smrt_ctlr_reset(smrt) != 0) { 1856 dev_err(smrt->smrt_dip, CE_WARN, "failed to reset " 1857 "device after failure to poll for async " 1858 "cancellation command abandoning smrt_command_t " 1859 "event command at address %p and cancellation " 1860 "command at %p", smrt->smrt_event_cmd, 1861 smrt->smrt_event_cancel_cmd); 1862 smrt->smrt_event_cmd = NULL; 1863 smrt->smrt_event_cancel_cmd = NULL; 1864 goto free; 1865 } 1866 1867 } 1868 1869 /* 1870 * Well, in the end, it's results that count. 1871 */ 1872 if (smrt->smrt_event_cancel_cmd->smcm_status & 1873 SMRT_CMD_STATUS_RESET_SENT) { 1874 goto free; 1875 } 1876 1877 if (smrt->smrt_event_cancel_cmd->smcm_status & SMRT_CMD_STATUS_ERROR) { 1878 ErrorInfo_t *ei = smrt->smrt_event_cancel_cmd->smcm_va_err; 1879 1880 /* 1881 * This can return a CISS_CMD_TARGET_STATUS entry when the 1882 * controller doesn't think a command is outstanding. It is 1883 * possible we raced, so don't think too much about that case. 1884 * Anything else leaves us between a rock and a hard place, the 1885 * only way out is a reset. 1886 */ 1887 if (ei->CommandStatus != CISS_CMD_TARGET_STATUS && 1888 smrt_ctlr_reset(smrt) != 0) { 1889 dev_err(smrt->smrt_dip, CE_WARN, "failed to reset " 1890 "device after receiving an error on the async " 1891 "cancellation command (%d); abandoning " 1892 "smrt_command_t event command at address %p and " 1893 "cancellation command at %p", ei->CommandStatus, 1894 smrt->smrt_event_cmd, smrt->smrt_event_cancel_cmd); 1895 smrt->smrt_event_cmd = NULL; 1896 smrt->smrt_event_cancel_cmd = NULL; 1897 goto free; 1898 } 1899 } 1900 1901 free: 1902 event = smrt->smrt_event_cmd; 1903 smrt->smrt_event_cmd = NULL; 1904 cancel = smrt->smrt_event_cancel_cmd; 1905 smrt->smrt_event_cancel_cmd = NULL; 1906 mutex_exit(&smrt->smrt_mutex); 1907 if (event != NULL) 1908 smrt_command_free(event); 1909 if (cancel != NULL) 1910 smrt_command_free(cancel); 1911 cv_destroy(&smrt->smrt_event_queue); 1912 } 1913 1914 /* 1915 * We've been asked to do a discovery in panic context. This would have 1916 * occurred because there was a device reset. Because we can't rely on the 1917 * target maps, all we can do at the moment is go over all the active targets 1918 * and note which ones no longer exist. If this target was required to dump, 1919 * then the dump code will encounter a fatal error. If not, then we should 1920 * count ourselves surprisingly lucky. 1921 */ 1922 static void 1923 smrt_discover_panic_check(smrt_t *smrt) 1924 { 1925 smrt_target_t *smtg; 1926 1927 ASSERT(MUTEX_HELD(&smrt->smrt_mutex)); 1928 for (smtg = list_head(&smrt->smrt_targets); smtg != NULL; 1929 smtg = list_next(&smrt->smrt_targets, smtg)) { 1930 uint64_t gen; 1931 1932 if (smtg->smtg_physical) { 1933 smrt_physical_t *smpt = smtg->smtg_lun.smtg_phys; 1934 /* 1935 * Don't worry about drives that aren't visible. 1936 */ 1937 if (!smpt->smpt_visible) 1938 continue; 1939 gen = smpt->smpt_gen; 1940 } else { 1941 smrt_volume_t *smlv = smtg->smtg_lun.smtg_vol; 1942 gen = smlv->smlv_gen; 1943 } 1944 1945 if (gen != smrt->smrt_discover_gen) { 1946 dev_err(smrt->smrt_dip, CE_WARN, "target %s " 1947 "disappeared during post-panic discovery", 1948 scsi_device_unit_address(smtg->smtg_scsi_dev)); 1949 smtg->smtg_gone = B_TRUE; 1950 } 1951 } 1952 } 1953 1954 static void 1955 smrt_discover(void *arg) 1956 { 1957 int log = 0, phys = 0; 1958 smrt_t *smrt = arg; 1959 uint64_t gen; 1960 boolean_t runphys, runvirt; 1961 1962 mutex_enter(&smrt->smrt_mutex); 1963 smrt->smrt_status |= SMRT_CTLR_DISCOVERY_RUNNING; 1964 smrt->smrt_status &= ~SMRT_CTLR_DISCOVERY_REQUESTED; 1965 1966 smrt->smrt_discover_gen++; 1967 gen = smrt->smrt_discover_gen; 1968 runphys = smrt->smrt_phys_tgtmap != NULL; 1969 runvirt = smrt->smrt_virt_tgtmap != NULL; 1970 mutex_exit(&smrt->smrt_mutex); 1971 if (runphys) 1972 phys = smrt_phys_discover(smrt, SMRT_DISCOVER_TIMEOUT, gen); 1973 if (runvirt) 1974 log = smrt_logvol_discover(smrt, SMRT_DISCOVER_TIMEOUT, gen); 1975 mutex_enter(&smrt->smrt_mutex); 1976 1977 if (phys != 0 || log != 0) { 1978 if (!ddi_in_panic()) { 1979 smrt->smrt_status |= SMRT_CTLR_DISCOVERY_PERIODIC; 1980 } else { 1981 panic("smrt_t %p failed to perform discovery after " 1982 "a reset in panic context, unable to continue. " 1983 "logvol: %d, phys: %d", smrt, log, phys); 1984 } 1985 } else { 1986 if (!ddi_in_panic() && 1987 smrt->smrt_status & SMRT_CTLR_DISCOVERY_REQUIRED) { 1988 smrt->smrt_status &= ~SMRT_CTLR_DISCOVERY_REQUIRED; 1989 cv_broadcast(&smrt->smrt_cv_finishq); 1990 } 1991 1992 if (ddi_in_panic()) { 1993 smrt_discover_panic_check(smrt); 1994 } 1995 } 1996 smrt->smrt_status &= ~SMRT_CTLR_DISCOVERY_RUNNING; 1997 if (smrt->smrt_status & SMRT_CTLR_DISCOVERY_REQUESTED) 1998 smrt->smrt_status |= SMRT_CTLR_DISCOVERY_PERIODIC; 1999 mutex_exit(&smrt->smrt_mutex); 2000 } 2001 2002 /* 2003 * Request discovery, which is always run via a taskq. 2004 */ 2005 void 2006 smrt_discover_request(smrt_t *smrt) 2007 { 2008 boolean_t run; 2009 ASSERT(MUTEX_HELD(&smrt->smrt_mutex)); 2010 2011 if (ddi_in_panic()) { 2012 smrt_discover(smrt); 2013 return; 2014 } 2015 2016 run = (smrt->smrt_status & SMRT_CTLR_DISCOVERY_MASK) == 0; 2017 smrt->smrt_status |= SMRT_CTLR_DISCOVERY_REQUESTED; 2018 if (run && ddi_taskq_dispatch(smrt->smrt_discover_taskq, 2019 smrt_discover, smrt, DDI_NOSLEEP) != DDI_SUCCESS) { 2020 smrt->smrt_status |= SMRT_CTLR_DISCOVERY_PERIODIC; 2021 smrt->smrt_stats.smrts_discovery_tq_errors++; 2022 } 2023 } 2024