1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright (c) 2017, Joyent, Inc.
14 */
15
16 #include <sys/scsi/adapters/smrt/smrt.h>
17
18 /*
19 * Discovery, Resets, Periodics, and Events
20 * ----------------------------------------
21 *
22 * Discovery is the act of figuring out what logical and physical volumes exist
23 * under the controller. Discovery happens in response to the following events:
24 *
25 * o iports for virtual and physical devices being attached
26 * o Controller event notifications indicating potential topology changes
27 * o After a reset of the controller, before we can perform I/O again
28 *
29 * Because we have to perform discovery after a reset, which can happen during
30 * panic(), that also means that discovery may be run in panic context. We
31 * also need to emphasize the need for discovery to happen after a controller
32 * reset. Once a reset is initiated, we cannot be certain about the addresses
33 * of any of the existing targets until the reset has completed. The driver
34 * performs I/Os to addresses that the controller provides. The controller
35 * specification says that these addresses may change after a controller reset.
36 *
37 * Unfortunately, all of this combined means that making sure we can correctly
38 * run discovery is somewhat complicated. In non-panic contexts, discovery is
39 * always run from a taskq. We'll kick off the discovery in the taskq if
40 * nothing is pending at that time. The state is managed by bits in the
41 * smrt_status member of the smrt_t. There are four bits at this time:
42 *
43 * SMRT_CTLR_DISCOVERY_REQUESTED This flag indicates that something has
44 * requested that a discovery be performed.
45 * If no flags are set when this is set,
46 * then we will kick off discovery. All
47 * discovery requests are initiated via the
48 * smrt_discover_request() function.
49 *
50 * SMRT_CTLR_DISCOVERY_RUNNING This flag is set at the start of us
51 * running a discovery. It is removed when
52 * discovery finishes.
53 *
54 * SMRT_CTLR_DISCOVERY_PERIODIC This flag is set in a number of
55 * circumstances, which will be described
56 * in a subsequent section. This indicates
57 * that the periodic must kick off the
58 * discovery process.
59 *
60 * SMRT_CTLR_DISCOVERY_REQUIRED This flag indicates that at some point a
61 * controller reset occurred and we need to
62 * have a successful discovery to finish
63 * the act of resetting and allowing I/O to
64 * continue.
65 *
66 * In general, a request to discover kicks off the taskq to discover entries, if
67 * it hasn't already been requested or started. This also allows us to coalesce
68 * multiple requests, if needed. Note that if a request comes in when a
69 * discovery is ongoing, we do not kick off discovery again. Instead, we set
70 * the SMRT_CTLR_DISCOVERY_REQUESTED flag which will rerun discovery after the
71 * initial pass has completed.
72 *
73 * When a discovery starts, the first thing it does is clear the
74 * SMRT_CTLR_DISCOVERY_REQUESTED flag. This is important, because any
75 * additional requests for discovery that come in after this has started likely
76 * indicate that we've missed something. As such, when the discovery process
77 * finishes, if it sees the REQUESTED flag, then it will need to set the
78 * PERIODIC flag. The PERIODIC flag is used to indicate that we should run
79 * discovery again, but not kick if off immediately. Instead, it should be
80 * driven by the normal periodic behavior.
81 *
82 * If for some reason the act of discovery fails, or we fail to dispatch
83 * discovery due to a transient error, then we will flag PERIODIC so that the
84 * periodic tick will try and run things again.
85 *
86 * Now, we need to talk about SMRT_CTLR_DISCOVERY_REQUIRED. This flag is set
87 * after a reset occurs. The reset thread will be blocked on this.
88 * Importantly, none of the code in the discovery path can ask for a controller
89 * reset at this time. If at the end of a discovery, this flag is set, then we
90 * will signal the reset thread that it should check on its status by
91 * broadcasting on the smrt_cv_finishq. At that point, the reset thread will
92 * continue.
93 *
94 * Panic Context
95 * -------------
96 *
97 * All of this talk of threads and taskqs is well and good, but as an HBA
98 * driver, we have a serious responsibility to try and deal with panic sanely.
99 * In panic context, we will directly call the discovery functions and not poll
100 * for them to occur.
101 *
102 * However, because our discovery relies on the target maps, which aren't safe
103 * for panic context at this time, we have to take a different approach. We
104 * leverage the fact that we have a generation number stored with every
105 * discovery. If we try to do an I/O to a device where the generation doesn't
106 * match, then we know that it disappeared and should not be used. We also
107 * sanity check the model, serial numbers, and WWNs to make sure that these are
108 * the same devices. If they are, then we'll end up updating the address
109 * structures.
110 *
111 * Now, it is possible that when we were panicking, we had a thread that was in
112 * the process of running a discovery or even resetting the system. Once we're
113 * in panic, those threads aren't running, so if they didn't end up producing a
114 * new view of the world that the SCSI framework is using, then it shouldn't
115 * really matter, as we won't have updated the list of devices. Importantly,
116 * once we're in that context, we're not going to be attaching or detaching
117 * targets. If we get a request for one of these targets which has disappeared,
118 * we're going to have to end up giving up.
119 *
120 * Request Attributes
121 * ------------------
122 *
123 * The CISS specification allows for three different kinds of attributes that
124 * describe how requests are queued to the controller. These are:
125 *
126 * HEAD OF QUEUE The request should go to the head of the
127 * controller queue. This is used for resets and
128 * aborts to ensure that they're not blocked behind
129 * additional I/O.
130 *
131 * SIMPLE This queues the request for normal processing.
132 * Commands queued this way are not special with
133 * respect to one another. We use this for all I/O
134 * and discovery commands.
135 *
136 * ORDERED This attribute is used to indicate that commands
137 * should be submitted and processed in some order.
138 * This is used primarily for the event
139 * notification bits so we can ensure that at the
140 * return of a cancellation of the event
141 * notification, that any outstanding request has
142 * been honored.
143 */
144
145 static int smrt_ctlr_versions(smrt_t *, uint16_t, smrt_versions_t *);
146 static void smrt_discover(void *);
147
148 /*
149 * The maximum number of seconds to wait for the controller to come online.
150 */
151 unsigned smrt_ciss_init_time = 90;
152
153 /*
154 * A tunable that determines the number of events per tick that we'll process
155 * via asynchronous event notification. If this rate is very high, then we will
156 * not submit the event and it will be picked up at the next tick of the
157 * periodic.
158 */
159 uint_t smrt_event_intervention_threshold = 1000;
160
161 /*
162 * Converts a LUN Address to a BMIC Identifier. The BMIC Identifier is used
163 * when performing various physical commands and generally should stay the same
164 * for a given device across inserts and removals; however, not across
165 * controller resets. These are calculated based on what the CISS specification
166 * calls the 'Level 2' target and bus, which don't have a real meaning in the
167 * SAS world otherwise.
168 */
169 uint16_t
smrt_lun_addr_to_bmic(PhysDevAddr_t * paddr)170 smrt_lun_addr_to_bmic(PhysDevAddr_t *paddr)
171 {
172 uint16_t id;
173
174 id = (paddr->Target[1].PeripDev.Bus - 1) << 8;
175 id += paddr->Target[1].PeripDev.Dev;
176
177 return (id);
178 }
179
180 void
smrt_write_lun_addr_phys(LUNAddr_t * lun,boolean_t masked,unsigned bus,unsigned target)181 smrt_write_lun_addr_phys(LUNAddr_t *lun, boolean_t masked, unsigned bus,
182 unsigned target)
183 {
184 lun->PhysDev.Mode = masked ? MASK_PERIPHERIAL_DEV_ADDR :
185 PERIPHERIAL_DEV_ADDR;
186
187 lun->PhysDev.TargetId = target;
188 lun->PhysDev.Bus = bus;
189
190 bzero(&lun->PhysDev.Target, sizeof (lun->PhysDev.Target));
191 }
192
193 /*
194 * According to the CISS Specification, the controller is always addressed in
195 * Mask Perhiperhal mode with a bus and target ID of zero. This is used by
196 * commands that need to write to the controller itself, which is generally
197 * discovery and other commands.
198 */
199 void
smrt_write_controller_lun_addr(LUNAddr_t * lun)200 smrt_write_controller_lun_addr(LUNAddr_t *lun)
201 {
202 smrt_write_lun_addr_phys(lun, B_TRUE, 0, 0);
203 }
204
205 void
smrt_write_message_common(smrt_command_t * smcm,uint8_t type,int timeout_secs)206 smrt_write_message_common(smrt_command_t *smcm, uint8_t type, int timeout_secs)
207 {
208 switch (type) {
209 case CISS_MSG_ABORT:
210 case CISS_MSG_RESET:
211 case CISS_MSG_NOP:
212 break;
213
214 default:
215 panic("unknown message type");
216 }
217
218 smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_MSG;
219 smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_HEADOFQUEUE;
220 smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_NONE;
221 smcm->smcm_va_cmd->Request.Timeout = LE_16(timeout_secs);
222 smcm->smcm_va_cmd->Request.CDBLen = CISS_CDBLEN;
223 smcm->smcm_va_cmd->Request.CDB[0] = type;
224 }
225
226 void
smrt_write_message_abort_one(smrt_command_t * smcm,uint32_t tag)227 smrt_write_message_abort_one(smrt_command_t *smcm, uint32_t tag)
228 {
229 smrt_tag_t cisstag;
230
231 /*
232 * When aborting a particular command, the request is addressed
233 * to the controller.
234 */
235 smrt_write_lun_addr_phys(&smcm->smcm_va_cmd->Header.LUN,
236 B_TRUE, 0, 0);
237
238 smrt_write_message_common(smcm, CISS_MSG_ABORT, 0);
239
240 /*
241 * Abort a single command.
242 */
243 smcm->smcm_va_cmd->Request.CDB[1] = CISS_ABORT_TASK;
244
245 /*
246 * The CISS Specification says that the tag value for a task-level
247 * abort should be in the CDB in bytes 4-11.
248 */
249 bzero(&cisstag, sizeof (cisstag));
250 cisstag.tag_value = tag;
251 bcopy(&cisstag, &smcm->smcm_va_cmd->Request.CDB[4],
252 sizeof (cisstag));
253 }
254
255 void
smrt_write_message_abort_all(smrt_command_t * smcm,LUNAddr_t * addr)256 smrt_write_message_abort_all(smrt_command_t *smcm, LUNAddr_t *addr)
257 {
258 /*
259 * When aborting all tasks for a particular Logical Volume,
260 * the command is addressed not to the controller but to
261 * the Volume itself.
262 */
263 smcm->smcm_va_cmd->Header.LUN = *addr;
264
265 smrt_write_message_common(smcm, CISS_MSG_ABORT, 0);
266
267 /*
268 * Abort all commands for a particular Logical Volume.
269 */
270 smcm->smcm_va_cmd->Request.CDB[1] = CISS_ABORT_TASKSET;
271 }
272
273 void
smrt_write_message_event_notify(smrt_command_t * smcm)274 smrt_write_message_event_notify(smrt_command_t *smcm)
275 {
276 smrt_event_notify_req_t senr;
277
278 smrt_write_controller_lun_addr(&smcm->smcm_va_cmd->Header.LUN);
279
280 smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD;
281 smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_ORDERED;
282 smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_READ;
283 smcm->smcm_va_cmd->Request.Timeout = 0;
284 smcm->smcm_va_cmd->Request.CDBLen = sizeof (senr);
285
286 bzero(&senr, sizeof (senr));
287 senr.senr_opcode = CISS_SCMD_READ;
288 senr.senr_subcode = CISS_BMIC_NOTIFY_ON_EVENT;
289 senr.senr_flags = BE_32(0);
290 senr.senr_size = BE_32(SMRT_EVENT_NOTIFY_BUFLEN);
291
292 bcopy(&senr, &smcm->smcm_va_cmd->Request.CDB[0],
293 MIN(CISS_CDBLEN, sizeof (senr)));
294 }
295
296 void
smrt_write_message_cancel_event_notify(smrt_command_t * smcm)297 smrt_write_message_cancel_event_notify(smrt_command_t *smcm)
298 {
299 smrt_event_notify_req_t senr;
300
301 smrt_write_controller_lun_addr(&smcm->smcm_va_cmd->Header.LUN);
302
303 smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD;
304 smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_ORDERED;
305 smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_WRITE;
306 smcm->smcm_va_cmd->Request.Timeout = LE_16(SMRT_ASYNC_CANCEL_TIMEOUT);
307 smcm->smcm_va_cmd->Request.CDBLen = sizeof (senr);
308
309 bzero(&senr, sizeof (senr));
310 senr.senr_opcode = CISS_SCMD_WRITE;
311 senr.senr_subcode = CISS_BMIC_NOTIFY_ON_EVENT_CANCEL;
312 senr.senr_size = BE_32(SMRT_EVENT_NOTIFY_BUFLEN);
313
314 bcopy(&senr, &smcm->smcm_va_cmd->Request.CDB[0],
315 MIN(CISS_CDBLEN, sizeof (senr)));
316 }
317
318 void
smrt_write_message_reset_ctlr(smrt_command_t * smcm)319 smrt_write_message_reset_ctlr(smrt_command_t *smcm)
320 {
321 smrt_write_lun_addr_phys(&smcm->smcm_va_cmd->Header.LUN,
322 B_TRUE, 0, 0);
323
324 smrt_write_message_common(smcm, CISS_MSG_RESET, 0);
325
326 smcm->smcm_va_cmd->Request.CDB[1] = CISS_RESET_CTLR;
327 }
328
329 void
smrt_write_message_nop(smrt_command_t * smcm,int timeout_secs)330 smrt_write_message_nop(smrt_command_t *smcm, int timeout_secs)
331 {
332 /*
333 * No-op messages are always sent to the controller.
334 */
335 smrt_write_lun_addr_phys(&smcm->smcm_va_cmd->Header.LUN,
336 B_TRUE, 0, 0);
337
338 smrt_write_message_common(smcm, CISS_MSG_NOP, timeout_secs);
339 }
340
341 /*
342 * This routine is executed regularly by ddi_periodic_add(9F). It checks the
343 * health of the controller and looks for submitted commands that have timed
344 * out.
345 */
346 void
smrt_periodic(void * arg)347 smrt_periodic(void *arg)
348 {
349 smrt_t *smrt = arg;
350
351 mutex_enter(&smrt->smrt_mutex);
352
353 /*
354 * Before we even check if the controller is running to process
355 * everything else, we must first check if we had a request to kick off
356 * discovery. We do this before the check if the controller is running,
357 * as this may be required to finish a discovery.
358 */
359 if ((smrt->smrt_status & SMRT_CTLR_DISCOVERY_PERIODIC) != 0 &&
360 (smrt->smrt_status & SMRT_CTLR_DISCOVERY_RUNNING) == 0 &&
361 (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) == 0) {
362 if (ddi_taskq_dispatch(smrt->smrt_discover_taskq,
363 smrt_discover, smrt, DDI_NOSLEEP) != DDI_SUCCESS) {
364 smrt->smrt_stats.smrts_discovery_tq_errors++;
365 } else {
366 smrt->smrt_status &= ~SMRT_CTLR_DISCOVERY_PERIODIC;
367 }
368 }
369
370 if (!(smrt->smrt_status & SMRT_CTLR_STATUS_RUNNING)) {
371 /*
372 * The device is currently not active, e.g. due to an
373 * in-progress controller reset.
374 */
375 mutex_exit(&smrt->smrt_mutex);
376 return;
377 }
378
379 /*
380 * Check on the health of the controller firmware. Note that if the
381 * controller has locked up, this routine will panic the system.
382 */
383 smrt_lockup_check(smrt);
384
385 /*
386 * Reset the event notification threshold counter.
387 */
388 smrt->smrt_event_count = 0;
389
390 /*
391 * Check inflight commands to see if they have timed out.
392 */
393 for (smrt_command_t *smcm = avl_first(&smrt->smrt_inflight);
394 smcm != NULL; smcm = AVL_NEXT(&smrt->smrt_inflight, smcm)) {
395 if (smcm->smcm_status & SMRT_CMD_STATUS_POLLED) {
396 /*
397 * Polled commands are timed out by the polling
398 * routine.
399 */
400 continue;
401 }
402
403 if (smcm->smcm_status & SMRT_CMD_STATUS_ABORT_SENT) {
404 /*
405 * This command has been aborted; either it will
406 * complete or the controller will be reset.
407 */
408 continue;
409 }
410
411 if (list_link_active(&smcm->smcm_link_abort)) {
412 /*
413 * Already on the abort queue.
414 */
415 continue;
416 }
417
418 if (smcm->smcm_expiry == 0) {
419 /*
420 * This command has no expiry time.
421 */
422 continue;
423 }
424
425 if (gethrtime() > smcm->smcm_expiry) {
426 list_insert_tail(&smrt->smrt_abortq, smcm);
427 smcm->smcm_status |= SMRT_CMD_STATUS_TIMEOUT;
428 }
429 }
430
431 /*
432 * Process the abort queue.
433 */
434 (void) smrt_process_abortq(smrt);
435
436 /*
437 * Check if we have an outstanding event intervention request. Note,
438 * the command in question should always be in a state such that it is
439 * usable by the system here. The command is always prepared again by
440 * the normal event notification path, even if a reset has occurred.
441 * The reset will be processed before we'd ever consider running an
442 * event again. Note, if we fail to submit this, then we leave this for
443 * the next occurrence of the periodic.
444 */
445 if (smrt->smrt_status & SMRT_CTLR_ASYNC_INTERVENTION) {
446 smrt->smrt_stats.smrts_events_intervened++;
447
448 if (smrt_submit(smrt, smrt->smrt_event_cmd) == 0) {
449 smrt->smrt_status &= ~SMRT_CTLR_ASYNC_INTERVENTION;
450 }
451 }
452
453 mutex_exit(&smrt->smrt_mutex);
454 }
455
456 int
smrt_retrieve(smrt_t * smrt)457 smrt_retrieve(smrt_t *smrt)
458 {
459 VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
460
461 switch (smrt->smrt_ctlr_mode) {
462 case SMRT_CTLR_MODE_SIMPLE:
463 smrt_retrieve_simple(smrt);
464 return (DDI_SUCCESS);
465
466 case SMRT_CTLR_MODE_UNKNOWN:
467 break;
468 }
469
470 panic("unknown controller mode");
471 /* LINTED: E_FUNC_NO_RET_VAL */
472 }
473
474 /*
475 * Grab a new tag number for this command. We aim to avoid reusing tag numbers
476 * as much as possible, so as to avoid spurious double completion from the
477 * controller.
478 */
479 static void
smrt_set_new_tag(smrt_t * smrt,smrt_command_t * smcm)480 smrt_set_new_tag(smrt_t *smrt, smrt_command_t *smcm)
481 {
482 VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
483
484 /*
485 * Loop until we find a tag that is not in use. The tag space is
486 * very large (~30 bits) and the maximum number of inflight commands
487 * is comparatively small (~1024 in current controllers).
488 */
489 for (;;) {
490 uint32_t new_tag = smrt->smrt_next_tag;
491
492 if (++smrt->smrt_next_tag > SMRT_MAX_TAG_NUMBER) {
493 smrt->smrt_next_tag = SMRT_MIN_TAG_NUMBER;
494 }
495
496 if (smrt_lookup_inflight(smrt, new_tag) != NULL) {
497 /*
498 * This tag is already used on an inflight command.
499 * Choose another.
500 */
501 continue;
502 }
503
504 /*
505 * Set the tag for the command and also write it into the
506 * appropriate part of the request block.
507 */
508 smcm->smcm_tag = new_tag;
509 smcm->smcm_va_cmd->Header.Tag.tag_value = new_tag;
510 return;
511 }
512 }
513
514 /*
515 * Submit a command to the controller.
516 */
517 int
smrt_submit(smrt_t * smrt,smrt_command_t * smcm)518 smrt_submit(smrt_t *smrt, smrt_command_t *smcm)
519 {
520 VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
521 VERIFY(smcm->smcm_type != SMRT_CMDTYPE_PREINIT);
522
523 /*
524 * Anything that asks us to ignore the running state of the controller
525 * must be wired up to poll for completion.
526 */
527 if (smcm->smcm_status & SMRT_CMD_IGNORE_RUNNING) {
528 VERIFY(smcm->smcm_status & SMRT_CMD_STATUS_POLLED);
529 }
530
531 /*
532 * If the controller is currently being reset, do not allow command
533 * submission. However, if this is one of the commands needed to finish
534 * reset, as indicated on the command structure, allow it.
535 */
536 if (!(smrt->smrt_status & SMRT_CTLR_STATUS_RUNNING) &&
537 !(smcm->smcm_status & SMRT_CMD_IGNORE_RUNNING)) {
538 return (EIO);
539 }
540
541 /*
542 * Do not allow submission of more concurrent commands than the
543 * controller supports.
544 */
545 if (avl_numnodes(&smrt->smrt_inflight) >= smrt->smrt_maxcmds) {
546 return (EAGAIN);
547 }
548
549 /*
550 * Synchronise the Command Block DMA resources to ensure that the
551 * device has a consistent view before we pass it the command.
552 */
553 if (ddi_dma_sync(smcm->smcm_contig.smdma_dma_handle, 0, 0,
554 DDI_DMA_SYNC_FORDEV) != DDI_SUCCESS) {
555 dev_err(smrt->smrt_dip, CE_PANIC, "DMA sync failure");
556 return (EIO);
557 }
558
559 /*
560 * Ensure that this command is not re-used without issuing a new
561 * tag number and performing any appropriate cleanup.
562 */
563 VERIFY(!(smcm->smcm_status & SMRT_CMD_STATUS_USED));
564 smcm->smcm_status |= SMRT_CMD_STATUS_USED;
565
566 /*
567 * Assign a tag that is not currently in use
568 */
569 smrt_set_new_tag(smrt, smcm);
570
571 /*
572 * Insert this command into the inflight AVL.
573 */
574 avl_index_t where;
575 if (avl_find(&smrt->smrt_inflight, smcm, &where) != NULL) {
576 dev_err(smrt->smrt_dip, CE_PANIC, "duplicate submit tag %x",
577 smcm->smcm_tag);
578 }
579 avl_insert(&smrt->smrt_inflight, smcm, where);
580 if (smrt->smrt_stats.smrts_max_inflight <
581 avl_numnodes(&smrt->smrt_inflight)) {
582 smrt->smrt_stats.smrts_max_inflight =
583 avl_numnodes(&smrt->smrt_inflight);
584 }
585
586 VERIFY(!(smcm->smcm_status & SMRT_CMD_STATUS_INFLIGHT));
587 smcm->smcm_status |= SMRT_CMD_STATUS_INFLIGHT;
588
589 smcm->smcm_time_submit = gethrtime();
590
591 switch (smrt->smrt_ctlr_mode) {
592 case SMRT_CTLR_MODE_SIMPLE:
593 smrt_submit_simple(smrt, smcm);
594 return (0);
595
596 case SMRT_CTLR_MODE_UNKNOWN:
597 break;
598 }
599 panic("unknown controller mode");
600 /* LINTED: E_FUNC_NO_RET_VAL */
601 }
602
603 static void
smrt_process_finishq_sync(smrt_command_t * smcm)604 smrt_process_finishq_sync(smrt_command_t *smcm)
605 {
606 smrt_t *smrt = smcm->smcm_ctlr;
607
608 if (ddi_dma_sync(smcm->smcm_contig.smdma_dma_handle, 0, 0,
609 DDI_DMA_SYNC_FORCPU) != DDI_SUCCESS) {
610 dev_err(smrt->smrt_dip, CE_PANIC, "finishq DMA sync failure");
611 }
612 }
613
614 static void
smrt_process_finishq_one(smrt_command_t * smcm)615 smrt_process_finishq_one(smrt_command_t *smcm)
616 {
617 smrt_t *smrt = smcm->smcm_ctlr;
618
619 VERIFY(!(smcm->smcm_status & SMRT_CMD_STATUS_COMPLETE));
620 smcm->smcm_status |= SMRT_CMD_STATUS_COMPLETE;
621
622 switch (smcm->smcm_type) {
623 case SMRT_CMDTYPE_INTERNAL:
624 cv_broadcast(&smcm->smcm_ctlr->smrt_cv_finishq);
625 return;
626
627 case SMRT_CMDTYPE_SCSA:
628 smrt_hba_complete(smcm);
629 return;
630
631 case SMRT_CMDTYPE_EVENT:
632 smrt_event_complete(smcm);
633 return;
634
635 case SMRT_CMDTYPE_ABORTQ:
636 /*
637 * Abort messages sent as part of abort queue processing
638 * do not require any completion activity.
639 */
640 mutex_exit(&smrt->smrt_mutex);
641 smrt_command_free(smcm);
642 mutex_enter(&smrt->smrt_mutex);
643 return;
644
645 case SMRT_CMDTYPE_PREINIT:
646 dev_err(smrt->smrt_dip, CE_PANIC, "preinit command "
647 "completed after initialisation");
648 return;
649 }
650
651 panic("unknown command type");
652 }
653
654 /*
655 * Process commands in the completion queue.
656 */
657 void
smrt_process_finishq(smrt_t * smrt)658 smrt_process_finishq(smrt_t *smrt)
659 {
660 smrt_command_t *smcm;
661
662 VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
663
664 while ((smcm = list_remove_head(&smrt->smrt_finishq)) != NULL) {
665 /*
666 * Synchronise the Command Block before we read from it or
667 * free it, to ensure that any writes from the controller are
668 * visible.
669 */
670 smrt_process_finishq_sync(smcm);
671
672 /*
673 * Check if this command was in line to be aborted.
674 */
675 if (list_link_active(&smcm->smcm_link_abort)) {
676 /*
677 * This command was in line, but the controller
678 * subsequently completed the command before we
679 * were able to do so.
680 */
681 list_remove(&smrt->smrt_abortq, smcm);
682 smcm->smcm_status &= ~SMRT_CMD_STATUS_TIMEOUT;
683 }
684
685 /*
686 * Check if this command has been abandoned by the original
687 * submitter. If it has, free it now to avoid a leak.
688 */
689 if (smcm->smcm_status & SMRT_CMD_STATUS_ABANDONED) {
690 mutex_exit(&smrt->smrt_mutex);
691 smrt_command_free(smcm);
692 mutex_enter(&smrt->smrt_mutex);
693 continue;
694 }
695
696 if (smcm->smcm_status & SMRT_CMD_STATUS_POLLED) {
697 /*
698 * This command will be picked up and processed
699 * by "smrt_poll_for()" once the CV is triggered
700 * at the end of processing.
701 */
702 smcm->smcm_status |= SMRT_CMD_STATUS_POLL_COMPLETE;
703 continue;
704 }
705
706 smrt_process_finishq_one(smcm);
707 }
708
709 cv_broadcast(&smrt->smrt_cv_finishq);
710 }
711
712 /*
713 * Process commands in the abort queue.
714 */
715 void
smrt_process_abortq(smrt_t * smrt)716 smrt_process_abortq(smrt_t *smrt)
717 {
718 smrt_command_t *smcm;
719 smrt_command_t *abort_smcm = NULL;
720
721 VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
722
723 if (list_is_empty(&smrt->smrt_abortq)) {
724 goto out;
725 }
726
727 another:
728 mutex_exit(&smrt->smrt_mutex);
729 if ((abort_smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_ABORTQ,
730 KM_NOSLEEP)) == NULL) {
731 /*
732 * No resources available to send abort messages. We will
733 * try again the next time around.
734 */
735 mutex_enter(&smrt->smrt_mutex);
736 goto out;
737 }
738 mutex_enter(&smrt->smrt_mutex);
739
740 while ((smcm = list_remove_head(&smrt->smrt_abortq)) != NULL) {
741 if (!(smcm->smcm_status & SMRT_CMD_STATUS_INFLIGHT)) {
742 /*
743 * This message is not currently inflight, so
744 * no abort is needed.
745 */
746 continue;
747 }
748
749 if (smcm->smcm_status & SMRT_CMD_STATUS_ABORT_SENT) {
750 /*
751 * An abort message has already been sent for
752 * this command.
753 */
754 continue;
755 }
756
757 /*
758 * Send an abort message for the command.
759 */
760 smrt_write_message_abort_one(abort_smcm, smcm->smcm_tag);
761 if (smrt_submit(smrt, abort_smcm) != 0) {
762 /*
763 * The command could not be submitted to the
764 * controller. Put it back in the abort queue
765 * and give up for now.
766 */
767 list_insert_head(&smrt->smrt_abortq, smcm);
768 goto out;
769 }
770 smcm->smcm_status |= SMRT_CMD_STATUS_ABORT_SENT;
771
772 /*
773 * Record some debugging information about the abort we
774 * sent:
775 */
776 smcm->smcm_abort_time = gethrtime();
777 smcm->smcm_abort_tag = abort_smcm->smcm_tag;
778
779 /*
780 * The abort message was sent. Release it and
781 * allocate another command.
782 */
783 abort_smcm = NULL;
784 goto another;
785 }
786
787 out:
788 cv_broadcast(&smrt->smrt_cv_finishq);
789 if (abort_smcm != NULL) {
790 mutex_exit(&smrt->smrt_mutex);
791 smrt_command_free(abort_smcm);
792 mutex_enter(&smrt->smrt_mutex);
793 }
794 }
795
796 int
smrt_poll_for(smrt_t * smrt,smrt_command_t * smcm)797 smrt_poll_for(smrt_t *smrt, smrt_command_t *smcm)
798 {
799 VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
800 VERIFY(smcm->smcm_status & SMRT_CMD_STATUS_POLLED);
801
802 while (!(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE)) {
803 if (smcm->smcm_expiry != 0) {
804 /*
805 * This command has an expiry time. Check to see
806 * if it has already passed:
807 */
808 if (smcm->smcm_expiry < gethrtime()) {
809 return (ETIMEDOUT);
810 }
811 }
812
813 if (ddi_in_panic()) {
814 /*
815 * When the system is panicking, there are no
816 * interrupts or other threads. Drive the polling loop
817 * on our own, but with a small delay to avoid
818 * aggrevating the controller while we're trying to
819 * dump.
820 */
821 (void) smrt_retrieve(smrt);
822 smrt_process_finishq(smrt);
823 drv_usecwait(100);
824 continue;
825 }
826
827 /*
828 * Wait for command completion to return through the regular
829 * interrupt handling path.
830 */
831 if (smcm->smcm_expiry == 0) {
832 cv_wait(&smrt->smrt_cv_finishq, &smrt->smrt_mutex);
833 } else {
834 /*
835 * Wait only until the expiry time for this command.
836 */
837 (void) cv_timedwait_sig_hrtime(&smrt->smrt_cv_finishq,
838 &smrt->smrt_mutex, smcm->smcm_expiry);
839 }
840 }
841
842 /*
843 * Fire the completion callback for this command. The callback
844 * is responsible for freeing the command, so it may not be
845 * referenced again once this call returns.
846 */
847 smrt_process_finishq_one(smcm);
848
849 return (0);
850 }
851
852 void
smrt_intr_set(smrt_t * smrt,boolean_t enabled)853 smrt_intr_set(smrt_t *smrt, boolean_t enabled)
854 {
855 /*
856 * Read the Interrupt Mask Register.
857 */
858 uint32_t imr = smrt_get32(smrt, CISS_I2O_INTERRUPT_MASK);
859
860 switch (smrt->smrt_ctlr_mode) {
861 case SMRT_CTLR_MODE_SIMPLE:
862 if (enabled) {
863 imr &= ~CISS_IMR_BIT_SIMPLE_INTR_DISABLE;
864 } else {
865 imr |= CISS_IMR_BIT_SIMPLE_INTR_DISABLE;
866 }
867 smrt_put32(smrt, CISS_I2O_INTERRUPT_MASK, imr);
868 return;
869
870 case SMRT_CTLR_MODE_UNKNOWN:
871 break;
872 }
873 panic("unknown controller mode");
874 }
875
876 /*
877 * Signal to the controller that we have updated the Configuration Table by
878 * writing to the Inbound Doorbell Register. The controller will, after some
879 * number of seconds, acknowledge this by clearing the bit.
880 *
881 * If successful, return DDI_SUCCESS. If the controller takes too long to
882 * acknowledge, return DDI_FAILURE.
883 */
884 int
smrt_cfgtbl_flush(smrt_t * smrt)885 smrt_cfgtbl_flush(smrt_t *smrt)
886 {
887 /*
888 * Read the current value of the Inbound Doorbell Register.
889 */
890 uint32_t idr = smrt_get32(smrt, CISS_I2O_INBOUND_DOORBELL);
891
892 /*
893 * Signal the Configuration Table change to the controller.
894 */
895 idr |= CISS_IDR_BIT_CFGTBL_CHANGE;
896 smrt_put32(smrt, CISS_I2O_INBOUND_DOORBELL, idr);
897
898 /*
899 * Wait for the controller to acknowledge the change.
900 */
901 for (unsigned i = 0; i < smrt_ciss_init_time; i++) {
902 idr = smrt_get32(smrt, CISS_I2O_INBOUND_DOORBELL);
903
904 if ((idr & CISS_IDR_BIT_CFGTBL_CHANGE) == 0) {
905 return (DDI_SUCCESS);
906 }
907
908 /*
909 * Wait for one second before trying again.
910 */
911 delay(drv_usectohz(1000000));
912 }
913
914 dev_err(smrt->smrt_dip, CE_WARN, "time out expired before controller "
915 "configuration completed");
916 return (DDI_FAILURE);
917 }
918
919 int
smrt_cfgtbl_transport_has_support(smrt_t * smrt,int xport)920 smrt_cfgtbl_transport_has_support(smrt_t *smrt, int xport)
921 {
922 VERIFY(xport == CISS_CFGTBL_XPORT_SIMPLE);
923
924 /*
925 * Read the current value of the "Supported Transport Methods" field in
926 * the Configuration Table.
927 */
928 uint32_t xport_active = ddi_get32(smrt->smrt_ct_handle,
929 &smrt->smrt_ct->TransportSupport);
930
931 /*
932 * Check that the desired transport method is supported by the
933 * controller:
934 */
935 if ((xport_active & xport) == 0) {
936 dev_err(smrt->smrt_dip, CE_WARN, "controller does not support "
937 "method \"%s\"", xport == CISS_CFGTBL_XPORT_SIMPLE ?
938 "simple" : "performant");
939 return (DDI_FAILURE);
940 }
941
942 return (DDI_SUCCESS);
943 }
944
945 void
smrt_cfgtbl_transport_set(smrt_t * smrt,int xport)946 smrt_cfgtbl_transport_set(smrt_t *smrt, int xport)
947 {
948 VERIFY(xport == CISS_CFGTBL_XPORT_SIMPLE);
949
950 ddi_put32(smrt->smrt_ct_handle, &smrt->smrt_ct->TransportRequest,
951 xport);
952 }
953
954 int
smrt_cfgtbl_transport_confirm(smrt_t * smrt,int xport)955 smrt_cfgtbl_transport_confirm(smrt_t *smrt, int xport)
956 {
957 VERIFY(xport == CISS_CFGTBL_XPORT_SIMPLE);
958
959 /*
960 * Read the current value of the TransportActive field in the
961 * Configuration Table.
962 */
963 uint32_t xport_active = ddi_get32(smrt->smrt_ct_handle,
964 &smrt->smrt_ct->TransportActive);
965
966 /*
967 * Check that the desired transport method is now active:
968 */
969 if ((xport_active & xport) == 0) {
970 dev_err(smrt->smrt_dip, CE_WARN, "failed to enable transport "
971 "method \"%s\"", xport == CISS_CFGTBL_XPORT_SIMPLE ?
972 "simple" : "performant");
973 return (DDI_FAILURE);
974 }
975
976 /*
977 * Ensure that the controller is now ready to accept commands.
978 */
979 if ((xport_active & CISS_CFGTBL_READY_FOR_COMMANDS) == 0) {
980 dev_err(smrt->smrt_dip, CE_WARN, "controller not ready to "
981 "accept commands");
982 return (DDI_FAILURE);
983 }
984
985 return (DDI_SUCCESS);
986 }
987
988 uint32_t
smrt_ctlr_get_maxsgelements(smrt_t * smrt)989 smrt_ctlr_get_maxsgelements(smrt_t *smrt)
990 {
991 return (ddi_get32(smrt->smrt_ct_handle, &smrt->smrt_ct->MaxSGElements));
992 }
993
994 uint32_t
smrt_ctlr_get_cmdsoutmax(smrt_t * smrt)995 smrt_ctlr_get_cmdsoutmax(smrt_t *smrt)
996 {
997 return (ddi_get32(smrt->smrt_ct_handle, &smrt->smrt_ct->CmdsOutMax));
998 }
999
1000 static uint32_t
smrt_ctlr_get_hostdrvsup(smrt_t * smrt)1001 smrt_ctlr_get_hostdrvsup(smrt_t *smrt)
1002 {
1003 return (ddi_get32(smrt->smrt_ct_handle,
1004 &smrt->smrt_ct->HostDrvrSupport));
1005 }
1006
1007 int
smrt_ctlr_init(smrt_t * smrt)1008 smrt_ctlr_init(smrt_t *smrt)
1009 {
1010 uint8_t signature[4] = { 'C', 'I', 'S', 'S' };
1011 int e;
1012
1013 if ((e = smrt_ctlr_wait_for_state(smrt,
1014 SMRT_WAIT_STATE_READY)) != DDI_SUCCESS) {
1015 return (e);
1016 }
1017
1018 /*
1019 * The configuration table contains an ASCII signature ("CISS") which
1020 * should be checked as we initialise the controller.
1021 * See: "9.1 Configuration Table" in CISS Specification.
1022 */
1023 for (unsigned i = 0; i < 4; i++) {
1024 if (ddi_get8(smrt->smrt_ct_handle,
1025 &smrt->smrt_ct->Signature[i]) != signature[i]) {
1026 dev_err(smrt->smrt_dip, CE_WARN, "invalid signature "
1027 "detected");
1028 return (DDI_FAILURE);
1029 }
1030 }
1031
1032 /*
1033 * Initialise an appropriate Transport Method. For now, this driver
1034 * only supports the "Simple" method.
1035 */
1036 if ((e = smrt_ctlr_init_simple(smrt)) != DDI_SUCCESS) {
1037 return (e);
1038 }
1039
1040 /*
1041 * Save some common feature support bitfields.
1042 */
1043 smrt->smrt_host_support = smrt_ctlr_get_hostdrvsup(smrt);
1044 smrt->smrt_bus_support = ddi_get32(smrt->smrt_ct_handle,
1045 &smrt->smrt_ct->BusTypes);
1046
1047 /*
1048 * Read initial controller heartbeat value and mark the current
1049 * reading time.
1050 */
1051 smrt->smrt_last_heartbeat = ddi_get32(smrt->smrt_ct_handle,
1052 &smrt->smrt_ct->HeartBeat);
1053 smrt->smrt_last_heartbeat_time = gethrtime();
1054
1055 /*
1056 * Determine the firmware version of the controller so that we can
1057 * select which type of interrupts to use.
1058 */
1059 if ((e = smrt_ctlr_versions(smrt, SMRT_DISCOVER_TIMEOUT,
1060 &smrt->smrt_versions)) != 0) {
1061 dev_err(smrt->smrt_dip, CE_WARN, "could not identify "
1062 "controller (%d)", e);
1063 return (DDI_FAILURE);
1064 }
1065
1066 dev_err(smrt->smrt_dip, CE_NOTE, "!firmware rev %s",
1067 smrt->smrt_versions.smrtv_firmware_rev);
1068
1069 return (DDI_SUCCESS);
1070 }
1071
1072 void
smrt_ctlr_teardown(smrt_t * smrt)1073 smrt_ctlr_teardown(smrt_t *smrt)
1074 {
1075 smrt->smrt_status &= ~SMRT_CTLR_STATUS_RUNNING;
1076
1077 switch (smrt->smrt_ctlr_mode) {
1078 case SMRT_CTLR_MODE_SIMPLE:
1079 smrt_ctlr_teardown_simple(smrt);
1080 return;
1081
1082 case SMRT_CTLR_MODE_UNKNOWN:
1083 return;
1084 }
1085
1086 panic("unknown controller mode");
1087 }
1088
1089 int
smrt_ctlr_wait_for_state(smrt_t * smrt,smrt_wait_state_t state)1090 smrt_ctlr_wait_for_state(smrt_t *smrt, smrt_wait_state_t state)
1091 {
1092 unsigned wait_usec = 100 * 1000;
1093 unsigned wait_count = SMRT_WAIT_DELAY_SECONDS * 1000000 / wait_usec;
1094
1095 VERIFY(state == SMRT_WAIT_STATE_READY ||
1096 state == SMRT_WAIT_STATE_UNREADY);
1097
1098 /*
1099 * Read from the Scratchpad Register until the expected ready signature
1100 * is detected. This behaviour is not described in the CISS
1101 * specification.
1102 *
1103 * If the device is not in the desired state immediately, sleep for a
1104 * second and try again. If the device has not become ready in 300
1105 * seconds, give up.
1106 */
1107 for (unsigned i = 0; i < wait_count; i++) {
1108 uint32_t spr = smrt_get32(smrt, CISS_I2O_SCRATCHPAD);
1109
1110 switch (state) {
1111 case SMRT_WAIT_STATE_READY:
1112 if (spr == CISS_SCRATCHPAD_INITIALISED) {
1113 return (DDI_SUCCESS);
1114 }
1115 break;
1116
1117 case SMRT_WAIT_STATE_UNREADY:
1118 if (spr != CISS_SCRATCHPAD_INITIALISED) {
1119 return (DDI_SUCCESS);
1120 }
1121 break;
1122 }
1123
1124 if (ddi_in_panic()) {
1125 /*
1126 * There is no sleep for the panicking, so we
1127 * must spin wait:
1128 */
1129 drv_usecwait(wait_usec);
1130 } else {
1131 /*
1132 * Wait for a quarter second and try again.
1133 */
1134 delay(drv_usectohz(wait_usec));
1135 }
1136 }
1137
1138 dev_err(smrt->smrt_dip, CE_WARN, "time out waiting for controller "
1139 "to enter state \"%s\"", state == SMRT_WAIT_STATE_READY ?
1140 "ready": "unready");
1141 return (DDI_FAILURE);
1142 }
1143
1144 void
smrt_lockup_check(smrt_t * smrt)1145 smrt_lockup_check(smrt_t *smrt)
1146 {
1147 /*
1148 * Read the current controller heartbeat value.
1149 */
1150 uint32_t heartbeat = ddi_get32(smrt->smrt_ct_handle,
1151 &smrt->smrt_ct->HeartBeat);
1152
1153 VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
1154
1155 /*
1156 * Check to see if the value is the same as last time we looked:
1157 */
1158 if (heartbeat != smrt->smrt_last_heartbeat) {
1159 /*
1160 * The heartbeat value has changed, which suggests that the
1161 * firmware in the controller has not yet come to a complete
1162 * stop. Record the new value, as well as the current time.
1163 */
1164 smrt->smrt_last_heartbeat = heartbeat;
1165 smrt->smrt_last_heartbeat_time = gethrtime();
1166 return;
1167 }
1168
1169 /*
1170 * The controller _might_ have been able to signal to us that is
1171 * has locked up. This is a truly unfathomable state of affairs:
1172 * If the firmware can tell it has flown off the rails, why not
1173 * simply reset the controller?
1174 */
1175 uint32_t odr = smrt_get32(smrt, CISS_I2O_OUTBOUND_DOORBELL_STATUS);
1176 uint32_t spr = smrt_get32(smrt, CISS_I2O_SCRATCHPAD);
1177 if ((odr & CISS_ODR_BIT_LOCKUP) != 0) {
1178 dev_err(smrt->smrt_dip, CE_PANIC, "HP SmartArray firmware has "
1179 "reported a critical fault (odr %08x spr %08x)",
1180 odr, spr);
1181 }
1182
1183 if (gethrtime() > smrt->smrt_last_heartbeat_time + 60 * NANOSEC) {
1184 dev_err(smrt->smrt_dip, CE_PANIC, "HP SmartArray firmware has "
1185 "stopped responding (odr %08x spr %08x)",
1186 odr, spr);
1187 }
1188 }
1189
1190 /*
1191 * Probe the controller with the IDENTIFY CONTROLLER request. This is a BMIC
1192 * command, so it must be submitted to the controller and we must poll for its
1193 * completion. This functionality is only presently used during controller
1194 * initialisation, so it uses the special pre-initialisation path for command
1195 * allocation and submission.
1196 */
1197 static int
smrt_ctlr_identify(smrt_t * smrt,uint16_t timeout,smrt_identify_controller_t * resp)1198 smrt_ctlr_identify(smrt_t *smrt, uint16_t timeout,
1199 smrt_identify_controller_t *resp)
1200 {
1201 smrt_command_t *smcm;
1202 smrt_identify_controller_req_t smicr;
1203 int r;
1204 size_t sz;
1205
1206 /*
1207 * Allocate a command with a data buffer; the controller will fill it
1208 * with identification information. There is some suggestion in the
1209 * firmware-level specification that the buffer length should be a
1210 * multiple of 512 bytes for some controllers, so we round up.
1211 */
1212 sz = P2ROUNDUP_TYPED(sizeof (*resp), 512, size_t);
1213 if ((smcm = smrt_command_alloc_preinit(smrt, sz, KM_SLEEP)) == NULL) {
1214 return (ENOMEM);
1215 }
1216
1217 smrt_write_controller_lun_addr(&smcm->smcm_va_cmd->Header.LUN);
1218
1219 smcm->smcm_va_cmd->Request.CDBLen = sizeof (smicr);
1220 smcm->smcm_va_cmd->Request.Timeout = timeout;
1221 smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD;
1222 smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_SIMPLE;
1223 smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_READ;
1224
1225 /*
1226 * Construct the IDENTIFY CONTROLLER request CDB. Note that any
1227 * reserved fields in the request must be filled with zeroes.
1228 */
1229 bzero(&smicr, sizeof (smicr));
1230 smicr.smicr_opcode = CISS_SCMD_BMIC_READ;
1231 smicr.smicr_lun = 0;
1232 smicr.smicr_command = CISS_BMIC_IDENTIFY_CONTROLLER;
1233 bcopy(&smicr, &smcm->smcm_va_cmd->Request.CDB[0],
1234 MIN(CISS_CDBLEN, sizeof (smicr)));
1235
1236 /*
1237 * Send the command to the device and poll for its completion.
1238 */
1239 smcm->smcm_status |= SMRT_CMD_STATUS_POLLED;
1240 smcm->smcm_expiry = gethrtime() + timeout * NANOSEC;
1241 if ((r = smrt_preinit_command_simple(smrt, smcm)) != 0) {
1242 VERIFY3S(r, ==, ETIMEDOUT);
1243 VERIFY0(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE);
1244
1245 /*
1246 * This command timed out, but the driver is not presently
1247 * initialised to the point where we can try to abort it.
1248 * The command was created with the PREINIT type, so it
1249 * does not appear in the global command tracking list.
1250 * In order to avoid problems with DMA from the controller,
1251 * we have to leak the command allocation.
1252 */
1253 smcm = NULL;
1254 goto out;
1255 }
1256
1257 if (smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) {
1258 /*
1259 * The controller was reset while we were trying to identify
1260 * it. Report failure.
1261 */
1262 r = EIO;
1263 goto out;
1264 }
1265
1266 if (smcm->smcm_status & SMRT_CMD_STATUS_ERROR) {
1267 ErrorInfo_t *ei = smcm->smcm_va_err;
1268
1269 if (ei->CommandStatus != CISS_CMD_DATA_UNDERRUN) {
1270 dev_err(smrt->smrt_dip, CE_WARN, "identify "
1271 "controller error: status 0x%x",
1272 ei->CommandStatus);
1273 r = EIO;
1274 goto out;
1275 }
1276 }
1277
1278 if (resp != NULL) {
1279 /*
1280 * Copy the identify response out for the caller.
1281 */
1282 bcopy(smcm->smcm_internal->smcmi_va, resp, sizeof (*resp));
1283 }
1284
1285 r = 0;
1286
1287 out:
1288 if (smcm != NULL) {
1289 smrt_command_free(smcm);
1290 }
1291 return (r);
1292 }
1293
1294 /*
1295 * The firmware versions in an IDENTIFY CONTROLLER response generally take
1296 * the form of a four byte ASCII string containing a dotted decimal version
1297 * number; e.g., "8.00".
1298 *
1299 * This function sanitises the firmware version, replacing unexpected
1300 * values with a question mark.
1301 */
1302 static void
smrt_copy_firmware_version(uint8_t * src,char * dst)1303 smrt_copy_firmware_version(uint8_t *src, char *dst)
1304 {
1305 for (unsigned i = 0; i < 4; i++) {
1306 /*
1307 * Make sure that this is a 7-bit clean ASCII value.
1308 */
1309 char c = src[i] <= 0x7f ? (char)(src[i] & 0x7f) : '?';
1310
1311 if (isalnum(c) || c == '.' || c == ' ') {
1312 dst[i] = c;
1313 } else {
1314 dst[i] = '?';
1315 }
1316 }
1317 dst[4] = '\0';
1318 }
1319
1320 /*
1321 * Using an IDENTIFY CONTROLLER request, determine firmware and controller
1322 * version details. See the comments for "smrt_ctlr_identify()" for more
1323 * details about calling context.
1324 */
1325 static int
smrt_ctlr_versions(smrt_t * smrt,uint16_t timeout,smrt_versions_t * smrtv)1326 smrt_ctlr_versions(smrt_t *smrt, uint16_t timeout, smrt_versions_t *smrtv)
1327 {
1328 smrt_identify_controller_t smic;
1329 int r;
1330
1331 if ((r = smrt_ctlr_identify(smrt, timeout, &smic)) != 0) {
1332 return (r);
1333 }
1334
1335 smrtv->smrtv_hardware_version = smic.smic_hardware_version;
1336 smrt_copy_firmware_version(smic.smic_firmware_rev,
1337 smrtv->smrtv_firmware_rev);
1338 smrt_copy_firmware_version(smic.smic_recovery_rev,
1339 smrtv->smrtv_recovery_rev);
1340 smrt_copy_firmware_version(smic.smic_bootblock_rev,
1341 smrtv->smrtv_bootblock_rev);
1342
1343 return (0);
1344 }
1345
1346 int
smrt_ctlr_reset(smrt_t * smrt)1347 smrt_ctlr_reset(smrt_t *smrt)
1348 {
1349 smrt_command_t *smcm, *smcm_nop;
1350 int r;
1351
1352 VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
1353
1354 if (ddi_in_panic()) {
1355 goto skip_check;
1356 }
1357
1358 if (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) {
1359 /*
1360 * Don't pile on. One reset is enough. Wait until
1361 * it's complete, and then return success.
1362 */
1363 while (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) {
1364 cv_wait(&smrt->smrt_cv_finishq, &smrt->smrt_mutex);
1365 }
1366 return (0);
1367 }
1368 smrt->smrt_status |= SMRT_CTLR_STATUS_RESETTING;
1369 smrt->smrt_last_reset_start = gethrtime();
1370 smrt->smrt_stats.smrts_ctlr_resets++;
1371
1372 skip_check:
1373 /*
1374 * Allocate two commands: one for the soft reset message, which we
1375 * cannot free until the controller has reset; and one for the ping we
1376 * will use to determine when it is once again functional.
1377 */
1378 mutex_exit(&smrt->smrt_mutex);
1379 if ((smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL,
1380 KM_NOSLEEP)) == NULL) {
1381 mutex_enter(&smrt->smrt_mutex);
1382 return (ENOMEM);
1383 }
1384 if ((smcm_nop = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL,
1385 KM_NOSLEEP)) == NULL) {
1386 smrt_command_free(smcm);
1387 mutex_enter(&smrt->smrt_mutex);
1388 return (ENOMEM);
1389 }
1390 mutex_enter(&smrt->smrt_mutex);
1391
1392 /*
1393 * Send a soft reset command to the controller. If this command
1394 * succeeds, there will likely be no completion notification. Instead,
1395 * the device should become unavailable for some period of time and
1396 * then become available again. Once available again, we know the soft
1397 * reset has completed and should abort all in-flight commands.
1398 */
1399 smrt_write_message_reset_ctlr(smcm);
1400
1401 /*
1402 * Disable interrupts now.
1403 */
1404 smrt_intr_set(smrt, B_FALSE);
1405
1406 dev_err(smrt->smrt_dip, CE_WARN, "attempting controller soft reset");
1407 smcm->smcm_status |= SMRT_CMD_STATUS_POLLED;
1408 if ((r = smrt_submit(smrt, smcm)) != 0) {
1409 dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: "
1410 "submit failed (%d)", r);
1411 }
1412
1413 /*
1414 * Mark every currently inflight command as being reset, including the
1415 * soft reset command we just sent. Once we confirm the reset works,
1416 * we can safely report that these commands have failed.
1417 */
1418 for (smrt_command_t *t = avl_first(&smrt->smrt_inflight);
1419 t != NULL; t = AVL_NEXT(&smrt->smrt_inflight, t)) {
1420 t->smcm_status |= SMRT_CMD_STATUS_RESET_SENT;
1421 }
1422
1423 /*
1424 * Now that we have submitted our soft reset command, prevent
1425 * the rest of the driver from interacting with the controller.
1426 */
1427 smrt->smrt_status &= ~SMRT_CTLR_STATUS_RUNNING;
1428
1429 /*
1430 * We do not expect a completion from the controller for our soft
1431 * reset command, but we also cannot remove it from the inflight
1432 * list until we know the controller has actually reset. To do
1433 * otherwise would potentially allow the controller to scribble
1434 * on the memory we were using.
1435 */
1436 smcm->smcm_status |= SMRT_CMD_STATUS_ABANDONED;
1437
1438 if (smrt_ctlr_wait_for_state(smrt, SMRT_WAIT_STATE_UNREADY) !=
1439 DDI_SUCCESS) {
1440 dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: "
1441 "controller did not become unready");
1442 }
1443 dev_err(smrt->smrt_dip, CE_NOTE, "soft reset: controller unready");
1444
1445 if (smrt_ctlr_wait_for_state(smrt, SMRT_WAIT_STATE_READY) !=
1446 DDI_SUCCESS) {
1447 dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: "
1448 "controller did not come become ready");
1449 }
1450 dev_err(smrt->smrt_dip, CE_NOTE, "soft reset: controller ready");
1451
1452 /*
1453 * In at least the Smart Array P420i, the controller can take 30-45
1454 * seconds after the scratchpad register shows it as being available
1455 * before it is ready to receive commands. In order to avoid hitting
1456 * it too early with our post-reset ping, we will sleep for 10 seconds
1457 * here.
1458 */
1459 if (ddi_in_panic()) {
1460 drv_usecwait(10 * MICROSEC);
1461 } else {
1462 delay(drv_usectohz(10 * MICROSEC));
1463 }
1464
1465 smrt_ctlr_teardown(smrt);
1466 if (smrt_ctlr_init(smrt) != DDI_SUCCESS) {
1467 dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: "
1468 "controller transport could not be configured");
1469 }
1470 dev_err(smrt->smrt_dip, CE_NOTE, "soft reset: controller configured");
1471
1472 smrt_write_message_nop(smcm_nop, 0);
1473 smcm_nop->smcm_status |= SMRT_CMD_STATUS_POLLED |
1474 SMRT_CMD_IGNORE_RUNNING;
1475 if ((r = smrt_submit(smrt, smcm_nop)) != 0) {
1476 dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: "
1477 "ping could not be submitted (%d)", r);
1478 }
1479
1480 /*
1481 * Interrupts are still masked at this stage. Poll manually in
1482 * a way that will not trigger regular finish queue processing:
1483 */
1484 VERIFY(smcm_nop->smcm_status & SMRT_CMD_STATUS_INFLIGHT);
1485 for (unsigned i = 0; i < 600; i++) {
1486 smrt_retrieve_simple(smrt);
1487
1488 if (!(smcm_nop->smcm_status & SMRT_CMD_STATUS_INFLIGHT)) {
1489 /*
1490 * Remove the ping command from the finish queue and
1491 * process it manually. This processing must mirror
1492 * what would have been done in smrt_process_finishq().
1493 */
1494 VERIFY(list_link_active(&smcm_nop->smcm_link_finish));
1495 list_remove(&smrt->smrt_finishq, smcm_nop);
1496 smrt_process_finishq_sync(smcm_nop);
1497 smcm_nop->smcm_status |= SMRT_CMD_STATUS_POLL_COMPLETE;
1498 smrt_process_finishq_one(smcm_nop);
1499 break;
1500 }
1501
1502 if (ddi_in_panic()) {
1503 drv_usecwait(100 * 1000);
1504 } else {
1505 delay(drv_usectohz(100 * 1000));
1506 }
1507 }
1508
1509 if (!(smcm_nop->smcm_status & SMRT_CMD_STATUS_COMPLETE)) {
1510 dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: "
1511 "ping did not complete");
1512 } else if (smcm_nop->smcm_status & SMRT_CMD_STATUS_ERROR) {
1513 dev_err(smrt->smrt_dip, CE_WARN, "soft reset: ping completed "
1514 "in error (status %u)",
1515 (unsigned)smcm_nop->smcm_va_err->CommandStatus);
1516 } else {
1517 dev_err(smrt->smrt_dip, CE_NOTE, "soft reset: ping completed");
1518 }
1519
1520 /*
1521 * Now that the controller is working again, we can abort any
1522 * commands that were inflight during the reset.
1523 */
1524 smrt_command_t *nt;
1525 for (smrt_command_t *t = avl_first(&smrt->smrt_inflight);
1526 t != NULL; t = nt) {
1527 nt = AVL_NEXT(&smrt->smrt_inflight, t);
1528
1529 if (t->smcm_status & SMRT_CMD_STATUS_RESET_SENT) {
1530 avl_remove(&smrt->smrt_inflight, t);
1531 t->smcm_status &= ~SMRT_CMD_STATUS_INFLIGHT;
1532
1533 list_insert_tail(&smrt->smrt_finishq, t);
1534 }
1535 }
1536
1537 /*
1538 * Quiesce our discovery thread. Note, because
1539 * SMRT_CTLR_STATUS_RESTARTING is set, nothing can cause it to be
1540 * enabled again.
1541 */
1542 if (!ddi_in_panic()) {
1543 mutex_exit(&smrt->smrt_mutex);
1544 ddi_taskq_wait(smrt->smrt_discover_taskq);
1545 mutex_enter(&smrt->smrt_mutex);
1546 }
1547
1548 /*
1549 * Re-enable interrupts. Now, we must kick off a discovery to make sure
1550 * that the system is in a sane state and that we can perform I/O.
1551 */
1552 smrt_intr_set(smrt, B_TRUE);
1553 smrt->smrt_status &= ~SMRT_CTLR_STATUS_RESETTING;
1554 smrt->smrt_status |= SMRT_CTLR_DISCOVERY_REQUIRED;
1555
1556 /*
1557 * Attempt a discovery to make sure that the drivers sees a realistic
1558 * view of the world. If we're not in panic context, spin for the
1559 * asynchronous process to complete, otherwise we're in panic context
1560 * and this is going to happen regardless if we want it to or not.
1561 * Before we kick off the request to run discovery, we reset the
1562 * discovery request flags as we know that nothing else can consider
1563 * running discovery and we don't want to delay until the next smrt
1564 * periodic tick if we can avoid it. In panic context, if this failed,
1565 * then we won't make it back.
1566 */
1567 VERIFY0(smrt->smrt_status & SMRT_CTLR_DISCOVERY_RUNNING);
1568 smrt->smrt_status &= ~(SMRT_CTLR_DISCOVERY_MASK);
1569 smrt_discover(smrt);
1570 if (!ddi_in_panic()) {
1571 while (smrt->smrt_status & SMRT_CTLR_DISCOVERY_REQUIRED) {
1572 cv_wait(&smrt->smrt_cv_finishq, &smrt->smrt_mutex);
1573 }
1574 }
1575
1576 smrt->smrt_status |= SMRT_CTLR_STATUS_RUNNING;
1577 smrt->smrt_last_reset_finish = gethrtime();
1578
1579 /*
1580 * Wake anybody that was waiting for the reset to complete.
1581 */
1582 cv_broadcast(&smrt->smrt_cv_finishq);
1583
1584 /*
1585 * Process the completion queue one last time before we let go
1586 * of the mutex.
1587 */
1588 smrt_process_finishq(smrt);
1589
1590 mutex_exit(&smrt->smrt_mutex);
1591 smrt_command_free(smcm_nop);
1592 mutex_enter(&smrt->smrt_mutex);
1593 return (0);
1594 }
1595
1596 int
smrt_event_init(smrt_t * smrt)1597 smrt_event_init(smrt_t *smrt)
1598 {
1599 int ret;
1600 smrt_command_t *event, *cancel;
1601
1602 event = smrt_command_alloc(smrt, SMRT_CMDTYPE_EVENT, KM_NOSLEEP);
1603 if (event == NULL)
1604 return (ENOMEM);
1605 if (smrt_command_attach_internal(smrt, event, SMRT_EVENT_NOTIFY_BUFLEN,
1606 KM_NOSLEEP) != 0) {
1607 smrt_command_free(event);
1608 return (ENOMEM);
1609 }
1610 smrt_write_message_event_notify(event);
1611
1612 cancel = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL, KM_NOSLEEP);
1613 if (cancel == NULL) {
1614 smrt_command_free(event);
1615 return (ENOMEM);
1616 }
1617 if (smrt_command_attach_internal(smrt, cancel, SMRT_EVENT_NOTIFY_BUFLEN,
1618 KM_NOSLEEP) != 0) {
1619 smrt_command_free(event);
1620 smrt_command_free(cancel);
1621 return (ENOMEM);
1622 }
1623 smrt_write_message_cancel_event_notify(cancel);
1624
1625 cv_init(&smrt->smrt_event_queue, NULL, CV_DRIVER, NULL);
1626
1627 mutex_enter(&smrt->smrt_mutex);
1628 if ((ret = smrt_submit(smrt, event)) != 0) {
1629 mutex_exit(&smrt->smrt_mutex);
1630 smrt_command_free(event);
1631 smrt_command_free(cancel);
1632 return (ret);
1633 }
1634
1635 smrt->smrt_event_cmd = event;
1636 smrt->smrt_event_cancel_cmd = cancel;
1637 mutex_exit(&smrt->smrt_mutex);
1638
1639 return (0);
1640 }
1641
1642 void
smrt_event_complete(smrt_command_t * smcm)1643 smrt_event_complete(smrt_command_t *smcm)
1644 {
1645 smrt_event_notify_t *sen;
1646 boolean_t log, rescan;
1647
1648 boolean_t intervene = B_FALSE;
1649 smrt_t *smrt = smcm->smcm_ctlr;
1650
1651 VERIFY(MUTEX_HELD(&smrt->smrt_mutex));
1652 VERIFY3P(smcm, ==, smrt->smrt_event_cmd);
1653 VERIFY0(smrt->smrt_status & SMRT_CTLR_ASYNC_INTERVENTION);
1654
1655 smrt->smrt_stats.smrts_events_received++;
1656
1657 if (smrt->smrt_status & SMRT_CTLR_STATUS_DETACHING) {
1658 cv_signal(&smrt->smrt_event_queue);
1659 return;
1660 }
1661
1662 if (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) {
1663 intervene = B_TRUE;
1664 goto clean;
1665 }
1666
1667 /*
1668 * The event notification command failed for some reason. Attempt to
1669 * drive on and try again at the next intervention period. Because this
1670 * may represent a programmer error (though it's hard to know), we wait
1671 * until the next intervention period and don't panic.
1672 */
1673 if (smcm->smcm_status & SMRT_CMD_STATUS_ERROR) {
1674 ErrorInfo_t *ei = smcm->smcm_va_err;
1675 intervene = B_TRUE;
1676
1677 smrt->smrt_stats.smrts_events_errors++;
1678 dev_err(smrt->smrt_dip, CE_WARN, "!event notification request "
1679 "error: status 0x%x", ei->CommandStatus);
1680 goto clean;
1681 }
1682
1683 sen = smcm->smcm_internal->smcmi_va;
1684 log = rescan = B_FALSE;
1685 switch (sen->sen_class) {
1686 case SMRT_EVENT_CLASS_PROTOCOL:
1687 /*
1688 * Most of the event protocol class events aren't really
1689 * actionable. However, subclass 1 indicates errors. Today,
1690 * the only error is an event overflow. If there's an event
1691 * overflow, then we must assume that we need to rescan.
1692 */
1693 if (sen->sen_subclass == SMRT_EVENT_PROTOCOL_SUBCLASS_ERROR) {
1694 rescan = B_TRUE;
1695 }
1696 break;
1697 case SMRT_EVENT_CLASS_HOTPLUG:
1698 /*
1699 * We want to log all hotplug events. However we only need to
1700 * scan these if the subclass indicates the event is for a disk.
1701 */
1702 log = B_TRUE;
1703 if (sen->sen_subclass == SMRT_EVENT_HOTPLUG_SUBCLASS_DRIVE) {
1704 rescan = B_TRUE;
1705 }
1706 break;
1707 case SMRT_EVENT_CLASS_HWERROR:
1708 case SMRT_EVENT_CLASS_ENVIRONMENT:
1709 log = B_TRUE;
1710 break;
1711 case SMRT_EVENT_CLASS_PHYS:
1712 log = B_TRUE;
1713 /*
1714 * This subclass indicates some change for physical drives. As
1715 * such, this should trigger a rescan.
1716 */
1717 if (sen->sen_subclass == SMRT_EVENT_PHYS_SUBCLASS_STATE) {
1718 rescan = B_TRUE;
1719 }
1720 break;
1721 case SMRT_EVENT_CLASS_LOGVOL:
1722 rescan = B_TRUE;
1723 log = B_TRUE;
1724 break;
1725 default:
1726 /*
1727 * While there are other classes of events, it's hard to say how
1728 * actionable they are for the moment. If we revamp this such
1729 * that it becomes an ireport based system, then we should just
1730 * always log these. We opt not to at the moment to try and be
1731 * kind to the system log.
1732 */
1733 break;
1734 }
1735
1736 /*
1737 * Ideally, this would be an ireport that we could pass onto
1738 * administrators; however, since we don't have any way to generate
1739 * that, we provide a subset of the event information.
1740 */
1741 if (log) {
1742 const char *rmsg;
1743 if (rescan == B_TRUE) {
1744 rmsg = "rescanning";
1745 } else {
1746 rmsg = "not rescanning";
1747 }
1748 if (sen->sen_message[0] != '\0') {
1749 sen->sen_message[sizeof (sen->sen_message) - 1] = '\0';
1750 dev_err(smrt->smrt_dip, CE_NOTE, "!controller event "
1751 "class/sub-class/detail %x, %x, %x: %s; %s devices",
1752 sen->sen_class, sen->sen_subclass, sen->sen_detail,
1753 sen->sen_message, rmsg);
1754 } else {
1755 dev_err(smrt->smrt_dip, CE_NOTE, "!controller event "
1756 "class/sub-class/detail %x, %x, %x; %s devices",
1757 sen->sen_class, sen->sen_subclass, sen->sen_detail,
1758 rmsg);
1759 }
1760 }
1761
1762 if (rescan)
1763 smrt_discover_request(smrt);
1764
1765 clean:
1766 mutex_exit(&smrt->smrt_mutex);
1767 smrt_command_reuse(smcm);
1768 bzero(smcm->smcm_internal->smcmi_va, SMRT_EVENT_NOTIFY_BUFLEN);
1769 mutex_enter(&smrt->smrt_mutex);
1770
1771 /*
1772 * Make sure we're not _now_ detaching or resetting.
1773 */
1774 if (smrt->smrt_status & SMRT_CTLR_STATUS_DETACHING) {
1775 cv_signal(&smrt->smrt_event_queue);
1776 return;
1777 }
1778
1779 if ((smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) != 0 ||
1780 intervene == B_TRUE) {
1781 smrt->smrt_status |= SMRT_CTLR_ASYNC_INTERVENTION;
1782 return;
1783 }
1784
1785 /*
1786 * Check out command count per tick. If it's too high, leave it for
1787 * intervention to solve. Likely there is some serious driver or
1788 * firmware error going on.
1789 */
1790 smrt->smrt_event_count++;
1791 if (smrt->smrt_event_count > smrt_event_intervention_threshold) {
1792 smrt->smrt_status |= SMRT_CTLR_ASYNC_INTERVENTION;
1793 return;
1794 }
1795
1796 if (smrt_submit(smrt, smcm) != 0) {
1797 smrt->smrt_status |= SMRT_CTLR_ASYNC_INTERVENTION;
1798 }
1799 }
1800
1801 void
smrt_event_fini(smrt_t * smrt)1802 smrt_event_fini(smrt_t *smrt)
1803 {
1804 int ret;
1805 smrt_command_t *event, *cancel;
1806 mutex_enter(&smrt->smrt_mutex);
1807
1808 /*
1809 * If intervention has been requested, there is nothing for us to do. We
1810 * clear the flag so nothing else accidentally sees this and takes
1811 * action. We also don't need to bother sending a cancellation request,
1812 * as there is no outstanding event.
1813 */
1814 if (smrt->smrt_status & SMRT_CTLR_ASYNC_INTERVENTION) {
1815 smrt->smrt_status &= ~SMRT_CTLR_ASYNC_INTERVENTION;
1816 goto free;
1817 }
1818
1819 /*
1820 * Submit a cancel request for the event notification queue. Because we
1821 * submit both the cancel event and the regular notification event as an
1822 * ordered command, we know that by the time this completes, that the
1823 * existing one will have completed.
1824 */
1825 smrt->smrt_event_cancel_cmd->smcm_status |= SMRT_CMD_STATUS_POLLED;
1826 if ((ret = smrt_submit(smrt, smrt->smrt_event_cancel_cmd)) != 0) {
1827 /*
1828 * This is unfortunate. We've failed to submit the command. At
1829 * this point all we can do is reset the device. If the reset
1830 * succeeds, we're done and we can clear all the memory. If it
1831 * fails, then all we can do is just leak the command and scream
1832 * to the system, sorry.
1833 */
1834 if (smrt_ctlr_reset(smrt) != 0) {
1835 dev_err(smrt->smrt_dip, CE_WARN, "failed to reset "
1836 "device after failure to submit cancellation "
1837 "(%d), abandoning smrt_command_t at address %p",
1838 ret, smrt->smrt_event_cmd);
1839 smrt->smrt_event_cmd = NULL;
1840 goto free;
1841 }
1842 }
1843
1844 smrt->smrt_event_cancel_cmd->smcm_expiry = gethrtime() +
1845 SMRT_ASYNC_CANCEL_TIMEOUT * NANOSEC;
1846 if ((ret = smrt_poll_for(smrt, smrt->smrt_event_cancel_cmd)) != 0) {
1847 VERIFY3S(ret, ==, ETIMEDOUT);
1848 VERIFY0(smrt->smrt_event_cancel_cmd->smcm_status &
1849 SMRT_CMD_STATUS_POLL_COMPLETE);
1850
1851 /*
1852 * The command timed out. All we can do is hope a reset will
1853 * work.
1854 */
1855 if (smrt_ctlr_reset(smrt) != 0) {
1856 dev_err(smrt->smrt_dip, CE_WARN, "failed to reset "
1857 "device after failure to poll for async "
1858 "cancellation command abandoning smrt_command_t "
1859 "event command at address %p and cancellation "
1860 "command at %p", smrt->smrt_event_cmd,
1861 smrt->smrt_event_cancel_cmd);
1862 smrt->smrt_event_cmd = NULL;
1863 smrt->smrt_event_cancel_cmd = NULL;
1864 goto free;
1865 }
1866
1867 }
1868
1869 /*
1870 * Well, in the end, it's results that count.
1871 */
1872 if (smrt->smrt_event_cancel_cmd->smcm_status &
1873 SMRT_CMD_STATUS_RESET_SENT) {
1874 goto free;
1875 }
1876
1877 if (smrt->smrt_event_cancel_cmd->smcm_status & SMRT_CMD_STATUS_ERROR) {
1878 ErrorInfo_t *ei = smrt->smrt_event_cancel_cmd->smcm_va_err;
1879
1880 /*
1881 * This can return a CISS_CMD_TARGET_STATUS entry when the
1882 * controller doesn't think a command is outstanding. It is
1883 * possible we raced, so don't think too much about that case.
1884 * Anything else leaves us between a rock and a hard place, the
1885 * only way out is a reset.
1886 */
1887 if (ei->CommandStatus != CISS_CMD_TARGET_STATUS &&
1888 smrt_ctlr_reset(smrt) != 0) {
1889 dev_err(smrt->smrt_dip, CE_WARN, "failed to reset "
1890 "device after receiving an error on the async "
1891 "cancellation command (%d); abandoning "
1892 "smrt_command_t event command at address %p and "
1893 "cancellation command at %p", ei->CommandStatus,
1894 smrt->smrt_event_cmd, smrt->smrt_event_cancel_cmd);
1895 smrt->smrt_event_cmd = NULL;
1896 smrt->smrt_event_cancel_cmd = NULL;
1897 goto free;
1898 }
1899 }
1900
1901 free:
1902 event = smrt->smrt_event_cmd;
1903 smrt->smrt_event_cmd = NULL;
1904 cancel = smrt->smrt_event_cancel_cmd;
1905 smrt->smrt_event_cancel_cmd = NULL;
1906 mutex_exit(&smrt->smrt_mutex);
1907 if (event != NULL)
1908 smrt_command_free(event);
1909 if (cancel != NULL)
1910 smrt_command_free(cancel);
1911 cv_destroy(&smrt->smrt_event_queue);
1912 }
1913
1914 /*
1915 * We've been asked to do a discovery in panic context. This would have
1916 * occurred because there was a device reset. Because we can't rely on the
1917 * target maps, all we can do at the moment is go over all the active targets
1918 * and note which ones no longer exist. If this target was required to dump,
1919 * then the dump code will encounter a fatal error. If not, then we should
1920 * count ourselves surprisingly lucky.
1921 */
1922 static void
smrt_discover_panic_check(smrt_t * smrt)1923 smrt_discover_panic_check(smrt_t *smrt)
1924 {
1925 smrt_target_t *smtg;
1926
1927 ASSERT(MUTEX_HELD(&smrt->smrt_mutex));
1928 for (smtg = list_head(&smrt->smrt_targets); smtg != NULL;
1929 smtg = list_next(&smrt->smrt_targets, smtg)) {
1930 uint64_t gen;
1931
1932 if (smtg->smtg_physical) {
1933 smrt_physical_t *smpt = smtg->smtg_lun.smtg_phys;
1934 /*
1935 * Don't worry about drives that aren't visible.
1936 */
1937 if (!smpt->smpt_visible)
1938 continue;
1939 gen = smpt->smpt_gen;
1940 } else {
1941 smrt_volume_t *smlv = smtg->smtg_lun.smtg_vol;
1942 gen = smlv->smlv_gen;
1943 }
1944
1945 if (gen != smrt->smrt_discover_gen) {
1946 dev_err(smrt->smrt_dip, CE_WARN, "target %s "
1947 "disappeared during post-panic discovery",
1948 scsi_device_unit_address(smtg->smtg_scsi_dev));
1949 smtg->smtg_gone = B_TRUE;
1950 }
1951 }
1952 }
1953
1954 static void
smrt_discover(void * arg)1955 smrt_discover(void *arg)
1956 {
1957 int log = 0, phys = 0;
1958 smrt_t *smrt = arg;
1959 uint64_t gen;
1960 boolean_t runphys, runvirt;
1961
1962 mutex_enter(&smrt->smrt_mutex);
1963 smrt->smrt_status |= SMRT_CTLR_DISCOVERY_RUNNING;
1964 smrt->smrt_status &= ~SMRT_CTLR_DISCOVERY_REQUESTED;
1965
1966 smrt->smrt_discover_gen++;
1967 gen = smrt->smrt_discover_gen;
1968 runphys = smrt->smrt_phys_tgtmap != NULL;
1969 runvirt = smrt->smrt_virt_tgtmap != NULL;
1970 mutex_exit(&smrt->smrt_mutex);
1971 if (runphys)
1972 phys = smrt_phys_discover(smrt, SMRT_DISCOVER_TIMEOUT, gen);
1973 if (runvirt)
1974 log = smrt_logvol_discover(smrt, SMRT_DISCOVER_TIMEOUT, gen);
1975 mutex_enter(&smrt->smrt_mutex);
1976
1977 if (phys != 0 || log != 0) {
1978 if (!ddi_in_panic()) {
1979 smrt->smrt_status |= SMRT_CTLR_DISCOVERY_PERIODIC;
1980 } else {
1981 panic("smrt_t %p failed to perform discovery after "
1982 "a reset in panic context, unable to continue. "
1983 "logvol: %d, phys: %d", smrt, log, phys);
1984 }
1985 } else {
1986 if (!ddi_in_panic() &&
1987 smrt->smrt_status & SMRT_CTLR_DISCOVERY_REQUIRED) {
1988 smrt->smrt_status &= ~SMRT_CTLR_DISCOVERY_REQUIRED;
1989 cv_broadcast(&smrt->smrt_cv_finishq);
1990 }
1991
1992 if (ddi_in_panic()) {
1993 smrt_discover_panic_check(smrt);
1994 }
1995 }
1996 smrt->smrt_status &= ~SMRT_CTLR_DISCOVERY_RUNNING;
1997 if (smrt->smrt_status & SMRT_CTLR_DISCOVERY_REQUESTED)
1998 smrt->smrt_status |= SMRT_CTLR_DISCOVERY_PERIODIC;
1999 mutex_exit(&smrt->smrt_mutex);
2000 }
2001
2002 /*
2003 * Request discovery, which is always run via a taskq.
2004 */
2005 void
smrt_discover_request(smrt_t * smrt)2006 smrt_discover_request(smrt_t *smrt)
2007 {
2008 boolean_t run;
2009 ASSERT(MUTEX_HELD(&smrt->smrt_mutex));
2010
2011 if (ddi_in_panic()) {
2012 smrt_discover(smrt);
2013 return;
2014 }
2015
2016 run = (smrt->smrt_status & SMRT_CTLR_DISCOVERY_MASK) == 0;
2017 smrt->smrt_status |= SMRT_CTLR_DISCOVERY_REQUESTED;
2018 if (run && ddi_taskq_dispatch(smrt->smrt_discover_taskq,
2019 smrt_discover, smrt, DDI_NOSLEEP) != DDI_SUCCESS) {
2020 smrt->smrt_status |= SMRT_CTLR_DISCOVERY_PERIODIC;
2021 smrt->smrt_stats.smrts_discovery_tq_errors++;
2022 }
2023 }
2024