1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (C) 2012-2014 Intel Corporation
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include <sys/param.h>
30 #include <sys/bus.h>
31 #include <sys/conf.h>
32 #include <sys/domainset.h>
33 #include <sys/proc.h>
34 #include <sys/sbuf.h>
35
36 #include <dev/pci/pcivar.h>
37
38 #include "nvme_private.h"
39
40 typedef enum error_print { ERROR_PRINT_NONE, ERROR_PRINT_NO_RETRY, ERROR_PRINT_ALL } error_print_t;
41 #define DO_NOT_RETRY 1
42
43 static void _nvme_qpair_submit_request(struct nvme_qpair *qpair,
44 struct nvme_request *req);
45 static void nvme_qpair_destroy(struct nvme_qpair *qpair);
46
47 static const char *
get_opcode_string(bool admin,uint8_t opc,char * buf,size_t len)48 get_opcode_string(bool admin, uint8_t opc, char *buf, size_t len)
49 {
50 struct sbuf sb;
51
52 sbuf_new(&sb, buf, len, SBUF_FIXEDLEN);
53 nvme_opcode_sbuf(admin, opc, &sb);
54 if (sbuf_finish(&sb) != 0)
55 return ("");
56 return (buf);
57 }
58
59 static void
nvme_admin_qpair_print_command(struct nvme_qpair * qpair,struct nvme_command * cmd)60 nvme_admin_qpair_print_command(struct nvme_qpair *qpair,
61 struct nvme_command *cmd)
62 {
63 char buf[64];
64
65 nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%x "
66 "cdw10:%08x cdw11:%08x\n",
67 get_opcode_string(true, cmd->opc, buf, sizeof(buf)), qpair->id,
68 cmd->cid, le32toh(cmd->nsid), le32toh(cmd->cdw10),
69 le32toh(cmd->cdw11));
70 }
71
72 static void
nvme_io_qpair_print_command(struct nvme_qpair * qpair,struct nvme_command * cmd)73 nvme_io_qpair_print_command(struct nvme_qpair *qpair,
74 struct nvme_command *cmd)
75 {
76 char buf[64];
77
78 switch (cmd->opc) {
79 case NVME_OPC_WRITE:
80 case NVME_OPC_READ:
81 case NVME_OPC_WRITE_UNCORRECTABLE:
82 case NVME_OPC_COMPARE:
83 case NVME_OPC_WRITE_ZEROES:
84 case NVME_OPC_VERIFY:
85 nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d "
86 "lba:%llu len:%d\n",
87 get_opcode_string(false, cmd->opc, buf, sizeof(buf)),
88 qpair->id, cmd->cid, le32toh(cmd->nsid),
89 ((unsigned long long)le32toh(cmd->cdw11) << 32) + le32toh(cmd->cdw10),
90 (le32toh(cmd->cdw12) & 0xFFFF) + 1);
91 break;
92 default:
93 nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d\n",
94 get_opcode_string(false, cmd->opc, buf, sizeof(buf)),
95 qpair->id, cmd->cid, le32toh(cmd->nsid));
96 break;
97 }
98 }
99
100 void
nvme_qpair_print_command(struct nvme_qpair * qpair,struct nvme_command * cmd)101 nvme_qpair_print_command(struct nvme_qpair *qpair, struct nvme_command *cmd)
102 {
103 if (qpair->id == 0)
104 nvme_admin_qpair_print_command(qpair, cmd);
105 else
106 nvme_io_qpair_print_command(qpair, cmd);
107 if (nvme_verbose_cmd_dump) {
108 nvme_printf(qpair->ctrlr,
109 "nsid:%#x rsvd2:%#x rsvd3:%#x mptr:%#jx prp1:%#jx prp2:%#jx\n",
110 cmd->nsid, cmd->rsvd2, cmd->rsvd3, (uintmax_t)cmd->mptr,
111 (uintmax_t)cmd->prp1, (uintmax_t)cmd->prp2);
112 nvme_printf(qpair->ctrlr,
113 "cdw10: %#x cdw11:%#x cdw12:%#x cdw13:%#x cdw14:%#x cdw15:%#x\n",
114 cmd->cdw10, cmd->cdw11, cmd->cdw12, cmd->cdw13, cmd->cdw14,
115 cmd->cdw15);
116 }
117 }
118
119 static const char *
get_status_string(const struct nvme_completion * cpl,char * buf,size_t len)120 get_status_string(const struct nvme_completion *cpl, char *buf, size_t len)
121 {
122 struct sbuf sb;
123
124 sbuf_new(&sb, buf, len, SBUF_FIXEDLEN);
125 nvme_sc_sbuf(cpl, &sb);
126 if (sbuf_finish(&sb) != 0)
127 return ("");
128 return (buf);
129 }
130
131 void
nvme_qpair_print_completion(struct nvme_qpair * qpair,struct nvme_completion * cpl)132 nvme_qpair_print_completion(struct nvme_qpair *qpair,
133 struct nvme_completion *cpl)
134 {
135 char buf[64];
136 uint8_t crd, m, dnr, p;
137
138 crd = NVME_STATUS_GET_CRD(cpl->status);
139 m = NVME_STATUS_GET_M(cpl->status);
140 dnr = NVME_STATUS_GET_DNR(cpl->status);
141 p = NVME_STATUS_GET_P(cpl->status);
142
143 nvme_printf(qpair->ctrlr, "%s crd:%x m:%x dnr:%x p:%d "
144 "sqid:%d cid:%d cdw0:%x\n",
145 get_status_string(cpl, buf, sizeof(buf)), crd, m, dnr, p,
146 cpl->sqid, cpl->cid, cpl->cdw0);
147 }
148
149 static bool
nvme_completion_is_retry(const struct nvme_completion * cpl)150 nvme_completion_is_retry(const struct nvme_completion *cpl)
151 {
152 uint8_t sct, sc, dnr;
153
154 sct = NVME_STATUS_GET_SCT(cpl->status);
155 sc = NVME_STATUS_GET_SC(cpl->status);
156 dnr = NVME_STATUS_GET_DNR(cpl->status); /* Do Not Retry Bit */
157
158 /*
159 * TODO: spec is not clear how commands that are aborted due
160 * to TLER will be marked. So for now, it seems
161 * NAMESPACE_NOT_READY is the only case where we should
162 * look at the DNR bit. Requests failed with ABORTED_BY_REQUEST
163 * set the DNR bit correctly since the driver controls that.
164 */
165 switch (sct) {
166 case NVME_SCT_GENERIC:
167 switch (sc) {
168 case NVME_SC_ABORTED_BY_REQUEST:
169 case NVME_SC_NAMESPACE_NOT_READY:
170 if (dnr)
171 return (0);
172 else
173 return (1);
174 case NVME_SC_INVALID_OPCODE:
175 case NVME_SC_INVALID_FIELD:
176 case NVME_SC_COMMAND_ID_CONFLICT:
177 case NVME_SC_DATA_TRANSFER_ERROR:
178 case NVME_SC_ABORTED_POWER_LOSS:
179 case NVME_SC_INTERNAL_DEVICE_ERROR:
180 case NVME_SC_ABORTED_SQ_DELETION:
181 case NVME_SC_ABORTED_FAILED_FUSED:
182 case NVME_SC_ABORTED_MISSING_FUSED:
183 case NVME_SC_INVALID_NAMESPACE_OR_FORMAT:
184 case NVME_SC_COMMAND_SEQUENCE_ERROR:
185 case NVME_SC_LBA_OUT_OF_RANGE:
186 case NVME_SC_CAPACITY_EXCEEDED:
187 default:
188 return (0);
189 }
190 case NVME_SCT_COMMAND_SPECIFIC:
191 case NVME_SCT_MEDIA_ERROR:
192 return (0);
193 case NVME_SCT_PATH_RELATED:
194 switch (sc) {
195 case NVME_SC_INTERNAL_PATH_ERROR:
196 if (dnr)
197 return (0);
198 else
199 return (1);
200 default:
201 return (0);
202 }
203 case NVME_SCT_VENDOR_SPECIFIC:
204 default:
205 return (0);
206 }
207 }
208
209 static void
nvme_qpair_complete_tracker(struct nvme_tracker * tr,struct nvme_completion * cpl,error_print_t print_on_error)210 nvme_qpair_complete_tracker(struct nvme_tracker *tr,
211 struct nvme_completion *cpl, error_print_t print_on_error)
212 {
213 struct nvme_qpair *qpair = tr->qpair;
214 struct nvme_request *req;
215 bool retry, error, retriable;
216
217 mtx_assert(&qpair->lock, MA_NOTOWNED);
218
219 req = tr->req;
220 error = nvme_completion_is_error(cpl);
221 retriable = nvme_completion_is_retry(cpl);
222 retry = error && retriable && req->retries < nvme_retry_count;
223 if (retry)
224 qpair->num_retries++;
225 if (error && req->retries >= nvme_retry_count && retriable)
226 qpair->num_failures++;
227
228 if (error && (print_on_error == ERROR_PRINT_ALL ||
229 (!retry && print_on_error == ERROR_PRINT_NO_RETRY))) {
230 nvme_qpair_print_command(qpair, &req->cmd);
231 nvme_qpair_print_completion(qpair, cpl);
232 }
233
234 qpair->act_tr[cpl->cid] = NULL;
235
236 KASSERT(cpl->cid == req->cmd.cid, ("cpl cid does not match cmd cid\n"));
237
238 if (!retry) {
239 if (req->payload_valid) {
240 bus_dmamap_sync(qpair->dma_tag_payload,
241 tr->payload_dma_map,
242 BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
243 }
244 if (req->cb_fn)
245 req->cb_fn(req->cb_arg, cpl);
246 }
247
248 mtx_lock(&qpair->lock);
249
250 if (retry) {
251 req->retries++;
252 nvme_qpair_submit_tracker(qpair, tr);
253 } else {
254 if (req->payload_valid) {
255 bus_dmamap_unload(qpair->dma_tag_payload,
256 tr->payload_dma_map);
257 }
258
259 nvme_free_request(req);
260 tr->req = NULL;
261
262 TAILQ_REMOVE(&qpair->outstanding_tr, tr, tailq);
263 TAILQ_INSERT_HEAD(&qpair->free_tr, tr, tailq);
264
265 /*
266 * If the controller is in the middle of resetting, don't
267 * try to submit queued requests here - let the reset logic
268 * handle that instead.
269 */
270 if (!STAILQ_EMPTY(&qpair->queued_req) &&
271 !qpair->ctrlr->is_resetting) {
272 req = STAILQ_FIRST(&qpair->queued_req);
273 STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
274 _nvme_qpair_submit_request(qpair, req);
275 }
276 }
277
278 mtx_unlock(&qpair->lock);
279 }
280
281 static uint32_t
nvme_qpair_make_status(uint32_t sct,uint32_t sc,uint32_t dnr)282 nvme_qpair_make_status(uint32_t sct, uint32_t sc, uint32_t dnr)
283 {
284 uint32_t status = 0;
285
286 status |= NVMEF(NVME_STATUS_SCT, sct);
287 status |= NVMEF(NVME_STATUS_SC, sc);
288 status |= NVMEF(NVME_STATUS_DNR, dnr);
289 /* M=0 : this is artificial so no data in error log page */
290 /* CRD=0 : this is artificial and no delayed retry support anyway */
291 /* P=0 : phase not checked */
292 return (status);
293 }
294
295 static void
nvme_qpair_manual_complete_tracker(struct nvme_tracker * tr,uint32_t sct,uint32_t sc,uint32_t dnr,error_print_t print_on_error)296 nvme_qpair_manual_complete_tracker(
297 struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr,
298 error_print_t print_on_error)
299 {
300 struct nvme_completion cpl;
301 struct nvme_qpair * qpair = tr->qpair;
302
303 mtx_assert(&qpair->lock, MA_NOTOWNED);
304
305 memset(&cpl, 0, sizeof(cpl));
306
307 cpl.sqid = qpair->id;
308 cpl.cid = tr->cid;
309 cpl.status = nvme_qpair_make_status(sct, sc, dnr);
310 nvme_qpair_complete_tracker(tr, &cpl, print_on_error);
311 }
312
313 static void
nvme_qpair_manual_complete_request(struct nvme_qpair * qpair,struct nvme_request * req,uint32_t sct,uint32_t sc,uint32_t dnr,error_print_t print_on_error)314 nvme_qpair_manual_complete_request(struct nvme_qpair *qpair,
315 struct nvme_request *req, uint32_t sct, uint32_t sc, uint32_t dnr,
316 error_print_t print_on_error)
317 {
318 struct nvme_completion cpl;
319 bool error;
320
321 memset(&cpl, 0, sizeof(cpl));
322 cpl.sqid = qpair->id;
323 cpl.status = nvme_qpair_make_status(sct, sc, dnr);
324 error = nvme_completion_is_error(&cpl);
325
326 if (error && print_on_error == ERROR_PRINT_ALL) {
327 nvme_qpair_print_command(qpair, &req->cmd);
328 nvme_qpair_print_completion(qpair, &cpl);
329 }
330
331 if (req->cb_fn)
332 req->cb_fn(req->cb_arg, &cpl);
333
334 nvme_free_request(req);
335 }
336
337 /* Locked version of completion processor */
338 static bool
_nvme_qpair_process_completions(struct nvme_qpair * qpair)339 _nvme_qpair_process_completions(struct nvme_qpair *qpair)
340 {
341 struct nvme_tracker *tr;
342 struct nvme_completion cpl;
343 bool done = false;
344 bool in_panic = dumping || SCHEDULER_STOPPED();
345
346 mtx_assert(&qpair->recovery, MA_OWNED);
347
348 /*
349 * qpair is not enabled, likely because a controller reset is in
350 * progress. Ignore the interrupt - any I/O that was associated with
351 * this interrupt will get retried when the reset is complete. Any
352 * pending completions for when we're in startup will be completed
353 * as soon as initialization is complete and we start sending commands
354 * to the device.
355 */
356 if (qpair->recovery_state != RECOVERY_NONE) {
357 qpair->num_ignored++;
358 return (false);
359 }
360
361 /*
362 * Sanity check initialization. After we reset the hardware, the phase
363 * is defined to be 1. So if we get here with zero prior calls and the
364 * phase is 0, it means that we've lost a race between the
365 * initialization and the ISR running. With the phase wrong, we'll
366 * process a bunch of completions that aren't really completions leading
367 * to a KASSERT below.
368 */
369 KASSERT(!(qpair->num_intr_handler_calls == 0 && qpair->phase == 0),
370 ("%s: Phase wrong for first interrupt call.",
371 device_get_nameunit(qpair->ctrlr->dev)));
372
373 qpair->num_intr_handler_calls++;
374
375 bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map,
376 BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
377 /*
378 * A panic can stop the CPU this routine is running on at any point. If
379 * we're called during a panic, complete the sq_head wrap protocol for
380 * the case where we are interrupted just after the increment at 1
381 * below, but before we can reset cq_head to zero at 2. Also cope with
382 * the case where we do the zero at 2, but may or may not have done the
383 * phase adjustment at step 3. The panic machinery flushes all pending
384 * memory writes, so we can make these strong ordering assumptions
385 * that would otherwise be unwise if we were racing in real time.
386 */
387 if (__predict_false(in_panic)) {
388 if (qpair->cq_head == qpair->num_entries) {
389 /*
390 * Here we know that we need to zero cq_head and then negate
391 * the phase, which hasn't been assigned if cq_head isn't
392 * zero due to the atomic_store_rel.
393 */
394 qpair->cq_head = 0;
395 qpair->phase = !qpair->phase;
396 } else if (qpair->cq_head == 0) {
397 /*
398 * In this case, we know that the assignment at 2
399 * happened below, but we don't know if it 3 happened or
400 * not. To do this, we look at the last completion
401 * entry and set the phase to the opposite phase
402 * that it has. This gets us back in sync
403 */
404 cpl = qpair->cpl[qpair->num_entries - 1];
405 nvme_completion_swapbytes(&cpl);
406 qpair->phase = !NVME_STATUS_GET_P(cpl.status);
407 }
408 }
409
410 while (1) {
411 uint16_t status;
412
413 /*
414 * We need to do this dance to avoid a race between the host and
415 * the device where the device overtakes the host while the host
416 * is reading this record, leaving the status field 'new' and
417 * the sqhd and cid fields potentially stale. If the phase
418 * doesn't match, that means status hasn't yet been updated and
419 * we'll get any pending changes next time. It also means that
420 * the phase must be the same the second time. We have to sync
421 * before reading to ensure any bouncing completes.
422 */
423 status = le16toh(qpair->cpl[qpair->cq_head].status);
424 if (NVME_STATUS_GET_P(status) != qpair->phase)
425 break;
426
427 bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map,
428 BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
429 cpl = qpair->cpl[qpair->cq_head];
430 nvme_completion_swapbytes(&cpl);
431
432 KASSERT(
433 NVME_STATUS_GET_P(status) == NVME_STATUS_GET_P(cpl.status),
434 ("Phase unexpectedly inconsistent"));
435
436 if (cpl.cid < qpair->num_trackers)
437 tr = qpair->act_tr[cpl.cid];
438 else
439 tr = NULL;
440
441 done = true;
442 if (tr != NULL) {
443 nvme_qpair_complete_tracker(tr, &cpl, ERROR_PRINT_ALL);
444 qpair->sq_head = cpl.sqhd;
445 } else if (!in_panic) {
446 /*
447 * A missing tracker is normally an error. However, a
448 * panic can stop the CPU this routine is running on
449 * after completing an I/O but before updating
450 * qpair->cq_head at 1 below. Later, we re-enter this
451 * routine to poll I/O associated with the kernel
452 * dump. We find that the tr has been set to null before
453 * calling the completion routine. If it hasn't
454 * completed (or it triggers a panic), then '1' below
455 * won't have updated cq_head. Rather than panic again,
456 * ignore this condition because it's not unexpected.
457 */
458 nvme_printf(qpair->ctrlr,
459 "cpl (cid = %u) does not map to outstanding cmd\n",
460 cpl.cid);
461 nvme_qpair_print_completion(qpair,
462 &qpair->cpl[qpair->cq_head]);
463 KASSERT(0, ("received completion for unknown cmd"));
464 }
465
466 /*
467 * There's a number of races with the following (see above) when
468 * the system panics. We compensate for each one of them by
469 * using the atomic store to force strong ordering (at least when
470 * viewed in the aftermath of a panic).
471 */
472 if (++qpair->cq_head == qpair->num_entries) { /* 1 */
473 atomic_store_rel_int(&qpair->cq_head, 0); /* 2 */
474 qpair->phase = !qpair->phase; /* 3 */
475 }
476 }
477
478 if (done) {
479 bus_space_write_4(qpair->ctrlr->bus_tag, qpair->ctrlr->bus_handle,
480 qpair->cq_hdbl_off, qpair->cq_head);
481 }
482
483 return (done);
484 }
485
486 bool
nvme_qpair_process_completions(struct nvme_qpair * qpair)487 nvme_qpair_process_completions(struct nvme_qpair *qpair)
488 {
489 bool done = false;
490
491 /*
492 * Interlock with reset / recovery code. This is an usually uncontended
493 * to make sure that we drain out of the ISRs before we reset the card
494 * and to prevent races with the recovery process called from a timeout
495 * context.
496 */
497 mtx_lock(&qpair->recovery);
498
499 if (__predict_true(qpair->recovery_state == RECOVERY_NONE))
500 done = _nvme_qpair_process_completions(qpair);
501 else
502 qpair->num_recovery_nolock++; // XXX likely need to rename
503
504 mtx_unlock(&qpair->recovery);
505
506 return (done);
507 }
508
509 static void
nvme_qpair_msi_handler(void * arg)510 nvme_qpair_msi_handler(void *arg)
511 {
512 struct nvme_qpair *qpair = arg;
513
514 nvme_qpair_process_completions(qpair);
515 }
516
517 int
nvme_qpair_construct(struct nvme_qpair * qpair,uint32_t num_entries,uint32_t num_trackers,struct nvme_controller * ctrlr)518 nvme_qpair_construct(struct nvme_qpair *qpair,
519 uint32_t num_entries, uint32_t num_trackers,
520 struct nvme_controller *ctrlr)
521 {
522 struct nvme_tracker *tr;
523 size_t cmdsz, cplsz, prpsz, allocsz, prpmemsz;
524 uint64_t queuemem_phys, prpmem_phys, list_phys;
525 uint8_t *queuemem, *prpmem, *prp_list;
526 int i, err;
527
528 qpair->vector = ctrlr->msi_count > 1 ? qpair->id : 0;
529 qpair->num_entries = num_entries;
530 qpair->num_trackers = num_trackers;
531 qpair->ctrlr = ctrlr;
532
533 mtx_init(&qpair->lock, "nvme qpair lock", NULL, MTX_DEF);
534 mtx_init(&qpair->recovery, "nvme qpair recovery", NULL, MTX_DEF);
535
536 callout_init_mtx(&qpair->timer, &qpair->recovery, 0);
537 qpair->timer_armed = false;
538 qpair->recovery_state = RECOVERY_WAITING;
539
540 /* Note: NVMe PRP format is restricted to 4-byte alignment. */
541 err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev),
542 4, ctrlr->page_size, BUS_SPACE_MAXADDR,
543 BUS_SPACE_MAXADDR, NULL, NULL, ctrlr->max_xfer_size,
544 howmany(ctrlr->max_xfer_size, ctrlr->page_size) + 1,
545 ctrlr->page_size, 0,
546 NULL, NULL, &qpair->dma_tag_payload);
547 if (err != 0) {
548 nvme_printf(ctrlr, "payload tag create failed %d\n", err);
549 goto out;
550 }
551
552 /*
553 * Each component must be page aligned, and individual PRP lists
554 * cannot cross a page boundary.
555 */
556 cmdsz = qpair->num_entries * sizeof(struct nvme_command);
557 cmdsz = roundup2(cmdsz, ctrlr->page_size);
558 cplsz = qpair->num_entries * sizeof(struct nvme_completion);
559 cplsz = roundup2(cplsz, ctrlr->page_size);
560 /*
561 * For commands requiring more than 2 PRP entries, one PRP will be
562 * embedded in the command (prp1), and the rest of the PRP entries
563 * will be in a list pointed to by the command (prp2).
564 */
565 prpsz = sizeof(uint64_t) *
566 howmany(ctrlr->max_xfer_size, ctrlr->page_size);
567 prpmemsz = qpair->num_trackers * prpsz;
568 allocsz = cmdsz + cplsz + prpmemsz;
569
570 err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev),
571 ctrlr->page_size, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL,
572 allocsz, 1, allocsz, 0, NULL, NULL, &qpair->dma_tag);
573 if (err != 0) {
574 nvme_printf(ctrlr, "tag create failed %d\n", err);
575 goto out;
576 }
577 bus_dma_tag_set_domain(qpair->dma_tag, qpair->domain);
578
579 if (bus_dmamem_alloc(qpair->dma_tag, (void **)&queuemem,
580 BUS_DMA_COHERENT | BUS_DMA_NOWAIT, &qpair->queuemem_map)) {
581 nvme_printf(ctrlr, "failed to alloc qpair memory\n");
582 goto out;
583 }
584
585 if (bus_dmamap_load(qpair->dma_tag, qpair->queuemem_map,
586 queuemem, allocsz, nvme_single_map, &queuemem_phys, 0) != 0) {
587 nvme_printf(ctrlr, "failed to load qpair memory\n");
588 bus_dmamem_free(qpair->dma_tag, qpair->cmd,
589 qpair->queuemem_map);
590 goto out;
591 }
592
593 qpair->num_cmds = 0;
594 qpair->num_intr_handler_calls = 0;
595 qpair->num_retries = 0;
596 qpair->num_failures = 0;
597 qpair->num_ignored = 0;
598 qpair->cmd = (struct nvme_command *)queuemem;
599 qpair->cpl = (struct nvme_completion *)(queuemem + cmdsz);
600 prpmem = (uint8_t *)(queuemem + cmdsz + cplsz);
601 qpair->cmd_bus_addr = queuemem_phys;
602 qpair->cpl_bus_addr = queuemem_phys + cmdsz;
603 prpmem_phys = queuemem_phys + cmdsz + cplsz;
604
605 /*
606 * Calcuate the stride of the doorbell register. Many emulators set this
607 * value to correspond to a cache line. However, some hardware has set
608 * it to various small values.
609 */
610 qpair->sq_tdbl_off = nvme_mmio_offsetof(doorbell[0]) +
611 (qpair->id << (ctrlr->dstrd + 1));
612 qpair->cq_hdbl_off = nvme_mmio_offsetof(doorbell[0]) +
613 (qpair->id << (ctrlr->dstrd + 1)) + (1 << ctrlr->dstrd);
614
615 TAILQ_INIT(&qpair->free_tr);
616 TAILQ_INIT(&qpair->outstanding_tr);
617 STAILQ_INIT(&qpair->queued_req);
618
619 list_phys = prpmem_phys;
620 prp_list = prpmem;
621 for (i = 0; i < qpair->num_trackers; i++) {
622 if (list_phys + prpsz > prpmem_phys + prpmemsz) {
623 qpair->num_trackers = i;
624 break;
625 }
626
627 /*
628 * Make sure that the PRP list for this tracker doesn't
629 * overflow to another nvme page.
630 */
631 if (trunc_page(list_phys) !=
632 trunc_page(list_phys + prpsz - 1)) {
633 list_phys = roundup2(list_phys, ctrlr->page_size);
634 prp_list =
635 (uint8_t *)roundup2((uintptr_t)prp_list, ctrlr->page_size);
636 }
637
638 tr = malloc_domainset(sizeof(*tr), M_NVME,
639 DOMAINSET_PREF(qpair->domain), M_ZERO | M_WAITOK);
640 bus_dmamap_create(qpair->dma_tag_payload, 0,
641 &tr->payload_dma_map);
642 tr->cid = i;
643 tr->qpair = qpair;
644 tr->prp = (uint64_t *)prp_list;
645 tr->prp_bus_addr = list_phys;
646 TAILQ_INSERT_HEAD(&qpair->free_tr, tr, tailq);
647 list_phys += prpsz;
648 prp_list += prpsz;
649 }
650
651 if (qpair->num_trackers == 0) {
652 nvme_printf(ctrlr, "failed to allocate enough trackers\n");
653 goto out;
654 }
655
656 qpair->act_tr = malloc_domainset(sizeof(struct nvme_tracker *) *
657 qpair->num_entries, M_NVME, DOMAINSET_PREF(qpair->domain),
658 M_ZERO | M_WAITOK);
659
660 if (ctrlr->msi_count > 1) {
661 /*
662 * MSI-X vector resource IDs start at 1, so we add one to
663 * the queue's vector to get the corresponding rid to use.
664 */
665 qpair->rid = qpair->vector + 1;
666
667 qpair->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ,
668 &qpair->rid, RF_ACTIVE);
669 if (qpair->res == NULL) {
670 nvme_printf(ctrlr, "unable to allocate MSI\n");
671 goto out;
672 }
673 if (bus_setup_intr(ctrlr->dev, qpair->res,
674 INTR_TYPE_MISC | INTR_MPSAFE, NULL,
675 nvme_qpair_msi_handler, qpair, &qpair->tag) != 0) {
676 nvme_printf(ctrlr, "unable to setup MSI\n");
677 goto out;
678 }
679 if (qpair->id == 0) {
680 bus_describe_intr(ctrlr->dev, qpair->res, qpair->tag,
681 "admin");
682 } else {
683 bus_describe_intr(ctrlr->dev, qpair->res, qpair->tag,
684 "io%d", qpair->id - 1);
685 }
686 }
687
688 return (0);
689
690 out:
691 nvme_qpair_destroy(qpair);
692 return (ENOMEM);
693 }
694
695 static void
nvme_qpair_destroy(struct nvme_qpair * qpair)696 nvme_qpair_destroy(struct nvme_qpair *qpair)
697 {
698 struct nvme_tracker *tr;
699
700 mtx_lock(&qpair->recovery);
701 qpair->timer_armed = false;
702 mtx_unlock(&qpair->recovery);
703 callout_drain(&qpair->timer);
704
705 if (qpair->tag) {
706 bus_teardown_intr(qpair->ctrlr->dev, qpair->res, qpair->tag);
707 qpair->tag = NULL;
708 }
709
710 if (qpair->act_tr) {
711 free(qpair->act_tr, M_NVME);
712 qpair->act_tr = NULL;
713 }
714
715 while (!TAILQ_EMPTY(&qpair->free_tr)) {
716 tr = TAILQ_FIRST(&qpair->free_tr);
717 TAILQ_REMOVE(&qpair->free_tr, tr, tailq);
718 bus_dmamap_destroy(qpair->dma_tag_payload,
719 tr->payload_dma_map);
720 free(tr, M_NVME);
721 }
722
723 if (qpair->cmd != NULL) {
724 bus_dmamap_unload(qpair->dma_tag, qpair->queuemem_map);
725 bus_dmamem_free(qpair->dma_tag, qpair->cmd,
726 qpair->queuemem_map);
727 qpair->cmd = NULL;
728 }
729
730 if (qpair->dma_tag) {
731 bus_dma_tag_destroy(qpair->dma_tag);
732 qpair->dma_tag = NULL;
733 }
734
735 if (qpair->dma_tag_payload) {
736 bus_dma_tag_destroy(qpair->dma_tag_payload);
737 qpair->dma_tag_payload = NULL;
738 }
739
740 if (mtx_initialized(&qpair->lock))
741 mtx_destroy(&qpair->lock);
742 if (mtx_initialized(&qpair->recovery))
743 mtx_destroy(&qpair->recovery);
744
745 if (qpair->res) {
746 bus_release_resource(qpair->ctrlr->dev, SYS_RES_IRQ,
747 rman_get_rid(qpair->res), qpair->res);
748 qpair->res = NULL;
749 }
750 }
751
752 static void
nvme_admin_qpair_abort_aers(struct nvme_qpair * qpair)753 nvme_admin_qpair_abort_aers(struct nvme_qpair *qpair)
754 {
755 struct nvme_tracker *tr;
756
757 /*
758 * nvme_complete_tracker must be called without the qpair lock held. It
759 * takes the lock to adjust outstanding_tr list, so make sure we don't
760 * have it yet. We need the lock to make the list traverse safe, but
761 * have to drop the lock to complete any AER. We restart the list scan
762 * when we do this to make this safe. There's interlock with the ISR so
763 * we know this tracker won't be completed twice.
764 */
765 mtx_assert(&qpair->lock, MA_NOTOWNED);
766
767 mtx_lock(&qpair->lock);
768 tr = TAILQ_FIRST(&qpair->outstanding_tr);
769 while (tr != NULL) {
770 if (tr->req->cmd.opc != NVME_OPC_ASYNC_EVENT_REQUEST) {
771 tr = TAILQ_NEXT(tr, tailq);
772 continue;
773 }
774 mtx_unlock(&qpair->lock);
775 nvme_qpair_manual_complete_tracker(tr,
776 NVME_SCT_GENERIC, NVME_SC_ABORTED_SQ_DELETION, 0,
777 ERROR_PRINT_NONE);
778 mtx_lock(&qpair->lock);
779 tr = TAILQ_FIRST(&qpair->outstanding_tr);
780 }
781 mtx_unlock(&qpair->lock);
782 }
783
784 void
nvme_admin_qpair_destroy(struct nvme_qpair * qpair)785 nvme_admin_qpair_destroy(struct nvme_qpair *qpair)
786 {
787 mtx_assert(&qpair->lock, MA_NOTOWNED);
788
789 nvme_admin_qpair_abort_aers(qpair);
790 nvme_qpair_destroy(qpair);
791 }
792
793 void
nvme_io_qpair_destroy(struct nvme_qpair * qpair)794 nvme_io_qpair_destroy(struct nvme_qpair *qpair)
795 {
796 nvme_qpair_destroy(qpair);
797 }
798
799 static void
nvme_abort_complete(void * arg,const struct nvme_completion * status)800 nvme_abort_complete(void *arg, const struct nvme_completion *status)
801 {
802 struct nvme_tracker *tr = arg;
803
804 /*
805 * If cdw0 bit 0 == 1, the controller was not able to abort the command
806 * we requested. We still need to check the active tracker array, to
807 * cover race where I/O timed out at same time controller was completing
808 * the I/O. An abort command always is on the admin queue, but affects
809 * either an admin or an I/O queue, so take the appropriate qpair lock
810 * for the original command's queue, since we'll need it to avoid races
811 * with the completion code and to complete the command manually.
812 */
813 mtx_lock(&tr->qpair->lock);
814 if ((status->cdw0 & 1) == 1 && tr->qpair->act_tr[tr->cid] != NULL) {
815 /*
816 * An I/O has timed out, and the controller was unable to abort
817 * it for some reason. And we've not processed a completion for
818 * it yet. Construct a fake completion status, and then complete
819 * the I/O's tracker manually.
820 */
821 nvme_printf(tr->qpair->ctrlr,
822 "abort command failed, aborting command manually\n");
823 nvme_qpair_manual_complete_tracker(tr,
824 NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 0, ERROR_PRINT_ALL);
825 }
826 /*
827 * XXX We don't check status for the possible 'Could not abort because
828 * excess aborts were submitted to the controller'. We don't prevent
829 * that, either. Document for the future here, since the standard is
830 * squishy and only says 'may generate' but implies anything is possible
831 * including hangs if you exceed the ACL.
832 */
833 mtx_unlock(&tr->qpair->lock);
834 }
835
836 static void
nvme_qpair_timeout(void * arg)837 nvme_qpair_timeout(void *arg)
838 {
839 struct nvme_qpair *qpair = arg;
840 struct nvme_controller *ctrlr = qpair->ctrlr;
841 struct nvme_tracker *tr;
842 sbintime_t now;
843 bool idle = true;
844 bool is_admin = qpair == &ctrlr->adminq;
845 bool fast;
846 uint32_t csts;
847 uint8_t cfs;
848
849 mtx_assert(&qpair->recovery, MA_OWNED);
850
851 /*
852 * If the controller is failed, then stop polling. This ensures that any
853 * failure processing that races with the qpair timeout will fail
854 * safely.
855 */
856 if (is_admin ? qpair->ctrlr->is_failed_admin : qpair->ctrlr->is_failed) {
857 nvme_printf(qpair->ctrlr,
858 "%sFailed controller, stopping watchdog timeout.\n",
859 is_admin ? "Complete " : "");
860 qpair->timer_armed = false;
861 return;
862 }
863
864 /*
865 * Shutdown condition: We set qpair->timer_armed to false in
866 * nvme_qpair_destroy before calling callout_drain. When we call that,
867 * this routine might get called one last time. Exit w/o setting a
868 * timeout. None of the watchdog stuff needs to be done since we're
869 * destroying the qpair.
870 */
871 if (!qpair->timer_armed) {
872 nvme_printf(qpair->ctrlr,
873 "Timeout fired during nvme_qpair_destroy\n");
874 return;
875 }
876
877 switch (qpair->recovery_state) {
878 case RECOVERY_NONE:
879 /*
880 * Read csts to get value of cfs - controller fatal status. If
881 * we are in the hot-plug or controller failed status proceed
882 * directly to reset. We also bail early if the status reads all
883 * 1's or the control fatal status bit is now 1. The latter is
884 * always true when the former is true, but not vice versa. The
885 * intent of the code is that if the card is gone (all 1's) or
886 * we've failed, then try to do a reset (which someitmes
887 * unwedges a card reading all 1's that's not gone away, but
888 * usually doesn't).
889 */
890 csts = nvme_mmio_read_4(ctrlr, csts);
891 cfs = NVMEV(NVME_CSTS_REG_CFS, csts);
892 if (csts == NVME_GONE || cfs == 1) {
893 /*
894 * We've had a command timeout that we weren't able to
895 * abort or we have aborts disabled and any command
896 * timed out.
897 *
898 * If we get here due to a possible surprise hot-unplug
899 * event, then we let nvme_ctrlr_reset confirm and fail
900 * the controller.
901 */
902 do_reset:
903 nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n",
904 (csts == 0xffffffff) ? " and possible hot unplug" :
905 (cfs ? " and fatal error status" : ""));
906 qpair->recovery_state = RECOVERY_WAITING;
907 nvme_ctrlr_reset(ctrlr);
908 idle = false;
909 break;
910 }
911
912
913 /*
914 * See if there's any recovery needed. First, do a fast check to
915 * see if anything could have timed out. If not, then skip
916 * everything else.
917 */
918 fast = false;
919 mtx_lock(&qpair->lock);
920 now = getsbinuptime();
921 TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) {
922 /*
923 * Skip async commands, they are posted to the card for
924 * an indefinite amount of time and have no deadline.
925 */
926 if (tr->deadline == SBT_MAX)
927 continue;
928
929 /*
930 * If the first real transaction is not in timeout, then
931 * we're done. Otherwise, we try recovery.
932 */
933 idle = false;
934 if (now <= tr->deadline)
935 fast = true;
936 break;
937 }
938 mtx_unlock(&qpair->lock);
939 if (idle || fast)
940 break;
941
942 /*
943 * There's a stale transaction at the start of the queue whose
944 * deadline has passed. Poll the competions as a last-ditch
945 * effort in case an interrupt has been missed. Warn the user if
946 * transactions were found of possible interrupt issues, but
947 * just once per controller.
948 */
949 if (_nvme_qpair_process_completions(qpair) && !ctrlr->isr_warned) {
950 nvme_printf(ctrlr, "System interrupt issues?\n");
951 ctrlr->isr_warned = true;
952 }
953
954 /*
955 * Now that we've run the ISR, re-rheck to see if there's any
956 * timed out commands and abort them or reset the card if so.
957 */
958 mtx_lock(&qpair->lock);
959 idle = true;
960 TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) {
961 /*
962 * Skip async commands, they are posted to the card for
963 * an indefinite amount of time and have no deadline.
964 */
965 if (tr->deadline == SBT_MAX)
966 continue;
967
968 /*
969 * If we know this tracker hasn't timed out, we also
970 * know all subsequent ones haven't timed out. The tr
971 * queue is in submission order and all normal commands
972 * in a queue have the same timeout (or the timeout was
973 * changed by the user, but we eventually timeout then).
974 */
975 idle = false;
976 if (now <= tr->deadline)
977 break;
978
979 /*
980 * Timeout expired, abort it or reset controller.
981 */
982 if (ctrlr->enable_aborts &&
983 tr->req->cb_fn != nvme_abort_complete) {
984 /*
985 * This isn't an abort command, ask for a
986 * hardware abort. This goes to the admin
987 * queue which will reset the card if it
988 * times out.
989 */
990 nvme_ctrlr_cmd_abort(ctrlr, tr->cid, qpair->id,
991 nvme_abort_complete, tr);
992 } else {
993 /*
994 * We have a live command in the card (either
995 * one we couldn't abort, or aborts weren't
996 * enabled). We can only reset.
997 */
998 mtx_unlock(&qpair->lock);
999 goto do_reset;
1000 }
1001 }
1002 mtx_unlock(&qpair->lock);
1003 break;
1004
1005 case RECOVERY_WAITING:
1006 /*
1007 * These messages aren't interesting while we're suspended. We
1008 * put the queues into waiting state while
1009 * suspending. Suspending takes a while, so we'll see these
1010 * during that time and they aren't diagnostic. At other times,
1011 * they indicate a problem that's worth complaining about.
1012 */
1013 if (!device_is_suspended(ctrlr->dev))
1014 nvme_printf(ctrlr, "Waiting for reset to complete\n");
1015 idle = false; /* We want to keep polling */
1016 break;
1017 }
1018
1019 /*
1020 * Rearm the timeout.
1021 */
1022 if (!idle) {
1023 callout_schedule_sbt(&qpair->timer, SBT_1S / 2, SBT_1S / 2, 0);
1024 } else {
1025 qpair->timer_armed = false;
1026 }
1027 }
1028
1029 /*
1030 * Submit the tracker to the hardware. Must already be in the
1031 * outstanding queue when called.
1032 */
1033 void
nvme_qpair_submit_tracker(struct nvme_qpair * qpair,struct nvme_tracker * tr)1034 nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr)
1035 {
1036 struct nvme_request *req;
1037 struct nvme_controller *ctrlr;
1038 int timeout;
1039
1040 mtx_assert(&qpair->lock, MA_OWNED);
1041
1042 req = tr->req;
1043 req->cmd.cid = tr->cid;
1044 qpair->act_tr[tr->cid] = tr;
1045 ctrlr = qpair->ctrlr;
1046
1047 if (req->timeout) {
1048 if (req->cb_fn == nvme_completion_poll_cb)
1049 timeout = 1;
1050 else if (qpair->id == 0)
1051 timeout = ctrlr->admin_timeout_period;
1052 else
1053 timeout = ctrlr->timeout_period;
1054 tr->deadline = getsbinuptime() + timeout * SBT_1S;
1055 if (!qpair->timer_armed) {
1056 qpair->timer_armed = true;
1057 callout_reset_sbt_on(&qpair->timer, SBT_1S / 2, SBT_1S / 2,
1058 nvme_qpair_timeout, qpair, qpair->cpu, 0);
1059 }
1060 } else
1061 tr->deadline = SBT_MAX;
1062
1063 /* Copy the command from the tracker to the submission queue. */
1064 memcpy(&qpair->cmd[qpair->sq_tail], &req->cmd, sizeof(req->cmd));
1065
1066 if (++qpair->sq_tail == qpair->num_entries)
1067 qpair->sq_tail = 0;
1068
1069 bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map,
1070 BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1071 bus_space_write_4(ctrlr->bus_tag, ctrlr->bus_handle,
1072 qpair->sq_tdbl_off, qpair->sq_tail);
1073 qpair->num_cmds++;
1074 }
1075
1076 static void
nvme_payload_map(void * arg,bus_dma_segment_t * seg,int nseg,int error)1077 nvme_payload_map(void *arg, bus_dma_segment_t *seg, int nseg, int error)
1078 {
1079 struct nvme_tracker *tr = arg;
1080 uint32_t cur_nseg;
1081
1082 /*
1083 * If the mapping operation failed, return immediately. The caller
1084 * is responsible for detecting the error status and failing the
1085 * tracker manually.
1086 */
1087 if (error != 0) {
1088 nvme_printf(tr->qpair->ctrlr,
1089 "nvme_payload_map err %d\n", error);
1090 return;
1091 }
1092
1093 /*
1094 * Note that we specified ctrlr->page_size for alignment and max
1095 * segment size when creating the bus dma tags. So here we can safely
1096 * just transfer each segment to its associated PRP entry.
1097 */
1098 tr->req->cmd.prp1 = htole64(seg[0].ds_addr);
1099
1100 if (nseg == 2) {
1101 tr->req->cmd.prp2 = htole64(seg[1].ds_addr);
1102 } else if (nseg > 2) {
1103 cur_nseg = 1;
1104 tr->req->cmd.prp2 = htole64((uint64_t)tr->prp_bus_addr);
1105 while (cur_nseg < nseg) {
1106 tr->prp[cur_nseg-1] =
1107 htole64((uint64_t)seg[cur_nseg].ds_addr);
1108 cur_nseg++;
1109 }
1110 } else {
1111 /*
1112 * prp2 should not be used by the controller
1113 * since there is only one segment, but set
1114 * to 0 just to be safe.
1115 */
1116 tr->req->cmd.prp2 = 0;
1117 }
1118
1119 bus_dmamap_sync(tr->qpair->dma_tag_payload, tr->payload_dma_map,
1120 BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1121 nvme_qpair_submit_tracker(tr->qpair, tr);
1122 }
1123
1124 static void
_nvme_qpair_submit_request(struct nvme_qpair * qpair,struct nvme_request * req)1125 _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
1126 {
1127 struct nvme_tracker *tr;
1128 int err = 0;
1129 bool is_admin = qpair == &qpair->ctrlr->adminq;
1130
1131 mtx_assert(&qpair->lock, MA_OWNED);
1132
1133 tr = TAILQ_FIRST(&qpair->free_tr);
1134 req->qpair = qpair;
1135
1136 /*
1137 * The controller has failed, so fail the request. Note, that this races
1138 * the recovery / timeout code. Since we hold the qpair lock, we know
1139 * it's safe to fail directly. is_failed is set when we fail the
1140 * controller. It is only ever reset in the ioctl reset controller
1141 * path, which is safe to race (for failed controllers, we make no
1142 * guarantees about bringing it out of failed state relative to other
1143 * commands). We try hard to allow admin commands when the entire
1144 * controller hasn't failed, only something related to I/O queues.
1145 */
1146 if (is_admin ? qpair->ctrlr->is_failed_admin : qpair->ctrlr->is_failed) {
1147 nvme_qpair_manual_complete_request(qpair, req,
1148 NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 1,
1149 ERROR_PRINT_NONE);
1150 return;
1151 }
1152
1153 /*
1154 * No tracker is available, or the qpair is disabled due to an
1155 * in-progress controller-level reset. If we lose the race with
1156 * recovery_state, then we may add an extra request to the queue which
1157 * will be resubmitted later. We only set recovery_state to NONE with
1158 * qpair->lock also held, so if we observe that the state is not NONE,
1159 * we know it won't transition back to NONE without retrying queued
1160 * request.
1161 */
1162 if (tr == NULL || qpair->recovery_state != RECOVERY_NONE) {
1163 STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq);
1164 return;
1165 }
1166
1167 TAILQ_REMOVE(&qpair->free_tr, tr, tailq);
1168 TAILQ_INSERT_TAIL(&qpair->outstanding_tr, tr, tailq);
1169 tr->deadline = SBT_MAX;
1170 tr->req = req;
1171
1172 if (!req->payload_valid) {
1173 nvme_qpair_submit_tracker(tr->qpair, tr);
1174 return;
1175 }
1176
1177 /*
1178 * tr->deadline updating when nvme_payload_map calls
1179 * nvme_qpair_submit_tracker (we call it above directly
1180 * when there's no map to load).
1181 */
1182 err = bus_dmamap_load_mem(tr->qpair->dma_tag_payload,
1183 tr->payload_dma_map, &req->payload, nvme_payload_map, tr, 0);
1184 if (err != 0) {
1185 /*
1186 * The dmamap operation failed, so we manually fail the
1187 * tracker here with DATA_TRANSFER_ERROR status.
1188 *
1189 * nvme_qpair_manual_complete_tracker must not be called
1190 * with the qpair lock held.
1191 */
1192 nvme_printf(qpair->ctrlr,
1193 "bus_dmamap_load_mem returned 0x%x!\n", err);
1194 mtx_unlock(&qpair->lock);
1195 nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC,
1196 NVME_SC_DATA_TRANSFER_ERROR, DO_NOT_RETRY, ERROR_PRINT_ALL);
1197 mtx_lock(&qpair->lock);
1198 }
1199 }
1200
1201 void
nvme_qpair_submit_request(struct nvme_qpair * qpair,struct nvme_request * req)1202 nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
1203 {
1204 mtx_lock(&qpair->lock);
1205 _nvme_qpair_submit_request(qpair, req);
1206 mtx_unlock(&qpair->lock);
1207 }
1208
1209 static void
nvme_qpair_enable(struct nvme_qpair * qpair)1210 nvme_qpair_enable(struct nvme_qpair *qpair)
1211 {
1212 bool is_admin __unused = qpair == &qpair->ctrlr->adminq;
1213
1214 if (mtx_initialized(&qpair->recovery))
1215 mtx_assert(&qpair->recovery, MA_OWNED);
1216 if (mtx_initialized(&qpair->lock))
1217 mtx_assert(&qpair->lock, MA_OWNED);
1218 KASSERT(!(is_admin ? qpair->ctrlr->is_failed_admin : qpair->ctrlr->is_failed),
1219 ("Enabling a failed qpair\n"));
1220
1221 qpair->recovery_state = RECOVERY_NONE;
1222 }
1223
1224 void
nvme_qpair_reset(struct nvme_qpair * qpair)1225 nvme_qpair_reset(struct nvme_qpair *qpair)
1226 {
1227 qpair->sq_head = qpair->sq_tail = qpair->cq_head = 0;
1228
1229 /*
1230 * First time through the completion queue, HW will set phase
1231 * bit on completions to 1. So set this to 1 here, indicating
1232 * we're looking for a 1 to know which entries have completed.
1233 * we'll toggle the bit each time when the completion queue
1234 * rolls over.
1235 */
1236 qpair->phase = 1;
1237
1238 memset(qpair->cmd, 0,
1239 qpair->num_entries * sizeof(struct nvme_command));
1240 memset(qpair->cpl, 0,
1241 qpair->num_entries * sizeof(struct nvme_completion));
1242 }
1243
1244 void
nvme_admin_qpair_enable(struct nvme_qpair * qpair)1245 nvme_admin_qpair_enable(struct nvme_qpair *qpair)
1246 {
1247 struct nvme_tracker *tr;
1248 struct nvme_tracker *tr_temp;
1249 bool rpt;
1250
1251 /*
1252 * Manually abort each outstanding admin command. Do not retry
1253 * admin commands found here, since they will be left over from
1254 * a controller reset and its likely the context in which the
1255 * command was issued no longer applies.
1256 */
1257 rpt = !TAILQ_EMPTY(&qpair->outstanding_tr);
1258 if (rpt)
1259 nvme_printf(qpair->ctrlr,
1260 "aborting outstanding admin command\n");
1261 TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) {
1262 nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC,
1263 NVME_SC_ABORTED_BY_REQUEST, DO_NOT_RETRY, ERROR_PRINT_ALL);
1264 }
1265 if (rpt)
1266 nvme_printf(qpair->ctrlr,
1267 "done aborting outstanding admin\n");
1268
1269 mtx_lock(&qpair->recovery);
1270 mtx_lock(&qpair->lock);
1271 nvme_qpair_enable(qpair);
1272 mtx_unlock(&qpair->lock);
1273 mtx_unlock(&qpair->recovery);
1274 }
1275
1276 void
nvme_io_qpair_enable(struct nvme_qpair * qpair)1277 nvme_io_qpair_enable(struct nvme_qpair *qpair)
1278 {
1279 STAILQ_HEAD(, nvme_request) temp;
1280 struct nvme_tracker *tr;
1281 struct nvme_tracker *tr_temp;
1282 struct nvme_request *req;
1283 bool report;
1284
1285 /*
1286 * Manually abort each outstanding I/O. This normally results in a
1287 * retry, unless the retry count on the associated request has
1288 * reached its limit.
1289 */
1290 report = !TAILQ_EMPTY(&qpair->outstanding_tr);
1291 if (report)
1292 nvme_printf(qpair->ctrlr, "aborting outstanding i/o\n");
1293 TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) {
1294 nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC,
1295 NVME_SC_ABORTED_BY_REQUEST, 0, ERROR_PRINT_NO_RETRY);
1296 }
1297 if (report)
1298 nvme_printf(qpair->ctrlr, "done aborting outstanding i/o\n");
1299
1300 mtx_lock(&qpair->recovery);
1301 mtx_lock(&qpair->lock);
1302 nvme_qpair_enable(qpair);
1303
1304 STAILQ_INIT(&temp);
1305 STAILQ_SWAP(&qpair->queued_req, &temp, nvme_request);
1306
1307 report = !STAILQ_EMPTY(&temp);
1308 if (report)
1309 nvme_printf(qpair->ctrlr, "resubmitting queued i/o\n");
1310 while (!STAILQ_EMPTY(&temp)) {
1311 req = STAILQ_FIRST(&temp);
1312 STAILQ_REMOVE_HEAD(&temp, stailq);
1313 nvme_qpair_print_command(qpair, &req->cmd);
1314 _nvme_qpair_submit_request(qpair, req);
1315 }
1316 if (report)
1317 nvme_printf(qpair->ctrlr, "done resubmitting i/o\n");
1318
1319 mtx_unlock(&qpair->lock);
1320 mtx_unlock(&qpair->recovery);
1321 }
1322
1323 static void
nvme_qpair_disable(struct nvme_qpair * qpair)1324 nvme_qpair_disable(struct nvme_qpair *qpair)
1325 {
1326 struct nvme_tracker *tr, *tr_temp;
1327
1328 if (mtx_initialized(&qpair->recovery))
1329 mtx_assert(&qpair->recovery, MA_OWNED);
1330 if (mtx_initialized(&qpair->lock))
1331 mtx_assert(&qpair->lock, MA_OWNED);
1332
1333 qpair->recovery_state = RECOVERY_WAITING;
1334 TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) {
1335 tr->deadline = SBT_MAX;
1336 }
1337 }
1338
1339 void
nvme_admin_qpair_disable(struct nvme_qpair * qpair)1340 nvme_admin_qpair_disable(struct nvme_qpair *qpair)
1341 {
1342 mtx_lock(&qpair->recovery);
1343
1344 mtx_lock(&qpair->lock);
1345 nvme_qpair_disable(qpair);
1346 mtx_unlock(&qpair->lock);
1347
1348 nvme_admin_qpair_abort_aers(qpair);
1349
1350 mtx_unlock(&qpair->recovery);
1351 }
1352
1353 void
nvme_io_qpair_disable(struct nvme_qpair * qpair)1354 nvme_io_qpair_disable(struct nvme_qpair *qpair)
1355 {
1356 mtx_lock(&qpair->recovery);
1357 mtx_lock(&qpair->lock);
1358
1359 nvme_qpair_disable(qpair);
1360
1361 mtx_unlock(&qpair->lock);
1362 mtx_unlock(&qpair->recovery);
1363 }
1364
1365 void
nvme_qpair_fail(struct nvme_qpair * qpair)1366 nvme_qpair_fail(struct nvme_qpair *qpair)
1367 {
1368 struct nvme_tracker *tr;
1369 struct nvme_request *req;
1370
1371 if (!mtx_initialized(&qpair->lock))
1372 return;
1373
1374 mtx_lock(&qpair->lock);
1375
1376 if (!STAILQ_EMPTY(&qpair->queued_req)) {
1377 nvme_printf(qpair->ctrlr, "failing queued i/o\n");
1378 }
1379 while (!STAILQ_EMPTY(&qpair->queued_req)) {
1380 req = STAILQ_FIRST(&qpair->queued_req);
1381 STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
1382 mtx_unlock(&qpair->lock);
1383 nvme_qpair_manual_complete_request(qpair, req, NVME_SCT_GENERIC,
1384 NVME_SC_ABORTED_BY_REQUEST, 1, ERROR_PRINT_ALL);
1385 mtx_lock(&qpair->lock);
1386 }
1387
1388 if (!TAILQ_EMPTY(&qpair->outstanding_tr)) {
1389 nvme_printf(qpair->ctrlr, "failing outstanding i/o\n");
1390 }
1391 /* Manually abort each outstanding I/O. */
1392 while (!TAILQ_EMPTY(&qpair->outstanding_tr)) {
1393 tr = TAILQ_FIRST(&qpair->outstanding_tr);
1394 /*
1395 * Do not remove the tracker. The abort_tracker path will
1396 * do that for us.
1397 */
1398 mtx_unlock(&qpair->lock);
1399 nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC,
1400 NVME_SC_ABORTED_BY_REQUEST, DO_NOT_RETRY, ERROR_PRINT_ALL);
1401 mtx_lock(&qpair->lock);
1402 }
1403
1404 mtx_unlock(&qpair->lock);
1405 }
1406