1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (C) 2012-2014 Intel Corporation
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include <sys/param.h>
30 #include <sys/bus.h>
31 #include <sys/conf.h>
32 #include <sys/domainset.h>
33 #include <sys/proc.h>
34 #include <sys/sbuf.h>
35
36 #include <dev/pci/pcivar.h>
37
38 #include "nvme_private.h"
39
40 typedef enum error_print { ERROR_PRINT_NONE, ERROR_PRINT_NO_RETRY, ERROR_PRINT_ALL } error_print_t;
41 #define DO_NOT_RETRY 1
42
43 static void _nvme_qpair_submit_request(struct nvme_qpair *qpair,
44 struct nvme_request *req);
45 static void nvme_qpair_destroy(struct nvme_qpair *qpair);
46
47 static const char *
get_opcode_string(bool admin,uint8_t opc,char * buf,size_t len)48 get_opcode_string(bool admin, uint8_t opc, char *buf, size_t len)
49 {
50 struct sbuf sb;
51
52 sbuf_new(&sb, buf, len, SBUF_FIXEDLEN);
53 nvme_opcode_sbuf(admin, opc, &sb);
54 if (sbuf_finish(&sb) != 0)
55 return ("");
56 return (buf);
57 }
58
59 static void
nvme_admin_qpair_print_command(struct nvme_qpair * qpair,struct nvme_command * cmd)60 nvme_admin_qpair_print_command(struct nvme_qpair *qpair,
61 struct nvme_command *cmd)
62 {
63 char buf[64];
64
65 nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%x "
66 "cdw10:%08x cdw11:%08x\n",
67 get_opcode_string(true, cmd->opc, buf, sizeof(buf)), qpair->id,
68 cmd->cid, le32toh(cmd->nsid), le32toh(cmd->cdw10),
69 le32toh(cmd->cdw11));
70 }
71
72 static void
nvme_io_qpair_print_command(struct nvme_qpair * qpair,struct nvme_command * cmd)73 nvme_io_qpair_print_command(struct nvme_qpair *qpair,
74 struct nvme_command *cmd)
75 {
76 char buf[64];
77
78 switch (cmd->opc) {
79 case NVME_OPC_WRITE:
80 case NVME_OPC_READ:
81 case NVME_OPC_WRITE_UNCORRECTABLE:
82 case NVME_OPC_COMPARE:
83 case NVME_OPC_WRITE_ZEROES:
84 case NVME_OPC_VERIFY:
85 nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d "
86 "lba:%llu len:%d\n",
87 get_opcode_string(false, cmd->opc, buf, sizeof(buf)),
88 qpair->id, cmd->cid, le32toh(cmd->nsid),
89 ((unsigned long long)le32toh(cmd->cdw11) << 32) + le32toh(cmd->cdw10),
90 (le32toh(cmd->cdw12) & 0xFFFF) + 1);
91 break;
92 default:
93 nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d\n",
94 get_opcode_string(false, cmd->opc, buf, sizeof(buf)),
95 qpair->id, cmd->cid, le32toh(cmd->nsid));
96 break;
97 }
98 }
99
100 void
nvme_qpair_print_command(struct nvme_qpair * qpair,struct nvme_command * cmd)101 nvme_qpair_print_command(struct nvme_qpair *qpair, struct nvme_command *cmd)
102 {
103 if (qpair->id == 0)
104 nvme_admin_qpair_print_command(qpair, cmd);
105 else
106 nvme_io_qpair_print_command(qpair, cmd);
107 if (nvme_verbose_cmd_dump) {
108 nvme_printf(qpair->ctrlr,
109 "nsid:%#x rsvd2:%#x rsvd3:%#x mptr:%#jx prp1:%#jx prp2:%#jx\n",
110 cmd->nsid, cmd->rsvd2, cmd->rsvd3, (uintmax_t)cmd->mptr,
111 (uintmax_t)cmd->prp1, (uintmax_t)cmd->prp2);
112 nvme_printf(qpair->ctrlr,
113 "cdw10: %#x cdw11:%#x cdw12:%#x cdw13:%#x cdw14:%#x cdw15:%#x\n",
114 cmd->cdw10, cmd->cdw11, cmd->cdw12, cmd->cdw13, cmd->cdw14,
115 cmd->cdw15);
116 }
117 }
118
119 static const char *
get_status_string(const struct nvme_completion * cpl,char * buf,size_t len)120 get_status_string(const struct nvme_completion *cpl, char *buf, size_t len)
121 {
122 struct sbuf sb;
123
124 sbuf_new(&sb, buf, len, SBUF_FIXEDLEN);
125 nvme_sc_sbuf(cpl, &sb);
126 if (sbuf_finish(&sb) != 0)
127 return ("");
128 return (buf);
129 }
130
131 void
nvme_qpair_print_completion(struct nvme_qpair * qpair,struct nvme_completion * cpl)132 nvme_qpair_print_completion(struct nvme_qpair *qpair,
133 struct nvme_completion *cpl)
134 {
135 char buf[64];
136 uint8_t crd, m, dnr, p;
137
138 crd = NVME_STATUS_GET_CRD(cpl->status);
139 m = NVME_STATUS_GET_M(cpl->status);
140 dnr = NVME_STATUS_GET_DNR(cpl->status);
141 p = NVME_STATUS_GET_P(cpl->status);
142
143 nvme_printf(qpair->ctrlr, "%s crd:%x m:%x dnr:%x p:%d "
144 "sqid:%d cid:%d cdw0:%x\n",
145 get_status_string(cpl, buf, sizeof(buf)), crd, m, dnr, p,
146 cpl->sqid, cpl->cid, cpl->cdw0);
147 }
148
149 static bool
nvme_completion_is_retry(const struct nvme_completion * cpl)150 nvme_completion_is_retry(const struct nvme_completion *cpl)
151 {
152 uint8_t sct, sc, dnr;
153
154 sct = NVME_STATUS_GET_SCT(cpl->status);
155 sc = NVME_STATUS_GET_SC(cpl->status);
156 dnr = NVME_STATUS_GET_DNR(cpl->status); /* Do Not Retry Bit */
157
158 /*
159 * TODO: spec is not clear how commands that are aborted due
160 * to TLER will be marked. So for now, it seems
161 * NAMESPACE_NOT_READY is the only case where we should
162 * look at the DNR bit. Requests failed with ABORTED_BY_REQUEST
163 * set the DNR bit correctly since the driver controls that.
164 */
165 switch (sct) {
166 case NVME_SCT_GENERIC:
167 switch (sc) {
168 case NVME_SC_ABORTED_BY_REQUEST:
169 case NVME_SC_NAMESPACE_NOT_READY:
170 if (dnr)
171 return (0);
172 else
173 return (1);
174 case NVME_SC_INVALID_OPCODE:
175 case NVME_SC_INVALID_FIELD:
176 case NVME_SC_COMMAND_ID_CONFLICT:
177 case NVME_SC_DATA_TRANSFER_ERROR:
178 case NVME_SC_ABORTED_POWER_LOSS:
179 case NVME_SC_INTERNAL_DEVICE_ERROR:
180 case NVME_SC_ABORTED_SQ_DELETION:
181 case NVME_SC_ABORTED_FAILED_FUSED:
182 case NVME_SC_ABORTED_MISSING_FUSED:
183 case NVME_SC_INVALID_NAMESPACE_OR_FORMAT:
184 case NVME_SC_COMMAND_SEQUENCE_ERROR:
185 case NVME_SC_LBA_OUT_OF_RANGE:
186 case NVME_SC_CAPACITY_EXCEEDED:
187 default:
188 return (0);
189 }
190 case NVME_SCT_COMMAND_SPECIFIC:
191 case NVME_SCT_MEDIA_ERROR:
192 return (0);
193 case NVME_SCT_PATH_RELATED:
194 switch (sc) {
195 case NVME_SC_INTERNAL_PATH_ERROR:
196 if (dnr)
197 return (0);
198 else
199 return (1);
200 default:
201 return (0);
202 }
203 case NVME_SCT_VENDOR_SPECIFIC:
204 default:
205 return (0);
206 }
207 }
208
209 static void
nvme_qpair_complete_tracker(struct nvme_tracker * tr,struct nvme_completion * cpl,error_print_t print_on_error)210 nvme_qpair_complete_tracker(struct nvme_tracker *tr,
211 struct nvme_completion *cpl, error_print_t print_on_error)
212 {
213 struct nvme_qpair *qpair = tr->qpair;
214 struct nvme_request *req;
215 bool retry, error, retriable;
216
217 mtx_assert(&qpair->lock, MA_NOTOWNED);
218
219 req = tr->req;
220 error = nvme_completion_is_error(cpl);
221 retriable = nvme_completion_is_retry(cpl);
222 retry = error && retriable && req->retries < nvme_retry_count;
223 if (retry)
224 qpair->num_retries++;
225 if (error && req->retries >= nvme_retry_count && retriable)
226 qpair->num_failures++;
227
228 if (error && (print_on_error == ERROR_PRINT_ALL ||
229 (!retry && print_on_error == ERROR_PRINT_NO_RETRY))) {
230 nvme_qpair_print_command(qpair, &req->cmd);
231 nvme_qpair_print_completion(qpair, cpl);
232 }
233
234 qpair->act_tr[cpl->cid] = NULL;
235
236 KASSERT(cpl->cid == req->cmd.cid, ("cpl cid does not match cmd cid\n"));
237
238 if (!retry) {
239 if (req->payload_valid) {
240 bus_dmamap_sync(qpair->dma_tag_payload,
241 tr->payload_dma_map,
242 BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
243 }
244 if (req->cb_fn)
245 req->cb_fn(req->cb_arg, cpl);
246 }
247
248 mtx_lock(&qpair->lock);
249
250 if (retry) {
251 req->retries++;
252 nvme_qpair_submit_tracker(qpair, tr);
253 } else {
254 if (req->payload_valid) {
255 bus_dmamap_unload(qpair->dma_tag_payload,
256 tr->payload_dma_map);
257 }
258
259 nvme_free_request(req);
260 tr->req = NULL;
261
262 TAILQ_REMOVE(&qpair->outstanding_tr, tr, tailq);
263 TAILQ_INSERT_HEAD(&qpair->free_tr, tr, tailq);
264
265 /*
266 * If the controller is in the middle of resetting, don't
267 * try to submit queued requests here - let the reset logic
268 * handle that instead.
269 */
270 if (!STAILQ_EMPTY(&qpair->queued_req) &&
271 !qpair->ctrlr->is_resetting) {
272 req = STAILQ_FIRST(&qpair->queued_req);
273 STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
274 _nvme_qpair_submit_request(qpair, req);
275 }
276 }
277
278 mtx_unlock(&qpair->lock);
279 }
280
281 static uint32_t
nvme_qpair_make_status(uint32_t sct,uint32_t sc,uint32_t dnr)282 nvme_qpair_make_status(uint32_t sct, uint32_t sc, uint32_t dnr)
283 {
284 uint32_t status = 0;
285
286 status |= NVMEF(NVME_STATUS_SCT, sct);
287 status |= NVMEF(NVME_STATUS_SC, sc);
288 status |= NVMEF(NVME_STATUS_DNR, dnr);
289 /* M=0 : this is artificial so no data in error log page */
290 /* CRD=0 : this is artificial and no delayed retry support anyway */
291 /* P=0 : phase not checked */
292 return (status);
293 }
294
295 static void
nvme_qpair_manual_complete_tracker(struct nvme_tracker * tr,uint32_t sct,uint32_t sc,uint32_t dnr,error_print_t print_on_error)296 nvme_qpair_manual_complete_tracker(
297 struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr,
298 error_print_t print_on_error)
299 {
300 struct nvme_completion cpl;
301 struct nvme_qpair * qpair = tr->qpair;
302
303 mtx_assert(&qpair->lock, MA_NOTOWNED);
304
305 memset(&cpl, 0, sizeof(cpl));
306
307 cpl.sqid = qpair->id;
308 cpl.cid = tr->cid;
309 cpl.status = nvme_qpair_make_status(sct, sc, dnr);
310 nvme_qpair_complete_tracker(tr, &cpl, print_on_error);
311 }
312
313 static void
nvme_qpair_manual_complete_request(struct nvme_qpair * qpair,struct nvme_request * req,uint32_t sct,uint32_t sc,uint32_t dnr,error_print_t print_on_error)314 nvme_qpair_manual_complete_request(struct nvme_qpair *qpair,
315 struct nvme_request *req, uint32_t sct, uint32_t sc, uint32_t dnr,
316 error_print_t print_on_error)
317 {
318 struct nvme_completion cpl;
319 bool error;
320
321 memset(&cpl, 0, sizeof(cpl));
322 cpl.sqid = qpair->id;
323 cpl.status = nvme_qpair_make_status(sct, sc, dnr);
324 error = nvme_completion_is_error(&cpl);
325
326 if (error && print_on_error == ERROR_PRINT_ALL) {
327 nvme_qpair_print_command(qpair, &req->cmd);
328 nvme_qpair_print_completion(qpair, &cpl);
329 }
330
331 if (req->cb_fn)
332 req->cb_fn(req->cb_arg, &cpl);
333
334 nvme_free_request(req);
335 }
336
337 /* Locked version of completion processor */
338 static bool
_nvme_qpair_process_completions(struct nvme_qpair * qpair)339 _nvme_qpair_process_completions(struct nvme_qpair *qpair)
340 {
341 struct nvme_tracker *tr;
342 struct nvme_completion cpl;
343 bool done = false;
344 bool in_panic = dumping || SCHEDULER_STOPPED();
345
346 mtx_assert(&qpair->recovery, MA_OWNED);
347
348 /*
349 * qpair is not enabled, likely because a controller reset is in
350 * progress. Ignore the interrupt - any I/O that was associated with
351 * this interrupt will get retried when the reset is complete. Any
352 * pending completions for when we're in startup will be completed
353 * as soon as initialization is complete and we start sending commands
354 * to the device.
355 */
356 if (qpair->recovery_state != RECOVERY_NONE) {
357 qpair->num_ignored++;
358 return (false);
359 }
360
361 /*
362 * Sanity check initialization. After we reset the hardware, the phase
363 * is defined to be 1. So if we get here with zero prior calls and the
364 * phase is 0, it means that we've lost a race between the
365 * initialization and the ISR running. With the phase wrong, we'll
366 * process a bunch of completions that aren't really completions leading
367 * to a KASSERT below.
368 */
369 KASSERT(!(qpair->num_intr_handler_calls == 0 && qpair->phase == 0),
370 ("%s: Phase wrong for first interrupt call.",
371 device_get_nameunit(qpair->ctrlr->dev)));
372
373 qpair->num_intr_handler_calls++;
374
375 bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map,
376 BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
377 /*
378 * A panic can stop the CPU this routine is running on at any point. If
379 * we're called during a panic, complete the sq_head wrap protocol for
380 * the case where we are interrupted just after the increment at 1
381 * below, but before we can reset cq_head to zero at 2. Also cope with
382 * the case where we do the zero at 2, but may or may not have done the
383 * phase adjustment at step 3. The panic machinery flushes all pending
384 * memory writes, so we can make these strong ordering assumptions
385 * that would otherwise be unwise if we were racing in real time.
386 */
387 if (__predict_false(in_panic)) {
388 if (qpair->cq_head == qpair->num_entries) {
389 /*
390 * Here we know that we need to zero cq_head and then negate
391 * the phase, which hasn't been assigned if cq_head isn't
392 * zero due to the atomic_store_rel.
393 */
394 qpair->cq_head = 0;
395 qpair->phase = !qpair->phase;
396 } else if (qpair->cq_head == 0) {
397 /*
398 * In this case, we know that the assignment at 2
399 * happened below, but we don't know if it 3 happened or
400 * not. To do this, we look at the last completion
401 * entry and set the phase to the opposite phase
402 * that it has. This gets us back in sync
403 */
404 cpl = qpair->cpl[qpair->num_entries - 1];
405 nvme_completion_swapbytes(&cpl);
406 qpair->phase = !NVME_STATUS_GET_P(cpl.status);
407 }
408 }
409
410 while (1) {
411 uint16_t status;
412
413 /*
414 * We need to do this dance to avoid a race between the host and
415 * the device where the device overtakes the host while the host
416 * is reading this record, leaving the status field 'new' and
417 * the sqhd and cid fields potentially stale. If the phase
418 * doesn't match, that means status hasn't yet been updated and
419 * we'll get any pending changes next time. It also means that
420 * the phase must be the same the second time. We have to sync
421 * before reading to ensure any bouncing completes.
422 */
423 status = le16toh(qpair->cpl[qpair->cq_head].status);
424 if (NVME_STATUS_GET_P(status) != qpair->phase)
425 break;
426
427 bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map,
428 BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
429 cpl = qpair->cpl[qpair->cq_head];
430 nvme_completion_swapbytes(&cpl);
431
432 KASSERT(
433 NVME_STATUS_GET_P(status) == NVME_STATUS_GET_P(cpl.status),
434 ("Phase unexpectedly inconsistent"));
435
436 if (cpl.cid < qpair->num_trackers)
437 tr = qpair->act_tr[cpl.cid];
438 else
439 tr = NULL;
440
441 done = true;
442 if (tr != NULL) {
443 nvme_qpair_complete_tracker(tr, &cpl, ERROR_PRINT_ALL);
444 qpair->sq_head = cpl.sqhd;
445 } else if (!in_panic) {
446 /*
447 * A missing tracker is normally an error. However, a
448 * panic can stop the CPU this routine is running on
449 * after completing an I/O but before updating
450 * qpair->cq_head at 1 below. Later, we re-enter this
451 * routine to poll I/O associated with the kernel
452 * dump. We find that the tr has been set to null before
453 * calling the completion routine. If it hasn't
454 * completed (or it triggers a panic), then '1' below
455 * won't have updated cq_head. Rather than panic again,
456 * ignore this condition because it's not unexpected.
457 */
458 nvme_printf(qpair->ctrlr,
459 "cpl (cid = %u) does not map to outstanding cmd\n",
460 cpl.cid);
461 nvme_qpair_print_completion(qpair,
462 &qpair->cpl[qpair->cq_head]);
463 KASSERT(0, ("received completion for unknown cmd"));
464 }
465
466 /*
467 * There's a number of races with the following (see above) when
468 * the system panics. We compensate for each one of them by
469 * using the atomic store to force strong ordering (at least when
470 * viewed in the aftermath of a panic).
471 */
472 if (++qpair->cq_head == qpair->num_entries) { /* 1 */
473 atomic_store_rel_int(&qpair->cq_head, 0); /* 2 */
474 qpair->phase = !qpair->phase; /* 3 */
475 }
476 }
477
478 if (done) {
479 bus_space_write_4(qpair->ctrlr->bus_tag, qpair->ctrlr->bus_handle,
480 qpair->cq_hdbl_off, qpair->cq_head);
481 }
482
483 return (done);
484 }
485
486 bool
nvme_qpair_process_completions(struct nvme_qpair * qpair)487 nvme_qpair_process_completions(struct nvme_qpair *qpair)
488 {
489 bool done = false;
490
491 /*
492 * Interlock with reset / recovery code. This is an usually uncontended
493 * to make sure that we drain out of the ISRs before we reset the card
494 * and to prevent races with the recovery process called from a timeout
495 * context.
496 */
497 mtx_lock(&qpair->recovery);
498
499 if (__predict_true(qpair->recovery_state == RECOVERY_NONE))
500 done = _nvme_qpair_process_completions(qpair);
501 else
502 qpair->num_recovery_nolock++; // XXX likely need to rename
503
504 mtx_unlock(&qpair->recovery);
505
506 return (done);
507 }
508
509 static void
nvme_qpair_msi_handler(void * arg)510 nvme_qpair_msi_handler(void *arg)
511 {
512 struct nvme_qpair *qpair = arg;
513
514 nvme_qpair_process_completions(qpair);
515 }
516
517 int
nvme_qpair_construct(struct nvme_qpair * qpair,uint32_t num_entries,uint32_t num_trackers,struct nvme_controller * ctrlr)518 nvme_qpair_construct(struct nvme_qpair *qpair,
519 uint32_t num_entries, uint32_t num_trackers,
520 struct nvme_controller *ctrlr)
521 {
522 struct nvme_tracker *tr;
523 size_t cmdsz, cplsz, prpsz, allocsz, prpmemsz;
524 uint64_t queuemem_phys, prpmem_phys, list_phys;
525 uint8_t *queuemem, *prpmem, *prp_list;
526 int i, err;
527
528 qpair->vector = ctrlr->msi_count > 1 ? qpair->id : 0;
529 qpair->num_entries = num_entries;
530 qpair->num_trackers = num_trackers;
531 qpair->ctrlr = ctrlr;
532
533 mtx_init(&qpair->lock, "nvme qpair lock", NULL, MTX_DEF);
534 mtx_init(&qpair->recovery, "nvme qpair recovery", NULL, MTX_DEF);
535
536 callout_init_mtx(&qpair->timer, &qpair->recovery, 0);
537 qpair->timer_armed = false;
538 qpair->recovery_state = RECOVERY_WAITING;
539
540 /* Note: NVMe PRP format is restricted to 4-byte alignment. */
541 err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev),
542 4, ctrlr->page_size, BUS_SPACE_MAXADDR,
543 BUS_SPACE_MAXADDR, NULL, NULL, ctrlr->max_xfer_size,
544 howmany(ctrlr->max_xfer_size, ctrlr->page_size) + 1,
545 ctrlr->page_size, 0,
546 NULL, NULL, &qpair->dma_tag_payload);
547 if (err != 0) {
548 nvme_printf(ctrlr, "payload tag create failed %d\n", err);
549 goto out;
550 }
551
552 /*
553 * Each component must be page aligned, and individual PRP lists
554 * cannot cross a page boundary.
555 */
556 cmdsz = qpair->num_entries * sizeof(struct nvme_command);
557 cmdsz = roundup2(cmdsz, ctrlr->page_size);
558 cplsz = qpair->num_entries * sizeof(struct nvme_completion);
559 cplsz = roundup2(cplsz, ctrlr->page_size);
560 /*
561 * For commands requiring more than 2 PRP entries, one PRP will be
562 * embedded in the command (prp1), and the rest of the PRP entries
563 * will be in a list pointed to by the command (prp2).
564 */
565 prpsz = sizeof(uint64_t) *
566 howmany(ctrlr->max_xfer_size, ctrlr->page_size);
567 prpmemsz = qpair->num_trackers * prpsz;
568 allocsz = cmdsz + cplsz + prpmemsz;
569
570 err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev),
571 ctrlr->page_size, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL,
572 allocsz, 1, allocsz, 0, NULL, NULL, &qpair->dma_tag);
573 if (err != 0) {
574 nvme_printf(ctrlr, "tag create failed %d\n", err);
575 goto out;
576 }
577 bus_dma_tag_set_domain(qpair->dma_tag, qpair->domain);
578
579 if (bus_dmamem_alloc(qpair->dma_tag, (void **)&queuemem,
580 BUS_DMA_COHERENT | BUS_DMA_NOWAIT, &qpair->queuemem_map)) {
581 nvme_printf(ctrlr, "failed to alloc qpair memory\n");
582 goto out;
583 }
584
585 if (bus_dmamap_load(qpair->dma_tag, qpair->queuemem_map,
586 queuemem, allocsz, nvme_single_map, &queuemem_phys, 0) != 0) {
587 nvme_printf(ctrlr, "failed to load qpair memory\n");
588 bus_dmamem_free(qpair->dma_tag, qpair->cmd,
589 qpair->queuemem_map);
590 goto out;
591 }
592
593 qpair->num_cmds = 0;
594 qpair->num_intr_handler_calls = 0;
595 qpair->num_retries = 0;
596 qpair->num_failures = 0;
597 qpair->num_ignored = 0;
598 qpair->cmd = (struct nvme_command *)queuemem;
599 qpair->cpl = (struct nvme_completion *)(queuemem + cmdsz);
600 prpmem = (uint8_t *)(queuemem + cmdsz + cplsz);
601 qpair->cmd_bus_addr = queuemem_phys;
602 qpair->cpl_bus_addr = queuemem_phys + cmdsz;
603 prpmem_phys = queuemem_phys + cmdsz + cplsz;
604
605 /*
606 * Calcuate the stride of the doorbell register. Many emulators set this
607 * value to correspond to a cache line. However, some hardware has set
608 * it to various small values.
609 */
610 qpair->sq_tdbl_off = nvme_mmio_offsetof(doorbell[0]) +
611 (qpair->id << (ctrlr->dstrd + 1));
612 qpair->cq_hdbl_off = nvme_mmio_offsetof(doorbell[0]) +
613 (qpair->id << (ctrlr->dstrd + 1)) + (1 << ctrlr->dstrd);
614
615 TAILQ_INIT(&qpair->free_tr);
616 TAILQ_INIT(&qpair->outstanding_tr);
617 STAILQ_INIT(&qpair->queued_req);
618
619 list_phys = prpmem_phys;
620 prp_list = prpmem;
621 for (i = 0; i < qpair->num_trackers; i++) {
622 if (list_phys + prpsz > prpmem_phys + prpmemsz) {
623 qpair->num_trackers = i;
624 break;
625 }
626
627 /*
628 * Make sure that the PRP list for this tracker doesn't
629 * overflow to another nvme page.
630 */
631 if (trunc_page(list_phys) !=
632 trunc_page(list_phys + prpsz - 1)) {
633 list_phys = roundup2(list_phys, ctrlr->page_size);
634 prp_list =
635 (uint8_t *)roundup2((uintptr_t)prp_list, ctrlr->page_size);
636 }
637
638 tr = malloc_domainset(sizeof(*tr), M_NVME,
639 DOMAINSET_PREF(qpair->domain), M_ZERO | M_WAITOK);
640 bus_dmamap_create(qpair->dma_tag_payload, 0,
641 &tr->payload_dma_map);
642 tr->cid = i;
643 tr->qpair = qpair;
644 tr->prp = (uint64_t *)prp_list;
645 tr->prp_bus_addr = list_phys;
646 TAILQ_INSERT_HEAD(&qpair->free_tr, tr, tailq);
647 list_phys += prpsz;
648 prp_list += prpsz;
649 }
650
651 if (qpair->num_trackers == 0) {
652 nvme_printf(ctrlr, "failed to allocate enough trackers\n");
653 goto out;
654 }
655
656 qpair->act_tr = malloc_domainset(sizeof(struct nvme_tracker *) *
657 qpair->num_entries, M_NVME, DOMAINSET_PREF(qpair->domain),
658 M_ZERO | M_WAITOK);
659
660 if (ctrlr->msi_count > 1) {
661 /*
662 * MSI-X vector resource IDs start at 1, so we add one to
663 * the queue's vector to get the corresponding rid to use.
664 */
665 qpair->rid = qpair->vector + 1;
666
667 qpair->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ,
668 &qpair->rid, RF_ACTIVE);
669 if (qpair->res == NULL) {
670 nvme_printf(ctrlr, "unable to allocate MSI\n");
671 goto out;
672 }
673 if (bus_setup_intr(ctrlr->dev, qpair->res,
674 INTR_TYPE_MISC | INTR_MPSAFE, NULL,
675 nvme_qpair_msi_handler, qpair, &qpair->tag) != 0) {
676 nvme_printf(ctrlr, "unable to setup MSI\n");
677 goto out;
678 }
679 if (qpair->id == 0) {
680 bus_describe_intr(ctrlr->dev, qpair->res, qpair->tag,
681 "admin");
682 } else {
683 bus_describe_intr(ctrlr->dev, qpair->res, qpair->tag,
684 "io%d", qpair->id - 1);
685 }
686 }
687
688 return (0);
689
690 out:
691 nvme_qpair_destroy(qpair);
692 return (ENOMEM);
693 }
694
695 static void
nvme_qpair_destroy(struct nvme_qpair * qpair)696 nvme_qpair_destroy(struct nvme_qpair *qpair)
697 {
698 struct nvme_tracker *tr;
699
700 mtx_lock(&qpair->recovery);
701 qpair->timer_armed = false;
702 mtx_unlock(&qpair->recovery);
703 callout_drain(&qpair->timer);
704
705 if (qpair->tag) {
706 bus_teardown_intr(qpair->ctrlr->dev, qpair->res, qpair->tag);
707 qpair->tag = NULL;
708 }
709
710 if (qpair->act_tr) {
711 free(qpair->act_tr, M_NVME);
712 qpair->act_tr = NULL;
713 }
714
715 while (!TAILQ_EMPTY(&qpair->free_tr)) {
716 tr = TAILQ_FIRST(&qpair->free_tr);
717 TAILQ_REMOVE(&qpair->free_tr, tr, tailq);
718 bus_dmamap_destroy(qpair->dma_tag_payload,
719 tr->payload_dma_map);
720 free(tr, M_NVME);
721 }
722
723 if (qpair->cmd != NULL) {
724 bus_dmamap_unload(qpair->dma_tag, qpair->queuemem_map);
725 bus_dmamem_free(qpair->dma_tag, qpair->cmd,
726 qpair->queuemem_map);
727 qpair->cmd = NULL;
728 }
729
730 if (qpair->dma_tag) {
731 bus_dma_tag_destroy(qpair->dma_tag);
732 qpair->dma_tag = NULL;
733 }
734
735 if (qpair->dma_tag_payload) {
736 bus_dma_tag_destroy(qpair->dma_tag_payload);
737 qpair->dma_tag_payload = NULL;
738 }
739
740 if (mtx_initialized(&qpair->lock))
741 mtx_destroy(&qpair->lock);
742 if (mtx_initialized(&qpair->recovery))
743 mtx_destroy(&qpair->recovery);
744
745 if (qpair->res) {
746 bus_release_resource(qpair->ctrlr->dev, SYS_RES_IRQ,
747 rman_get_rid(qpair->res), qpair->res);
748 qpair->res = NULL;
749 }
750 }
751
752 static void
nvme_admin_qpair_abort_aers(struct nvme_qpair * qpair)753 nvme_admin_qpair_abort_aers(struct nvme_qpair *qpair)
754 {
755 struct nvme_tracker *tr;
756
757 /*
758 * nvme_complete_tracker must be called without the qpair lock held. It
759 * takes the lock to adjust outstanding_tr list, so make sure we don't
760 * have it yet. We need the lock to make the list traverse safe, but
761 * have to drop the lock to complete any AER. We restart the list scan
762 * when we do this to make this safe. There's interlock with the ISR so
763 * we know this tracker won't be completed twice.
764 */
765 mtx_assert(&qpair->lock, MA_NOTOWNED);
766
767 mtx_lock(&qpair->lock);
768 tr = TAILQ_FIRST(&qpair->outstanding_tr);
769 while (tr != NULL) {
770 if (tr->req->cmd.opc != NVME_OPC_ASYNC_EVENT_REQUEST) {
771 tr = TAILQ_NEXT(tr, tailq);
772 continue;
773 }
774 mtx_unlock(&qpair->lock);
775 nvme_qpair_manual_complete_tracker(tr,
776 NVME_SCT_GENERIC, NVME_SC_ABORTED_SQ_DELETION, 0,
777 ERROR_PRINT_NONE);
778 mtx_lock(&qpair->lock);
779 tr = TAILQ_FIRST(&qpair->outstanding_tr);
780 }
781 mtx_unlock(&qpair->lock);
782 }
783
784 void
nvme_admin_qpair_destroy(struct nvme_qpair * qpair)785 nvme_admin_qpair_destroy(struct nvme_qpair *qpair)
786 {
787 mtx_assert(&qpair->lock, MA_NOTOWNED);
788
789 nvme_admin_qpair_abort_aers(qpair);
790 nvme_qpair_destroy(qpair);
791 }
792
793 void
nvme_io_qpair_destroy(struct nvme_qpair * qpair)794 nvme_io_qpair_destroy(struct nvme_qpair *qpair)
795 {
796
797 nvme_qpair_destroy(qpair);
798 }
799
800 static void
nvme_abort_complete(void * arg,const struct nvme_completion * status)801 nvme_abort_complete(void *arg, const struct nvme_completion *status)
802 {
803 struct nvme_tracker *tr = arg;
804
805 /*
806 * If cdw0 bit 0 == 1, the controller was not able to abort the command
807 * we requested. We still need to check the active tracker array, to
808 * cover race where I/O timed out at same time controller was completing
809 * the I/O. An abort command always is on the admin queue, but affects
810 * either an admin or an I/O queue, so take the appropriate qpair lock
811 * for the original command's queue, since we'll need it to avoid races
812 * with the completion code and to complete the command manually.
813 */
814 mtx_lock(&tr->qpair->lock);
815 if ((status->cdw0 & 1) == 1 && tr->qpair->act_tr[tr->cid] != NULL) {
816 /*
817 * An I/O has timed out, and the controller was unable to abort
818 * it for some reason. And we've not processed a completion for
819 * it yet. Construct a fake completion status, and then complete
820 * the I/O's tracker manually.
821 */
822 nvme_printf(tr->qpair->ctrlr,
823 "abort command failed, aborting command manually\n");
824 nvme_qpair_manual_complete_tracker(tr,
825 NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 0, ERROR_PRINT_ALL);
826 }
827 /*
828 * XXX We don't check status for the possible 'Could not abort because
829 * excess aborts were submitted to the controller'. We don't prevent
830 * that, either. Document for the future here, since the standard is
831 * squishy and only says 'may generate' but implies anything is possible
832 * including hangs if you exceed the ACL.
833 */
834 mtx_unlock(&tr->qpair->lock);
835 }
836
837 static void
nvme_qpair_timeout(void * arg)838 nvme_qpair_timeout(void *arg)
839 {
840 struct nvme_qpair *qpair = arg;
841 struct nvme_controller *ctrlr = qpair->ctrlr;
842 struct nvme_tracker *tr;
843 sbintime_t now;
844 bool idle = true;
845 bool is_admin = qpair == &ctrlr->adminq;
846 bool fast;
847 uint32_t csts;
848 uint8_t cfs;
849
850 mtx_assert(&qpair->recovery, MA_OWNED);
851
852 /*
853 * If the controller is failed, then stop polling. This ensures that any
854 * failure processing that races with the qpair timeout will fail
855 * safely.
856 */
857 if (is_admin ? qpair->ctrlr->is_failed_admin : qpair->ctrlr->is_failed) {
858 nvme_printf(qpair->ctrlr,
859 "%sFailed controller, stopping watchdog timeout.\n",
860 is_admin ? "Complete " : "");
861 qpair->timer_armed = false;
862 return;
863 }
864
865 /*
866 * Shutdown condition: We set qpair->timer_armed to false in
867 * nvme_qpair_destroy before calling callout_drain. When we call that,
868 * this routine might get called one last time. Exit w/o setting a
869 * timeout. None of the watchdog stuff needs to be done since we're
870 * destroying the qpair.
871 */
872 if (!qpair->timer_armed) {
873 nvme_printf(qpair->ctrlr,
874 "Timeout fired during nvme_qpair_destroy\n");
875 return;
876 }
877
878 switch (qpair->recovery_state) {
879 case RECOVERY_NONE:
880 /*
881 * Read csts to get value of cfs - controller fatal status. If
882 * we are in the hot-plug or controller failed status proceed
883 * directly to reset. We also bail early if the status reads all
884 * 1's or the control fatal status bit is now 1. The latter is
885 * always true when the former is true, but not vice versa. The
886 * intent of the code is that if the card is gone (all 1's) or
887 * we've failed, then try to do a reset (which someitmes
888 * unwedges a card reading all 1's that's not gone away, but
889 * usually doesn't).
890 */
891 csts = nvme_mmio_read_4(ctrlr, csts);
892 cfs = NVMEV(NVME_CSTS_REG_CFS, csts);
893 if (csts == NVME_GONE || cfs == 1) {
894 /*
895 * We've had a command timeout that we weren't able to
896 * abort or we have aborts disabled and any command
897 * timed out.
898 *
899 * If we get here due to a possible surprise hot-unplug
900 * event, then we let nvme_ctrlr_reset confirm and fail
901 * the controller.
902 */
903 do_reset:
904 nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n",
905 (csts == 0xffffffff) ? " and possible hot unplug" :
906 (cfs ? " and fatal error status" : ""));
907 qpair->recovery_state = RECOVERY_WAITING;
908 nvme_ctrlr_reset(ctrlr);
909 idle = false;
910 break;
911 }
912
913
914 /*
915 * See if there's any recovery needed. First, do a fast check to
916 * see if anything could have timed out. If not, then skip
917 * everything else.
918 */
919 fast = false;
920 mtx_lock(&qpair->lock);
921 now = getsbinuptime();
922 TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) {
923 /*
924 * Skip async commands, they are posted to the card for
925 * an indefinite amount of time and have no deadline.
926 */
927 if (tr->deadline == SBT_MAX)
928 continue;
929
930 /*
931 * If the first real transaction is not in timeout, then
932 * we're done. Otherwise, we try recovery.
933 */
934 idle = false;
935 if (now <= tr->deadline)
936 fast = true;
937 break;
938 }
939 mtx_unlock(&qpair->lock);
940 if (idle || fast)
941 break;
942
943 /*
944 * There's a stale transaction at the start of the queue whose
945 * deadline has passed. Poll the competions as a last-ditch
946 * effort in case an interrupt has been missed. Warn the user if
947 * transactions were found of possible interrupt issues, but
948 * just once per controller.
949 */
950 if (_nvme_qpair_process_completions(qpair) && !ctrlr->isr_warned) {
951 nvme_printf(ctrlr, "System interrupt issues?\n");
952 ctrlr->isr_warned = true;
953 }
954
955 /*
956 * Now that we've run the ISR, re-rheck to see if there's any
957 * timed out commands and abort them or reset the card if so.
958 */
959 mtx_lock(&qpair->lock);
960 idle = true;
961 TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) {
962 /*
963 * Skip async commands, they are posted to the card for
964 * an indefinite amount of time and have no deadline.
965 */
966 if (tr->deadline == SBT_MAX)
967 continue;
968
969 /*
970 * If we know this tracker hasn't timed out, we also
971 * know all subsequent ones haven't timed out. The tr
972 * queue is in submission order and all normal commands
973 * in a queue have the same timeout (or the timeout was
974 * changed by the user, but we eventually timeout then).
975 */
976 idle = false;
977 if (now <= tr->deadline)
978 break;
979
980 /*
981 * Timeout expired, abort it or reset controller.
982 */
983 if (ctrlr->enable_aborts &&
984 tr->req->cb_fn != nvme_abort_complete) {
985 /*
986 * This isn't an abort command, ask for a
987 * hardware abort. This goes to the admin
988 * queue which will reset the card if it
989 * times out.
990 */
991 nvme_ctrlr_cmd_abort(ctrlr, tr->cid, qpair->id,
992 nvme_abort_complete, tr);
993 } else {
994 /*
995 * We have a live command in the card (either
996 * one we couldn't abort, or aborts weren't
997 * enabled). We can only reset.
998 */
999 mtx_unlock(&qpair->lock);
1000 goto do_reset;
1001 }
1002 }
1003 mtx_unlock(&qpair->lock);
1004 break;
1005
1006 case RECOVERY_WAITING:
1007 /*
1008 * These messages aren't interesting while we're suspended. We
1009 * put the queues into waiting state while
1010 * suspending. Suspending takes a while, so we'll see these
1011 * during that time and they aren't diagnostic. At other times,
1012 * they indicate a problem that's worth complaining about.
1013 */
1014 if (!device_is_suspended(ctrlr->dev))
1015 nvme_printf(ctrlr, "Waiting for reset to complete\n");
1016 idle = false; /* We want to keep polling */
1017 break;
1018 }
1019
1020 /*
1021 * Rearm the timeout.
1022 */
1023 if (!idle) {
1024 callout_schedule_sbt(&qpair->timer, SBT_1S / 2, SBT_1S / 2, 0);
1025 } else {
1026 qpair->timer_armed = false;
1027 }
1028 }
1029
1030 /*
1031 * Submit the tracker to the hardware. Must already be in the
1032 * outstanding queue when called.
1033 */
1034 void
nvme_qpair_submit_tracker(struct nvme_qpair * qpair,struct nvme_tracker * tr)1035 nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr)
1036 {
1037 struct nvme_request *req;
1038 struct nvme_controller *ctrlr;
1039 int timeout;
1040
1041 mtx_assert(&qpair->lock, MA_OWNED);
1042
1043 req = tr->req;
1044 req->cmd.cid = tr->cid;
1045 qpair->act_tr[tr->cid] = tr;
1046 ctrlr = qpair->ctrlr;
1047
1048 if (req->timeout) {
1049 if (req->cb_fn == nvme_completion_poll_cb)
1050 timeout = 1;
1051 else if (qpair->id == 0)
1052 timeout = ctrlr->admin_timeout_period;
1053 else
1054 timeout = ctrlr->timeout_period;
1055 tr->deadline = getsbinuptime() + timeout * SBT_1S;
1056 if (!qpair->timer_armed) {
1057 qpair->timer_armed = true;
1058 callout_reset_sbt_on(&qpair->timer, SBT_1S / 2, SBT_1S / 2,
1059 nvme_qpair_timeout, qpair, qpair->cpu, 0);
1060 }
1061 } else
1062 tr->deadline = SBT_MAX;
1063
1064 /* Copy the command from the tracker to the submission queue. */
1065 memcpy(&qpair->cmd[qpair->sq_tail], &req->cmd, sizeof(req->cmd));
1066
1067 if (++qpair->sq_tail == qpair->num_entries)
1068 qpair->sq_tail = 0;
1069
1070 bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map,
1071 BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1072 bus_space_write_4(ctrlr->bus_tag, ctrlr->bus_handle,
1073 qpair->sq_tdbl_off, qpair->sq_tail);
1074 qpair->num_cmds++;
1075 }
1076
1077 static void
nvme_payload_map(void * arg,bus_dma_segment_t * seg,int nseg,int error)1078 nvme_payload_map(void *arg, bus_dma_segment_t *seg, int nseg, int error)
1079 {
1080 struct nvme_tracker *tr = arg;
1081 uint32_t cur_nseg;
1082
1083 /*
1084 * If the mapping operation failed, return immediately. The caller
1085 * is responsible for detecting the error status and failing the
1086 * tracker manually.
1087 */
1088 if (error != 0) {
1089 nvme_printf(tr->qpair->ctrlr,
1090 "nvme_payload_map err %d\n", error);
1091 return;
1092 }
1093
1094 /*
1095 * Note that we specified ctrlr->page_size for alignment and max
1096 * segment size when creating the bus dma tags. So here we can safely
1097 * just transfer each segment to its associated PRP entry.
1098 */
1099 tr->req->cmd.prp1 = htole64(seg[0].ds_addr);
1100
1101 if (nseg == 2) {
1102 tr->req->cmd.prp2 = htole64(seg[1].ds_addr);
1103 } else if (nseg > 2) {
1104 cur_nseg = 1;
1105 tr->req->cmd.prp2 = htole64((uint64_t)tr->prp_bus_addr);
1106 while (cur_nseg < nseg) {
1107 tr->prp[cur_nseg-1] =
1108 htole64((uint64_t)seg[cur_nseg].ds_addr);
1109 cur_nseg++;
1110 }
1111 } else {
1112 /*
1113 * prp2 should not be used by the controller
1114 * since there is only one segment, but set
1115 * to 0 just to be safe.
1116 */
1117 tr->req->cmd.prp2 = 0;
1118 }
1119
1120 bus_dmamap_sync(tr->qpair->dma_tag_payload, tr->payload_dma_map,
1121 BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1122 nvme_qpair_submit_tracker(tr->qpair, tr);
1123 }
1124
1125 static void
_nvme_qpair_submit_request(struct nvme_qpair * qpair,struct nvme_request * req)1126 _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
1127 {
1128 struct nvme_tracker *tr;
1129 int err = 0;
1130 bool is_admin = qpair == &qpair->ctrlr->adminq;
1131
1132 mtx_assert(&qpair->lock, MA_OWNED);
1133
1134 tr = TAILQ_FIRST(&qpair->free_tr);
1135 req->qpair = qpair;
1136
1137 /*
1138 * The controller has failed, so fail the request. Note, that this races
1139 * the recovery / timeout code. Since we hold the qpair lock, we know
1140 * it's safe to fail directly. is_failed is set when we fail the
1141 * controller. It is only ever reset in the ioctl reset controller
1142 * path, which is safe to race (for failed controllers, we make no
1143 * guarantees about bringing it out of failed state relative to other
1144 * commands). We try hard to allow admin commands when the entire
1145 * controller hasn't failed, only something related to I/O queues.
1146 */
1147 if (is_admin ? qpair->ctrlr->is_failed_admin : qpair->ctrlr->is_failed) {
1148 nvme_qpair_manual_complete_request(qpair, req,
1149 NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 1,
1150 ERROR_PRINT_NONE);
1151 return;
1152 }
1153
1154 /*
1155 * No tracker is available, or the qpair is disabled due to an
1156 * in-progress controller-level reset. If we lose the race with
1157 * recovery_state, then we may add an extra request to the queue which
1158 * will be resubmitted later. We only set recovery_state to NONE with
1159 * qpair->lock also held, so if we observe that the state is not NONE,
1160 * we know it won't transition back to NONE without retrying queued
1161 * request.
1162 */
1163 if (tr == NULL || qpair->recovery_state != RECOVERY_NONE) {
1164 STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq);
1165 return;
1166 }
1167
1168 TAILQ_REMOVE(&qpair->free_tr, tr, tailq);
1169 TAILQ_INSERT_TAIL(&qpair->outstanding_tr, tr, tailq);
1170 tr->deadline = SBT_MAX;
1171 tr->req = req;
1172
1173 if (!req->payload_valid) {
1174 nvme_qpair_submit_tracker(tr->qpair, tr);
1175 return;
1176 }
1177
1178 /*
1179 * tr->deadline updating when nvme_payload_map calls
1180 * nvme_qpair_submit_tracker (we call it above directly
1181 * when there's no map to load).
1182 */
1183 err = bus_dmamap_load_mem(tr->qpair->dma_tag_payload,
1184 tr->payload_dma_map, &req->payload, nvme_payload_map, tr, 0);
1185 if (err != 0) {
1186 /*
1187 * The dmamap operation failed, so we manually fail the
1188 * tracker here with DATA_TRANSFER_ERROR status.
1189 *
1190 * nvme_qpair_manual_complete_tracker must not be called
1191 * with the qpair lock held.
1192 */
1193 nvme_printf(qpair->ctrlr,
1194 "bus_dmamap_load_mem returned 0x%x!\n", err);
1195 mtx_unlock(&qpair->lock);
1196 nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC,
1197 NVME_SC_DATA_TRANSFER_ERROR, DO_NOT_RETRY, ERROR_PRINT_ALL);
1198 mtx_lock(&qpair->lock);
1199 }
1200 }
1201
1202 void
nvme_qpair_submit_request(struct nvme_qpair * qpair,struct nvme_request * req)1203 nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
1204 {
1205
1206 mtx_lock(&qpair->lock);
1207 _nvme_qpair_submit_request(qpair, req);
1208 mtx_unlock(&qpair->lock);
1209 }
1210
1211 static void
nvme_qpair_enable(struct nvme_qpair * qpair)1212 nvme_qpair_enable(struct nvme_qpair *qpair)
1213 {
1214 bool is_admin __unused = qpair == &qpair->ctrlr->adminq;
1215
1216 if (mtx_initialized(&qpair->recovery))
1217 mtx_assert(&qpair->recovery, MA_OWNED);
1218 if (mtx_initialized(&qpair->lock))
1219 mtx_assert(&qpair->lock, MA_OWNED);
1220 KASSERT(!(is_admin ? qpair->ctrlr->is_failed_admin : qpair->ctrlr->is_failed),
1221 ("Enabling a failed qpair\n"));
1222
1223 qpair->recovery_state = RECOVERY_NONE;
1224 }
1225
1226 void
nvme_qpair_reset(struct nvme_qpair * qpair)1227 nvme_qpair_reset(struct nvme_qpair *qpair)
1228 {
1229
1230 qpair->sq_head = qpair->sq_tail = qpair->cq_head = 0;
1231
1232 /*
1233 * First time through the completion queue, HW will set phase
1234 * bit on completions to 1. So set this to 1 here, indicating
1235 * we're looking for a 1 to know which entries have completed.
1236 * we'll toggle the bit each time when the completion queue
1237 * rolls over.
1238 */
1239 qpair->phase = 1;
1240
1241 memset(qpair->cmd, 0,
1242 qpair->num_entries * sizeof(struct nvme_command));
1243 memset(qpair->cpl, 0,
1244 qpair->num_entries * sizeof(struct nvme_completion));
1245 }
1246
1247 void
nvme_admin_qpair_enable(struct nvme_qpair * qpair)1248 nvme_admin_qpair_enable(struct nvme_qpair *qpair)
1249 {
1250 struct nvme_tracker *tr;
1251 struct nvme_tracker *tr_temp;
1252 bool rpt;
1253
1254 /*
1255 * Manually abort each outstanding admin command. Do not retry
1256 * admin commands found here, since they will be left over from
1257 * a controller reset and its likely the context in which the
1258 * command was issued no longer applies.
1259 */
1260 rpt = !TAILQ_EMPTY(&qpair->outstanding_tr);
1261 if (rpt)
1262 nvme_printf(qpair->ctrlr,
1263 "aborting outstanding admin command\n");
1264 TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) {
1265 nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC,
1266 NVME_SC_ABORTED_BY_REQUEST, DO_NOT_RETRY, ERROR_PRINT_ALL);
1267 }
1268 if (rpt)
1269 nvme_printf(qpair->ctrlr,
1270 "done aborting outstanding admin\n");
1271
1272 mtx_lock(&qpair->recovery);
1273 mtx_lock(&qpair->lock);
1274 nvme_qpair_enable(qpair);
1275 mtx_unlock(&qpair->lock);
1276 mtx_unlock(&qpair->recovery);
1277 }
1278
1279 void
nvme_io_qpair_enable(struct nvme_qpair * qpair)1280 nvme_io_qpair_enable(struct nvme_qpair *qpair)
1281 {
1282 STAILQ_HEAD(, nvme_request) temp;
1283 struct nvme_tracker *tr;
1284 struct nvme_tracker *tr_temp;
1285 struct nvme_request *req;
1286 bool report;
1287
1288 /*
1289 * Manually abort each outstanding I/O. This normally results in a
1290 * retry, unless the retry count on the associated request has
1291 * reached its limit.
1292 */
1293 report = !TAILQ_EMPTY(&qpair->outstanding_tr);
1294 if (report)
1295 nvme_printf(qpair->ctrlr, "aborting outstanding i/o\n");
1296 TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) {
1297 nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC,
1298 NVME_SC_ABORTED_BY_REQUEST, 0, ERROR_PRINT_NO_RETRY);
1299 }
1300 if (report)
1301 nvme_printf(qpair->ctrlr, "done aborting outstanding i/o\n");
1302
1303 mtx_lock(&qpair->recovery);
1304 mtx_lock(&qpair->lock);
1305 nvme_qpair_enable(qpair);
1306
1307 STAILQ_INIT(&temp);
1308 STAILQ_SWAP(&qpair->queued_req, &temp, nvme_request);
1309
1310 report = !STAILQ_EMPTY(&temp);
1311 if (report)
1312 nvme_printf(qpair->ctrlr, "resubmitting queued i/o\n");
1313 while (!STAILQ_EMPTY(&temp)) {
1314 req = STAILQ_FIRST(&temp);
1315 STAILQ_REMOVE_HEAD(&temp, stailq);
1316 nvme_qpair_print_command(qpair, &req->cmd);
1317 _nvme_qpair_submit_request(qpair, req);
1318 }
1319 if (report)
1320 nvme_printf(qpair->ctrlr, "done resubmitting i/o\n");
1321
1322 mtx_unlock(&qpair->lock);
1323 mtx_unlock(&qpair->recovery);
1324 }
1325
1326 static void
nvme_qpair_disable(struct nvme_qpair * qpair)1327 nvme_qpair_disable(struct nvme_qpair *qpair)
1328 {
1329 struct nvme_tracker *tr, *tr_temp;
1330
1331 if (mtx_initialized(&qpair->recovery))
1332 mtx_assert(&qpair->recovery, MA_OWNED);
1333 if (mtx_initialized(&qpair->lock))
1334 mtx_assert(&qpair->lock, MA_OWNED);
1335
1336 qpair->recovery_state = RECOVERY_WAITING;
1337 TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) {
1338 tr->deadline = SBT_MAX;
1339 }
1340 }
1341
1342 void
nvme_admin_qpair_disable(struct nvme_qpair * qpair)1343 nvme_admin_qpair_disable(struct nvme_qpair *qpair)
1344 {
1345 mtx_lock(&qpair->recovery);
1346
1347 mtx_lock(&qpair->lock);
1348 nvme_qpair_disable(qpair);
1349 mtx_unlock(&qpair->lock);
1350
1351 nvme_admin_qpair_abort_aers(qpair);
1352
1353 mtx_unlock(&qpair->recovery);
1354 }
1355
1356 void
nvme_io_qpair_disable(struct nvme_qpair * qpair)1357 nvme_io_qpair_disable(struct nvme_qpair *qpair)
1358 {
1359 mtx_lock(&qpair->recovery);
1360 mtx_lock(&qpair->lock);
1361
1362 nvme_qpair_disable(qpair);
1363
1364 mtx_unlock(&qpair->lock);
1365 mtx_unlock(&qpair->recovery);
1366 }
1367
1368 void
nvme_qpair_fail(struct nvme_qpair * qpair)1369 nvme_qpair_fail(struct nvme_qpair *qpair)
1370 {
1371 struct nvme_tracker *tr;
1372 struct nvme_request *req;
1373
1374 if (!mtx_initialized(&qpair->lock))
1375 return;
1376
1377 mtx_lock(&qpair->lock);
1378
1379 if (!STAILQ_EMPTY(&qpair->queued_req)) {
1380 nvme_printf(qpair->ctrlr, "failing queued i/o\n");
1381 }
1382 while (!STAILQ_EMPTY(&qpair->queued_req)) {
1383 req = STAILQ_FIRST(&qpair->queued_req);
1384 STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
1385 mtx_unlock(&qpair->lock);
1386 nvme_qpair_manual_complete_request(qpair, req, NVME_SCT_GENERIC,
1387 NVME_SC_ABORTED_BY_REQUEST, 1, ERROR_PRINT_ALL);
1388 mtx_lock(&qpair->lock);
1389 }
1390
1391 if (!TAILQ_EMPTY(&qpair->outstanding_tr)) {
1392 nvme_printf(qpair->ctrlr, "failing outstanding i/o\n");
1393 }
1394 /* Manually abort each outstanding I/O. */
1395 while (!TAILQ_EMPTY(&qpair->outstanding_tr)) {
1396 tr = TAILQ_FIRST(&qpair->outstanding_tr);
1397 /*
1398 * Do not remove the tracker. The abort_tracker path will
1399 * do that for us.
1400 */
1401 mtx_unlock(&qpair->lock);
1402 nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC,
1403 NVME_SC_ABORTED_BY_REQUEST, DO_NOT_RETRY, ERROR_PRINT_ALL);
1404 mtx_lock(&qpair->lock);
1405 }
1406
1407 mtx_unlock(&qpair->lock);
1408 }
1409