xref: /freebsd/sys/dev/nvme/nvme_qpair.c (revision 85a6ba310f05139cdbdb2bc852e5a5fbe7975bfa)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (C) 2012-2014 Intel Corporation
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/param.h>
30 #include <sys/bus.h>
31 #include <sys/conf.h>
32 #include <sys/domainset.h>
33 #include <sys/proc.h>
34 #include <sys/sbuf.h>
35 
36 #include <dev/pci/pcivar.h>
37 
38 #include "nvme_private.h"
39 
40 typedef enum error_print { ERROR_PRINT_NONE, ERROR_PRINT_NO_RETRY, ERROR_PRINT_ALL } error_print_t;
41 #define DO_NOT_RETRY	1
42 
43 static void	_nvme_qpair_submit_request(struct nvme_qpair *qpair,
44 					   struct nvme_request *req);
45 static void	nvme_qpair_destroy(struct nvme_qpair *qpair);
46 
47 static const char *
get_opcode_string(bool admin,uint8_t opc,char * buf,size_t len)48 get_opcode_string(bool admin, uint8_t opc, char *buf, size_t len)
49 {
50 	struct sbuf sb;
51 
52 	sbuf_new(&sb, buf, len, SBUF_FIXEDLEN);
53 	nvme_opcode_sbuf(admin, opc, &sb);
54 	if (sbuf_finish(&sb) != 0)
55 		return ("");
56 	return (buf);
57 }
58 
59 static void
nvme_admin_qpair_print_command(struct nvme_qpair * qpair,struct nvme_command * cmd)60 nvme_admin_qpair_print_command(struct nvme_qpair *qpair,
61     struct nvme_command *cmd)
62 {
63 	char buf[64];
64 
65 	nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%x "
66 	    "cdw10:%08x cdw11:%08x\n",
67 	    get_opcode_string(true, cmd->opc, buf, sizeof(buf)), qpair->id,
68 	    cmd->cid, le32toh(cmd->nsid), le32toh(cmd->cdw10),
69 	    le32toh(cmd->cdw11));
70 }
71 
72 static void
nvme_io_qpair_print_command(struct nvme_qpair * qpair,struct nvme_command * cmd)73 nvme_io_qpair_print_command(struct nvme_qpair *qpair,
74     struct nvme_command *cmd)
75 {
76 	char buf[64];
77 
78 	switch (cmd->opc) {
79 	case NVME_OPC_WRITE:
80 	case NVME_OPC_READ:
81 	case NVME_OPC_WRITE_UNCORRECTABLE:
82 	case NVME_OPC_COMPARE:
83 	case NVME_OPC_WRITE_ZEROES:
84 	case NVME_OPC_VERIFY:
85 		nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d "
86 		    "lba:%llu len:%d\n",
87 		    get_opcode_string(false, cmd->opc, buf, sizeof(buf)),
88 		    qpair->id, cmd->cid, le32toh(cmd->nsid),
89 		    ((unsigned long long)le32toh(cmd->cdw11) << 32) + le32toh(cmd->cdw10),
90 		    (le32toh(cmd->cdw12) & 0xFFFF) + 1);
91 		break;
92 	default:
93 		nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d\n",
94 		    get_opcode_string(false, cmd->opc, buf, sizeof(buf)),
95 		    qpair->id, cmd->cid, le32toh(cmd->nsid));
96 		break;
97 	}
98 }
99 
100 void
nvme_qpair_print_command(struct nvme_qpair * qpair,struct nvme_command * cmd)101 nvme_qpair_print_command(struct nvme_qpair *qpair, struct nvme_command *cmd)
102 {
103 	if (qpair->id == 0)
104 		nvme_admin_qpair_print_command(qpair, cmd);
105 	else
106 		nvme_io_qpair_print_command(qpair, cmd);
107 	if (nvme_verbose_cmd_dump) {
108 		nvme_printf(qpair->ctrlr,
109 		    "nsid:%#x rsvd2:%#x rsvd3:%#x mptr:%#jx prp1:%#jx prp2:%#jx\n",
110 		    cmd->nsid, cmd->rsvd2, cmd->rsvd3, (uintmax_t)cmd->mptr,
111 		    (uintmax_t)cmd->prp1, (uintmax_t)cmd->prp2);
112 		nvme_printf(qpair->ctrlr,
113 		    "cdw10: %#x cdw11:%#x cdw12:%#x cdw13:%#x cdw14:%#x cdw15:%#x\n",
114 		    cmd->cdw10, cmd->cdw11, cmd->cdw12, cmd->cdw13, cmd->cdw14,
115 		    cmd->cdw15);
116 	}
117 }
118 
119 static const char *
get_status_string(const struct nvme_completion * cpl,char * buf,size_t len)120 get_status_string(const struct nvme_completion *cpl, char *buf, size_t len)
121 {
122 	struct sbuf sb;
123 
124 	sbuf_new(&sb, buf, len, SBUF_FIXEDLEN);
125 	nvme_sc_sbuf(cpl, &sb);
126 	if (sbuf_finish(&sb) != 0)
127 		return ("");
128 	return (buf);
129 }
130 
131 void
nvme_qpair_print_completion(struct nvme_qpair * qpair,struct nvme_completion * cpl)132 nvme_qpair_print_completion(struct nvme_qpair *qpair,
133     struct nvme_completion *cpl)
134 {
135 	char buf[64];
136 	uint8_t crd, m, dnr, p;
137 
138 	crd = NVME_STATUS_GET_CRD(cpl->status);
139 	m = NVME_STATUS_GET_M(cpl->status);
140 	dnr = NVME_STATUS_GET_DNR(cpl->status);
141 	p = NVME_STATUS_GET_P(cpl->status);
142 
143 	nvme_printf(qpair->ctrlr, "%s crd:%x m:%x dnr:%x p:%d "
144 	    "sqid:%d cid:%d cdw0:%x\n",
145 	    get_status_string(cpl, buf, sizeof(buf)), crd, m, dnr, p,
146 	    cpl->sqid, cpl->cid, cpl->cdw0);
147 }
148 
149 static bool
nvme_completion_is_retry(const struct nvme_completion * cpl)150 nvme_completion_is_retry(const struct nvme_completion *cpl)
151 {
152 	uint8_t sct, sc, dnr;
153 
154 	sct = NVME_STATUS_GET_SCT(cpl->status);
155 	sc = NVME_STATUS_GET_SC(cpl->status);
156 	dnr = NVME_STATUS_GET_DNR(cpl->status);	/* Do Not Retry Bit */
157 
158 	/*
159 	 * TODO: spec is not clear how commands that are aborted due
160 	 *  to TLER will be marked.  So for now, it seems
161 	 *  NAMESPACE_NOT_READY is the only case where we should
162 	 *  look at the DNR bit. Requests failed with ABORTED_BY_REQUEST
163 	 *  set the DNR bit correctly since the driver controls that.
164 	 */
165 	switch (sct) {
166 	case NVME_SCT_GENERIC:
167 		switch (sc) {
168 		case NVME_SC_ABORTED_BY_REQUEST:
169 		case NVME_SC_NAMESPACE_NOT_READY:
170 			if (dnr)
171 				return (0);
172 			else
173 				return (1);
174 		case NVME_SC_INVALID_OPCODE:
175 		case NVME_SC_INVALID_FIELD:
176 		case NVME_SC_COMMAND_ID_CONFLICT:
177 		case NVME_SC_DATA_TRANSFER_ERROR:
178 		case NVME_SC_ABORTED_POWER_LOSS:
179 		case NVME_SC_INTERNAL_DEVICE_ERROR:
180 		case NVME_SC_ABORTED_SQ_DELETION:
181 		case NVME_SC_ABORTED_FAILED_FUSED:
182 		case NVME_SC_ABORTED_MISSING_FUSED:
183 		case NVME_SC_INVALID_NAMESPACE_OR_FORMAT:
184 		case NVME_SC_COMMAND_SEQUENCE_ERROR:
185 		case NVME_SC_LBA_OUT_OF_RANGE:
186 		case NVME_SC_CAPACITY_EXCEEDED:
187 		default:
188 			return (0);
189 		}
190 	case NVME_SCT_COMMAND_SPECIFIC:
191 	case NVME_SCT_MEDIA_ERROR:
192 		return (0);
193 	case NVME_SCT_PATH_RELATED:
194 		switch (sc) {
195 		case NVME_SC_INTERNAL_PATH_ERROR:
196 			if (dnr)
197 				return (0);
198 			else
199 				return (1);
200 		default:
201 			return (0);
202 		}
203 	case NVME_SCT_VENDOR_SPECIFIC:
204 	default:
205 		return (0);
206 	}
207 }
208 
209 static void
nvme_qpair_complete_tracker(struct nvme_tracker * tr,struct nvme_completion * cpl,error_print_t print_on_error)210 nvme_qpair_complete_tracker(struct nvme_tracker *tr,
211     struct nvme_completion *cpl, error_print_t print_on_error)
212 {
213 	struct nvme_qpair	*qpair = tr->qpair;
214 	struct nvme_request	*req;
215 	bool			retry, error, retriable;
216 
217 	mtx_assert(&qpair->lock, MA_NOTOWNED);
218 
219 	req = tr->req;
220 	error = nvme_completion_is_error(cpl);
221 	retriable = nvme_completion_is_retry(cpl);
222 	retry = error && retriable && req->retries < nvme_retry_count;
223 	if (retry)
224 		qpair->num_retries++;
225 	if (error && req->retries >= nvme_retry_count && retriable)
226 		qpair->num_failures++;
227 
228 	if (error && (print_on_error == ERROR_PRINT_ALL ||
229 		(!retry && print_on_error == ERROR_PRINT_NO_RETRY))) {
230 		nvme_qpair_print_command(qpair, &req->cmd);
231 		nvme_qpair_print_completion(qpair, cpl);
232 	}
233 
234 	qpair->act_tr[cpl->cid] = NULL;
235 
236 	KASSERT(cpl->cid == req->cmd.cid, ("cpl cid does not match cmd cid\n"));
237 
238 	if (!retry) {
239 		if (req->payload_valid) {
240 			bus_dmamap_sync(qpair->dma_tag_payload,
241 			    tr->payload_dma_map,
242 			    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
243 		}
244 		if (req->cb_fn)
245 			req->cb_fn(req->cb_arg, cpl);
246 	}
247 
248 	mtx_lock(&qpair->lock);
249 
250 	if (retry) {
251 		req->retries++;
252 		nvme_qpair_submit_tracker(qpair, tr);
253 	} else {
254 		if (req->payload_valid) {
255 			bus_dmamap_unload(qpair->dma_tag_payload,
256 			    tr->payload_dma_map);
257 		}
258 
259 		nvme_free_request(req);
260 		tr->req = NULL;
261 
262 		TAILQ_REMOVE(&qpair->outstanding_tr, tr, tailq);
263 		TAILQ_INSERT_HEAD(&qpair->free_tr, tr, tailq);
264 
265 		/*
266 		 * If the controller is in the middle of resetting, don't
267 		 *  try to submit queued requests here - let the reset logic
268 		 *  handle that instead.
269 		 */
270 		if (!STAILQ_EMPTY(&qpair->queued_req) &&
271 		    !qpair->ctrlr->is_resetting) {
272 			req = STAILQ_FIRST(&qpair->queued_req);
273 			STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
274 			_nvme_qpair_submit_request(qpair, req);
275 		}
276 	}
277 
278 	mtx_unlock(&qpair->lock);
279 }
280 
281 static uint32_t
nvme_qpair_make_status(uint32_t sct,uint32_t sc,uint32_t dnr)282 nvme_qpair_make_status(uint32_t sct, uint32_t sc, uint32_t dnr)
283 {
284 	uint32_t status = 0;
285 
286 	status |= NVMEF(NVME_STATUS_SCT, sct);
287 	status |= NVMEF(NVME_STATUS_SC, sc);
288 	status |= NVMEF(NVME_STATUS_DNR, dnr);
289 	/* M=0 : this is artificial so no data in error log page */
290 	/* CRD=0 : this is artificial and no delayed retry support anyway */
291 	/* P=0 : phase not checked */
292 	return (status);
293 }
294 
295 static void
nvme_qpair_manual_complete_tracker(struct nvme_tracker * tr,uint32_t sct,uint32_t sc,uint32_t dnr,error_print_t print_on_error)296 nvme_qpair_manual_complete_tracker(
297     struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr,
298     error_print_t print_on_error)
299 {
300 	struct nvme_completion	cpl;
301 	struct nvme_qpair * qpair = tr->qpair;
302 
303 	mtx_assert(&qpair->lock, MA_NOTOWNED);
304 
305 	memset(&cpl, 0, sizeof(cpl));
306 
307 	cpl.sqid = qpair->id;
308 	cpl.cid = tr->cid;
309 	cpl.status = nvme_qpair_make_status(sct, sc, dnr);
310 	nvme_qpair_complete_tracker(tr, &cpl, print_on_error);
311 }
312 
313 static void
nvme_qpair_manual_complete_request(struct nvme_qpair * qpair,struct nvme_request * req,uint32_t sct,uint32_t sc,uint32_t dnr,error_print_t print_on_error)314 nvme_qpair_manual_complete_request(struct nvme_qpair *qpair,
315     struct nvme_request *req, uint32_t sct, uint32_t sc, uint32_t dnr,
316     error_print_t print_on_error)
317 {
318 	struct nvme_completion	cpl;
319 	bool			error;
320 
321 	memset(&cpl, 0, sizeof(cpl));
322 	cpl.sqid = qpair->id;
323 	cpl.status = nvme_qpair_make_status(sct, sc, dnr);
324 	error = nvme_completion_is_error(&cpl);
325 
326 	if (error && print_on_error == ERROR_PRINT_ALL) {
327 		nvme_qpair_print_command(qpair, &req->cmd);
328 		nvme_qpair_print_completion(qpair, &cpl);
329 	}
330 
331 	if (req->cb_fn)
332 		req->cb_fn(req->cb_arg, &cpl);
333 
334 	nvme_free_request(req);
335 }
336 
337 /* Locked version of completion processor */
338 static bool
_nvme_qpair_process_completions(struct nvme_qpair * qpair)339 _nvme_qpair_process_completions(struct nvme_qpair *qpair)
340 {
341 	struct nvme_tracker	*tr;
342 	struct nvme_completion	cpl;
343 	bool done = false;
344 	bool in_panic = dumping || SCHEDULER_STOPPED();
345 
346 	mtx_assert(&qpair->recovery, MA_OWNED);
347 
348 	/*
349 	 * qpair is not enabled, likely because a controller reset is in
350 	 * progress.  Ignore the interrupt - any I/O that was associated with
351 	 * this interrupt will get retried when the reset is complete. Any
352 	 * pending completions for when we're in startup will be completed
353 	 * as soon as initialization is complete and we start sending commands
354 	 * to the device.
355 	 */
356 	if (qpair->recovery_state != RECOVERY_NONE) {
357 		qpair->num_ignored++;
358 		return (false);
359 	}
360 
361 	/*
362 	 * Sanity check initialization. After we reset the hardware, the phase
363 	 * is defined to be 1. So if we get here with zero prior calls and the
364 	 * phase is 0, it means that we've lost a race between the
365 	 * initialization and the ISR running. With the phase wrong, we'll
366 	 * process a bunch of completions that aren't really completions leading
367 	 * to a KASSERT below.
368 	 */
369 	KASSERT(!(qpair->num_intr_handler_calls == 0 && qpair->phase == 0),
370 	    ("%s: Phase wrong for first interrupt call.",
371 		device_get_nameunit(qpair->ctrlr->dev)));
372 
373 	qpair->num_intr_handler_calls++;
374 
375 	bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map,
376 	    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
377 	/*
378 	 * A panic can stop the CPU this routine is running on at any point.  If
379 	 * we're called during a panic, complete the sq_head wrap protocol for
380 	 * the case where we are interrupted just after the increment at 1
381 	 * below, but before we can reset cq_head to zero at 2. Also cope with
382 	 * the case where we do the zero at 2, but may or may not have done the
383 	 * phase adjustment at step 3. The panic machinery flushes all pending
384 	 * memory writes, so we can make these strong ordering assumptions
385 	 * that would otherwise be unwise if we were racing in real time.
386 	 */
387 	if (__predict_false(in_panic)) {
388 		if (qpair->cq_head == qpair->num_entries) {
389 			/*
390 			 * Here we know that we need to zero cq_head and then negate
391 			 * the phase, which hasn't been assigned if cq_head isn't
392 			 * zero due to the atomic_store_rel.
393 			 */
394 			qpair->cq_head = 0;
395 			qpair->phase = !qpair->phase;
396 		} else if (qpair->cq_head == 0) {
397 			/*
398 			 * In this case, we know that the assignment at 2
399 			 * happened below, but we don't know if it 3 happened or
400 			 * not. To do this, we look at the last completion
401 			 * entry and set the phase to the opposite phase
402 			 * that it has. This gets us back in sync
403 			 */
404 			cpl = qpair->cpl[qpair->num_entries - 1];
405 			nvme_completion_swapbytes(&cpl);
406 			qpair->phase = !NVME_STATUS_GET_P(cpl.status);
407 		}
408 	}
409 
410 	while (1) {
411 		uint16_t status;
412 
413 		/*
414 		 * We need to do this dance to avoid a race between the host and
415 		 * the device where the device overtakes the host while the host
416 		 * is reading this record, leaving the status field 'new' and
417 		 * the sqhd and cid fields potentially stale. If the phase
418 		 * doesn't match, that means status hasn't yet been updated and
419 		 * we'll get any pending changes next time. It also means that
420 		 * the phase must be the same the second time. We have to sync
421 		 * before reading to ensure any bouncing completes.
422 		 */
423 		status = le16toh(qpair->cpl[qpair->cq_head].status);
424 		if (NVME_STATUS_GET_P(status) != qpair->phase)
425 			break;
426 
427 		bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map,
428 		    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
429 		cpl = qpair->cpl[qpair->cq_head];
430 		nvme_completion_swapbytes(&cpl);
431 
432 		KASSERT(
433 		    NVME_STATUS_GET_P(status) == NVME_STATUS_GET_P(cpl.status),
434 		    ("Phase unexpectedly inconsistent"));
435 
436 		if (cpl.cid < qpair->num_trackers)
437 			tr = qpair->act_tr[cpl.cid];
438 		else
439 			tr = NULL;
440 
441 		done = true;
442 		if (tr != NULL) {
443 			nvme_qpair_complete_tracker(tr, &cpl, ERROR_PRINT_ALL);
444 			qpair->sq_head = cpl.sqhd;
445 		} else if (!in_panic) {
446 			/*
447 			 * A missing tracker is normally an error.  However, a
448 			 * panic can stop the CPU this routine is running on
449 			 * after completing an I/O but before updating
450 			 * qpair->cq_head at 1 below.  Later, we re-enter this
451 			 * routine to poll I/O associated with the kernel
452 			 * dump. We find that the tr has been set to null before
453 			 * calling the completion routine.  If it hasn't
454 			 * completed (or it triggers a panic), then '1' below
455 			 * won't have updated cq_head. Rather than panic again,
456 			 * ignore this condition because it's not unexpected.
457 			 */
458 			nvme_printf(qpair->ctrlr,
459 			    "cpl (cid = %u) does not map to outstanding cmd\n",
460 				cpl.cid);
461 			nvme_qpair_print_completion(qpair,
462 			    &qpair->cpl[qpair->cq_head]);
463 			KASSERT(0, ("received completion for unknown cmd"));
464 		}
465 
466 		/*
467 		 * There's a number of races with the following (see above) when
468 		 * the system panics. We compensate for each one of them by
469 		 * using the atomic store to force strong ordering (at least when
470 		 * viewed in the aftermath of a panic).
471 		 */
472 		if (++qpair->cq_head == qpair->num_entries) {		/* 1 */
473 			atomic_store_rel_int(&qpair->cq_head, 0);	/* 2 */
474 			qpair->phase = !qpair->phase;			/* 3 */
475 		}
476 	}
477 
478 	if (done) {
479 		bus_space_write_4(qpair->ctrlr->bus_tag, qpair->ctrlr->bus_handle,
480 		    qpair->cq_hdbl_off, qpair->cq_head);
481 	}
482 
483 	return (done);
484 }
485 
486 bool
nvme_qpair_process_completions(struct nvme_qpair * qpair)487 nvme_qpair_process_completions(struct nvme_qpair *qpair)
488 {
489 	bool done = false;
490 
491 	/*
492 	 * Interlock with reset / recovery code. This is an usually uncontended
493 	 * to make sure that we drain out of the ISRs before we reset the card
494 	 * and to prevent races with the recovery process called from a timeout
495 	 * context.
496 	 */
497 	mtx_lock(&qpair->recovery);
498 
499 	if (__predict_true(qpair->recovery_state == RECOVERY_NONE))
500 		done = _nvme_qpair_process_completions(qpair);
501 	else
502 		qpair->num_recovery_nolock++;	// XXX likely need to rename
503 
504 	mtx_unlock(&qpair->recovery);
505 
506 	return (done);
507 }
508 
509 static void
nvme_qpair_msi_handler(void * arg)510 nvme_qpair_msi_handler(void *arg)
511 {
512 	struct nvme_qpair *qpair = arg;
513 
514 	nvme_qpair_process_completions(qpair);
515 }
516 
517 int
nvme_qpair_construct(struct nvme_qpair * qpair,uint32_t num_entries,uint32_t num_trackers,struct nvme_controller * ctrlr)518 nvme_qpair_construct(struct nvme_qpair *qpair,
519     uint32_t num_entries, uint32_t num_trackers,
520     struct nvme_controller *ctrlr)
521 {
522 	struct nvme_tracker	*tr;
523 	size_t			cmdsz, cplsz, prpsz, allocsz, prpmemsz;
524 	uint64_t		queuemem_phys, prpmem_phys, list_phys;
525 	uint8_t			*queuemem, *prpmem, *prp_list;
526 	int			i, err;
527 
528 	qpair->vector = ctrlr->msi_count > 1 ? qpair->id : 0;
529 	qpair->num_entries = num_entries;
530 	qpair->num_trackers = num_trackers;
531 	qpair->ctrlr = ctrlr;
532 
533 	mtx_init(&qpair->lock, "nvme qpair lock", NULL, MTX_DEF);
534 	mtx_init(&qpair->recovery, "nvme qpair recovery", NULL, MTX_DEF);
535 
536 	callout_init_mtx(&qpair->timer, &qpair->recovery, 0);
537 	qpair->timer_armed = false;
538 	qpair->recovery_state = RECOVERY_WAITING;
539 
540 	/* Note: NVMe PRP format is restricted to 4-byte alignment. */
541 	err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev),
542 	    4, ctrlr->page_size, BUS_SPACE_MAXADDR,
543 	    BUS_SPACE_MAXADDR, NULL, NULL, ctrlr->max_xfer_size,
544 	    howmany(ctrlr->max_xfer_size, ctrlr->page_size) + 1,
545 	    ctrlr->page_size, 0,
546 	    NULL, NULL, &qpair->dma_tag_payload);
547 	if (err != 0) {
548 		nvme_printf(ctrlr, "payload tag create failed %d\n", err);
549 		goto out;
550 	}
551 
552 	/*
553 	 * Each component must be page aligned, and individual PRP lists
554 	 * cannot cross a page boundary.
555 	 */
556 	cmdsz = qpair->num_entries * sizeof(struct nvme_command);
557 	cmdsz = roundup2(cmdsz, ctrlr->page_size);
558 	cplsz = qpair->num_entries * sizeof(struct nvme_completion);
559 	cplsz = roundup2(cplsz, ctrlr->page_size);
560 	/*
561 	 * For commands requiring more than 2 PRP entries, one PRP will be
562 	 * embedded in the command (prp1), and the rest of the PRP entries
563 	 * will be in a list pointed to by the command (prp2).
564 	 */
565 	prpsz = sizeof(uint64_t) *
566 	    howmany(ctrlr->max_xfer_size, ctrlr->page_size);
567 	prpmemsz = qpair->num_trackers * prpsz;
568 	allocsz = cmdsz + cplsz + prpmemsz;
569 
570 	err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev),
571 	    ctrlr->page_size, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL,
572 	    allocsz, 1, allocsz, 0, NULL, NULL, &qpair->dma_tag);
573 	if (err != 0) {
574 		nvme_printf(ctrlr, "tag create failed %d\n", err);
575 		goto out;
576 	}
577 	bus_dma_tag_set_domain(qpair->dma_tag, qpair->domain);
578 
579 	if (bus_dmamem_alloc(qpair->dma_tag, (void **)&queuemem,
580 	     BUS_DMA_COHERENT | BUS_DMA_NOWAIT, &qpair->queuemem_map)) {
581 		nvme_printf(ctrlr, "failed to alloc qpair memory\n");
582 		goto out;
583 	}
584 
585 	if (bus_dmamap_load(qpair->dma_tag, qpair->queuemem_map,
586 	    queuemem, allocsz, nvme_single_map, &queuemem_phys, 0) != 0) {
587 		nvme_printf(ctrlr, "failed to load qpair memory\n");
588 		bus_dmamem_free(qpair->dma_tag, qpair->cmd,
589 		    qpair->queuemem_map);
590 		goto out;
591 	}
592 
593 	qpair->num_cmds = 0;
594 	qpair->num_intr_handler_calls = 0;
595 	qpair->num_retries = 0;
596 	qpair->num_failures = 0;
597 	qpair->num_ignored = 0;
598 	qpair->cmd = (struct nvme_command *)queuemem;
599 	qpair->cpl = (struct nvme_completion *)(queuemem + cmdsz);
600 	prpmem = (uint8_t *)(queuemem + cmdsz + cplsz);
601 	qpair->cmd_bus_addr = queuemem_phys;
602 	qpair->cpl_bus_addr = queuemem_phys + cmdsz;
603 	prpmem_phys = queuemem_phys + cmdsz + cplsz;
604 
605 	/*
606 	 * Calcuate the stride of the doorbell register. Many emulators set this
607 	 * value to correspond to a cache line. However, some hardware has set
608 	 * it to various small values.
609 	 */
610 	qpair->sq_tdbl_off = nvme_mmio_offsetof(doorbell[0]) +
611 	    (qpair->id << (ctrlr->dstrd + 1));
612 	qpair->cq_hdbl_off = nvme_mmio_offsetof(doorbell[0]) +
613 	    (qpair->id << (ctrlr->dstrd + 1)) + (1 << ctrlr->dstrd);
614 
615 	TAILQ_INIT(&qpair->free_tr);
616 	TAILQ_INIT(&qpair->outstanding_tr);
617 	STAILQ_INIT(&qpair->queued_req);
618 
619 	list_phys = prpmem_phys;
620 	prp_list = prpmem;
621 	for (i = 0; i < qpair->num_trackers; i++) {
622 		if (list_phys + prpsz > prpmem_phys + prpmemsz) {
623 			qpair->num_trackers = i;
624 			break;
625 		}
626 
627 		/*
628 		 * Make sure that the PRP list for this tracker doesn't
629 		 * overflow to another nvme page.
630 		 */
631 		if (trunc_page(list_phys) !=
632 		    trunc_page(list_phys + prpsz - 1)) {
633 			list_phys = roundup2(list_phys, ctrlr->page_size);
634 			prp_list =
635 			    (uint8_t *)roundup2((uintptr_t)prp_list, ctrlr->page_size);
636 		}
637 
638 		tr = malloc_domainset(sizeof(*tr), M_NVME,
639 		    DOMAINSET_PREF(qpair->domain), M_ZERO | M_WAITOK);
640 		bus_dmamap_create(qpair->dma_tag_payload, 0,
641 		    &tr->payload_dma_map);
642 		tr->cid = i;
643 		tr->qpair = qpair;
644 		tr->prp = (uint64_t *)prp_list;
645 		tr->prp_bus_addr = list_phys;
646 		TAILQ_INSERT_HEAD(&qpair->free_tr, tr, tailq);
647 		list_phys += prpsz;
648 		prp_list += prpsz;
649 	}
650 
651 	if (qpair->num_trackers == 0) {
652 		nvme_printf(ctrlr, "failed to allocate enough trackers\n");
653 		goto out;
654 	}
655 
656 	qpair->act_tr = malloc_domainset(sizeof(struct nvme_tracker *) *
657 	    qpair->num_entries, M_NVME, DOMAINSET_PREF(qpair->domain),
658 	    M_ZERO | M_WAITOK);
659 
660 	if (ctrlr->msi_count > 1) {
661 		/*
662 		 * MSI-X vector resource IDs start at 1, so we add one to
663 		 *  the queue's vector to get the corresponding rid to use.
664 		 */
665 		qpair->rid = qpair->vector + 1;
666 
667 		qpair->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ,
668 		    &qpair->rid, RF_ACTIVE);
669 		if (qpair->res == NULL) {
670 			nvme_printf(ctrlr, "unable to allocate MSI\n");
671 			goto out;
672 		}
673 		if (bus_setup_intr(ctrlr->dev, qpair->res,
674 		    INTR_TYPE_MISC | INTR_MPSAFE, NULL,
675 		    nvme_qpair_msi_handler, qpair, &qpair->tag) != 0) {
676 			nvme_printf(ctrlr, "unable to setup MSI\n");
677 			goto out;
678 		}
679 		if (qpair->id == 0) {
680 			bus_describe_intr(ctrlr->dev, qpair->res, qpair->tag,
681 			    "admin");
682 		} else {
683 			bus_describe_intr(ctrlr->dev, qpair->res, qpair->tag,
684 			    "io%d", qpair->id - 1);
685 		}
686 	}
687 
688 	return (0);
689 
690 out:
691 	nvme_qpair_destroy(qpair);
692 	return (ENOMEM);
693 }
694 
695 static void
nvme_qpair_destroy(struct nvme_qpair * qpair)696 nvme_qpair_destroy(struct nvme_qpair *qpair)
697 {
698 	struct nvme_tracker	*tr;
699 
700 	mtx_lock(&qpair->recovery);
701 	qpair->timer_armed = false;
702 	mtx_unlock(&qpair->recovery);
703 	callout_drain(&qpair->timer);
704 
705 	if (qpair->tag) {
706 		bus_teardown_intr(qpair->ctrlr->dev, qpair->res, qpair->tag);
707 		qpair->tag = NULL;
708 	}
709 
710 	if (qpair->act_tr) {
711 		free(qpair->act_tr, M_NVME);
712 		qpair->act_tr = NULL;
713 	}
714 
715 	while (!TAILQ_EMPTY(&qpair->free_tr)) {
716 		tr = TAILQ_FIRST(&qpair->free_tr);
717 		TAILQ_REMOVE(&qpair->free_tr, tr, tailq);
718 		bus_dmamap_destroy(qpair->dma_tag_payload,
719 		    tr->payload_dma_map);
720 		free(tr, M_NVME);
721 	}
722 
723 	if (qpair->cmd != NULL) {
724 		bus_dmamap_unload(qpair->dma_tag, qpair->queuemem_map);
725 		bus_dmamem_free(qpair->dma_tag, qpair->cmd,
726 		    qpair->queuemem_map);
727 		qpair->cmd = NULL;
728 	}
729 
730 	if (qpair->dma_tag) {
731 		bus_dma_tag_destroy(qpair->dma_tag);
732 		qpair->dma_tag = NULL;
733 	}
734 
735 	if (qpair->dma_tag_payload) {
736 		bus_dma_tag_destroy(qpair->dma_tag_payload);
737 		qpair->dma_tag_payload = NULL;
738 	}
739 
740 	if (mtx_initialized(&qpair->lock))
741 		mtx_destroy(&qpair->lock);
742 	if (mtx_initialized(&qpair->recovery))
743 		mtx_destroy(&qpair->recovery);
744 
745 	if (qpair->res) {
746 		bus_release_resource(qpair->ctrlr->dev, SYS_RES_IRQ,
747 		    rman_get_rid(qpair->res), qpair->res);
748 		qpair->res = NULL;
749 	}
750 }
751 
752 static void
nvme_admin_qpair_abort_aers(struct nvme_qpair * qpair)753 nvme_admin_qpair_abort_aers(struct nvme_qpair *qpair)
754 {
755 	struct nvme_tracker	*tr;
756 
757 	/*
758 	 * nvme_complete_tracker must be called without the qpair lock held. It
759 	 * takes the lock to adjust outstanding_tr list, so make sure we don't
760 	 * have it yet. We need the lock to make the list traverse safe, but
761 	 * have to drop the lock to complete any AER. We restart the list scan
762 	 * when we do this to make this safe. There's interlock with the ISR so
763 	 * we know this tracker won't be completed twice.
764 	 */
765 	mtx_assert(&qpair->lock, MA_NOTOWNED);
766 
767 	mtx_lock(&qpair->lock);
768 	tr = TAILQ_FIRST(&qpair->outstanding_tr);
769 	while (tr != NULL) {
770 		if (tr->req->cmd.opc != NVME_OPC_ASYNC_EVENT_REQUEST) {
771 			tr = TAILQ_NEXT(tr, tailq);
772 			continue;
773 		}
774 		mtx_unlock(&qpair->lock);
775 		nvme_qpair_manual_complete_tracker(tr,
776 		    NVME_SCT_GENERIC, NVME_SC_ABORTED_SQ_DELETION, 0,
777 		    ERROR_PRINT_NONE);
778 		mtx_lock(&qpair->lock);
779 		tr = TAILQ_FIRST(&qpair->outstanding_tr);
780 	}
781 	mtx_unlock(&qpair->lock);
782 }
783 
784 void
nvme_admin_qpair_destroy(struct nvme_qpair * qpair)785 nvme_admin_qpair_destroy(struct nvme_qpair *qpair)
786 {
787 	mtx_assert(&qpair->lock, MA_NOTOWNED);
788 
789 	nvme_admin_qpair_abort_aers(qpair);
790 	nvme_qpair_destroy(qpair);
791 }
792 
793 void
nvme_io_qpair_destroy(struct nvme_qpair * qpair)794 nvme_io_qpair_destroy(struct nvme_qpair *qpair)
795 {
796 
797 	nvme_qpair_destroy(qpair);
798 }
799 
800 static void
nvme_abort_complete(void * arg,const struct nvme_completion * status)801 nvme_abort_complete(void *arg, const struct nvme_completion *status)
802 {
803 	struct nvme_tracker     *tr = arg;
804 
805 	/*
806 	 * If cdw0 bit 0 == 1, the controller was not able to abort the command
807 	 * we requested.  We still need to check the active tracker array, to
808 	 * cover race where I/O timed out at same time controller was completing
809 	 * the I/O. An abort command always is on the admin queue, but affects
810 	 * either an admin or an I/O queue, so take the appropriate qpair lock
811 	 * for the original command's queue, since we'll need it to avoid races
812 	 * with the completion code and to complete the command manually.
813 	 */
814 	mtx_lock(&tr->qpair->lock);
815 	if ((status->cdw0 & 1) == 1 && tr->qpair->act_tr[tr->cid] != NULL) {
816 		/*
817 		 * An I/O has timed out, and the controller was unable to abort
818 		 * it for some reason.  And we've not processed a completion for
819 		 * it yet. Construct a fake completion status, and then complete
820 		 * the I/O's tracker manually.
821 		 */
822 		nvme_printf(tr->qpair->ctrlr,
823 		    "abort command failed, aborting command manually\n");
824 		nvme_qpair_manual_complete_tracker(tr,
825 		    NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 0, ERROR_PRINT_ALL);
826 	}
827 	/*
828 	 * XXX We don't check status for the possible 'Could not abort because
829 	 * excess aborts were submitted to the controller'. We don't prevent
830 	 * that, either. Document for the future here, since the standard is
831 	 * squishy and only says 'may generate' but implies anything is possible
832 	 * including hangs if you exceed the ACL.
833 	 */
834 	mtx_unlock(&tr->qpair->lock);
835 }
836 
837 static void
nvme_qpair_timeout(void * arg)838 nvme_qpair_timeout(void *arg)
839 {
840 	struct nvme_qpair	*qpair = arg;
841 	struct nvme_controller	*ctrlr = qpair->ctrlr;
842 	struct nvme_tracker	*tr;
843 	sbintime_t		now;
844 	bool			idle = true;
845 	bool			is_admin = qpair == &ctrlr->adminq;
846 	bool			fast;
847 	uint32_t		csts;
848 	uint8_t			cfs;
849 
850 	mtx_assert(&qpair->recovery, MA_OWNED);
851 
852 	/*
853 	 * If the controller is failed, then stop polling. This ensures that any
854 	 * failure processing that races with the qpair timeout will fail
855 	 * safely.
856 	 */
857 	if (is_admin ? qpair->ctrlr->is_failed_admin : qpair->ctrlr->is_failed) {
858 		nvme_printf(qpair->ctrlr,
859 		    "%sFailed controller, stopping watchdog timeout.\n",
860 		    is_admin ? "Complete " : "");
861 		qpair->timer_armed = false;
862 		return;
863 	}
864 
865 	/*
866 	 * Shutdown condition: We set qpair->timer_armed to false in
867 	 * nvme_qpair_destroy before calling callout_drain. When we call that,
868 	 * this routine might get called one last time. Exit w/o setting a
869 	 * timeout. None of the watchdog stuff needs to be done since we're
870 	 * destroying the qpair.
871 	 */
872 	if (!qpair->timer_armed) {
873 		nvme_printf(qpair->ctrlr,
874 		    "Timeout fired during nvme_qpair_destroy\n");
875 		return;
876 	}
877 
878 	switch (qpair->recovery_state) {
879 	case RECOVERY_NONE:
880 		/*
881 		 * Read csts to get value of cfs - controller fatal status.  If
882 		 * we are in the hot-plug or controller failed status proceed
883 		 * directly to reset. We also bail early if the status reads all
884 		 * 1's or the control fatal status bit is now 1. The latter is
885 		 * always true when the former is true, but not vice versa.  The
886 		 * intent of the code is that if the card is gone (all 1's) or
887 		 * we've failed, then try to do a reset (which someitmes
888 		 * unwedges a card reading all 1's that's not gone away, but
889 		 * usually doesn't).
890 		 */
891 		csts = nvme_mmio_read_4(ctrlr, csts);
892 		cfs = NVMEV(NVME_CSTS_REG_CFS, csts);
893 		if (csts == NVME_GONE || cfs == 1) {
894 			/*
895 			 * We've had a command timeout that we weren't able to
896 			 * abort or we have aborts disabled and any command
897 			 * timed out.
898 			 *
899 			 * If we get here due to a possible surprise hot-unplug
900 			 * event, then we let nvme_ctrlr_reset confirm and fail
901 			 * the controller.
902 			 */
903 do_reset:
904 			nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n",
905 			    (csts == 0xffffffff) ? " and possible hot unplug" :
906 			    (cfs ? " and fatal error status" : ""));
907 			qpair->recovery_state = RECOVERY_WAITING;
908 			nvme_ctrlr_reset(ctrlr);
909 			idle = false;
910 			break;
911 		}
912 
913 
914 		/*
915 		 * See if there's any recovery needed. First, do a fast check to
916 		 * see if anything could have timed out. If not, then skip
917 		 * everything else.
918 		 */
919 		fast = false;
920 		mtx_lock(&qpair->lock);
921 		now = getsbinuptime();
922 		TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) {
923 			/*
924 			 * Skip async commands, they are posted to the card for
925 			 * an indefinite amount of time and have no deadline.
926 			 */
927 			if (tr->deadline == SBT_MAX)
928 				continue;
929 
930 			/*
931 			 * If the first real transaction is not in timeout, then
932 			 * we're done. Otherwise, we try recovery.
933 			 */
934 			idle = false;
935 			if (now <= tr->deadline)
936 				fast = true;
937 			break;
938 		}
939 		mtx_unlock(&qpair->lock);
940 		if (idle || fast)
941 			break;
942 
943 		/*
944 		 * There's a stale transaction at the start of the queue whose
945 		 * deadline has passed. Poll the competions as a last-ditch
946 		 * effort in case an interrupt has been missed. Warn the user if
947 		 * transactions were found of possible interrupt issues, but
948 		 * just once per controller.
949 		 */
950 		if (_nvme_qpair_process_completions(qpair) && !ctrlr->isr_warned) {
951 			nvme_printf(ctrlr, "System interrupt issues?\n");
952 			ctrlr->isr_warned = true;
953 		}
954 
955 		/*
956 		 * Now that we've run the ISR, re-rheck to see if there's any
957 		 * timed out commands and abort them or reset the card if so.
958 		 */
959 		mtx_lock(&qpair->lock);
960 		idle = true;
961 		TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) {
962 			/*
963 			 * Skip async commands, they are posted to the card for
964 			 * an indefinite amount of time and have no deadline.
965 			 */
966 			if (tr->deadline == SBT_MAX)
967 				continue;
968 
969 			/*
970 			 * If we know this tracker hasn't timed out, we also
971 			 * know all subsequent ones haven't timed out. The tr
972 			 * queue is in submission order and all normal commands
973 			 * in a queue have the same timeout (or the timeout was
974 			 * changed by the user, but we eventually timeout then).
975 			 */
976 			idle = false;
977 			if (now <= tr->deadline)
978 				break;
979 
980 			/*
981 			 * Timeout expired, abort it or reset controller.
982 			 */
983 			if (ctrlr->enable_aborts &&
984 			    tr->req->cb_fn != nvme_abort_complete) {
985 				/*
986 				 * This isn't an abort command, ask for a
987 				 * hardware abort. This goes to the admin
988 				 * queue which will reset the card if it
989 				 * times out.
990 				 */
991 				nvme_ctrlr_cmd_abort(ctrlr, tr->cid, qpair->id,
992 				    nvme_abort_complete, tr);
993 			} else {
994 				/*
995 				 * We have a live command in the card (either
996 				 * one we couldn't abort, or aborts weren't
997 				 * enabled).  We can only reset.
998 				 */
999 				mtx_unlock(&qpair->lock);
1000 				goto do_reset;
1001 			}
1002 		}
1003 		mtx_unlock(&qpair->lock);
1004 		break;
1005 
1006 	case RECOVERY_WAITING:
1007 		/*
1008 		 * These messages aren't interesting while we're suspended. We
1009 		 * put the queues into waiting state while
1010 		 * suspending. Suspending takes a while, so we'll see these
1011 		 * during that time and they aren't diagnostic. At other times,
1012 		 * they indicate a problem that's worth complaining about.
1013 		 */
1014 		if (!device_is_suspended(ctrlr->dev))
1015 			nvme_printf(ctrlr, "Waiting for reset to complete\n");
1016 		idle = false;		/* We want to keep polling */
1017 		break;
1018 	}
1019 
1020 	/*
1021 	 * Rearm the timeout.
1022 	 */
1023 	if (!idle) {
1024 		callout_schedule_sbt(&qpair->timer, SBT_1S / 2, SBT_1S / 2, 0);
1025 	} else {
1026 		qpair->timer_armed = false;
1027 	}
1028 }
1029 
1030 /*
1031  * Submit the tracker to the hardware. Must already be in the
1032  * outstanding queue when called.
1033  */
1034 void
nvme_qpair_submit_tracker(struct nvme_qpair * qpair,struct nvme_tracker * tr)1035 nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr)
1036 {
1037 	struct nvme_request	*req;
1038 	struct nvme_controller	*ctrlr;
1039 	int timeout;
1040 
1041 	mtx_assert(&qpair->lock, MA_OWNED);
1042 
1043 	req = tr->req;
1044 	req->cmd.cid = tr->cid;
1045 	qpair->act_tr[tr->cid] = tr;
1046 	ctrlr = qpair->ctrlr;
1047 
1048 	if (req->timeout) {
1049 		if (req->cb_fn == nvme_completion_poll_cb)
1050 			timeout = 1;
1051 		else if (qpair->id == 0)
1052 			timeout = ctrlr->admin_timeout_period;
1053 		else
1054 			timeout = ctrlr->timeout_period;
1055 		tr->deadline = getsbinuptime() + timeout * SBT_1S;
1056 		if (!qpair->timer_armed) {
1057 			qpair->timer_armed = true;
1058 			callout_reset_sbt_on(&qpair->timer, SBT_1S / 2, SBT_1S / 2,
1059 			    nvme_qpair_timeout, qpair, qpair->cpu, 0);
1060 		}
1061 	} else
1062 		tr->deadline = SBT_MAX;
1063 
1064 	/* Copy the command from the tracker to the submission queue. */
1065 	memcpy(&qpair->cmd[qpair->sq_tail], &req->cmd, sizeof(req->cmd));
1066 
1067 	if (++qpair->sq_tail == qpair->num_entries)
1068 		qpair->sq_tail = 0;
1069 
1070 	bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map,
1071 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1072 	bus_space_write_4(ctrlr->bus_tag, ctrlr->bus_handle,
1073 	    qpair->sq_tdbl_off, qpair->sq_tail);
1074 	qpair->num_cmds++;
1075 }
1076 
1077 static void
nvme_payload_map(void * arg,bus_dma_segment_t * seg,int nseg,int error)1078 nvme_payload_map(void *arg, bus_dma_segment_t *seg, int nseg, int error)
1079 {
1080 	struct nvme_tracker 	*tr = arg;
1081 	uint32_t		cur_nseg;
1082 
1083 	/*
1084 	 * If the mapping operation failed, return immediately.  The caller
1085 	 *  is responsible for detecting the error status and failing the
1086 	 *  tracker manually.
1087 	 */
1088 	if (error != 0) {
1089 		nvme_printf(tr->qpair->ctrlr,
1090 		    "nvme_payload_map err %d\n", error);
1091 		return;
1092 	}
1093 
1094 	/*
1095 	 * Note that we specified ctrlr->page_size for alignment and max
1096 	 * segment size when creating the bus dma tags.  So here we can safely
1097 	 * just transfer each segment to its associated PRP entry.
1098 	 */
1099 	tr->req->cmd.prp1 = htole64(seg[0].ds_addr);
1100 
1101 	if (nseg == 2) {
1102 		tr->req->cmd.prp2 = htole64(seg[1].ds_addr);
1103 	} else if (nseg > 2) {
1104 		cur_nseg = 1;
1105 		tr->req->cmd.prp2 = htole64((uint64_t)tr->prp_bus_addr);
1106 		while (cur_nseg < nseg) {
1107 			tr->prp[cur_nseg-1] =
1108 			    htole64((uint64_t)seg[cur_nseg].ds_addr);
1109 			cur_nseg++;
1110 		}
1111 	} else {
1112 		/*
1113 		 * prp2 should not be used by the controller
1114 		 *  since there is only one segment, but set
1115 		 *  to 0 just to be safe.
1116 		 */
1117 		tr->req->cmd.prp2 = 0;
1118 	}
1119 
1120 	bus_dmamap_sync(tr->qpair->dma_tag_payload, tr->payload_dma_map,
1121 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1122 	nvme_qpair_submit_tracker(tr->qpair, tr);
1123 }
1124 
1125 static void
_nvme_qpair_submit_request(struct nvme_qpair * qpair,struct nvme_request * req)1126 _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
1127 {
1128 	struct nvme_tracker	*tr;
1129 	int			err = 0;
1130 	bool			is_admin = qpair == &qpair->ctrlr->adminq;
1131 
1132 	mtx_assert(&qpair->lock, MA_OWNED);
1133 
1134 	tr = TAILQ_FIRST(&qpair->free_tr);
1135 	req->qpair = qpair;
1136 
1137 	/*
1138 	 * The controller has failed, so fail the request. Note, that this races
1139 	 * the recovery / timeout code. Since we hold the qpair lock, we know
1140 	 * it's safe to fail directly. is_failed is set when we fail the
1141 	 * controller.  It is only ever reset in the ioctl reset controller
1142 	 * path, which is safe to race (for failed controllers, we make no
1143 	 * guarantees about bringing it out of failed state relative to other
1144 	 * commands). We try hard to allow admin commands when the entire
1145 	 * controller hasn't failed, only something related to I/O queues.
1146 	 */
1147 	if (is_admin ? qpair->ctrlr->is_failed_admin : qpair->ctrlr->is_failed) {
1148 		nvme_qpair_manual_complete_request(qpair, req,
1149 		    NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 1,
1150 		    ERROR_PRINT_NONE);
1151 		return;
1152 	}
1153 
1154 	/*
1155 	 * No tracker is available, or the qpair is disabled due to an
1156 	 * in-progress controller-level reset. If we lose the race with
1157 	 * recovery_state, then we may add an extra request to the queue which
1158 	 * will be resubmitted later.  We only set recovery_state to NONE with
1159 	 * qpair->lock also held, so if we observe that the state is not NONE,
1160 	 * we know it won't transition back to NONE without retrying queued
1161 	 * request.
1162 	 */
1163 	if (tr == NULL || qpair->recovery_state != RECOVERY_NONE) {
1164 		STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq);
1165 		return;
1166 	}
1167 
1168 	TAILQ_REMOVE(&qpair->free_tr, tr, tailq);
1169 	TAILQ_INSERT_TAIL(&qpair->outstanding_tr, tr, tailq);
1170 	tr->deadline = SBT_MAX;
1171 	tr->req = req;
1172 
1173 	if (!req->payload_valid) {
1174 		nvme_qpair_submit_tracker(tr->qpair, tr);
1175 		return;
1176 	}
1177 
1178 	/*
1179 	 * tr->deadline updating when nvme_payload_map calls
1180 	 * nvme_qpair_submit_tracker (we call it above directly
1181 	 * when there's no map to load).
1182 	 */
1183 	err = bus_dmamap_load_mem(tr->qpair->dma_tag_payload,
1184 	    tr->payload_dma_map, &req->payload, nvme_payload_map, tr, 0);
1185 	if (err != 0) {
1186 		/*
1187 		 * The dmamap operation failed, so we manually fail the
1188 		 *  tracker here with DATA_TRANSFER_ERROR status.
1189 		 *
1190 		 * nvme_qpair_manual_complete_tracker must not be called
1191 		 *  with the qpair lock held.
1192 		 */
1193 		nvme_printf(qpair->ctrlr,
1194 		    "bus_dmamap_load_mem returned 0x%x!\n", err);
1195 		mtx_unlock(&qpair->lock);
1196 		nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC,
1197 		    NVME_SC_DATA_TRANSFER_ERROR, DO_NOT_RETRY, ERROR_PRINT_ALL);
1198 		mtx_lock(&qpair->lock);
1199 	}
1200 }
1201 
1202 void
nvme_qpair_submit_request(struct nvme_qpair * qpair,struct nvme_request * req)1203 nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
1204 {
1205 
1206 	mtx_lock(&qpair->lock);
1207 	_nvme_qpair_submit_request(qpair, req);
1208 	mtx_unlock(&qpair->lock);
1209 }
1210 
1211 static void
nvme_qpair_enable(struct nvme_qpair * qpair)1212 nvme_qpair_enable(struct nvme_qpair *qpair)
1213 {
1214 	bool is_admin __unused = qpair == &qpair->ctrlr->adminq;
1215 
1216 	if (mtx_initialized(&qpair->recovery))
1217 		mtx_assert(&qpair->recovery, MA_OWNED);
1218 	if (mtx_initialized(&qpair->lock))
1219 		mtx_assert(&qpair->lock, MA_OWNED);
1220 	KASSERT(!(is_admin ? qpair->ctrlr->is_failed_admin : qpair->ctrlr->is_failed),
1221 	    ("Enabling a failed qpair\n"));
1222 
1223 	qpair->recovery_state = RECOVERY_NONE;
1224 }
1225 
1226 void
nvme_qpair_reset(struct nvme_qpair * qpair)1227 nvme_qpair_reset(struct nvme_qpair *qpair)
1228 {
1229 
1230 	qpair->sq_head = qpair->sq_tail = qpair->cq_head = 0;
1231 
1232 	/*
1233 	 * First time through the completion queue, HW will set phase
1234 	 *  bit on completions to 1.  So set this to 1 here, indicating
1235 	 *  we're looking for a 1 to know which entries have completed.
1236 	 *  we'll toggle the bit each time when the completion queue
1237 	 *  rolls over.
1238 	 */
1239 	qpair->phase = 1;
1240 
1241 	memset(qpair->cmd, 0,
1242 	    qpair->num_entries * sizeof(struct nvme_command));
1243 	memset(qpair->cpl, 0,
1244 	    qpair->num_entries * sizeof(struct nvme_completion));
1245 }
1246 
1247 void
nvme_admin_qpair_enable(struct nvme_qpair * qpair)1248 nvme_admin_qpair_enable(struct nvme_qpair *qpair)
1249 {
1250 	struct nvme_tracker		*tr;
1251 	struct nvme_tracker		*tr_temp;
1252 	bool				rpt;
1253 
1254 	/*
1255 	 * Manually abort each outstanding admin command.  Do not retry
1256 	 * admin commands found here, since they will be left over from
1257 	 * a controller reset and its likely the context in which the
1258 	 * command was issued no longer applies.
1259 	 */
1260 	rpt = !TAILQ_EMPTY(&qpair->outstanding_tr);
1261 	if (rpt)
1262 		nvme_printf(qpair->ctrlr,
1263 		    "aborting outstanding admin command\n");
1264 	TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) {
1265 		nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC,
1266 		    NVME_SC_ABORTED_BY_REQUEST, DO_NOT_RETRY, ERROR_PRINT_ALL);
1267 	}
1268 	if (rpt)
1269 		nvme_printf(qpair->ctrlr,
1270 		    "done aborting outstanding admin\n");
1271 
1272 	mtx_lock(&qpair->recovery);
1273 	mtx_lock(&qpair->lock);
1274 	nvme_qpair_enable(qpair);
1275 	mtx_unlock(&qpair->lock);
1276 	mtx_unlock(&qpair->recovery);
1277 }
1278 
1279 void
nvme_io_qpair_enable(struct nvme_qpair * qpair)1280 nvme_io_qpair_enable(struct nvme_qpair *qpair)
1281 {
1282 	STAILQ_HEAD(, nvme_request)	temp;
1283 	struct nvme_tracker		*tr;
1284 	struct nvme_tracker		*tr_temp;
1285 	struct nvme_request		*req;
1286 	bool				report;
1287 
1288 	/*
1289 	 * Manually abort each outstanding I/O.  This normally results in a
1290 	 * retry, unless the retry count on the associated request has
1291 	 * reached its limit.
1292 	 */
1293 	report = !TAILQ_EMPTY(&qpair->outstanding_tr);
1294 	if (report)
1295 		nvme_printf(qpair->ctrlr, "aborting outstanding i/o\n");
1296 	TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) {
1297 		nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC,
1298 		    NVME_SC_ABORTED_BY_REQUEST, 0, ERROR_PRINT_NO_RETRY);
1299 	}
1300 	if (report)
1301 		nvme_printf(qpair->ctrlr, "done aborting outstanding i/o\n");
1302 
1303 	mtx_lock(&qpair->recovery);
1304 	mtx_lock(&qpair->lock);
1305 	nvme_qpair_enable(qpair);
1306 
1307 	STAILQ_INIT(&temp);
1308 	STAILQ_SWAP(&qpair->queued_req, &temp, nvme_request);
1309 
1310 	report = !STAILQ_EMPTY(&temp);
1311 	if (report)
1312 		nvme_printf(qpair->ctrlr, "resubmitting queued i/o\n");
1313 	while (!STAILQ_EMPTY(&temp)) {
1314 		req = STAILQ_FIRST(&temp);
1315 		STAILQ_REMOVE_HEAD(&temp, stailq);
1316 		nvme_qpair_print_command(qpair, &req->cmd);
1317 		_nvme_qpair_submit_request(qpair, req);
1318 	}
1319 	if (report)
1320 		nvme_printf(qpair->ctrlr, "done resubmitting i/o\n");
1321 
1322 	mtx_unlock(&qpair->lock);
1323 	mtx_unlock(&qpair->recovery);
1324 }
1325 
1326 static void
nvme_qpair_disable(struct nvme_qpair * qpair)1327 nvme_qpair_disable(struct nvme_qpair *qpair)
1328 {
1329 	struct nvme_tracker	*tr, *tr_temp;
1330 
1331 	if (mtx_initialized(&qpair->recovery))
1332 		mtx_assert(&qpair->recovery, MA_OWNED);
1333 	if (mtx_initialized(&qpair->lock))
1334 		mtx_assert(&qpair->lock, MA_OWNED);
1335 
1336 	qpair->recovery_state = RECOVERY_WAITING;
1337 	TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) {
1338 		tr->deadline = SBT_MAX;
1339 	}
1340 }
1341 
1342 void
nvme_admin_qpair_disable(struct nvme_qpair * qpair)1343 nvme_admin_qpair_disable(struct nvme_qpair *qpair)
1344 {
1345 	mtx_lock(&qpair->recovery);
1346 
1347 	mtx_lock(&qpair->lock);
1348 	nvme_qpair_disable(qpair);
1349 	mtx_unlock(&qpair->lock);
1350 
1351 	nvme_admin_qpair_abort_aers(qpair);
1352 
1353 	mtx_unlock(&qpair->recovery);
1354 }
1355 
1356 void
nvme_io_qpair_disable(struct nvme_qpair * qpair)1357 nvme_io_qpair_disable(struct nvme_qpair *qpair)
1358 {
1359 	mtx_lock(&qpair->recovery);
1360 	mtx_lock(&qpair->lock);
1361 
1362 	nvme_qpair_disable(qpair);
1363 
1364 	mtx_unlock(&qpair->lock);
1365 	mtx_unlock(&qpair->recovery);
1366 }
1367 
1368 void
nvme_qpair_fail(struct nvme_qpair * qpair)1369 nvme_qpair_fail(struct nvme_qpair *qpair)
1370 {
1371 	struct nvme_tracker		*tr;
1372 	struct nvme_request		*req;
1373 
1374 	if (!mtx_initialized(&qpair->lock))
1375 		return;
1376 
1377 	mtx_lock(&qpair->lock);
1378 
1379 	if (!STAILQ_EMPTY(&qpair->queued_req)) {
1380 		nvme_printf(qpair->ctrlr, "failing queued i/o\n");
1381 	}
1382 	while (!STAILQ_EMPTY(&qpair->queued_req)) {
1383 		req = STAILQ_FIRST(&qpair->queued_req);
1384 		STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
1385 		mtx_unlock(&qpair->lock);
1386 		nvme_qpair_manual_complete_request(qpair, req, NVME_SCT_GENERIC,
1387 		    NVME_SC_ABORTED_BY_REQUEST, 1, ERROR_PRINT_ALL);
1388 		mtx_lock(&qpair->lock);
1389 	}
1390 
1391 	if (!TAILQ_EMPTY(&qpair->outstanding_tr)) {
1392 		nvme_printf(qpair->ctrlr, "failing outstanding i/o\n");
1393 	}
1394 	/* Manually abort each outstanding I/O. */
1395 	while (!TAILQ_EMPTY(&qpair->outstanding_tr)) {
1396 		tr = TAILQ_FIRST(&qpair->outstanding_tr);
1397 		/*
1398 		 * Do not remove the tracker.  The abort_tracker path will
1399 		 *  do that for us.
1400 		 */
1401 		mtx_unlock(&qpair->lock);
1402 		nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC,
1403 		    NVME_SC_ABORTED_BY_REQUEST, DO_NOT_RETRY, ERROR_PRINT_ALL);
1404 		mtx_lock(&qpair->lock);
1405 	}
1406 
1407 	mtx_unlock(&qpair->lock);
1408 }
1409