xref: /freebsd/sys/dev/nvme/nvme_ctrlr.c (revision 8052b01e7e4113fa8296ce43c354116b0a1774b7)
1bb0ec6b3SJim Harris /*-
24d846d26SWarner Losh  * SPDX-License-Identifier: BSD-2-Clause
3718cf2ccSPedro F. Giffuni  *
450dea2daSJim Harris  * Copyright (C) 2012-2016 Intel Corporation
5bb0ec6b3SJim Harris  * All rights reserved.
6bb0ec6b3SJim Harris  *
7bb0ec6b3SJim Harris  * Redistribution and use in source and binary forms, with or without
8bb0ec6b3SJim Harris  * modification, are permitted provided that the following conditions
9bb0ec6b3SJim Harris  * are met:
10bb0ec6b3SJim Harris  * 1. Redistributions of source code must retain the above copyright
11bb0ec6b3SJim Harris  *    notice, this list of conditions and the following disclaimer.
12bb0ec6b3SJim Harris  * 2. Redistributions in binary form must reproduce the above copyright
13bb0ec6b3SJim Harris  *    notice, this list of conditions and the following disclaimer in the
14bb0ec6b3SJim Harris  *    documentation and/or other materials provided with the distribution.
15bb0ec6b3SJim Harris  *
16bb0ec6b3SJim Harris  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17bb0ec6b3SJim Harris  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18bb0ec6b3SJim Harris  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19bb0ec6b3SJim Harris  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20bb0ec6b3SJim Harris  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21bb0ec6b3SJim Harris  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22bb0ec6b3SJim Harris  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23bb0ec6b3SJim Harris  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24bb0ec6b3SJim Harris  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25bb0ec6b3SJim Harris  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26bb0ec6b3SJim Harris  * SUCH DAMAGE.
27bb0ec6b3SJim Harris  */
28bb0ec6b3SJim Harris 
29bb0ec6b3SJim Harris #include <sys/cdefs.h>
30f24c011bSWarner Losh #include "opt_cam.h"
314b3da659SWarner Losh #include "opt_nvme.h"
32f24c011bSWarner Losh 
33bb0ec6b3SJim Harris #include <sys/param.h>
347c3f19d7SJim Harris #include <sys/systm.h>
357c3f19d7SJim Harris #include <sys/buf.h>
36bb0ec6b3SJim Harris #include <sys/bus.h>
37bb0ec6b3SJim Harris #include <sys/conf.h>
38bb0ec6b3SJim Harris #include <sys/ioccom.h>
397c3f19d7SJim Harris #include <sys/proc.h>
40bb0ec6b3SJim Harris #include <sys/smp.h>
417c3f19d7SJim Harris #include <sys/uio.h>
42244b8053SWarner Losh #include <sys/sbuf.h>
430d787e9bSWojciech Macek #include <sys/endian.h>
44244b8053SWarner Losh #include <machine/stdarg.h>
451eab19cbSAlexander Motin #include <vm/vm.h>
46bb0ec6b3SJim Harris 
47bb0ec6b3SJim Harris #include "nvme_private.h"
48bb0ec6b3SJim Harris 
490d787e9bSWojciech Macek #define B4_CHK_RDY_DELAY_MS	2300		/* work around controller bug */
50ce1ec9c1SWarner Losh 
510a0b08ccSJim Harris static void nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr,
520a0b08ccSJim Harris 						struct nvme_async_event_request *aer);
53bb0ec6b3SJim Harris 
54244b8053SWarner Losh static void
55d5fca1dcSWarner Losh nvme_ctrlr_barrier(struct nvme_controller *ctrlr, int flags)
56d5fca1dcSWarner Losh {
57d5fca1dcSWarner Losh 	bus_barrier(ctrlr->resource, 0, rman_get_size(ctrlr->resource), flags);
58d5fca1dcSWarner Losh }
59d5fca1dcSWarner Losh 
60d5fca1dcSWarner Losh static void
61244b8053SWarner Losh nvme_ctrlr_devctl_log(struct nvme_controller *ctrlr, const char *type, const char *msg, ...)
62244b8053SWarner Losh {
63244b8053SWarner Losh 	struct sbuf sb;
64244b8053SWarner Losh 	va_list ap;
65244b8053SWarner Losh 	int error;
66244b8053SWarner Losh 
674e6a434bSWarner Losh 	if (sbuf_new(&sb, NULL, 0, SBUF_AUTOEXTEND | SBUF_NOWAIT) == NULL)
684e6a434bSWarner Losh 		return;
69244b8053SWarner Losh 	sbuf_printf(&sb, "%s: ", device_get_nameunit(ctrlr->dev));
70244b8053SWarner Losh 	va_start(ap, msg);
71244b8053SWarner Losh 	sbuf_vprintf(&sb, msg, ap);
72244b8053SWarner Losh 	va_end(ap);
73244b8053SWarner Losh 	error = sbuf_finish(&sb);
74244b8053SWarner Losh 	if (error == 0)
75244b8053SWarner Losh 		printf("%s\n", sbuf_data(&sb));
76244b8053SWarner Losh 
77244b8053SWarner Losh 	sbuf_clear(&sb);
78244b8053SWarner Losh 	sbuf_printf(&sb, "name=\"%s\" reason=\"", device_get_nameunit(ctrlr->dev));
79244b8053SWarner Losh 	va_start(ap, msg);
80244b8053SWarner Losh 	sbuf_vprintf(&sb, msg, ap);
81244b8053SWarner Losh 	va_end(ap);
82244b8053SWarner Losh 	sbuf_printf(&sb, "\"");
83244b8053SWarner Losh 	error = sbuf_finish(&sb);
84244b8053SWarner Losh 	if (error == 0)
85244b8053SWarner Losh 		devctl_notify("nvme", "controller", type, sbuf_data(&sb));
86244b8053SWarner Losh 	sbuf_delete(&sb);
87244b8053SWarner Losh }
88244b8053SWarner Losh 
89a965389bSScott Long static int
90bb0ec6b3SJim Harris nvme_ctrlr_construct_admin_qpair(struct nvme_controller *ctrlr)
91bb0ec6b3SJim Harris {
92bb0ec6b3SJim Harris 	struct nvme_qpair	*qpair;
93bb0ec6b3SJim Harris 	uint32_t		num_entries;
94a965389bSScott Long 	int			error;
95bb0ec6b3SJim Harris 
96bb0ec6b3SJim Harris 	qpair = &ctrlr->adminq;
971eab19cbSAlexander Motin 	qpair->id = 0;
981eab19cbSAlexander Motin 	qpair->cpu = CPU_FFS(&cpuset_domain[ctrlr->domain]) - 1;
991eab19cbSAlexander Motin 	qpair->domain = ctrlr->domain;
100bb0ec6b3SJim Harris 
101bb0ec6b3SJim Harris 	num_entries = NVME_ADMIN_ENTRIES;
102bb0ec6b3SJim Harris 	TUNABLE_INT_FETCH("hw.nvme.admin_entries", &num_entries);
103bb0ec6b3SJim Harris 	/*
104bb0ec6b3SJim Harris 	 * If admin_entries was overridden to an invalid value, revert it
105bb0ec6b3SJim Harris 	 *  back to our default value.
106bb0ec6b3SJim Harris 	 */
107bb0ec6b3SJim Harris 	if (num_entries < NVME_MIN_ADMIN_ENTRIES ||
108bb0ec6b3SJim Harris 	    num_entries > NVME_MAX_ADMIN_ENTRIES) {
109547d523eSJim Harris 		nvme_printf(ctrlr, "invalid hw.nvme.admin_entries=%d "
110547d523eSJim Harris 		    "specified\n", num_entries);
111bb0ec6b3SJim Harris 		num_entries = NVME_ADMIN_ENTRIES;
112bb0ec6b3SJim Harris 	}
113bb0ec6b3SJim Harris 
114bb0ec6b3SJim Harris 	/*
115bb0ec6b3SJim Harris 	 * The admin queue's max xfer size is treated differently than the
116bb0ec6b3SJim Harris 	 *  max I/O xfer size.  16KB is sufficient here - maybe even less?
117bb0ec6b3SJim Harris 	 */
1181eab19cbSAlexander Motin 	error = nvme_qpair_construct(qpair, num_entries, NVME_ADMIN_TRACKERS,
11921b6da58SJim Harris 	     ctrlr);
120a965389bSScott Long 	return (error);
121bb0ec6b3SJim Harris }
122bb0ec6b3SJim Harris 
1231eab19cbSAlexander Motin #define QP(ctrlr, c)	((c) * (ctrlr)->num_io_queues / mp_ncpus)
1241eab19cbSAlexander Motin 
125bb0ec6b3SJim Harris static int
126bb0ec6b3SJim Harris nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr)
127bb0ec6b3SJim Harris {
128bb0ec6b3SJim Harris 	struct nvme_qpair	*qpair;
1290d787e9bSWojciech Macek 	uint32_t		cap_lo;
1300d787e9bSWojciech Macek 	uint16_t		mqes;
1311eab19cbSAlexander Motin 	int			c, error, i, n;
1321eab19cbSAlexander Motin 	int			num_entries, num_trackers, max_entries;
133bb0ec6b3SJim Harris 
134bb0ec6b3SJim Harris 	/*
135f93b7f95SWarner Losh 	 * NVMe spec sets a hard limit of 64K max entries, but devices may
136f93b7f95SWarner Losh 	 * specify a smaller limit, so we need to check the MQES field in the
137f93b7f95SWarner Losh 	 * capabilities register. We have to cap the number of entries to the
138f93b7f95SWarner Losh 	 * current stride allows for in BAR 0/1, otherwise the remainder entries
1396e8ab671SGordon Bergling 	 * are inaccessible. MQES should reflect this, and this is just a
140f93b7f95SWarner Losh 	 * fail-safe.
141bb0ec6b3SJim Harris 	 */
142f93b7f95SWarner Losh 	max_entries =
143f93b7f95SWarner Losh 	    (rman_get_size(ctrlr->resource) - nvme_mmio_offsetof(doorbell[0])) /
144f93b7f95SWarner Losh 	    (1 << (ctrlr->dstrd + 1));
145f93b7f95SWarner Losh 	num_entries = NVME_IO_ENTRIES;
146f93b7f95SWarner Losh 	TUNABLE_INT_FETCH("hw.nvme.io_entries", &num_entries);
1470d787e9bSWojciech Macek 	cap_lo = nvme_mmio_read_4(ctrlr, cap_lo);
14862d2cf18SWarner Losh 	mqes = NVME_CAP_LO_MQES(cap_lo);
1490d787e9bSWojciech Macek 	num_entries = min(num_entries, mqes + 1);
150f93b7f95SWarner Losh 	num_entries = min(num_entries, max_entries);
151bb0ec6b3SJim Harris 
15221b6da58SJim Harris 	num_trackers = NVME_IO_TRACKERS;
15321b6da58SJim Harris 	TUNABLE_INT_FETCH("hw.nvme.io_trackers", &num_trackers);
15421b6da58SJim Harris 
15521b6da58SJim Harris 	num_trackers = max(num_trackers, NVME_MIN_IO_TRACKERS);
15621b6da58SJim Harris 	num_trackers = min(num_trackers, NVME_MAX_IO_TRACKERS);
15721b6da58SJim Harris 	/*
158f93b7f95SWarner Losh 	 * No need to have more trackers than entries in the submit queue.  Note
159f93b7f95SWarner Losh 	 * also that for a queue size of N, we can only have (N-1) commands
160f93b7f95SWarner Losh 	 * outstanding, hence the "-1" here.
16121b6da58SJim Harris 	 */
16221b6da58SJim Harris 	num_trackers = min(num_trackers, (num_entries-1));
16321b6da58SJim Harris 
1642b647da7SJim Harris 	/*
165c02565f9SWarner Losh 	 * Our best estimate for the maximum number of I/Os that we should
1664d547561SWarner Losh 	 * normally have in flight at one time. This should be viewed as a hint,
1674d547561SWarner Losh 	 * not a hard limit and will need to be revisited when the upper layers
168c02565f9SWarner Losh 	 * of the storage system grows multi-queue support.
169c02565f9SWarner Losh 	 */
1705fff95ccSWarner Losh 	ctrlr->max_hw_pend_io = num_trackers * ctrlr->num_io_queues * 3 / 4;
171c02565f9SWarner Losh 
172bb0ec6b3SJim Harris 	ctrlr->ioq = malloc(ctrlr->num_io_queues * sizeof(struct nvme_qpair),
173237d2019SJim Harris 	    M_NVME, M_ZERO | M_WAITOK);
174bb0ec6b3SJim Harris 
1751eab19cbSAlexander Motin 	for (i = c = n = 0; i < ctrlr->num_io_queues; i++, c += n) {
176bb0ec6b3SJim Harris 		qpair = &ctrlr->ioq[i];
177bb0ec6b3SJim Harris 
178bb0ec6b3SJim Harris 		/*
179bb0ec6b3SJim Harris 		 * Admin queue has ID=0. IO queues start at ID=1 -
180bb0ec6b3SJim Harris 		 *  hence the 'i+1' here.
1811eab19cbSAlexander Motin 		 */
1821eab19cbSAlexander Motin 		qpair->id = i + 1;
1831eab19cbSAlexander Motin 		if (ctrlr->num_io_queues > 1) {
1841eab19cbSAlexander Motin 			/* Find number of CPUs served by this queue. */
1851eab19cbSAlexander Motin 			for (n = 1; QP(ctrlr, c + n) == i; n++)
1861eab19cbSAlexander Motin 				;
1871eab19cbSAlexander Motin 			/* Shuffle multiple NVMe devices between CPUs. */
1881eab19cbSAlexander Motin 			qpair->cpu = c + (device_get_unit(ctrlr->dev)+n/2) % n;
1891eab19cbSAlexander Motin 			qpair->domain = pcpu_find(qpair->cpu)->pc_domain;
1901eab19cbSAlexander Motin 		} else {
1911eab19cbSAlexander Motin 			qpair->cpu = CPU_FFS(&cpuset_domain[ctrlr->domain]) - 1;
1921eab19cbSAlexander Motin 			qpair->domain = ctrlr->domain;
1931eab19cbSAlexander Motin 		}
1941eab19cbSAlexander Motin 
1951eab19cbSAlexander Motin 		/*
196bb0ec6b3SJim Harris 		 * For I/O queues, use the controller-wide max_xfer_size
197bb0ec6b3SJim Harris 		 *  calculated in nvme_attach().
198bb0ec6b3SJim Harris 		 */
1991eab19cbSAlexander Motin 		error = nvme_qpair_construct(qpair, num_entries, num_trackers,
200bb0ec6b3SJim Harris 		    ctrlr);
201a965389bSScott Long 		if (error)
202a965389bSScott Long 			return (error);
203bb0ec6b3SJim Harris 
2042b647da7SJim Harris 		/*
2052b647da7SJim Harris 		 * Do not bother binding interrupts if we only have one I/O
2062b647da7SJim Harris 		 *  interrupt thread for this controller.
2072b647da7SJim Harris 		 */
208c75ad8ceSJim Harris 		if (ctrlr->num_io_queues > 1)
2091eab19cbSAlexander Motin 			bus_bind_intr(ctrlr->dev, qpair->res, qpair->cpu);
210bb0ec6b3SJim Harris 	}
211bb0ec6b3SJim Harris 
212bb0ec6b3SJim Harris 	return (0);
213bb0ec6b3SJim Harris }
214bb0ec6b3SJim Harris 
215232e2edbSJim Harris static void
216232e2edbSJim Harris nvme_ctrlr_fail(struct nvme_controller *ctrlr)
217232e2edbSJim Harris {
218232e2edbSJim Harris 	int i;
219232e2edbSJim Harris 
2207588c6ccSWarner Losh 	ctrlr->is_failed = true;
22171a28181SAlexander Motin 	nvme_admin_qpair_disable(&ctrlr->adminq);
222232e2edbSJim Harris 	nvme_qpair_fail(&ctrlr->adminq);
223824073fbSWarner Losh 	if (ctrlr->ioq != NULL) {
22471a28181SAlexander Motin 		for (i = 0; i < ctrlr->num_io_queues; i++) {
22571a28181SAlexander Motin 			nvme_io_qpair_disable(&ctrlr->ioq[i]);
226232e2edbSJim Harris 			nvme_qpair_fail(&ctrlr->ioq[i]);
227824073fbSWarner Losh 		}
22871a28181SAlexander Motin 	}
229232e2edbSJim Harris 	nvme_notify_fail_consumers(ctrlr);
230232e2edbSJim Harris }
231232e2edbSJim Harris 
232232e2edbSJim Harris void
233232e2edbSJim Harris nvme_ctrlr_post_failed_request(struct nvme_controller *ctrlr,
234232e2edbSJim Harris     struct nvme_request *req)
235232e2edbSJim Harris {
236232e2edbSJim Harris 
237a90b8104SJim Harris 	mtx_lock(&ctrlr->lock);
238232e2edbSJim Harris 	STAILQ_INSERT_TAIL(&ctrlr->fail_req, req, stailq);
239a90b8104SJim Harris 	mtx_unlock(&ctrlr->lock);
240502dc84aSWarner Losh 	if (!ctrlr->is_dying)
241232e2edbSJim Harris 		taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->fail_req_task);
242232e2edbSJim Harris }
243232e2edbSJim Harris 
244232e2edbSJim Harris static void
245232e2edbSJim Harris nvme_ctrlr_fail_req_task(void *arg, int pending)
246232e2edbSJim Harris {
247232e2edbSJim Harris 	struct nvme_controller	*ctrlr = arg;
248232e2edbSJim Harris 	struct nvme_request	*req;
249232e2edbSJim Harris 
250a90b8104SJim Harris 	mtx_lock(&ctrlr->lock);
251c252f637SAlexander Motin 	while ((req = STAILQ_FIRST(&ctrlr->fail_req)) != NULL) {
252232e2edbSJim Harris 		STAILQ_REMOVE_HEAD(&ctrlr->fail_req, stailq);
253c252f637SAlexander Motin 		mtx_unlock(&ctrlr->lock);
254232e2edbSJim Harris 		nvme_qpair_manual_complete_request(req->qpair, req,
2552ffd6fceSWarner Losh 		    NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST);
256c252f637SAlexander Motin 		mtx_lock(&ctrlr->lock);
257232e2edbSJim Harris 	}
258a90b8104SJim Harris 	mtx_unlock(&ctrlr->lock);
259232e2edbSJim Harris }
260232e2edbSJim Harris 
26183581511SWarner Losh /*
26283581511SWarner Losh  * Wait for RDY to change.
26383581511SWarner Losh  *
26483581511SWarner Losh  * Starts sleeping for 1us and geometrically increases it the longer we wait,
26583581511SWarner Losh  * capped at 1ms.
26683581511SWarner Losh  */
267bb0ec6b3SJim Harris static int
268cbdec09cSJim Harris nvme_ctrlr_wait_for_ready(struct nvme_controller *ctrlr, int desired_val)
269bb0ec6b3SJim Harris {
27026259f6aSWarner Losh 	int timeout = ticks + MSEC_2_TICKS(ctrlr->ready_timeout_in_ms);
27183581511SWarner Losh 	sbintime_t delta_t = SBT_1US;
2720d787e9bSWojciech Macek 	uint32_t csts;
273bb0ec6b3SJim Harris 
27471a28181SAlexander Motin 	while (1) {
27571a28181SAlexander Motin 		csts = nvme_mmio_read_4(ctrlr, csts);
2769600aa31SWarner Losh 		if (csts == NVME_GONE)		/* Hot unplug. */
27771a28181SAlexander Motin 			return (ENXIO);
27871a28181SAlexander Motin 		if (((csts >> NVME_CSTS_REG_RDY_SHIFT) & NVME_CSTS_REG_RDY_MASK)
27971a28181SAlexander Motin 		    == desired_val)
28071a28181SAlexander Motin 			break;
2814fbbe523SAlexander Motin 		if (timeout - ticks < 0) {
282cbdec09cSJim Harris 			nvme_printf(ctrlr, "controller ready did not become %d "
283cbdec09cSJim Harris 			    "within %d ms\n", desired_val, ctrlr->ready_timeout_in_ms);
284bb0ec6b3SJim Harris 			return (ENXIO);
285bb0ec6b3SJim Harris 		}
28683581511SWarner Losh 
28783581511SWarner Losh 		pause_sbt("nvmerdy", delta_t, 0, C_PREL(1));
28883581511SWarner Losh 		delta_t = min(SBT_1MS, delta_t * 3 / 2);
289bb0ec6b3SJim Harris 	}
290bb0ec6b3SJim Harris 
291bb0ec6b3SJim Harris 	return (0);
292bb0ec6b3SJim Harris }
293bb0ec6b3SJim Harris 
294ce1ec9c1SWarner Losh static int
295bb0ec6b3SJim Harris nvme_ctrlr_disable(struct nvme_controller *ctrlr)
296bb0ec6b3SJim Harris {
2970d787e9bSWojciech Macek 	uint32_t cc;
2980d787e9bSWojciech Macek 	uint32_t csts;
2990d787e9bSWojciech Macek 	uint8_t  en, rdy;
300ce1ec9c1SWarner Losh 	int err;
301bb0ec6b3SJim Harris 
3020d787e9bSWojciech Macek 	cc = nvme_mmio_read_4(ctrlr, cc);
3030d787e9bSWojciech Macek 	csts = nvme_mmio_read_4(ctrlr, csts);
3040d787e9bSWojciech Macek 
3050d787e9bSWojciech Macek 	en = (cc >> NVME_CC_REG_EN_SHIFT) & NVME_CC_REG_EN_MASK;
3060d787e9bSWojciech Macek 	rdy = (csts >> NVME_CSTS_REG_RDY_SHIFT) & NVME_CSTS_REG_RDY_MASK;
307bb0ec6b3SJim Harris 
308ce1ec9c1SWarner Losh 	/*
309ce1ec9c1SWarner Losh 	 * Per 3.1.5 in NVME 1.3 spec, transitioning CC.EN from 0 to 1
310ce1ec9c1SWarner Losh 	 * when CSTS.RDY is 1 or transitioning CC.EN from 1 to 0 when
311ce1ec9c1SWarner Losh 	 * CSTS.RDY is 0 "has undefined results" So make sure that CSTS.RDY
312ce1ec9c1SWarner Losh 	 * isn't the desired value. Short circuit if we're already disabled.
313ce1ec9c1SWarner Losh 	 */
314a245627aSWarner Losh 	if (en == 0) {
315a245627aSWarner Losh 		/* Wait for RDY == 0 or timeout & fail */
316a245627aSWarner Losh 		if (rdy == 0)
317a245627aSWarner Losh 			return (0);
318a245627aSWarner Losh 		return (nvme_ctrlr_wait_for_ready(ctrlr, 0));
319a245627aSWarner Losh 	}
3200d787e9bSWojciech Macek 	if (rdy == 0) {
321a245627aSWarner Losh 		/* EN == 1, wait for  RDY == 1 or timeout & fail */
322ce1ec9c1SWarner Losh 		err = nvme_ctrlr_wait_for_ready(ctrlr, 1);
323ce1ec9c1SWarner Losh 		if (err != 0)
324ce1ec9c1SWarner Losh 			return (err);
325ce1ec9c1SWarner Losh 	}
326bb0ec6b3SJim Harris 
3270d787e9bSWojciech Macek 	cc &= ~NVME_CC_REG_EN_MASK;
3280d787e9bSWojciech Macek 	nvme_mmio_write_4(ctrlr, cc, cc);
32977054a89SWarner Losh 
330ce1ec9c1SWarner Losh 	/*
33177054a89SWarner Losh 	 * A few drives have firmware bugs that freeze the drive if we access
33277054a89SWarner Losh 	 * the mmio too soon after we disable.
333ce1ec9c1SWarner Losh 	 */
334989c7f0bSWarner Losh 	if (ctrlr->quirks & QUIRK_DELAY_B4_CHK_RDY)
33526259f6aSWarner Losh 		pause("nvmeR", MSEC_2_TICKS(B4_CHK_RDY_DELAY_MS));
336ce1ec9c1SWarner Losh 	return (nvme_ctrlr_wait_for_ready(ctrlr, 0));
337bb0ec6b3SJim Harris }
338bb0ec6b3SJim Harris 
339bb0ec6b3SJim Harris static int
340bb0ec6b3SJim Harris nvme_ctrlr_enable(struct nvme_controller *ctrlr)
341bb0ec6b3SJim Harris {
3420d787e9bSWojciech Macek 	uint32_t	cc;
3430d787e9bSWojciech Macek 	uint32_t	csts;
3440d787e9bSWojciech Macek 	uint32_t	aqa;
3450d787e9bSWojciech Macek 	uint32_t	qsize;
3460d787e9bSWojciech Macek 	uint8_t		en, rdy;
347ce1ec9c1SWarner Losh 	int		err;
348bb0ec6b3SJim Harris 
3490d787e9bSWojciech Macek 	cc = nvme_mmio_read_4(ctrlr, cc);
3500d787e9bSWojciech Macek 	csts = nvme_mmio_read_4(ctrlr, csts);
3510d787e9bSWojciech Macek 
3520d787e9bSWojciech Macek 	en = (cc >> NVME_CC_REG_EN_SHIFT) & NVME_CC_REG_EN_MASK;
3530d787e9bSWojciech Macek 	rdy = (csts >> NVME_CSTS_REG_RDY_SHIFT) & NVME_CSTS_REG_RDY_MASK;
354bb0ec6b3SJim Harris 
355ce1ec9c1SWarner Losh 	/*
356ce1ec9c1SWarner Losh 	 * See note in nvme_ctrlr_disable. Short circuit if we're already enabled.
357ce1ec9c1SWarner Losh 	 */
3580d787e9bSWojciech Macek 	if (en == 1) {
3590d787e9bSWojciech Macek 		if (rdy == 1)
360bb0ec6b3SJim Harris 			return (0);
361cbdec09cSJim Harris 		return (nvme_ctrlr_wait_for_ready(ctrlr, 1));
362a245627aSWarner Losh 	}
363a245627aSWarner Losh 
364a245627aSWarner Losh 	/* EN == 0 already wait for RDY == 0 or timeout & fail */
365ce1ec9c1SWarner Losh 	err = nvme_ctrlr_wait_for_ready(ctrlr, 0);
366ce1ec9c1SWarner Losh 	if (err != 0)
367ce1ec9c1SWarner Losh 		return (err);
368bb0ec6b3SJim Harris 
369bb0ec6b3SJim Harris 	nvme_mmio_write_8(ctrlr, asq, ctrlr->adminq.cmd_bus_addr);
370bb0ec6b3SJim Harris 	nvme_mmio_write_8(ctrlr, acq, ctrlr->adminq.cpl_bus_addr);
371bb0ec6b3SJim Harris 
372bb0ec6b3SJim Harris 	/* acqs and asqs are 0-based. */
3730d787e9bSWojciech Macek 	qsize = ctrlr->adminq.num_entries - 1;
3740d787e9bSWojciech Macek 
3750d787e9bSWojciech Macek 	aqa = 0;
3760d787e9bSWojciech Macek 	aqa = (qsize & NVME_AQA_REG_ACQS_MASK) << NVME_AQA_REG_ACQS_SHIFT;
3770d787e9bSWojciech Macek 	aqa |= (qsize & NVME_AQA_REG_ASQS_MASK) << NVME_AQA_REG_ASQS_SHIFT;
3780d787e9bSWojciech Macek 	nvme_mmio_write_4(ctrlr, aqa, aqa);
379bb0ec6b3SJim Harris 
3800d787e9bSWojciech Macek 	/* Initialization values for CC */
3810d787e9bSWojciech Macek 	cc = 0;
3820d787e9bSWojciech Macek 	cc |= 1 << NVME_CC_REG_EN_SHIFT;
3830d787e9bSWojciech Macek 	cc |= 0 << NVME_CC_REG_CSS_SHIFT;
3840d787e9bSWojciech Macek 	cc |= 0 << NVME_CC_REG_AMS_SHIFT;
3850d787e9bSWojciech Macek 	cc |= 0 << NVME_CC_REG_SHN_SHIFT;
3860d787e9bSWojciech Macek 	cc |= 6 << NVME_CC_REG_IOSQES_SHIFT; /* SQ entry size == 64 == 2^6 */
3870d787e9bSWojciech Macek 	cc |= 4 << NVME_CC_REG_IOCQES_SHIFT; /* CQ entry size == 16 == 2^4 */
388bb0ec6b3SJim Harris 
3893a468f20SWarner Losh 	/*
3903a468f20SWarner Losh 	 * Use the Memory Page Size selected during device initialization.  Note
3913a468f20SWarner Losh 	 * that value stored in mps is suitable to use here without adjusting by
3923a468f20SWarner Losh 	 * NVME_MPS_SHIFT.
3933a468f20SWarner Losh 	 */
3943a468f20SWarner Losh 	cc |= ctrlr->mps << NVME_CC_REG_MPS_SHIFT;
395bb0ec6b3SJim Harris 
396d5fca1dcSWarner Losh 	nvme_ctrlr_barrier(ctrlr, BUS_SPACE_BARRIER_WRITE);
3970d787e9bSWojciech Macek 	nvme_mmio_write_4(ctrlr, cc, cc);
398bb0ec6b3SJim Harris 
399cbdec09cSJim Harris 	return (nvme_ctrlr_wait_for_ready(ctrlr, 1));
400bb0ec6b3SJim Harris }
401bb0ec6b3SJim Harris 
4024d547561SWarner Losh static void
4034d547561SWarner Losh nvme_ctrlr_disable_qpairs(struct nvme_controller *ctrlr)
404bb0ec6b3SJim Harris {
4054d547561SWarner Losh 	int i;
406b846efd7SJim Harris 
407b846efd7SJim Harris 	nvme_admin_qpair_disable(&ctrlr->adminq);
4082b647da7SJim Harris 	/*
4092b647da7SJim Harris 	 * I/O queues are not allocated before the initial HW
4102b647da7SJim Harris 	 *  reset, so do not try to disable them.  Use is_initialized
4112b647da7SJim Harris 	 *  to determine if this is the initial HW reset.
4122b647da7SJim Harris 	 */
4132b647da7SJim Harris 	if (ctrlr->is_initialized) {
414b846efd7SJim Harris 		for (i = 0; i < ctrlr->num_io_queues; i++)
415b846efd7SJim Harris 			nvme_io_qpair_disable(&ctrlr->ioq[i]);
4162b647da7SJim Harris 	}
4174d547561SWarner Losh }
4184d547561SWarner Losh 
419*8052b01eSWarner Losh static void
420*8052b01eSWarner Losh nvme_pre_reset(struct nvme_controller *ctrlr)
421*8052b01eSWarner Losh {
422*8052b01eSWarner Losh 	/*
423*8052b01eSWarner Losh 	 * Make sure that all the ISRs are done before proceeding with the reset
424*8052b01eSWarner Losh 	 * (and also keep any stray interrupts that happen during this process
425*8052b01eSWarner Losh 	 * from racing this process). For startup, this is a nop, since the
426*8052b01eSWarner Losh 	 * hardware is in a good state. But for recovery, where we randomly
427*8052b01eSWarner Losh 	 * reset the hardware, this ensure that we're not racing the ISRs.
428*8052b01eSWarner Losh 	 */
429*8052b01eSWarner Losh 	mtx_lock(&ctrlr->adminq.recovery);
430*8052b01eSWarner Losh 	for (int i = 0; i < ctrlr->num_io_queues; i++) {
431*8052b01eSWarner Losh 		mtx_lock(&ctrlr->ioq[i].recovery);
432*8052b01eSWarner Losh 	}
433*8052b01eSWarner Losh }
434*8052b01eSWarner Losh 
435*8052b01eSWarner Losh static void
436*8052b01eSWarner Losh nvme_post_reset(struct nvme_controller *ctrlr)
437*8052b01eSWarner Losh {
438*8052b01eSWarner Losh 	/*
439*8052b01eSWarner Losh 	 * Reset complete, unblock ISRs
440*8052b01eSWarner Losh 	 */
441*8052b01eSWarner Losh 	mtx_unlock(&ctrlr->adminq.recovery);
442*8052b01eSWarner Losh 	for (int i = 0; i < ctrlr->num_io_queues; i++) {
443*8052b01eSWarner Losh 		mtx_unlock(&ctrlr->ioq[i].recovery);
444*8052b01eSWarner Losh 	}
445*8052b01eSWarner Losh }
446*8052b01eSWarner Losh 
447dd2516fcSWarner Losh static int
4484d547561SWarner Losh nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr)
4494d547561SWarner Losh {
4504d547561SWarner Losh 	int err;
4514d547561SWarner Losh 
452bad42df9SColin Percival 	TSENTER();
453b846efd7SJim Harris 
454e5e26e4aSWarner Losh 	nvme_ctrlr_disable_qpairs(ctrlr);
455bb0ec6b3SJim Harris 
456ce1ec9c1SWarner Losh 	err = nvme_ctrlr_disable(ctrlr);
457ce1ec9c1SWarner Losh 	if (err != 0)
458*8052b01eSWarner Losh 		goto out;
459e5e26e4aSWarner Losh 
460bad42df9SColin Percival 	err = nvme_ctrlr_enable(ctrlr);
461*8052b01eSWarner Losh out:
462*8052b01eSWarner Losh 
463bad42df9SColin Percival 	TSEXIT();
464bad42df9SColin Percival 	return (err);
465bb0ec6b3SJim Harris }
466bb0ec6b3SJim Harris 
467b846efd7SJim Harris void
468b846efd7SJim Harris nvme_ctrlr_reset(struct nvme_controller *ctrlr)
469b846efd7SJim Harris {
470f37c22a3SJim Harris 	int cmpset;
471f37c22a3SJim Harris 
472f37c22a3SJim Harris 	cmpset = atomic_cmpset_32(&ctrlr->is_resetting, 0, 1);
473f37c22a3SJim Harris 
474232e2edbSJim Harris 	if (cmpset == 0 || ctrlr->is_failed)
475232e2edbSJim Harris 		/*
476232e2edbSJim Harris 		 * Controller is already resetting or has failed.  Return
477232e2edbSJim Harris 		 *  immediately since there is no need to kick off another
478232e2edbSJim Harris 		 *  reset in these cases.
479232e2edbSJim Harris 		 */
480f37c22a3SJim Harris 		return;
481b846efd7SJim Harris 
482502dc84aSWarner Losh 	if (!ctrlr->is_dying)
48348ce3178SJim Harris 		taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->reset_task);
484b846efd7SJim Harris }
485b846efd7SJim Harris 
486bb0ec6b3SJim Harris static int
487bb0ec6b3SJim Harris nvme_ctrlr_identify(struct nvme_controller *ctrlr)
488bb0ec6b3SJim Harris {
489955910a9SJim Harris 	struct nvme_completion_poll_status	status;
490bb0ec6b3SJim Harris 
49129077eb4SWarner Losh 	status.done = 0;
492bb0ec6b3SJim Harris 	nvme_ctrlr_cmd_identify_controller(ctrlr, &ctrlr->cdata,
493955910a9SJim Harris 	    nvme_completion_poll_cb, &status);
494ab0681aaSWarner Losh 	nvme_completion_poll(&status);
495955910a9SJim Harris 	if (nvme_completion_is_error(&status.cpl)) {
496547d523eSJim Harris 		nvme_printf(ctrlr, "nvme_identify_controller failed!\n");
497bb0ec6b3SJim Harris 		return (ENXIO);
498bb0ec6b3SJim Harris 	}
499bb0ec6b3SJim Harris 
5000d787e9bSWojciech Macek 	/* Convert data to host endian */
5010d787e9bSWojciech Macek 	nvme_controller_data_swapbytes(&ctrlr->cdata);
5020d787e9bSWojciech Macek 
50302e33484SJim Harris 	/*
50402e33484SJim Harris 	 * Use MDTS to ensure our default max_xfer_size doesn't exceed what the
50502e33484SJim Harris 	 *  controller supports.
50602e33484SJim Harris 	 */
50702e33484SJim Harris 	if (ctrlr->cdata.mdts > 0)
50802e33484SJim Harris 		ctrlr->max_xfer_size = min(ctrlr->max_xfer_size,
5096e3deec8SWarner Losh 		    1 << (ctrlr->cdata.mdts + NVME_MPS_SHIFT +
5106e3deec8SWarner Losh 			NVME_CAP_HI_MPSMIN(ctrlr->cap_hi)));
51102e33484SJim Harris 
512bb0ec6b3SJim Harris 	return (0);
513bb0ec6b3SJim Harris }
514bb0ec6b3SJim Harris 
515bb0ec6b3SJim Harris static int
516bb0ec6b3SJim Harris nvme_ctrlr_set_num_qpairs(struct nvme_controller *ctrlr)
517bb0ec6b3SJim Harris {
518955910a9SJim Harris 	struct nvme_completion_poll_status	status;
5192b647da7SJim Harris 	int					cq_allocated, sq_allocated;
520bb0ec6b3SJim Harris 
52129077eb4SWarner Losh 	status.done = 0;
522bb0ec6b3SJim Harris 	nvme_ctrlr_cmd_set_num_queues(ctrlr, ctrlr->num_io_queues,
523955910a9SJim Harris 	    nvme_completion_poll_cb, &status);
524ab0681aaSWarner Losh 	nvme_completion_poll(&status);
525955910a9SJim Harris 	if (nvme_completion_is_error(&status.cpl)) {
526824073fbSWarner Losh 		nvme_printf(ctrlr, "nvme_ctrlr_set_num_qpairs failed!\n");
527bb0ec6b3SJim Harris 		return (ENXIO);
528bb0ec6b3SJim Harris 	}
529bb0ec6b3SJim Harris 
530bb0ec6b3SJim Harris 	/*
531bb0ec6b3SJim Harris 	 * Data in cdw0 is 0-based.
532bb0ec6b3SJim Harris 	 * Lower 16-bits indicate number of submission queues allocated.
533bb0ec6b3SJim Harris 	 * Upper 16-bits indicate number of completion queues allocated.
534bb0ec6b3SJim Harris 	 */
535955910a9SJim Harris 	sq_allocated = (status.cpl.cdw0 & 0xFFFF) + 1;
536955910a9SJim Harris 	cq_allocated = (status.cpl.cdw0 >> 16) + 1;
537bb0ec6b3SJim Harris 
538bb0ec6b3SJim Harris 	/*
5392b647da7SJim Harris 	 * Controller may allocate more queues than we requested,
5402b647da7SJim Harris 	 *  so use the minimum of the number requested and what was
5412b647da7SJim Harris 	 *  actually allocated.
542bb0ec6b3SJim Harris 	 */
5432b647da7SJim Harris 	ctrlr->num_io_queues = min(ctrlr->num_io_queues, sq_allocated);
5442b647da7SJim Harris 	ctrlr->num_io_queues = min(ctrlr->num_io_queues, cq_allocated);
5451eab19cbSAlexander Motin 	if (ctrlr->num_io_queues > vm_ndomains)
5461eab19cbSAlexander Motin 		ctrlr->num_io_queues -= ctrlr->num_io_queues % vm_ndomains;
547bb0ec6b3SJim Harris 
548bb0ec6b3SJim Harris 	return (0);
549bb0ec6b3SJim Harris }
550bb0ec6b3SJim Harris 
551bb0ec6b3SJim Harris static int
552bb0ec6b3SJim Harris nvme_ctrlr_create_qpairs(struct nvme_controller *ctrlr)
553bb0ec6b3SJim Harris {
554955910a9SJim Harris 	struct nvme_completion_poll_status	status;
555bb0ec6b3SJim Harris 	struct nvme_qpair			*qpair;
556955910a9SJim Harris 	int					i;
557bb0ec6b3SJim Harris 
558bb0ec6b3SJim Harris 	for (i = 0; i < ctrlr->num_io_queues; i++) {
559bb0ec6b3SJim Harris 		qpair = &ctrlr->ioq[i];
560bb0ec6b3SJim Harris 
56129077eb4SWarner Losh 		status.done = 0;
5621eab19cbSAlexander Motin 		nvme_ctrlr_cmd_create_io_cq(ctrlr, qpair,
563955910a9SJim Harris 		    nvme_completion_poll_cb, &status);
564ab0681aaSWarner Losh 		nvme_completion_poll(&status);
565955910a9SJim Harris 		if (nvme_completion_is_error(&status.cpl)) {
566547d523eSJim Harris 			nvme_printf(ctrlr, "nvme_create_io_cq failed!\n");
567bb0ec6b3SJim Harris 			return (ENXIO);
568bb0ec6b3SJim Harris 		}
569bb0ec6b3SJim Harris 
57029077eb4SWarner Losh 		status.done = 0;
571ead7e103SAlexander Motin 		nvme_ctrlr_cmd_create_io_sq(ctrlr, qpair,
572955910a9SJim Harris 		    nvme_completion_poll_cb, &status);
573ab0681aaSWarner Losh 		nvme_completion_poll(&status);
574955910a9SJim Harris 		if (nvme_completion_is_error(&status.cpl)) {
575547d523eSJim Harris 			nvme_printf(ctrlr, "nvme_create_io_sq failed!\n");
576bb0ec6b3SJim Harris 			return (ENXIO);
577bb0ec6b3SJim Harris 		}
578bb0ec6b3SJim Harris 	}
579bb0ec6b3SJim Harris 
580bb0ec6b3SJim Harris 	return (0);
581bb0ec6b3SJim Harris }
582bb0ec6b3SJim Harris 
583bb0ec6b3SJim Harris static int
5844d547561SWarner Losh nvme_ctrlr_delete_qpairs(struct nvme_controller *ctrlr)
5858b1e6ebeSWarner Losh {
5868b1e6ebeSWarner Losh 	struct nvme_completion_poll_status	status;
5879835d216SWarner Losh 	struct nvme_qpair			*qpair;
5889835d216SWarner Losh 
5899835d216SWarner Losh 	for (int i = 0; i < ctrlr->num_io_queues; i++) {
5909835d216SWarner Losh 		qpair = &ctrlr->ioq[i];
5918b1e6ebeSWarner Losh 
5928b1e6ebeSWarner Losh 		status.done = 0;
5935d7fd8f7SWarner Losh 		nvme_ctrlr_cmd_delete_io_sq(ctrlr, qpair,
5948b1e6ebeSWarner Losh 		    nvme_completion_poll_cb, &status);
595ab0681aaSWarner Losh 		nvme_completion_poll(&status);
5968b1e6ebeSWarner Losh 		if (nvme_completion_is_error(&status.cpl)) {
5975d7fd8f7SWarner Losh 			nvme_printf(ctrlr, "nvme_destroy_io_sq failed!\n");
5988b1e6ebeSWarner Losh 			return (ENXIO);
5998b1e6ebeSWarner Losh 		}
6008b1e6ebeSWarner Losh 
6018b1e6ebeSWarner Losh 		status.done = 0;
6028b1e6ebeSWarner Losh 		nvme_ctrlr_cmd_delete_io_cq(ctrlr, qpair,
6038b1e6ebeSWarner Losh 		    nvme_completion_poll_cb, &status);
604ab0681aaSWarner Losh 		nvme_completion_poll(&status);
6058b1e6ebeSWarner Losh 		if (nvme_completion_is_error(&status.cpl)) {
6065d7fd8f7SWarner Losh 			nvme_printf(ctrlr, "nvme_destroy_io_cq failed!\n");
6078b1e6ebeSWarner Losh 			return (ENXIO);
6088b1e6ebeSWarner Losh 		}
6099835d216SWarner Losh 	}
6108b1e6ebeSWarner Losh 
6118b1e6ebeSWarner Losh 	return (0);
6128b1e6ebeSWarner Losh }
6138b1e6ebeSWarner Losh 
6148b1e6ebeSWarner Losh static int
615bb0ec6b3SJim Harris nvme_ctrlr_construct_namespaces(struct nvme_controller *ctrlr)
616bb0ec6b3SJim Harris {
617bb0ec6b3SJim Harris 	struct nvme_namespace	*ns;
618696c9502SWarner Losh 	uint32_t 		i;
619bb0ec6b3SJim Harris 
620a8a18dd5SWarner Losh 	for (i = 0; i < min(ctrlr->cdata.nn, NVME_MAX_NAMESPACES); i++) {
621bb0ec6b3SJim Harris 		ns = &ctrlr->ns[i];
622a8a18dd5SWarner Losh 		nvme_ns_construct(ns, i+1, ctrlr);
623bb0ec6b3SJim Harris 	}
624bb0ec6b3SJim Harris 
625bb0ec6b3SJim Harris 	return (0);
626bb0ec6b3SJim Harris }
627bb0ec6b3SJim Harris 
6287588c6ccSWarner Losh static bool
6292868353aSJim Harris is_log_page_id_valid(uint8_t page_id)
6302868353aSJim Harris {
6312868353aSJim Harris 
6322868353aSJim Harris 	switch (page_id) {
6332868353aSJim Harris 	case NVME_LOG_ERROR:
6342868353aSJim Harris 	case NVME_LOG_HEALTH_INFORMATION:
6352868353aSJim Harris 	case NVME_LOG_FIRMWARE_SLOT:
636f439e3a4SAlexander Motin 	case NVME_LOG_CHANGED_NAMESPACE:
6376c99d132SAlexander Motin 	case NVME_LOG_COMMAND_EFFECT:
6386c99d132SAlexander Motin 	case NVME_LOG_RES_NOTIFICATION:
6396c99d132SAlexander Motin 	case NVME_LOG_SANITIZE_STATUS:
6407588c6ccSWarner Losh 		return (true);
6412868353aSJim Harris 	}
6422868353aSJim Harris 
6437588c6ccSWarner Losh 	return (false);
6442868353aSJim Harris }
6452868353aSJim Harris 
6462868353aSJim Harris static uint32_t
6472868353aSJim Harris nvme_ctrlr_get_log_page_size(struct nvme_controller *ctrlr, uint8_t page_id)
6482868353aSJim Harris {
6492868353aSJim Harris 	uint32_t	log_page_size;
6502868353aSJim Harris 
6512868353aSJim Harris 	switch (page_id) {
6522868353aSJim Harris 	case NVME_LOG_ERROR:
6532868353aSJim Harris 		log_page_size = min(
6542868353aSJim Harris 		    sizeof(struct nvme_error_information_entry) *
6550d787e9bSWojciech Macek 		    (ctrlr->cdata.elpe + 1), NVME_MAX_AER_LOG_SIZE);
6562868353aSJim Harris 		break;
6572868353aSJim Harris 	case NVME_LOG_HEALTH_INFORMATION:
6582868353aSJim Harris 		log_page_size = sizeof(struct nvme_health_information_page);
6592868353aSJim Harris 		break;
6602868353aSJim Harris 	case NVME_LOG_FIRMWARE_SLOT:
6612868353aSJim Harris 		log_page_size = sizeof(struct nvme_firmware_page);
6622868353aSJim Harris 		break;
663f439e3a4SAlexander Motin 	case NVME_LOG_CHANGED_NAMESPACE:
664f439e3a4SAlexander Motin 		log_page_size = sizeof(struct nvme_ns_list);
665f439e3a4SAlexander Motin 		break;
6666c99d132SAlexander Motin 	case NVME_LOG_COMMAND_EFFECT:
6676c99d132SAlexander Motin 		log_page_size = sizeof(struct nvme_command_effects_page);
6686c99d132SAlexander Motin 		break;
6696c99d132SAlexander Motin 	case NVME_LOG_RES_NOTIFICATION:
6706c99d132SAlexander Motin 		log_page_size = sizeof(struct nvme_res_notification_page);
6716c99d132SAlexander Motin 		break;
6726c99d132SAlexander Motin 	case NVME_LOG_SANITIZE_STATUS:
6736c99d132SAlexander Motin 		log_page_size = sizeof(struct nvme_sanitize_status_page);
6746c99d132SAlexander Motin 		break;
6752868353aSJim Harris 	default:
6762868353aSJim Harris 		log_page_size = 0;
6772868353aSJim Harris 		break;
6782868353aSJim Harris 	}
6792868353aSJim Harris 
6802868353aSJim Harris 	return (log_page_size);
6812868353aSJim Harris }
6822868353aSJim Harris 
6832868353aSJim Harris static void
684bb2f67fdSJim Harris nvme_ctrlr_log_critical_warnings(struct nvme_controller *ctrlr,
6850d787e9bSWojciech Macek     uint8_t state)
686bb2f67fdSJim Harris {
687bb2f67fdSJim Harris 
6880d787e9bSWojciech Macek 	if (state & NVME_CRIT_WARN_ST_AVAILABLE_SPARE)
689244b8053SWarner Losh 		nvme_ctrlr_devctl_log(ctrlr, "critical",
690244b8053SWarner Losh 		    "available spare space below threshold");
691bb2f67fdSJim Harris 
6920d787e9bSWojciech Macek 	if (state & NVME_CRIT_WARN_ST_TEMPERATURE)
693244b8053SWarner Losh 		nvme_ctrlr_devctl_log(ctrlr, "critical",
694244b8053SWarner Losh 		    "temperature above threshold");
695bb2f67fdSJim Harris 
6960d787e9bSWojciech Macek 	if (state & NVME_CRIT_WARN_ST_DEVICE_RELIABILITY)
697244b8053SWarner Losh 		nvme_ctrlr_devctl_log(ctrlr, "critical",
698244b8053SWarner Losh 		    "device reliability degraded");
699bb2f67fdSJim Harris 
7000d787e9bSWojciech Macek 	if (state & NVME_CRIT_WARN_ST_READ_ONLY)
701244b8053SWarner Losh 		nvme_ctrlr_devctl_log(ctrlr, "critical",
702244b8053SWarner Losh 		    "media placed in read only mode");
703bb2f67fdSJim Harris 
7040d787e9bSWojciech Macek 	if (state & NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP)
705244b8053SWarner Losh 		nvme_ctrlr_devctl_log(ctrlr, "critical",
706244b8053SWarner Losh 		    "volatile memory backup device failed");
707bb2f67fdSJim Harris 
7080d787e9bSWojciech Macek 	if (state & NVME_CRIT_WARN_ST_RESERVED_MASK)
709244b8053SWarner Losh 		nvme_ctrlr_devctl_log(ctrlr, "critical",
710244b8053SWarner Losh 		    "unknown critical warning(s): state = 0x%02x", state);
711bb2f67fdSJim Harris }
712bb2f67fdSJim Harris 
713bb2f67fdSJim Harris static void
7142868353aSJim Harris nvme_ctrlr_async_event_log_page_cb(void *arg, const struct nvme_completion *cpl)
7152868353aSJim Harris {
7162868353aSJim Harris 	struct nvme_async_event_request		*aer = arg;
717bb2f67fdSJim Harris 	struct nvme_health_information_page	*health_info;
718f439e3a4SAlexander Motin 	struct nvme_ns_list			*nsl;
7190d787e9bSWojciech Macek 	struct nvme_error_information_entry	*err;
7200d787e9bSWojciech Macek 	int i;
7212868353aSJim Harris 
7220d7e13ecSJim Harris 	/*
7230d7e13ecSJim Harris 	 * If the log page fetch for some reason completed with an error,
7240d7e13ecSJim Harris 	 *  don't pass log page data to the consumers.  In practice, this case
7250d7e13ecSJim Harris 	 *  should never happen.
7260d7e13ecSJim Harris 	 */
7270d7e13ecSJim Harris 	if (nvme_completion_is_error(cpl))
7280d7e13ecSJim Harris 		nvme_notify_async_consumers(aer->ctrlr, &aer->cpl,
7290d7e13ecSJim Harris 		    aer->log_page_id, NULL, 0);
730bb2f67fdSJim Harris 	else {
7310d787e9bSWojciech Macek 		/* Convert data to host endian */
7320d787e9bSWojciech Macek 		switch (aer->log_page_id) {
7330d787e9bSWojciech Macek 		case NVME_LOG_ERROR:
7340d787e9bSWojciech Macek 			err = (struct nvme_error_information_entry *)aer->log_page_buffer;
7350d787e9bSWojciech Macek 			for (i = 0; i < (aer->ctrlr->cdata.elpe + 1); i++)
7360d787e9bSWojciech Macek 				nvme_error_information_entry_swapbytes(err++);
7370d787e9bSWojciech Macek 			break;
7380d787e9bSWojciech Macek 		case NVME_LOG_HEALTH_INFORMATION:
7390d787e9bSWojciech Macek 			nvme_health_information_page_swapbytes(
7400d787e9bSWojciech Macek 			    (struct nvme_health_information_page *)aer->log_page_buffer);
7410d787e9bSWojciech Macek 			break;
7420d787e9bSWojciech Macek 		case NVME_LOG_FIRMWARE_SLOT:
7430d787e9bSWojciech Macek 			nvme_firmware_page_swapbytes(
7440d787e9bSWojciech Macek 			    (struct nvme_firmware_page *)aer->log_page_buffer);
7450d787e9bSWojciech Macek 			break;
746f439e3a4SAlexander Motin 		case NVME_LOG_CHANGED_NAMESPACE:
747f439e3a4SAlexander Motin 			nvme_ns_list_swapbytes(
748f439e3a4SAlexander Motin 			    (struct nvme_ns_list *)aer->log_page_buffer);
749f439e3a4SAlexander Motin 			break;
7506c99d132SAlexander Motin 		case NVME_LOG_COMMAND_EFFECT:
7516c99d132SAlexander Motin 			nvme_command_effects_page_swapbytes(
7526c99d132SAlexander Motin 			    (struct nvme_command_effects_page *)aer->log_page_buffer);
7536c99d132SAlexander Motin 			break;
7546c99d132SAlexander Motin 		case NVME_LOG_RES_NOTIFICATION:
7556c99d132SAlexander Motin 			nvme_res_notification_page_swapbytes(
7566c99d132SAlexander Motin 			    (struct nvme_res_notification_page *)aer->log_page_buffer);
7576c99d132SAlexander Motin 			break;
7586c99d132SAlexander Motin 		case NVME_LOG_SANITIZE_STATUS:
7596c99d132SAlexander Motin 			nvme_sanitize_status_page_swapbytes(
7606c99d132SAlexander Motin 			    (struct nvme_sanitize_status_page *)aer->log_page_buffer);
7616c99d132SAlexander Motin 			break;
7620d787e9bSWojciech Macek 		case INTEL_LOG_TEMP_STATS:
7630d787e9bSWojciech Macek 			intel_log_temp_stats_swapbytes(
7640d787e9bSWojciech Macek 			    (struct intel_log_temp_stats *)aer->log_page_buffer);
7650d787e9bSWojciech Macek 			break;
7660d787e9bSWojciech Macek 		default:
7670d787e9bSWojciech Macek 			break;
7680d787e9bSWojciech Macek 		}
7690d787e9bSWojciech Macek 
770bb2f67fdSJim Harris 		if (aer->log_page_id == NVME_LOG_HEALTH_INFORMATION) {
771bb2f67fdSJim Harris 			health_info = (struct nvme_health_information_page *)
772bb2f67fdSJim Harris 			    aer->log_page_buffer;
773bb2f67fdSJim Harris 			nvme_ctrlr_log_critical_warnings(aer->ctrlr,
774bb2f67fdSJim Harris 			    health_info->critical_warning);
775bb2f67fdSJim Harris 			/*
776bb2f67fdSJim Harris 			 * Critical warnings reported through the
777bb2f67fdSJim Harris 			 *  SMART/health log page are persistent, so
778bb2f67fdSJim Harris 			 *  clear the associated bits in the async event
779bb2f67fdSJim Harris 			 *  config so that we do not receive repeated
780bb2f67fdSJim Harris 			 *  notifications for the same event.
781bb2f67fdSJim Harris 			 */
7820d787e9bSWojciech Macek 			aer->ctrlr->async_event_config &=
7830d787e9bSWojciech Macek 			    ~health_info->critical_warning;
784bb2f67fdSJim Harris 			nvme_ctrlr_cmd_set_async_event_config(aer->ctrlr,
785bb2f67fdSJim Harris 			    aer->ctrlr->async_event_config, NULL, NULL);
786f439e3a4SAlexander Motin 		} else if (aer->log_page_id == NVME_LOG_CHANGED_NAMESPACE &&
787f439e3a4SAlexander Motin 		    !nvme_use_nvd) {
788f439e3a4SAlexander Motin 			nsl = (struct nvme_ns_list *)aer->log_page_buffer;
789f439e3a4SAlexander Motin 			for (i = 0; i < nitems(nsl->ns) && nsl->ns[i] != 0; i++) {
790f439e3a4SAlexander Motin 				if (nsl->ns[i] > NVME_MAX_NAMESPACES)
791f439e3a4SAlexander Motin 					break;
792f439e3a4SAlexander Motin 				nvme_notify_ns(aer->ctrlr, nsl->ns[i]);
793f439e3a4SAlexander Motin 			}
794bb2f67fdSJim Harris 		}
795bb2f67fdSJim Harris 
7960d7e13ecSJim Harris 		/*
7970d7e13ecSJim Harris 		 * Pass the cpl data from the original async event completion,
7980d7e13ecSJim Harris 		 *  not the log page fetch.
7990d7e13ecSJim Harris 		 */
8000d7e13ecSJim Harris 		nvme_notify_async_consumers(aer->ctrlr, &aer->cpl,
8010d7e13ecSJim Harris 		    aer->log_page_id, aer->log_page_buffer, aer->log_page_size);
802bb2f67fdSJim Harris 	}
8032868353aSJim Harris 
8042868353aSJim Harris 	/*
8052868353aSJim Harris 	 * Repost another asynchronous event request to replace the one
8062868353aSJim Harris 	 *  that just completed.
8072868353aSJim Harris 	 */
8082868353aSJim Harris 	nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer);
8092868353aSJim Harris }
8102868353aSJim Harris 
811bb0ec6b3SJim Harris static void
8120a0b08ccSJim Harris nvme_ctrlr_async_event_cb(void *arg, const struct nvme_completion *cpl)
8130a0b08ccSJim Harris {
8140a0b08ccSJim Harris 	struct nvme_async_event_request	*aer = arg;
8150a0b08ccSJim Harris 
816ec526ea9SJim Harris 	if (nvme_completion_is_error(cpl)) {
8170a0b08ccSJim Harris 		/*
818ec526ea9SJim Harris 		 *  Do not retry failed async event requests.  This avoids
819ec526ea9SJim Harris 		 *  infinite loops where a new async event request is submitted
820ec526ea9SJim Harris 		 *  to replace the one just failed, only to fail again and
821ec526ea9SJim Harris 		 *  perpetuate the loop.
8220a0b08ccSJim Harris 		 */
8230a0b08ccSJim Harris 		return;
8240a0b08ccSJim Harris 	}
8250a0b08ccSJim Harris 
8262868353aSJim Harris 	/* Associated log page is in bits 23:16 of completion entry dw0. */
8270d7e13ecSJim Harris 	aer->log_page_id = (cpl->cdw0 & 0xFF0000) >> 16;
8282868353aSJim Harris 
829f439e3a4SAlexander Motin 	nvme_printf(aer->ctrlr, "async event occurred (type 0x%x, info 0x%02x,"
830a6d222ebSAlexander Motin 	    " page 0x%02x)\n", (cpl->cdw0 & 0x07), (cpl->cdw0 & 0xFF00) >> 8,
831547d523eSJim Harris 	    aer->log_page_id);
832547d523eSJim Harris 
8330d7e13ecSJim Harris 	if (is_log_page_id_valid(aer->log_page_id)) {
8342868353aSJim Harris 		aer->log_page_size = nvme_ctrlr_get_log_page_size(aer->ctrlr,
8350d7e13ecSJim Harris 		    aer->log_page_id);
8362868353aSJim Harris 		memcpy(&aer->cpl, cpl, sizeof(*cpl));
8370d7e13ecSJim Harris 		nvme_ctrlr_cmd_get_log_page(aer->ctrlr, aer->log_page_id,
8382868353aSJim Harris 		    NVME_GLOBAL_NAMESPACE_TAG, aer->log_page_buffer,
8392868353aSJim Harris 		    aer->log_page_size, nvme_ctrlr_async_event_log_page_cb,
8402868353aSJim Harris 		    aer);
8412868353aSJim Harris 		/* Wait to notify consumers until after log page is fetched. */
8422868353aSJim Harris 	} else {
8430d7e13ecSJim Harris 		nvme_notify_async_consumers(aer->ctrlr, cpl, aer->log_page_id,
8440d7e13ecSJim Harris 		    NULL, 0);
845038a5ee4SJim Harris 
8460a0b08ccSJim Harris 		/*
8472868353aSJim Harris 		 * Repost another asynchronous event request to replace the one
8482868353aSJim Harris 		 *  that just completed.
8490a0b08ccSJim Harris 		 */
8500a0b08ccSJim Harris 		nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer);
8510a0b08ccSJim Harris 	}
8522868353aSJim Harris }
8530a0b08ccSJim Harris 
8540a0b08ccSJim Harris static void
8550a0b08ccSJim Harris nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr,
8560a0b08ccSJim Harris     struct nvme_async_event_request *aer)
8570a0b08ccSJim Harris {
8580a0b08ccSJim Harris 	struct nvme_request *req;
8590a0b08ccSJim Harris 
8600a0b08ccSJim Harris 	aer->ctrlr = ctrlr;
8611e526bc4SJim Harris 	req = nvme_allocate_request_null(nvme_ctrlr_async_event_cb, aer);
8620a0b08ccSJim Harris 	aer->req = req;
8630a0b08ccSJim Harris 
8640a0b08ccSJim Harris 	/*
86594143332SJim Harris 	 * Disable timeout here, since asynchronous event requests should by
86694143332SJim Harris 	 *  nature never be timed out.
8670a0b08ccSJim Harris 	 */
8687588c6ccSWarner Losh 	req->timeout = false;
8699544e6dcSChuck Tuffli 	req->cmd.opc = NVME_OPC_ASYNC_EVENT_REQUEST;
8700a0b08ccSJim Harris 	nvme_ctrlr_submit_admin_request(ctrlr, req);
8710a0b08ccSJim Harris }
8720a0b08ccSJim Harris 
8730a0b08ccSJim Harris static void
874bb0ec6b3SJim Harris nvme_ctrlr_configure_aer(struct nvme_controller *ctrlr)
875bb0ec6b3SJim Harris {
876d5fc9821SJim Harris 	struct nvme_completion_poll_status	status;
8770a0b08ccSJim Harris 	struct nvme_async_event_request		*aer;
8780a0b08ccSJim Harris 	uint32_t				i;
879bb0ec6b3SJim Harris 
880f439e3a4SAlexander Motin 	ctrlr->async_event_config = NVME_CRIT_WARN_ST_AVAILABLE_SPARE |
881f439e3a4SAlexander Motin 	    NVME_CRIT_WARN_ST_DEVICE_RELIABILITY |
882f439e3a4SAlexander Motin 	    NVME_CRIT_WARN_ST_READ_ONLY |
883f439e3a4SAlexander Motin 	    NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP;
884f439e3a4SAlexander Motin 	if (ctrlr->cdata.ver >= NVME_REV(1, 2))
885881534f0SWarner Losh 		ctrlr->async_event_config |= NVME_ASYNC_EVENT_NS_ATTRIBUTE |
886881534f0SWarner Losh 		    NVME_ASYNC_EVENT_FW_ACTIVATE;
887d5fc9821SJim Harris 
88829077eb4SWarner Losh 	status.done = 0;
889d5fc9821SJim Harris 	nvme_ctrlr_cmd_get_feature(ctrlr, NVME_FEAT_TEMPERATURE_THRESHOLD,
890d5fc9821SJim Harris 	    0, NULL, 0, nvme_completion_poll_cb, &status);
891ab0681aaSWarner Losh 	nvme_completion_poll(&status);
892d5fc9821SJim Harris 	if (nvme_completion_is_error(&status.cpl) ||
893d5fc9821SJim Harris 	    (status.cpl.cdw0 & 0xFFFF) == 0xFFFF ||
894d5fc9821SJim Harris 	    (status.cpl.cdw0 & 0xFFFF) == 0x0000) {
895d5fc9821SJim Harris 		nvme_printf(ctrlr, "temperature threshold not supported\n");
896f439e3a4SAlexander Motin 	} else
897f439e3a4SAlexander Motin 		ctrlr->async_event_config |= NVME_CRIT_WARN_ST_TEMPERATURE;
898d5fc9821SJim Harris 
899bb2f67fdSJim Harris 	nvme_ctrlr_cmd_set_async_event_config(ctrlr,
900bb2f67fdSJim Harris 	    ctrlr->async_event_config, NULL, NULL);
901bb0ec6b3SJim Harris 
902bb0ec6b3SJim Harris 	/* aerl is a zero-based value, so we need to add 1 here. */
9030a0b08ccSJim Harris 	ctrlr->num_aers = min(NVME_MAX_ASYNC_EVENTS, (ctrlr->cdata.aerl+1));
904bb0ec6b3SJim Harris 
9050a0b08ccSJim Harris 	for (i = 0; i < ctrlr->num_aers; i++) {
9060a0b08ccSJim Harris 		aer = &ctrlr->aer[i];
9070a0b08ccSJim Harris 		nvme_ctrlr_construct_and_submit_aer(ctrlr, aer);
9080a0b08ccSJim Harris 	}
909bb0ec6b3SJim Harris }
910bb0ec6b3SJim Harris 
911bb0ec6b3SJim Harris static void
912bb0ec6b3SJim Harris nvme_ctrlr_configure_int_coalescing(struct nvme_controller *ctrlr)
913bb0ec6b3SJim Harris {
914bb0ec6b3SJim Harris 
915bb0ec6b3SJim Harris 	ctrlr->int_coal_time = 0;
916bb0ec6b3SJim Harris 	TUNABLE_INT_FETCH("hw.nvme.int_coal_time",
917bb0ec6b3SJim Harris 	    &ctrlr->int_coal_time);
918bb0ec6b3SJim Harris 
919bb0ec6b3SJim Harris 	ctrlr->int_coal_threshold = 0;
920bb0ec6b3SJim Harris 	TUNABLE_INT_FETCH("hw.nvme.int_coal_threshold",
921bb0ec6b3SJim Harris 	    &ctrlr->int_coal_threshold);
922bb0ec6b3SJim Harris 
923bb0ec6b3SJim Harris 	nvme_ctrlr_cmd_set_interrupt_coalescing(ctrlr, ctrlr->int_coal_time,
924bb0ec6b3SJim Harris 	    ctrlr->int_coal_threshold, NULL, NULL);
925bb0ec6b3SJim Harris }
926bb0ec6b3SJim Harris 
927be34f216SJim Harris static void
92867abaee9SAlexander Motin nvme_ctrlr_hmb_free(struct nvme_controller *ctrlr)
92967abaee9SAlexander Motin {
93067abaee9SAlexander Motin 	struct nvme_hmb_chunk *hmbc;
93167abaee9SAlexander Motin 	int i;
93267abaee9SAlexander Motin 
93367abaee9SAlexander Motin 	if (ctrlr->hmb_desc_paddr) {
93467abaee9SAlexander Motin 		bus_dmamap_unload(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_map);
93567abaee9SAlexander Motin 		bus_dmamem_free(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_vaddr,
93667abaee9SAlexander Motin 		    ctrlr->hmb_desc_map);
93767abaee9SAlexander Motin 		ctrlr->hmb_desc_paddr = 0;
93867abaee9SAlexander Motin 	}
93967abaee9SAlexander Motin 	if (ctrlr->hmb_desc_tag) {
94067abaee9SAlexander Motin 		bus_dma_tag_destroy(ctrlr->hmb_desc_tag);
941b2cdfb72SAlexander Motin 		ctrlr->hmb_desc_tag = NULL;
94267abaee9SAlexander Motin 	}
94367abaee9SAlexander Motin 	for (i = 0; i < ctrlr->hmb_nchunks; i++) {
94467abaee9SAlexander Motin 		hmbc = &ctrlr->hmb_chunks[i];
94567abaee9SAlexander Motin 		bus_dmamap_unload(ctrlr->hmb_tag, hmbc->hmbc_map);
94667abaee9SAlexander Motin 		bus_dmamem_free(ctrlr->hmb_tag, hmbc->hmbc_vaddr,
94767abaee9SAlexander Motin 		    hmbc->hmbc_map);
94867abaee9SAlexander Motin 	}
94967abaee9SAlexander Motin 	ctrlr->hmb_nchunks = 0;
95067abaee9SAlexander Motin 	if (ctrlr->hmb_tag) {
95167abaee9SAlexander Motin 		bus_dma_tag_destroy(ctrlr->hmb_tag);
95267abaee9SAlexander Motin 		ctrlr->hmb_tag = NULL;
95367abaee9SAlexander Motin 	}
95467abaee9SAlexander Motin 	if (ctrlr->hmb_chunks) {
95567abaee9SAlexander Motin 		free(ctrlr->hmb_chunks, M_NVME);
95667abaee9SAlexander Motin 		ctrlr->hmb_chunks = NULL;
95767abaee9SAlexander Motin 	}
95867abaee9SAlexander Motin }
95967abaee9SAlexander Motin 
96067abaee9SAlexander Motin static void
96167abaee9SAlexander Motin nvme_ctrlr_hmb_alloc(struct nvme_controller *ctrlr)
96267abaee9SAlexander Motin {
96367abaee9SAlexander Motin 	struct nvme_hmb_chunk *hmbc;
96467abaee9SAlexander Motin 	size_t pref, min, minc, size;
96567abaee9SAlexander Motin 	int err, i;
96667abaee9SAlexander Motin 	uint64_t max;
96767abaee9SAlexander Motin 
9681c7dd40eSAlexander Motin 	/* Limit HMB to 5% of RAM size per device by default. */
9691c7dd40eSAlexander Motin 	max = (uint64_t)physmem * PAGE_SIZE / 20;
97067abaee9SAlexander Motin 	TUNABLE_UINT64_FETCH("hw.nvme.hmb_max", &max);
97167abaee9SAlexander Motin 
9723740a8dbSWarner Losh 	/*
9733740a8dbSWarner Losh 	 * Units of Host Memory Buffer in the Identify info are always in terms
9743740a8dbSWarner Losh 	 * of 4k units.
9753740a8dbSWarner Losh 	 */
976214df80aSWarner Losh 	min = (long long unsigned)ctrlr->cdata.hmmin * NVME_HMB_UNITS;
9776de4e458SAlexander Motin 	if (max == 0 || max < min)
97867abaee9SAlexander Motin 		return;
979214df80aSWarner Losh 	pref = MIN((long long unsigned)ctrlr->cdata.hmpre * NVME_HMB_UNITS, max);
9803740a8dbSWarner Losh 	minc = MAX(ctrlr->cdata.hmminds * NVME_HMB_UNITS, ctrlr->page_size);
98167abaee9SAlexander Motin 	if (min > 0 && ctrlr->cdata.hmmaxd > 0)
98267abaee9SAlexander Motin 		minc = MAX(minc, min / ctrlr->cdata.hmmaxd);
98367abaee9SAlexander Motin 	ctrlr->hmb_chunk = pref;
98467abaee9SAlexander Motin 
98567abaee9SAlexander Motin again:
9863740a8dbSWarner Losh 	/*
9873740a8dbSWarner Losh 	 * However, the chunk sizes, number of chunks, and alignment of chunks
9883740a8dbSWarner Losh 	 * are all based on the current MPS (ctrlr->page_size).
9893740a8dbSWarner Losh 	 */
9903740a8dbSWarner Losh 	ctrlr->hmb_chunk = roundup2(ctrlr->hmb_chunk, ctrlr->page_size);
99167abaee9SAlexander Motin 	ctrlr->hmb_nchunks = howmany(pref, ctrlr->hmb_chunk);
99267abaee9SAlexander Motin 	if (ctrlr->cdata.hmmaxd > 0 && ctrlr->hmb_nchunks > ctrlr->cdata.hmmaxd)
99367abaee9SAlexander Motin 		ctrlr->hmb_nchunks = ctrlr->cdata.hmmaxd;
99467abaee9SAlexander Motin 	ctrlr->hmb_chunks = malloc(sizeof(struct nvme_hmb_chunk) *
99567abaee9SAlexander Motin 	    ctrlr->hmb_nchunks, M_NVME, M_WAITOK);
99667abaee9SAlexander Motin 	err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev),
9973740a8dbSWarner Losh 	    ctrlr->page_size, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL,
99867abaee9SAlexander Motin 	    ctrlr->hmb_chunk, 1, ctrlr->hmb_chunk, 0, NULL, NULL, &ctrlr->hmb_tag);
99967abaee9SAlexander Motin 	if (err != 0) {
100067abaee9SAlexander Motin 		nvme_printf(ctrlr, "HMB tag create failed %d\n", err);
100167abaee9SAlexander Motin 		nvme_ctrlr_hmb_free(ctrlr);
100267abaee9SAlexander Motin 		return;
100367abaee9SAlexander Motin 	}
100467abaee9SAlexander Motin 
100567abaee9SAlexander Motin 	for (i = 0; i < ctrlr->hmb_nchunks; i++) {
100667abaee9SAlexander Motin 		hmbc = &ctrlr->hmb_chunks[i];
100767abaee9SAlexander Motin 		if (bus_dmamem_alloc(ctrlr->hmb_tag,
100867abaee9SAlexander Motin 		    (void **)&hmbc->hmbc_vaddr, BUS_DMA_NOWAIT,
100967abaee9SAlexander Motin 		    &hmbc->hmbc_map)) {
101067abaee9SAlexander Motin 			nvme_printf(ctrlr, "failed to alloc HMB\n");
101167abaee9SAlexander Motin 			break;
101267abaee9SAlexander Motin 		}
101367abaee9SAlexander Motin 		if (bus_dmamap_load(ctrlr->hmb_tag, hmbc->hmbc_map,
101467abaee9SAlexander Motin 		    hmbc->hmbc_vaddr, ctrlr->hmb_chunk, nvme_single_map,
101567abaee9SAlexander Motin 		    &hmbc->hmbc_paddr, BUS_DMA_NOWAIT) != 0) {
101667abaee9SAlexander Motin 			bus_dmamem_free(ctrlr->hmb_tag, hmbc->hmbc_vaddr,
101767abaee9SAlexander Motin 			    hmbc->hmbc_map);
101867abaee9SAlexander Motin 			nvme_printf(ctrlr, "failed to load HMB\n");
101967abaee9SAlexander Motin 			break;
102067abaee9SAlexander Motin 		}
102167abaee9SAlexander Motin 		bus_dmamap_sync(ctrlr->hmb_tag, hmbc->hmbc_map,
102267abaee9SAlexander Motin 		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
102367abaee9SAlexander Motin 	}
102467abaee9SAlexander Motin 
102567abaee9SAlexander Motin 	if (i < ctrlr->hmb_nchunks && i * ctrlr->hmb_chunk < min &&
102667abaee9SAlexander Motin 	    ctrlr->hmb_chunk / 2 >= minc) {
102767abaee9SAlexander Motin 		ctrlr->hmb_nchunks = i;
102867abaee9SAlexander Motin 		nvme_ctrlr_hmb_free(ctrlr);
102967abaee9SAlexander Motin 		ctrlr->hmb_chunk /= 2;
103067abaee9SAlexander Motin 		goto again;
103167abaee9SAlexander Motin 	}
103267abaee9SAlexander Motin 	ctrlr->hmb_nchunks = i;
103367abaee9SAlexander Motin 	if (ctrlr->hmb_nchunks * ctrlr->hmb_chunk < min) {
103467abaee9SAlexander Motin 		nvme_ctrlr_hmb_free(ctrlr);
103567abaee9SAlexander Motin 		return;
103667abaee9SAlexander Motin 	}
103767abaee9SAlexander Motin 
103867abaee9SAlexander Motin 	size = sizeof(struct nvme_hmb_desc) * ctrlr->hmb_nchunks;
103967abaee9SAlexander Motin 	err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev),
104067abaee9SAlexander Motin 	    16, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL,
104167abaee9SAlexander Motin 	    size, 1, size, 0, NULL, NULL, &ctrlr->hmb_desc_tag);
104267abaee9SAlexander Motin 	if (err != 0) {
104367abaee9SAlexander Motin 		nvme_printf(ctrlr, "HMB desc tag create failed %d\n", err);
104467abaee9SAlexander Motin 		nvme_ctrlr_hmb_free(ctrlr);
104567abaee9SAlexander Motin 		return;
104667abaee9SAlexander Motin 	}
104767abaee9SAlexander Motin 	if (bus_dmamem_alloc(ctrlr->hmb_desc_tag,
104867abaee9SAlexander Motin 	    (void **)&ctrlr->hmb_desc_vaddr, BUS_DMA_WAITOK,
104967abaee9SAlexander Motin 	    &ctrlr->hmb_desc_map)) {
105067abaee9SAlexander Motin 		nvme_printf(ctrlr, "failed to alloc HMB desc\n");
105167abaee9SAlexander Motin 		nvme_ctrlr_hmb_free(ctrlr);
105267abaee9SAlexander Motin 		return;
105367abaee9SAlexander Motin 	}
105467abaee9SAlexander Motin 	if (bus_dmamap_load(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_map,
105567abaee9SAlexander Motin 	    ctrlr->hmb_desc_vaddr, size, nvme_single_map,
105667abaee9SAlexander Motin 	    &ctrlr->hmb_desc_paddr, BUS_DMA_NOWAIT) != 0) {
105767abaee9SAlexander Motin 		bus_dmamem_free(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_vaddr,
105867abaee9SAlexander Motin 		    ctrlr->hmb_desc_map);
105967abaee9SAlexander Motin 		nvme_printf(ctrlr, "failed to load HMB desc\n");
106067abaee9SAlexander Motin 		nvme_ctrlr_hmb_free(ctrlr);
106167abaee9SAlexander Motin 		return;
106267abaee9SAlexander Motin 	}
106367abaee9SAlexander Motin 
106467abaee9SAlexander Motin 	for (i = 0; i < ctrlr->hmb_nchunks; i++) {
106567abaee9SAlexander Motin 		ctrlr->hmb_desc_vaddr[i].addr =
106667abaee9SAlexander Motin 		    htole64(ctrlr->hmb_chunks[i].hmbc_paddr);
10673740a8dbSWarner Losh 		ctrlr->hmb_desc_vaddr[i].size = htole32(ctrlr->hmb_chunk / ctrlr->page_size);
106867abaee9SAlexander Motin 	}
106967abaee9SAlexander Motin 	bus_dmamap_sync(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_map,
107067abaee9SAlexander Motin 	    BUS_DMASYNC_PREWRITE);
107167abaee9SAlexander Motin 
107267abaee9SAlexander Motin 	nvme_printf(ctrlr, "Allocated %lluMB host memory buffer\n",
107367abaee9SAlexander Motin 	    (long long unsigned)ctrlr->hmb_nchunks * ctrlr->hmb_chunk
107467abaee9SAlexander Motin 	    / 1024 / 1024);
107567abaee9SAlexander Motin }
107667abaee9SAlexander Motin 
107767abaee9SAlexander Motin static void
107867abaee9SAlexander Motin nvme_ctrlr_hmb_enable(struct nvme_controller *ctrlr, bool enable, bool memret)
107967abaee9SAlexander Motin {
108067abaee9SAlexander Motin 	struct nvme_completion_poll_status	status;
108167abaee9SAlexander Motin 	uint32_t cdw11;
108267abaee9SAlexander Motin 
108367abaee9SAlexander Motin 	cdw11 = 0;
108467abaee9SAlexander Motin 	if (enable)
108567abaee9SAlexander Motin 		cdw11 |= 1;
108667abaee9SAlexander Motin 	if (memret)
108767abaee9SAlexander Motin 		cdw11 |= 2;
108867abaee9SAlexander Motin 	status.done = 0;
108967abaee9SAlexander Motin 	nvme_ctrlr_cmd_set_feature(ctrlr, NVME_FEAT_HOST_MEMORY_BUFFER, cdw11,
10903740a8dbSWarner Losh 	    ctrlr->hmb_nchunks * ctrlr->hmb_chunk / ctrlr->page_size,
10913740a8dbSWarner Losh 	    ctrlr->hmb_desc_paddr, ctrlr->hmb_desc_paddr >> 32,
10923740a8dbSWarner Losh 	    ctrlr->hmb_nchunks, NULL, 0,
109367abaee9SAlexander Motin 	    nvme_completion_poll_cb, &status);
109467abaee9SAlexander Motin 	nvme_completion_poll(&status);
109567abaee9SAlexander Motin 	if (nvme_completion_is_error(&status.cpl))
109667abaee9SAlexander Motin 		nvme_printf(ctrlr, "nvme_ctrlr_hmb_enable failed!\n");
109767abaee9SAlexander Motin }
109867abaee9SAlexander Motin 
109967abaee9SAlexander Motin static void
11004d547561SWarner Losh nvme_ctrlr_start(void *ctrlr_arg, bool resetting)
1101bb0ec6b3SJim Harris {
1102bb0ec6b3SJim Harris 	struct nvme_controller *ctrlr = ctrlr_arg;
11032b647da7SJim Harris 	uint32_t old_num_io_queues;
1104b846efd7SJim Harris 	int i;
1105b846efd7SJim Harris 
1106bad42df9SColin Percival 	TSENTER();
1107bad42df9SColin Percival 
11082b647da7SJim Harris 	/*
11092b647da7SJim Harris 	 * Only reset adminq here when we are restarting the
11102b647da7SJim Harris 	 *  controller after a reset.  During initialization,
11112b647da7SJim Harris 	 *  we have already submitted admin commands to get
11122b647da7SJim Harris 	 *  the number of I/O queues supported, so cannot reset
11132b647da7SJim Harris 	 *  the adminq again here.
11142b647da7SJim Harris 	 */
1115ac90f70dSAlexander Motin 	if (resetting) {
1116cb5b7c13SJim Harris 		nvme_qpair_reset(&ctrlr->adminq);
1117ac90f70dSAlexander Motin 		nvme_admin_qpair_enable(&ctrlr->adminq);
1118ac90f70dSAlexander Motin 	}
11192b647da7SJim Harris 
1120701267adSAlexander Motin 	if (ctrlr->ioq != NULL) {
1121cb5b7c13SJim Harris 		for (i = 0; i < ctrlr->num_io_queues; i++)
1122cb5b7c13SJim Harris 			nvme_qpair_reset(&ctrlr->ioq[i]);
1123701267adSAlexander Motin 	}
1124cb5b7c13SJim Harris 
1125701267adSAlexander Motin 	/*
1126701267adSAlexander Motin 	 * If it was a reset on initialization command timeout, just
1127701267adSAlexander Motin 	 * return here, letting initialization code fail gracefully.
1128701267adSAlexander Motin 	 */
1129701267adSAlexander Motin 	if (resetting && !ctrlr->is_initialized)
1130701267adSAlexander Motin 		return;
1131701267adSAlexander Motin 
1132ac90f70dSAlexander Motin 	if (resetting && nvme_ctrlr_identify(ctrlr) != 0) {
1133232e2edbSJim Harris 		nvme_ctrlr_fail(ctrlr);
1134be34f216SJim Harris 		return;
1135232e2edbSJim Harris 	}
1136bb0ec6b3SJim Harris 
11372b647da7SJim Harris 	/*
11382b647da7SJim Harris 	 * The number of qpairs are determined during controller initialization,
11392b647da7SJim Harris 	 *  including using NVMe SET_FEATURES/NUMBER_OF_QUEUES to determine the
11402b647da7SJim Harris 	 *  HW limit.  We call SET_FEATURES again here so that it gets called
11412b647da7SJim Harris 	 *  after any reset for controllers that depend on the driver to
11422b647da7SJim Harris 	 *  explicit specify how many queues it will use.  This value should
11432b647da7SJim Harris 	 *  never change between resets, so panic if somehow that does happen.
11442b647da7SJim Harris 	 */
11454d547561SWarner Losh 	if (resetting) {
11462b647da7SJim Harris 		old_num_io_queues = ctrlr->num_io_queues;
1147232e2edbSJim Harris 		if (nvme_ctrlr_set_num_qpairs(ctrlr) != 0) {
1148232e2edbSJim Harris 			nvme_ctrlr_fail(ctrlr);
1149be34f216SJim Harris 			return;
1150232e2edbSJim Harris 		}
1151bb0ec6b3SJim Harris 
11522b647da7SJim Harris 		if (old_num_io_queues != ctrlr->num_io_queues) {
11537b036d77SJim Harris 			panic("num_io_queues changed from %u to %u",
11547b036d77SJim Harris 			      old_num_io_queues, ctrlr->num_io_queues);
11557b036d77SJim Harris 		}
11562b647da7SJim Harris 	}
11572b647da7SJim Harris 
115867abaee9SAlexander Motin 	if (ctrlr->cdata.hmpre > 0 && ctrlr->hmb_nchunks == 0) {
115967abaee9SAlexander Motin 		nvme_ctrlr_hmb_alloc(ctrlr);
116067abaee9SAlexander Motin 		if (ctrlr->hmb_nchunks > 0)
116167abaee9SAlexander Motin 			nvme_ctrlr_hmb_enable(ctrlr, true, false);
116267abaee9SAlexander Motin 	} else if (ctrlr->hmb_nchunks > 0)
116367abaee9SAlexander Motin 		nvme_ctrlr_hmb_enable(ctrlr, true, true);
116467abaee9SAlexander Motin 
1165232e2edbSJim Harris 	if (nvme_ctrlr_create_qpairs(ctrlr) != 0) {
1166232e2edbSJim Harris 		nvme_ctrlr_fail(ctrlr);
1167be34f216SJim Harris 		return;
1168232e2edbSJim Harris 	}
1169bb0ec6b3SJim Harris 
1170232e2edbSJim Harris 	if (nvme_ctrlr_construct_namespaces(ctrlr) != 0) {
1171232e2edbSJim Harris 		nvme_ctrlr_fail(ctrlr);
1172be34f216SJim Harris 		return;
1173232e2edbSJim Harris 	}
1174bb0ec6b3SJim Harris 
1175bb0ec6b3SJim Harris 	nvme_ctrlr_configure_aer(ctrlr);
1176bb0ec6b3SJim Harris 	nvme_ctrlr_configure_int_coalescing(ctrlr);
1177bb0ec6b3SJim Harris 
1178b846efd7SJim Harris 	for (i = 0; i < ctrlr->num_io_queues; i++)
1179b846efd7SJim Harris 		nvme_io_qpair_enable(&ctrlr->ioq[i]);
1180bad42df9SColin Percival 	TSEXIT();
1181bb0ec6b3SJim Harris }
1182bb0ec6b3SJim Harris 
1183be34f216SJim Harris void
1184be34f216SJim Harris nvme_ctrlr_start_config_hook(void *arg)
1185be34f216SJim Harris {
1186be34f216SJim Harris 	struct nvme_controller *ctrlr = arg;
118766e59850SWarner Losh 
1188bad42df9SColin Percival 	TSENTER();
1189bad42df9SColin Percival 
1190*8052b01eSWarner Losh 	/*
1191*8052b01eSWarner Losh 	 * Don't call pre/post reset here. We've not yet created the qpairs,
1192*8052b01eSWarner Losh 	 * haven't setup the ISRs, so there's no need to 'drain' them or
1193*8052b01eSWarner Losh 	 * 'exclude' them.
1194*8052b01eSWarner Losh 	 */
1195701267adSAlexander Motin 	if (nvme_ctrlr_hw_reset(ctrlr) != 0) {
1196701267adSAlexander Motin fail:
119766e59850SWarner Losh 		nvme_ctrlr_fail(ctrlr);
119892390644SAlexander Motin 		config_intrhook_disestablish(&ctrlr->config_hook);
119966e59850SWarner Losh 		return;
120066e59850SWarner Losh 	}
120166e59850SWarner Losh 
12024b3da659SWarner Losh #ifdef NVME_2X_RESET
12034b3da659SWarner Losh 	/*
12044b3da659SWarner Losh 	 * Reset controller twice to ensure we do a transition from cc.en==1 to
12054b3da659SWarner Losh 	 * cc.en==0.  This is because we don't really know what status the
12064b3da659SWarner Losh 	 * controller was left in when boot handed off to OS.  Linux doesn't do
12074b3da659SWarner Losh 	 * this, however, and when the controller is in state cc.en == 0, no
12084b3da659SWarner Losh 	 * I/O can happen.
12094b3da659SWarner Losh 	 */
1210701267adSAlexander Motin 	if (nvme_ctrlr_hw_reset(ctrlr) != 0)
1211701267adSAlexander Motin 		goto fail;
12124b3da659SWarner Losh #endif
1213be34f216SJim Harris 
12142b647da7SJim Harris 	nvme_qpair_reset(&ctrlr->adminq);
12152b647da7SJim Harris 	nvme_admin_qpair_enable(&ctrlr->adminq);
12162b647da7SJim Harris 
1217ac90f70dSAlexander Motin 	if (nvme_ctrlr_identify(ctrlr) == 0 &&
1218ac90f70dSAlexander Motin 	    nvme_ctrlr_set_num_qpairs(ctrlr) == 0 &&
12192b647da7SJim Harris 	    nvme_ctrlr_construct_io_qpairs(ctrlr) == 0)
12204d547561SWarner Losh 		nvme_ctrlr_start(ctrlr, false);
12212b647da7SJim Harris 	else
1222701267adSAlexander Motin 		goto fail;
12232b647da7SJim Harris 
12242b647da7SJim Harris 	nvme_sysctl_initialize_ctrlr(ctrlr);
1225be34f216SJim Harris 	config_intrhook_disestablish(&ctrlr->config_hook);
1226496a2752SJim Harris 
1227496a2752SJim Harris 	ctrlr->is_initialized = 1;
1228496a2752SJim Harris 	nvme_notify_new_controller(ctrlr);
1229bad42df9SColin Percival 	TSEXIT();
1230b846efd7SJim Harris }
1231b846efd7SJim Harris 
1232bb0ec6b3SJim Harris static void
123348ce3178SJim Harris nvme_ctrlr_reset_task(void *arg, int pending)
123412d191ecSJim Harris {
123512d191ecSJim Harris 	struct nvme_controller	*ctrlr = arg;
123648ce3178SJim Harris 	int			status;
123712d191ecSJim Harris 
1238244b8053SWarner Losh 	nvme_ctrlr_devctl_log(ctrlr, "RESET", "resetting controller");
1239*8052b01eSWarner Losh 	nvme_pre_reset(ctrlr);
124048ce3178SJim Harris 	status = nvme_ctrlr_hw_reset(ctrlr);
1241*8052b01eSWarner Losh 	nvme_post_reset(ctrlr);
124248ce3178SJim Harris 	if (status == 0)
12434d547561SWarner Losh 		nvme_ctrlr_start(ctrlr, true);
1244232e2edbSJim Harris 	else
1245232e2edbSJim Harris 		nvme_ctrlr_fail(ctrlr);
1246f37c22a3SJim Harris 
1247f37c22a3SJim Harris 	atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
124812d191ecSJim Harris }
124912d191ecSJim Harris 
1250bb1c7be4SWarner Losh /*
1251bb1c7be4SWarner Losh  * Poll all the queues enabled on the device for completion.
1252bb1c7be4SWarner Losh  */
1253bb1c7be4SWarner Losh void
1254bb1c7be4SWarner Losh nvme_ctrlr_poll(struct nvme_controller *ctrlr)
1255bb1c7be4SWarner Losh {
1256bb1c7be4SWarner Losh 	int i;
1257bb1c7be4SWarner Losh 
1258bb1c7be4SWarner Losh 	nvme_qpair_process_completions(&ctrlr->adminq);
1259bb1c7be4SWarner Losh 
1260bb1c7be4SWarner Losh 	for (i = 0; i < ctrlr->num_io_queues; i++)
1261bb1c7be4SWarner Losh 		if (ctrlr->ioq && ctrlr->ioq[i].cpl)
1262bb1c7be4SWarner Losh 			nvme_qpair_process_completions(&ctrlr->ioq[i]);
1263bb1c7be4SWarner Losh }
1264bb1c7be4SWarner Losh 
1265bb1c7be4SWarner Losh /*
12664d547561SWarner Losh  * Poll the single-vector interrupt case: num_io_queues will be 1 and
1267bb1c7be4SWarner Losh  * there's only a single vector. While we're polling, we mask further
1268bb1c7be4SWarner Losh  * interrupts in the controller.
1269bb1c7be4SWarner Losh  */
1270f24c011bSWarner Losh void
1271e3bdf3daSAlexander Motin nvme_ctrlr_shared_handler(void *arg)
1272bb0ec6b3SJim Harris {
1273bb0ec6b3SJim Harris 	struct nvme_controller *ctrlr = arg;
1274bb0ec6b3SJim Harris 
12754d6abcb1SJim Harris 	nvme_mmio_write_4(ctrlr, intms, 1);
1276bb1c7be4SWarner Losh 	nvme_ctrlr_poll(ctrlr);
1277bb0ec6b3SJim Harris 	nvme_mmio_write_4(ctrlr, intmc, 1);
1278bb0ec6b3SJim Harris }
1279bb0ec6b3SJim Harris 
12807c3f19d7SJim Harris static void
12817c3f19d7SJim Harris nvme_pt_done(void *arg, const struct nvme_completion *cpl)
12827c3f19d7SJim Harris {
12837c3f19d7SJim Harris 	struct nvme_pt_command *pt = arg;
1284c252f637SAlexander Motin 	struct mtx *mtx = pt->driver_lock;
12850d787e9bSWojciech Macek 	uint16_t status;
12867c3f19d7SJim Harris 
12877c3f19d7SJim Harris 	bzero(&pt->cpl, sizeof(pt->cpl));
12887c3f19d7SJim Harris 	pt->cpl.cdw0 = cpl->cdw0;
12890d787e9bSWojciech Macek 
12900d787e9bSWojciech Macek 	status = cpl->status;
12910d787e9bSWojciech Macek 	status &= ~NVME_STATUS_P_MASK;
12920d787e9bSWojciech Macek 	pt->cpl.status = status;
12937c3f19d7SJim Harris 
1294c252f637SAlexander Motin 	mtx_lock(mtx);
1295c252f637SAlexander Motin 	pt->driver_lock = NULL;
12967c3f19d7SJim Harris 	wakeup(pt);
1297c252f637SAlexander Motin 	mtx_unlock(mtx);
12987c3f19d7SJim Harris }
12997c3f19d7SJim Harris 
13007c3f19d7SJim Harris int
13017c3f19d7SJim Harris nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr,
13027c3f19d7SJim Harris     struct nvme_pt_command *pt, uint32_t nsid, int is_user_buffer,
13037c3f19d7SJim Harris     int is_admin_cmd)
13047c3f19d7SJim Harris {
13057c3f19d7SJim Harris 	struct nvme_request	*req;
13067c3f19d7SJim Harris 	struct mtx		*mtx;
13077c3f19d7SJim Harris 	struct buf		*buf = NULL;
13087c3f19d7SJim Harris 	int			ret = 0;
13097c3f19d7SJim Harris 
13107b68ae1eSJim Harris 	if (pt->len > 0) {
13117b68ae1eSJim Harris 		if (pt->len > ctrlr->max_xfer_size) {
13127b68ae1eSJim Harris 			nvme_printf(ctrlr, "pt->len (%d) "
13137b68ae1eSJim Harris 			    "exceeds max_xfer_size (%d)\n", pt->len,
13147b68ae1eSJim Harris 			    ctrlr->max_xfer_size);
13157b68ae1eSJim Harris 			return EIO;
13167b68ae1eSJim Harris 		}
13177c3f19d7SJim Harris 		if (is_user_buffer) {
13187c3f19d7SJim Harris 			/*
13197c3f19d7SJim Harris 			 * Ensure the user buffer is wired for the duration of
13204d547561SWarner Losh 			 *  this pass-through command.
13217c3f19d7SJim Harris 			 */
13227c3f19d7SJim Harris 			PHOLD(curproc);
1323756a5412SGleb Smirnoff 			buf = uma_zalloc(pbuf_zone, M_WAITOK);
13247c3f19d7SJim Harris 			buf->b_iocmd = pt->is_read ? BIO_READ : BIO_WRITE;
132544ca4575SBrooks Davis 			if (vmapbuf(buf, pt->buf, pt->len, 1) < 0) {
13267c3f19d7SJim Harris 				ret = EFAULT;
13277c3f19d7SJim Harris 				goto err;
13287c3f19d7SJim Harris 			}
13297c3f19d7SJim Harris 			req = nvme_allocate_request_vaddr(buf->b_data, pt->len,
13307c3f19d7SJim Harris 			    nvme_pt_done, pt);
13317c3f19d7SJim Harris 		} else
13327c3f19d7SJim Harris 			req = nvme_allocate_request_vaddr(pt->buf, pt->len,
13337c3f19d7SJim Harris 			    nvme_pt_done, pt);
13347b68ae1eSJim Harris 	} else
13357c3f19d7SJim Harris 		req = nvme_allocate_request_null(nvme_pt_done, pt);
13367c3f19d7SJim Harris 
13370d787e9bSWojciech Macek 	/* Assume user space already converted to little-endian */
13389544e6dcSChuck Tuffli 	req->cmd.opc = pt->cmd.opc;
13399544e6dcSChuck Tuffli 	req->cmd.fuse = pt->cmd.fuse;
134091182bcfSWarner Losh 	req->cmd.rsvd2 = pt->cmd.rsvd2;
134191182bcfSWarner Losh 	req->cmd.rsvd3 = pt->cmd.rsvd3;
13427c3f19d7SJim Harris 	req->cmd.cdw10 = pt->cmd.cdw10;
13437c3f19d7SJim Harris 	req->cmd.cdw11 = pt->cmd.cdw11;
13447c3f19d7SJim Harris 	req->cmd.cdw12 = pt->cmd.cdw12;
13457c3f19d7SJim Harris 	req->cmd.cdw13 = pt->cmd.cdw13;
13467c3f19d7SJim Harris 	req->cmd.cdw14 = pt->cmd.cdw14;
13477c3f19d7SJim Harris 	req->cmd.cdw15 = pt->cmd.cdw15;
13487c3f19d7SJim Harris 
13490d787e9bSWojciech Macek 	req->cmd.nsid = htole32(nsid);
13507c3f19d7SJim Harris 
1351c252f637SAlexander Motin 	mtx = mtx_pool_find(mtxpool_sleep, pt);
13527c3f19d7SJim Harris 	pt->driver_lock = mtx;
13537c3f19d7SJim Harris 
13547c3f19d7SJim Harris 	if (is_admin_cmd)
13557c3f19d7SJim Harris 		nvme_ctrlr_submit_admin_request(ctrlr, req);
13567c3f19d7SJim Harris 	else
13577c3f19d7SJim Harris 		nvme_ctrlr_submit_io_request(ctrlr, req);
13587c3f19d7SJim Harris 
1359c252f637SAlexander Motin 	mtx_lock(mtx);
1360c252f637SAlexander Motin 	while (pt->driver_lock != NULL)
13617c3f19d7SJim Harris 		mtx_sleep(pt, mtx, PRIBIO, "nvme_pt", 0);
13627c3f19d7SJim Harris 	mtx_unlock(mtx);
13637c3f19d7SJim Harris 
13647c3f19d7SJim Harris err:
13657c3f19d7SJim Harris 	if (buf != NULL) {
1366756a5412SGleb Smirnoff 		uma_zfree(pbuf_zone, buf);
13677c3f19d7SJim Harris 		PRELE(curproc);
13687c3f19d7SJim Harris 	}
13697c3f19d7SJim Harris 
13707c3f19d7SJim Harris 	return (ret);
13717c3f19d7SJim Harris }
13727c3f19d7SJim Harris 
1373bb0ec6b3SJim Harris static int
1374bb0ec6b3SJim Harris nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
1375bb0ec6b3SJim Harris     struct thread *td)
1376bb0ec6b3SJim Harris {
1377bb0ec6b3SJim Harris 	struct nvme_controller			*ctrlr;
13787c3f19d7SJim Harris 	struct nvme_pt_command			*pt;
1379bb0ec6b3SJim Harris 
1380bb0ec6b3SJim Harris 	ctrlr = cdev->si_drv1;
1381bb0ec6b3SJim Harris 
1382bb0ec6b3SJim Harris 	switch (cmd) {
1383b846efd7SJim Harris 	case NVME_RESET_CONTROLLER:
1384b846efd7SJim Harris 		nvme_ctrlr_reset(ctrlr);
1385b846efd7SJim Harris 		break;
13867c3f19d7SJim Harris 	case NVME_PASSTHROUGH_CMD:
13877c3f19d7SJim Harris 		pt = (struct nvme_pt_command *)arg;
13880d787e9bSWojciech Macek 		return (nvme_ctrlr_passthrough_cmd(ctrlr, pt, le32toh(pt->cmd.nsid),
13897c3f19d7SJim Harris 		    1 /* is_user_buffer */, 1 /* is_admin_cmd */));
1390a7bf63beSAlexander Motin 	case NVME_GET_NSID:
1391a7bf63beSAlexander Motin 	{
1392a7bf63beSAlexander Motin 		struct nvme_get_nsid *gnsid = (struct nvme_get_nsid *)arg;
1393a7bf63beSAlexander Motin 		strncpy(gnsid->cdev, device_get_nameunit(ctrlr->dev),
1394a7bf63beSAlexander Motin 		    sizeof(gnsid->cdev));
13954053f8acSDavid Bright 		gnsid->cdev[sizeof(gnsid->cdev) - 1] = '\0';
1396a7bf63beSAlexander Motin 		gnsid->nsid = 0;
1397a7bf63beSAlexander Motin 		break;
1398a7bf63beSAlexander Motin 	}
1399e32d47f3SDavid Bright 	case NVME_GET_MAX_XFER_SIZE:
1400e32d47f3SDavid Bright 		*(uint64_t *)arg = ctrlr->max_xfer_size;
1401e32d47f3SDavid Bright 		break;
1402bb0ec6b3SJim Harris 	default:
1403bb0ec6b3SJim Harris 		return (ENOTTY);
1404bb0ec6b3SJim Harris 	}
1405bb0ec6b3SJim Harris 
1406bb0ec6b3SJim Harris 	return (0);
1407bb0ec6b3SJim Harris }
1408bb0ec6b3SJim Harris 
1409bb0ec6b3SJim Harris static struct cdevsw nvme_ctrlr_cdevsw = {
1410bb0ec6b3SJim Harris 	.d_version =	D_VERSION,
1411bb0ec6b3SJim Harris 	.d_flags =	0,
1412bb0ec6b3SJim Harris 	.d_ioctl =	nvme_ctrlr_ioctl
1413bb0ec6b3SJim Harris };
1414bb0ec6b3SJim Harris 
1415bb0ec6b3SJim Harris int
1416bb0ec6b3SJim Harris nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev)
1417bb0ec6b3SJim Harris {
1418e134ecdcSAlexander Motin 	struct make_dev_args	md_args;
14190d787e9bSWojciech Macek 	uint32_t	cap_lo;
14200d787e9bSWojciech Macek 	uint32_t	cap_hi;
14210bed3eabSAlexander Motin 	uint32_t	to, vs, pmrcap;
1422f42ca756SJim Harris 	int		status, timeout_period;
1423bb0ec6b3SJim Harris 
1424bb0ec6b3SJim Harris 	ctrlr->dev = dev;
1425bb0ec6b3SJim Harris 
1426a90b8104SJim Harris 	mtx_init(&ctrlr->lock, "nvme ctrlr lock", NULL, MTX_DEF);
14271eab19cbSAlexander Motin 	if (bus_get_domain(dev, &ctrlr->domain) != 0)
14281eab19cbSAlexander Motin 		ctrlr->domain = 0;
1429a90b8104SJim Harris 
14306af6a52eSWarner Losh 	ctrlr->cap_lo = cap_lo = nvme_mmio_read_4(ctrlr, cap_lo);
1431c44441f8SAlexander Motin 	if (bootverbose) {
1432c44441f8SAlexander Motin 		device_printf(dev, "CapLo: 0x%08x: MQES %u%s%s%s%s, TO %u\n",
1433c44441f8SAlexander Motin 		    cap_lo, NVME_CAP_LO_MQES(cap_lo),
1434c44441f8SAlexander Motin 		    NVME_CAP_LO_CQR(cap_lo) ? ", CQR" : "",
1435c44441f8SAlexander Motin 		    NVME_CAP_LO_AMS(cap_lo) ? ", AMS" : "",
1436c44441f8SAlexander Motin 		    (NVME_CAP_LO_AMS(cap_lo) & 0x1) ? " WRRwUPC" : "",
1437c44441f8SAlexander Motin 		    (NVME_CAP_LO_AMS(cap_lo) & 0x2) ? " VS" : "",
1438c44441f8SAlexander Motin 		    NVME_CAP_LO_TO(cap_lo));
1439c44441f8SAlexander Motin 	}
14406af6a52eSWarner Losh 	ctrlr->cap_hi = cap_hi = nvme_mmio_read_4(ctrlr, cap_hi);
1441c44441f8SAlexander Motin 	if (bootverbose) {
1442c44441f8SAlexander Motin 		device_printf(dev, "CapHi: 0x%08x: DSTRD %u%s, CSS %x%s, "
1443c44441f8SAlexander Motin 		    "MPSMIN %u, MPSMAX %u%s%s\n", cap_hi,
1444c44441f8SAlexander Motin 		    NVME_CAP_HI_DSTRD(cap_hi),
14450bed3eabSAlexander Motin 		    NVME_CAP_HI_NSSRS(cap_hi) ? ", NSSRS" : "",
1446c44441f8SAlexander Motin 		    NVME_CAP_HI_CSS(cap_hi),
14470bed3eabSAlexander Motin 		    NVME_CAP_HI_BPS(cap_hi) ? ", BPS" : "",
1448c44441f8SAlexander Motin 		    NVME_CAP_HI_MPSMIN(cap_hi),
1449c44441f8SAlexander Motin 		    NVME_CAP_HI_MPSMAX(cap_hi),
14500bed3eabSAlexander Motin 		    NVME_CAP_HI_PMRS(cap_hi) ? ", PMRS" : "",
14510bed3eabSAlexander Motin 		    NVME_CAP_HI_CMBS(cap_hi) ? ", CMBS" : "");
1452c44441f8SAlexander Motin 	}
1453c44441f8SAlexander Motin 	if (bootverbose) {
1454c44441f8SAlexander Motin 		vs = nvme_mmio_read_4(ctrlr, vs);
1455c44441f8SAlexander Motin 		device_printf(dev, "Version: 0x%08x: %d.%d\n", vs,
1456c44441f8SAlexander Motin 		    NVME_MAJOR(vs), NVME_MINOR(vs));
1457c44441f8SAlexander Motin 	}
14580bed3eabSAlexander Motin 	if (bootverbose && NVME_CAP_HI_PMRS(cap_hi)) {
14590bed3eabSAlexander Motin 		pmrcap = nvme_mmio_read_4(ctrlr, pmrcap);
14600bed3eabSAlexander Motin 		device_printf(dev, "PMRCap: 0x%08x: BIR %u%s%s, PMRTU %u, "
14610bed3eabSAlexander Motin 		    "PMRWBM %x, PMRTO %u%s\n", pmrcap,
14620bed3eabSAlexander Motin 		    NVME_PMRCAP_BIR(pmrcap),
14630bed3eabSAlexander Motin 		    NVME_PMRCAP_RDS(pmrcap) ? ", RDS" : "",
14640bed3eabSAlexander Motin 		    NVME_PMRCAP_WDS(pmrcap) ? ", WDS" : "",
14650bed3eabSAlexander Motin 		    NVME_PMRCAP_PMRTU(pmrcap),
14660bed3eabSAlexander Motin 		    NVME_PMRCAP_PMRWBM(pmrcap),
14670bed3eabSAlexander Motin 		    NVME_PMRCAP_PMRTO(pmrcap),
14680bed3eabSAlexander Motin 		    NVME_PMRCAP_CMSS(pmrcap) ? ", CMSS" : "");
14690bed3eabSAlexander Motin 	}
1470c44441f8SAlexander Motin 
1471f93b7f95SWarner Losh 	ctrlr->dstrd = NVME_CAP_HI_DSTRD(cap_hi) + 2;
1472bb0ec6b3SJim Harris 
147355412ef9SWarner Losh 	ctrlr->mps = NVME_CAP_HI_MPSMIN(cap_hi);
147455412ef9SWarner Losh 	ctrlr->page_size = 1 << (NVME_MPS_SHIFT + ctrlr->mps);
147502e33484SJim Harris 
1476bb0ec6b3SJim Harris 	/* Get ready timeout value from controller, in units of 500ms. */
147762d2cf18SWarner Losh 	to = NVME_CAP_LO_TO(cap_lo) + 1;
14780d787e9bSWojciech Macek 	ctrlr->ready_timeout_in_ms = to * 500;
1479bb0ec6b3SJim Harris 
148094143332SJim Harris 	timeout_period = NVME_DEFAULT_TIMEOUT_PERIOD;
148194143332SJim Harris 	TUNABLE_INT_FETCH("hw.nvme.timeout_period", &timeout_period);
148294143332SJim Harris 	timeout_period = min(timeout_period, NVME_MAX_TIMEOUT_PERIOD);
148394143332SJim Harris 	timeout_period = max(timeout_period, NVME_MIN_TIMEOUT_PERIOD);
148494143332SJim Harris 	ctrlr->timeout_period = timeout_period;
148594143332SJim Harris 
1486cb5b7c13SJim Harris 	nvme_retry_count = NVME_DEFAULT_RETRY_COUNT;
1487cb5b7c13SJim Harris 	TUNABLE_INT_FETCH("hw.nvme.retry_count", &nvme_retry_count);
1488cb5b7c13SJim Harris 
148948ce3178SJim Harris 	ctrlr->enable_aborts = 0;
149048ce3178SJim Harris 	TUNABLE_INT_FETCH("hw.nvme.enable_aborts", &ctrlr->enable_aborts);
149148ce3178SJim Harris 
14923086efe8SWarner Losh 	/* Cap transfers by the maximum addressable by page-sized PRP (4KB pages -> 2MB). */
14933086efe8SWarner Losh 	ctrlr->max_xfer_size = MIN(maxphys, (ctrlr->page_size / 8 * ctrlr->page_size));
1494a965389bSScott Long 	if (nvme_ctrlr_construct_admin_qpair(ctrlr) != 0)
1495a965389bSScott Long 		return (ENXIO);
1496bb0ec6b3SJim Harris 
1497f0f47121SWarner Losh 	/*
1498f0f47121SWarner Losh 	 * Create 2 threads for the taskqueue. The reset thread will block when
1499f0f47121SWarner Losh 	 * it detects that the controller has failed until all I/O has been
1500f0f47121SWarner Losh 	 * failed up the stack. The fail_req task needs to be able to run in
1501f0f47121SWarner Losh 	 * this case to finish the request failure for some cases.
1502f0f47121SWarner Losh 	 *
1503f0f47121SWarner Losh 	 * We could partially solve this race by draining the failed requeust
1504f0f47121SWarner Losh 	 * queue before proceding to free the sim, though nothing would stop
1505f0f47121SWarner Losh 	 * new I/O from coming in after we do that drain, but before we reach
1506f0f47121SWarner Losh 	 * cam_sim_free, so this big hammer is used instead.
1507f0f47121SWarner Losh 	 */
150812d191ecSJim Harris 	ctrlr->taskqueue = taskqueue_create("nvme_taskq", M_WAITOK,
150912d191ecSJim Harris 	    taskqueue_thread_enqueue, &ctrlr->taskqueue);
1510f0f47121SWarner Losh 	taskqueue_start_threads(&ctrlr->taskqueue, 2, PI_DISK, "nvme taskq");
151112d191ecSJim Harris 
1512f37c22a3SJim Harris 	ctrlr->is_resetting = 0;
1513496a2752SJim Harris 	ctrlr->is_initialized = 0;
1514496a2752SJim Harris 	ctrlr->notification_sent = 0;
1515232e2edbSJim Harris 	TASK_INIT(&ctrlr->reset_task, 0, nvme_ctrlr_reset_task, ctrlr);
1516232e2edbSJim Harris 	TASK_INIT(&ctrlr->fail_req_task, 0, nvme_ctrlr_fail_req_task, ctrlr);
1517232e2edbSJim Harris 	STAILQ_INIT(&ctrlr->fail_req);
15187588c6ccSWarner Losh 	ctrlr->is_failed = false;
1519f37c22a3SJim Harris 
1520e134ecdcSAlexander Motin 	make_dev_args_init(&md_args);
1521e134ecdcSAlexander Motin 	md_args.mda_devsw = &nvme_ctrlr_cdevsw;
1522e134ecdcSAlexander Motin 	md_args.mda_uid = UID_ROOT;
1523e134ecdcSAlexander Motin 	md_args.mda_gid = GID_WHEEL;
1524e134ecdcSAlexander Motin 	md_args.mda_mode = 0600;
1525e134ecdcSAlexander Motin 	md_args.mda_unit = device_get_unit(dev);
1526e134ecdcSAlexander Motin 	md_args.mda_si_drv1 = (void *)ctrlr;
1527e134ecdcSAlexander Motin 	status = make_dev_s(&md_args, &ctrlr->cdev, "nvme%d",
1528e134ecdcSAlexander Motin 	    device_get_unit(dev));
1529e134ecdcSAlexander Motin 	if (status != 0)
1530e134ecdcSAlexander Motin 		return (ENXIO);
1531e134ecdcSAlexander Motin 
1532bb0ec6b3SJim Harris 	return (0);
1533bb0ec6b3SJim Harris }
1534d281e8fbSJim Harris 
1535d281e8fbSJim Harris void
1536990e741cSJim Harris nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev)
1537990e741cSJim Harris {
153871a28181SAlexander Motin 	int	gone, i;
1539990e741cSJim Harris 
1540502dc84aSWarner Losh 	ctrlr->is_dying = true;
1541502dc84aSWarner Losh 
1542e134ecdcSAlexander Motin 	if (ctrlr->resource == NULL)
1543e134ecdcSAlexander Motin 		goto nores;
154431111372SAlexander Motin 	if (!mtx_initialized(&ctrlr->adminq.lock))
154531111372SAlexander Motin 		goto noadminq;
154612d191ecSJim Harris 
154771a28181SAlexander Motin 	/*
154871a28181SAlexander Motin 	 * Check whether it is a hot unplug or a clean driver detach.
154971a28181SAlexander Motin 	 * If device is not there any more, skip any shutdown commands.
155071a28181SAlexander Motin 	 */
15519600aa31SWarner Losh 	gone = (nvme_mmio_read_4(ctrlr, csts) == NVME_GONE);
155271a28181SAlexander Motin 	if (gone)
155371a28181SAlexander Motin 		nvme_ctrlr_fail(ctrlr);
155471a28181SAlexander Motin 	else
1555f439e3a4SAlexander Motin 		nvme_notify_fail_consumers(ctrlr);
1556f439e3a4SAlexander Motin 
1557b846efd7SJim Harris 	for (i = 0; i < NVME_MAX_NAMESPACES; i++)
1558b846efd7SJim Harris 		nvme_ns_destruct(&ctrlr->ns[i]);
1559990e741cSJim Harris 
1560990e741cSJim Harris 	if (ctrlr->cdev)
1561990e741cSJim Harris 		destroy_dev(ctrlr->cdev);
1562990e741cSJim Harris 
15638e61280bSWarner Losh 	if (ctrlr->is_initialized) {
156467abaee9SAlexander Motin 		if (!gone) {
156567abaee9SAlexander Motin 			if (ctrlr->hmb_nchunks > 0)
156667abaee9SAlexander Motin 				nvme_ctrlr_hmb_enable(ctrlr, false, false);
15674d547561SWarner Losh 			nvme_ctrlr_delete_qpairs(ctrlr);
156867abaee9SAlexander Motin 		}
1569701267adSAlexander Motin 		nvme_ctrlr_hmb_free(ctrlr);
1570701267adSAlexander Motin 	}
1571701267adSAlexander Motin 	if (ctrlr->ioq != NULL) {
157271a28181SAlexander Motin 		for (i = 0; i < ctrlr->num_io_queues; i++)
1573990e741cSJim Harris 			nvme_io_qpair_destroy(&ctrlr->ioq[i]);
1574990e741cSJim Harris 		free(ctrlr->ioq, M_NVME);
15758e61280bSWarner Losh 	}
1576550d5d64SAlexander Motin 	nvme_admin_qpair_destroy(&ctrlr->adminq);
1577990e741cSJim Harris 
1578e134ecdcSAlexander Motin 	/*
1579e134ecdcSAlexander Motin 	 *  Notify the controller of a shutdown, even though this is due to
1580e134ecdcSAlexander Motin 	 *   a driver unload, not a system shutdown (this path is not invoked
1581e134ecdcSAlexander Motin 	 *   during shutdown).  This ensures the controller receives a
1582e134ecdcSAlexander Motin 	 *   shutdown notification in case the system is shutdown before
1583e134ecdcSAlexander Motin 	 *   reloading the driver.
1584e134ecdcSAlexander Motin 	 */
158571a28181SAlexander Motin 	if (!gone)
1586e134ecdcSAlexander Motin 		nvme_ctrlr_shutdown(ctrlr);
1587990e741cSJim Harris 
158871a28181SAlexander Motin 	if (!gone)
1589e134ecdcSAlexander Motin 		nvme_ctrlr_disable(ctrlr);
1590e134ecdcSAlexander Motin 
159131111372SAlexander Motin noadminq:
1592e134ecdcSAlexander Motin 	if (ctrlr->taskqueue)
1593e134ecdcSAlexander Motin 		taskqueue_free(ctrlr->taskqueue);
1594990e741cSJim Harris 
1595990e741cSJim Harris 	if (ctrlr->tag)
1596990e741cSJim Harris 		bus_teardown_intr(ctrlr->dev, ctrlr->res, ctrlr->tag);
1597990e741cSJim Harris 
1598990e741cSJim Harris 	if (ctrlr->res)
1599990e741cSJim Harris 		bus_release_resource(ctrlr->dev, SYS_RES_IRQ,
1600990e741cSJim Harris 		    rman_get_rid(ctrlr->res), ctrlr->res);
1601990e741cSJim Harris 
1602e134ecdcSAlexander Motin 	if (ctrlr->bar4_resource != NULL) {
1603e134ecdcSAlexander Motin 		bus_release_resource(dev, SYS_RES_MEMORY,
1604e134ecdcSAlexander Motin 		    ctrlr->bar4_resource_id, ctrlr->bar4_resource);
1605e134ecdcSAlexander Motin 	}
1606e134ecdcSAlexander Motin 
1607e134ecdcSAlexander Motin 	bus_release_resource(dev, SYS_RES_MEMORY,
1608e134ecdcSAlexander Motin 	    ctrlr->resource_id, ctrlr->resource);
1609e134ecdcSAlexander Motin 
1610e134ecdcSAlexander Motin nores:
1611e134ecdcSAlexander Motin 	mtx_destroy(&ctrlr->lock);
1612990e741cSJim Harris }
1613990e741cSJim Harris 
1614990e741cSJim Harris void
161556183abcSJim Harris nvme_ctrlr_shutdown(struct nvme_controller *ctrlr)
161656183abcSJim Harris {
16170d787e9bSWojciech Macek 	uint32_t	cc;
16180d787e9bSWojciech Macek 	uint32_t	csts;
16194fbbe523SAlexander Motin 	int		timeout;
162056183abcSJim Harris 
16210d787e9bSWojciech Macek 	cc = nvme_mmio_read_4(ctrlr, cc);
16220d787e9bSWojciech Macek 	cc &= ~(NVME_CC_REG_SHN_MASK << NVME_CC_REG_SHN_SHIFT);
16230d787e9bSWojciech Macek 	cc |= NVME_SHN_NORMAL << NVME_CC_REG_SHN_SHIFT;
16240d787e9bSWojciech Macek 	nvme_mmio_write_4(ctrlr, cc, cc);
16250d787e9bSWojciech Macek 
16264fbbe523SAlexander Motin 	timeout = ticks + (ctrlr->cdata.rtd3e == 0 ? 5 * hz :
16274fbbe523SAlexander Motin 	    ((uint64_t)ctrlr->cdata.rtd3e * hz + 999999) / 1000000);
162871a28181SAlexander Motin 	while (1) {
16290d787e9bSWojciech Macek 		csts = nvme_mmio_read_4(ctrlr, csts);
16309600aa31SWarner Losh 		if (csts == NVME_GONE)		/* Hot unplug. */
163171a28181SAlexander Motin 			break;
163271a28181SAlexander Motin 		if (NVME_CSTS_GET_SHST(csts) == NVME_SHST_COMPLETE)
163371a28181SAlexander Motin 			break;
16344fbbe523SAlexander Motin 		if (timeout - ticks < 0) {
16354fbbe523SAlexander Motin 			nvme_printf(ctrlr, "shutdown timeout\n");
163671a28181SAlexander Motin 			break;
163756183abcSJim Harris 		}
16384fbbe523SAlexander Motin 		pause("nvmeshut", 1);
163971a28181SAlexander Motin 	}
164056183abcSJim Harris }
164156183abcSJim Harris 
164256183abcSJim Harris void
1643d281e8fbSJim Harris nvme_ctrlr_submit_admin_request(struct nvme_controller *ctrlr,
1644d281e8fbSJim Harris     struct nvme_request *req)
1645d281e8fbSJim Harris {
1646d281e8fbSJim Harris 
16475ae9ed68SJim Harris 	nvme_qpair_submit_request(&ctrlr->adminq, req);
1648d281e8fbSJim Harris }
1649d281e8fbSJim Harris 
1650d281e8fbSJim Harris void
1651d281e8fbSJim Harris nvme_ctrlr_submit_io_request(struct nvme_controller *ctrlr,
1652d281e8fbSJim Harris     struct nvme_request *req)
1653d281e8fbSJim Harris {
1654d281e8fbSJim Harris 	struct nvme_qpair       *qpair;
1655d281e8fbSJim Harris 
16561eab19cbSAlexander Motin 	qpair = &ctrlr->ioq[QP(ctrlr, curcpu)];
16575ae9ed68SJim Harris 	nvme_qpair_submit_request(qpair, req);
1658d281e8fbSJim Harris }
1659038a5ee4SJim Harris 
1660038a5ee4SJim Harris device_t
1661038a5ee4SJim Harris nvme_ctrlr_get_device(struct nvme_controller *ctrlr)
1662038a5ee4SJim Harris {
1663038a5ee4SJim Harris 
1664038a5ee4SJim Harris 	return (ctrlr->dev);
1665038a5ee4SJim Harris }
1666dbba7442SJim Harris 
1667dbba7442SJim Harris const struct nvme_controller_data *
1668dbba7442SJim Harris nvme_ctrlr_get_data(struct nvme_controller *ctrlr)
1669dbba7442SJim Harris {
1670dbba7442SJim Harris 
1671dbba7442SJim Harris 	return (&ctrlr->cdata);
1672dbba7442SJim Harris }
16734d547561SWarner Losh 
16744d547561SWarner Losh int
16754d547561SWarner Losh nvme_ctrlr_suspend(struct nvme_controller *ctrlr)
16764d547561SWarner Losh {
16774d547561SWarner Losh 	int to = hz;
16784d547561SWarner Losh 
16794d547561SWarner Losh 	/*
16804d547561SWarner Losh 	 * Can't touch failed controllers, so it's already suspended.
16814d547561SWarner Losh 	 */
16824d547561SWarner Losh 	if (ctrlr->is_failed)
16834d547561SWarner Losh 		return (0);
16844d547561SWarner Losh 
16854d547561SWarner Losh 	/*
16864d547561SWarner Losh 	 * We don't want the reset taskqueue running, since it does similar
16874d547561SWarner Losh 	 * things, so prevent it from running after we start. Wait for any reset
16884d547561SWarner Losh 	 * that may have been started to complete. The reset process we follow
16894d547561SWarner Losh 	 * will ensure that any new I/O will queue and be given to the hardware
16904d547561SWarner Losh 	 * after we resume (though there should be none).
16914d547561SWarner Losh 	 */
16924d547561SWarner Losh 	while (atomic_cmpset_32(&ctrlr->is_resetting, 0, 1) == 0 && to-- > 0)
16934d547561SWarner Losh 		pause("nvmesusp", 1);
16944d547561SWarner Losh 	if (to <= 0) {
16954d547561SWarner Losh 		nvme_printf(ctrlr,
16964d547561SWarner Losh 		    "Competing reset task didn't finish. Try again later.\n");
16974d547561SWarner Losh 		return (EWOULDBLOCK);
16984d547561SWarner Losh 	}
16994d547561SWarner Losh 
170067abaee9SAlexander Motin 	if (ctrlr->hmb_nchunks > 0)
170167abaee9SAlexander Motin 		nvme_ctrlr_hmb_enable(ctrlr, false, false);
170267abaee9SAlexander Motin 
17034d547561SWarner Losh 	/*
17044d547561SWarner Losh 	 * Per Section 7.6.2 of NVMe spec 1.4, to properly suspend, we need to
17054d547561SWarner Losh 	 * delete the hardware I/O queues, and then shutdown. This properly
17064d547561SWarner Losh 	 * flushes any metadata the drive may have stored so it can survive
17074d547561SWarner Losh 	 * having its power removed and prevents the unsafe shutdown count from
17084d547561SWarner Losh 	 * incriminating. Once we delete the qpairs, we have to disable them
1709e5e26e4aSWarner Losh 	 * before shutting down.
17104d547561SWarner Losh 	 */
17114d547561SWarner Losh 	nvme_ctrlr_delete_qpairs(ctrlr);
17124d547561SWarner Losh 	nvme_ctrlr_disable_qpairs(ctrlr);
17134d547561SWarner Losh 	nvme_ctrlr_shutdown(ctrlr);
17144d547561SWarner Losh 
17154d547561SWarner Losh 	return (0);
17164d547561SWarner Losh }
17174d547561SWarner Losh 
17184d547561SWarner Losh int
17194d547561SWarner Losh nvme_ctrlr_resume(struct nvme_controller *ctrlr)
17204d547561SWarner Losh {
17214d547561SWarner Losh 
17224d547561SWarner Losh 	/*
17234d547561SWarner Losh 	 * Can't touch failed controllers, so nothing to do to resume.
17244d547561SWarner Losh 	 */
17254d547561SWarner Losh 	if (ctrlr->is_failed)
17264d547561SWarner Losh 		return (0);
17274d547561SWarner Losh 
1728*8052b01eSWarner Losh 	nvme_pre_reset(ctrlr);
17294b3da659SWarner Losh 	if (nvme_ctrlr_hw_reset(ctrlr) != 0)
17304b3da659SWarner Losh 		goto fail;
17314b3da659SWarner Losh #ifdef NVME_2X_RESET
17324d547561SWarner Losh 	/*
17334b3da659SWarner Losh 	 * Prior to FreeBSD 13.1, FreeBSD's nvme driver reset the hardware twice
17344b3da659SWarner Losh 	 * to get it into a known good state. However, the hardware's state is
17354b3da659SWarner Losh 	 * good and we don't need to do this for proper functioning.
17364d547561SWarner Losh 	 */
17374d547561SWarner Losh 	if (nvme_ctrlr_hw_reset(ctrlr) != 0)
17384d547561SWarner Losh 		goto fail;
17394b3da659SWarner Losh #endif
1740*8052b01eSWarner Losh 	nvme_post_reset(ctrlr);
17414d547561SWarner Losh 
17424d547561SWarner Losh 	/*
17434053f8acSDavid Bright 	 * Now that we've reset the hardware, we can restart the controller. Any
17444d547561SWarner Losh 	 * I/O that was pending is requeued. Any admin commands are aborted with
17454d547561SWarner Losh 	 * an error. Once we've restarted, take the controller out of reset.
17464d547561SWarner Losh 	 */
17474d547561SWarner Losh 	nvme_ctrlr_start(ctrlr, true);
17484053f8acSDavid Bright 	(void)atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
17494d547561SWarner Losh 
17504d547561SWarner Losh 	return (0);
17514d547561SWarner Losh fail:
17524d547561SWarner Losh 	/*
17534d547561SWarner Losh 	 * Since we can't bring the controller out of reset, announce and fail
17544d547561SWarner Losh 	 * the controller. However, we have to return success for the resume
17554d547561SWarner Losh 	 * itself, due to questionable APIs.
17564d547561SWarner Losh 	 */
1757*8052b01eSWarner Losh 	nvme_post_reset(ctrlr);
17584d547561SWarner Losh 	nvme_printf(ctrlr, "Failed to reset on resume, failing.\n");
17594d547561SWarner Losh 	nvme_ctrlr_fail(ctrlr);
17604053f8acSDavid Bright 	(void)atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
17614d547561SWarner Losh 	return (0);
17624d547561SWarner Losh }
1763