xref: /freebsd/sys/dev/nvme/nvme_ctrlr.c (revision 145992504973bd16cf3518af9ba5ce185fefa82a)
1 /*-
2  * Copyright (C) 2012 Intel Corporation
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29 
30 #include <sys/param.h>
31 #include <sys/bus.h>
32 #include <sys/conf.h>
33 #include <sys/ioccom.h>
34 #include <sys/smp.h>
35 
36 #include <dev/pci/pcireg.h>
37 #include <dev/pci/pcivar.h>
38 
39 #include "nvme_private.h"
40 
41 static void
42 nvme_ctrlr_cb(void *arg, const struct nvme_completion *status)
43 {
44 	struct nvme_completion	*cpl = arg;
45 	struct mtx		*mtx;
46 
47 	/*
48 	 * Copy status into the argument passed by the caller, so that
49 	 *  the caller can check the status to determine if the
50 	 *  the request passed or failed.
51 	 */
52 	memcpy(cpl, status, sizeof(*cpl));
53 	mtx = mtx_pool_find(mtxpool_sleep, cpl);
54 	mtx_lock(mtx);
55 	wakeup(cpl);
56 	mtx_unlock(mtx);
57 }
58 
59 static int
60 nvme_ctrlr_allocate_bar(struct nvme_controller *ctrlr)
61 {
62 
63 	/* Chatham puts the NVMe MMRs behind BAR 2/3, not BAR 0/1. */
64 	if (pci_get_devid(ctrlr->dev) == CHATHAM_PCI_ID)
65 		ctrlr->resource_id = PCIR_BAR(2);
66 	else
67 		ctrlr->resource_id = PCIR_BAR(0);
68 
69 	ctrlr->resource = bus_alloc_resource(ctrlr->dev, SYS_RES_MEMORY,
70 	    &ctrlr->resource_id, 0, ~0, 1, RF_ACTIVE);
71 
72 	if(ctrlr->resource == NULL) {
73 		device_printf(ctrlr->dev, "unable to allocate pci resource\n");
74 		return (ENOMEM);
75 	}
76 
77 	ctrlr->bus_tag = rman_get_bustag(ctrlr->resource);
78 	ctrlr->bus_handle = rman_get_bushandle(ctrlr->resource);
79 	ctrlr->regs = (struct nvme_registers *)ctrlr->bus_handle;
80 
81 	return (0);
82 }
83 
84 #ifdef CHATHAM2
85 static int
86 nvme_ctrlr_allocate_chatham_bar(struct nvme_controller *ctrlr)
87 {
88 
89 	ctrlr->chatham_resource_id = PCIR_BAR(CHATHAM_CONTROL_BAR);
90 	ctrlr->chatham_resource = bus_alloc_resource(ctrlr->dev,
91 	    SYS_RES_MEMORY, &ctrlr->chatham_resource_id, 0, ~0, 1,
92 	    RF_ACTIVE);
93 
94 	if(ctrlr->chatham_resource == NULL) {
95 		device_printf(ctrlr->dev, "unable to alloc pci resource\n");
96 		return (ENOMEM);
97 	}
98 
99 	ctrlr->chatham_bus_tag = rman_get_bustag(ctrlr->chatham_resource);
100 	ctrlr->chatham_bus_handle =
101 	    rman_get_bushandle(ctrlr->chatham_resource);
102 
103 	return (0);
104 }
105 
106 static void
107 nvme_ctrlr_setup_chatham(struct nvme_controller *ctrlr)
108 {
109 	uint64_t reg1, reg2, reg3;
110 	uint64_t temp1, temp2;
111 	uint32_t temp3;
112 	uint32_t use_flash_timings = 0;
113 
114 	DELAY(10000);
115 
116 	temp3 = chatham_read_4(ctrlr, 0x8080);
117 
118 	device_printf(ctrlr->dev, "Chatham version: 0x%x\n", temp3);
119 
120 	ctrlr->chatham_lbas = chatham_read_4(ctrlr, 0x8068) - 0x110;
121 	ctrlr->chatham_size = ctrlr->chatham_lbas * 512;
122 
123 	device_printf(ctrlr->dev, "Chatham size: %lld\n",
124 	    (long long)ctrlr->chatham_size);
125 
126 	reg1 = reg2 = reg3 = ctrlr->chatham_size - 1;
127 
128 	TUNABLE_INT_FETCH("hw.nvme.use_flash_timings", &use_flash_timings);
129 	if (use_flash_timings) {
130 		device_printf(ctrlr->dev, "Chatham: using flash timings\n");
131 		temp1 = 0x00001b58000007d0LL;
132 		temp2 = 0x000000cb00000131LL;
133 	} else {
134 		device_printf(ctrlr->dev, "Chatham: using DDR timings\n");
135 		temp1 = temp2 = 0x0LL;
136 	}
137 
138 	chatham_write_8(ctrlr, 0x8000, reg1);
139 	chatham_write_8(ctrlr, 0x8008, reg2);
140 	chatham_write_8(ctrlr, 0x8010, reg3);
141 
142 	chatham_write_8(ctrlr, 0x8020, temp1);
143 	temp3 = chatham_read_4(ctrlr, 0x8020);
144 
145 	chatham_write_8(ctrlr, 0x8028, temp2);
146 	temp3 = chatham_read_4(ctrlr, 0x8028);
147 
148 	chatham_write_8(ctrlr, 0x8030, temp1);
149 	chatham_write_8(ctrlr, 0x8038, temp2);
150 	chatham_write_8(ctrlr, 0x8040, temp1);
151 	chatham_write_8(ctrlr, 0x8048, temp2);
152 	chatham_write_8(ctrlr, 0x8050, temp1);
153 	chatham_write_8(ctrlr, 0x8058, temp2);
154 
155 	DELAY(10000);
156 }
157 
158 static void
159 nvme_chatham_populate_cdata(struct nvme_controller *ctrlr)
160 {
161 	struct nvme_controller_data *cdata;
162 
163 	cdata = &ctrlr->cdata;
164 
165 	cdata->vid = 0x8086;
166 	cdata->ssvid = 0x2011;
167 
168 	/*
169 	 * Chatham2 puts garbage data in these fields when we
170 	 *  invoke IDENTIFY_CONTROLLER, so we need to re-zero
171 	 *  the fields before calling bcopy().
172 	 */
173 	memset(cdata->sn, 0, sizeof(cdata->sn));
174 	memcpy(cdata->sn, "2012", strlen("2012"));
175 	memset(cdata->mn, 0, sizeof(cdata->mn));
176 	memcpy(cdata->mn, "CHATHAM2", strlen("CHATHAM2"));
177 	memset(cdata->fr, 0, sizeof(cdata->fr));
178 	memcpy(cdata->fr, "0", strlen("0"));
179 	cdata->rab = 8;
180 	cdata->aerl = 3;
181 	cdata->lpa.ns_smart = 1;
182 	cdata->sqes.min = 6;
183 	cdata->sqes.max = 6;
184 	cdata->sqes.min = 4;
185 	cdata->sqes.max = 4;
186 	cdata->nn = 1;
187 
188 	/* Chatham2 doesn't support DSM command */
189 	cdata->oncs.dsm = 0;
190 
191 	cdata->vwc.present = 1;
192 }
193 #endif /* CHATHAM2 */
194 
195 static void
196 nvme_ctrlr_construct_admin_qpair(struct nvme_controller *ctrlr)
197 {
198 	struct nvme_qpair	*qpair;
199 	uint32_t		num_entries;
200 
201 	qpair = &ctrlr->adminq;
202 
203 	num_entries = NVME_ADMIN_ENTRIES;
204 	TUNABLE_INT_FETCH("hw.nvme.admin_entries", &num_entries);
205 	/*
206 	 * If admin_entries was overridden to an invalid value, revert it
207 	 *  back to our default value.
208 	 */
209 	if (num_entries < NVME_MIN_ADMIN_ENTRIES ||
210 	    num_entries > NVME_MAX_ADMIN_ENTRIES) {
211 		printf("nvme: invalid hw.nvme.admin_entries=%d specified\n",
212 		    num_entries);
213 		num_entries = NVME_ADMIN_ENTRIES;
214 	}
215 
216 	/*
217 	 * The admin queue's max xfer size is treated differently than the
218 	 *  max I/O xfer size.  16KB is sufficient here - maybe even less?
219 	 */
220 	nvme_qpair_construct(qpair,
221 			     0, /* qpair ID */
222 			     0, /* vector */
223 			     num_entries,
224 			     NVME_ADMIN_TRACKERS,
225 			     16*1024, /* max xfer size */
226 			     ctrlr);
227 }
228 
229 static int
230 nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr)
231 {
232 	struct nvme_qpair	*qpair;
233 	union cap_lo_register	cap_lo;
234 	int			i, num_entries, num_trackers;
235 
236 	num_entries = NVME_IO_ENTRIES;
237 	TUNABLE_INT_FETCH("hw.nvme.io_entries", &num_entries);
238 
239 	/*
240 	 * NVMe spec sets a hard limit of 64K max entries, but
241 	 *  devices may specify a smaller limit, so we need to check
242 	 *  the MQES field in the capabilities register.
243 	 */
244 	cap_lo.raw = nvme_mmio_read_4(ctrlr, cap_lo);
245 	num_entries = min(num_entries, cap_lo.bits.mqes+1);
246 
247 	num_trackers = NVME_IO_TRACKERS;
248 	TUNABLE_INT_FETCH("hw.nvme.io_trackers", &num_trackers);
249 
250 	num_trackers = max(num_trackers, NVME_MIN_IO_TRACKERS);
251 	num_trackers = min(num_trackers, NVME_MAX_IO_TRACKERS);
252 	/*
253 	 * No need to have more trackers than entries in the submit queue.
254 	 *  Note also that for a queue size of N, we can only have (N-1)
255 	 *  commands outstanding, hence the "-1" here.
256 	 */
257 	num_trackers = min(num_trackers, (num_entries-1));
258 
259 	ctrlr->max_xfer_size = NVME_MAX_XFER_SIZE;
260 	TUNABLE_INT_FETCH("hw.nvme.max_xfer_size", &ctrlr->max_xfer_size);
261 	/*
262 	 * Check that tunable doesn't specify a size greater than what our
263 	 *  driver supports, and is an even PAGE_SIZE multiple.
264 	 */
265 	if (ctrlr->max_xfer_size > NVME_MAX_XFER_SIZE ||
266 	    ctrlr->max_xfer_size % PAGE_SIZE)
267 		ctrlr->max_xfer_size = NVME_MAX_XFER_SIZE;
268 
269 	ctrlr->ioq = malloc(ctrlr->num_io_queues * sizeof(struct nvme_qpair),
270 	    M_NVME, M_ZERO | M_NOWAIT);
271 
272 	if (ctrlr->ioq == NULL)
273 		return (ENOMEM);
274 
275 	for (i = 0; i < ctrlr->num_io_queues; i++) {
276 		qpair = &ctrlr->ioq[i];
277 
278 		/*
279 		 * Admin queue has ID=0. IO queues start at ID=1 -
280 		 *  hence the 'i+1' here.
281 		 *
282 		 * For I/O queues, use the controller-wide max_xfer_size
283 		 *  calculated in nvme_attach().
284 		 */
285 		nvme_qpair_construct(qpair,
286 				     i+1, /* qpair ID */
287 				     ctrlr->msix_enabled ? i+1 : 0, /* vector */
288 				     num_entries,
289 				     num_trackers,
290 				     ctrlr->max_xfer_size,
291 				     ctrlr);
292 
293 		if (ctrlr->per_cpu_io_queues)
294 			bus_bind_intr(ctrlr->dev, qpair->res, i);
295 	}
296 
297 	return (0);
298 }
299 
300 static int
301 nvme_ctrlr_wait_for_ready(struct nvme_controller *ctrlr)
302 {
303 	int ms_waited;
304 	union cc_register cc;
305 	union csts_register csts;
306 
307 	cc.raw = nvme_mmio_read_4(ctrlr, cc);
308 	csts.raw = nvme_mmio_read_4(ctrlr, csts);
309 
310 	if (!cc.bits.en) {
311 		device_printf(ctrlr->dev, "%s called with cc.en = 0\n",
312 		    __func__);
313 		return (ENXIO);
314 	}
315 
316 	ms_waited = 0;
317 
318 	while (!csts.bits.rdy) {
319 		DELAY(1000);
320 		if (ms_waited++ > ctrlr->ready_timeout_in_ms) {
321 			device_printf(ctrlr->dev, "controller did not become "
322 			    "ready within %d ms\n", ctrlr->ready_timeout_in_ms);
323 			return (ENXIO);
324 		}
325 		csts.raw = nvme_mmio_read_4(ctrlr, csts);
326 	}
327 
328 	return (0);
329 }
330 
331 static void
332 nvme_ctrlr_disable(struct nvme_controller *ctrlr)
333 {
334 	union cc_register cc;
335 	union csts_register csts;
336 
337 	cc.raw = nvme_mmio_read_4(ctrlr, cc);
338 	csts.raw = nvme_mmio_read_4(ctrlr, csts);
339 
340 	if (cc.bits.en == 1 && csts.bits.rdy == 0)
341 		nvme_ctrlr_wait_for_ready(ctrlr);
342 
343 	cc.bits.en = 0;
344 	nvme_mmio_write_4(ctrlr, cc, cc.raw);
345 	DELAY(5000);
346 }
347 
348 static int
349 nvme_ctrlr_enable(struct nvme_controller *ctrlr)
350 {
351 	union cc_register	cc;
352 	union csts_register	csts;
353 	union aqa_register	aqa;
354 
355 	cc.raw = nvme_mmio_read_4(ctrlr, cc);
356 	csts.raw = nvme_mmio_read_4(ctrlr, csts);
357 
358 	if (cc.bits.en == 1) {
359 		if (csts.bits.rdy == 1)
360 			return (0);
361 		else
362 			return (nvme_ctrlr_wait_for_ready(ctrlr));
363 	}
364 
365 	nvme_mmio_write_8(ctrlr, asq, ctrlr->adminq.cmd_bus_addr);
366 	DELAY(5000);
367 	nvme_mmio_write_8(ctrlr, acq, ctrlr->adminq.cpl_bus_addr);
368 	DELAY(5000);
369 
370 	aqa.raw = 0;
371 	/* acqs and asqs are 0-based. */
372 	aqa.bits.acqs = ctrlr->adminq.num_entries-1;
373 	aqa.bits.asqs = ctrlr->adminq.num_entries-1;
374 	nvme_mmio_write_4(ctrlr, aqa, aqa.raw);
375 	DELAY(5000);
376 
377 	cc.bits.en = 1;
378 	cc.bits.css = 0;
379 	cc.bits.ams = 0;
380 	cc.bits.shn = 0;
381 	cc.bits.iosqes = 6; /* SQ entry size == 64 == 2^6 */
382 	cc.bits.iocqes = 4; /* CQ entry size == 16 == 2^4 */
383 
384 	/* This evaluates to 0, which is according to spec. */
385 	cc.bits.mps = (PAGE_SIZE >> 13);
386 
387 	nvme_mmio_write_4(ctrlr, cc, cc.raw);
388 	DELAY(5000);
389 
390 	return (nvme_ctrlr_wait_for_ready(ctrlr));
391 }
392 
393 int
394 nvme_ctrlr_reset(struct nvme_controller *ctrlr)
395 {
396 
397 	nvme_ctrlr_disable(ctrlr);
398 	return (nvme_ctrlr_enable(ctrlr));
399 }
400 
401 /*
402  * Disable this code for now, since Chatham doesn't support
403  *  AERs so I have no good way to test them.
404  */
405 #if 0
406 static void
407 nvme_async_event_cb(void *arg, const struct nvme_completion *status)
408 {
409 	struct nvme_controller *ctrlr = arg;
410 
411 	printf("Asynchronous event occurred.\n");
412 
413 	/* TODO: decode async event type based on status */
414 	/* TODO: check status for any error bits */
415 
416 	/*
417 	 * Repost an asynchronous event request so that it can be
418 	 *  used again by the controller.
419 	 */
420 	nvme_ctrlr_cmd_asynchronous_event_request(ctrlr, nvme_async_event_cb,
421 	    ctrlr);
422 }
423 #endif
424 
425 static int
426 nvme_ctrlr_identify(struct nvme_controller *ctrlr)
427 {
428 	struct mtx		*mtx;
429 	struct nvme_completion	cpl;
430 	int			status;
431 
432 	mtx = mtx_pool_find(mtxpool_sleep, &cpl);
433 
434 	mtx_lock(mtx);
435 	nvme_ctrlr_cmd_identify_controller(ctrlr, &ctrlr->cdata,
436 	    nvme_ctrlr_cb, &cpl);
437 	status = msleep(&cpl, mtx, PRIBIO, "nvme_start", hz*5);
438 	mtx_unlock(mtx);
439 	if ((status != 0) || cpl.sf_sc || cpl.sf_sct) {
440 		printf("nvme_identify_controller failed!\n");
441 		return (ENXIO);
442 	}
443 
444 #ifdef CHATHAM2
445 	if (pci_get_devid(ctrlr->dev) == CHATHAM_PCI_ID)
446 		nvme_chatham_populate_cdata(ctrlr);
447 #endif
448 
449 	return (0);
450 }
451 
452 static int
453 nvme_ctrlr_set_num_qpairs(struct nvme_controller *ctrlr)
454 {
455 	struct mtx		*mtx;
456 	struct nvme_completion	cpl;
457 	int			cq_allocated, sq_allocated, status;
458 
459 	mtx = mtx_pool_find(mtxpool_sleep, &cpl);
460 
461 	mtx_lock(mtx);
462 	nvme_ctrlr_cmd_set_num_queues(ctrlr, ctrlr->num_io_queues,
463 	    nvme_ctrlr_cb, &cpl);
464 	status = msleep(&cpl, mtx, PRIBIO, "nvme_start", hz*5);
465 	mtx_unlock(mtx);
466 	if ((status != 0) || cpl.sf_sc || cpl.sf_sct) {
467 		printf("nvme_set_num_queues failed!\n");
468 		return (ENXIO);
469 	}
470 
471 	/*
472 	 * Data in cdw0 is 0-based.
473 	 * Lower 16-bits indicate number of submission queues allocated.
474 	 * Upper 16-bits indicate number of completion queues allocated.
475 	 */
476 	sq_allocated = (cpl.cdw0 & 0xFFFF) + 1;
477 	cq_allocated = (cpl.cdw0 >> 16) + 1;
478 
479 	/*
480 	 * Check that the controller was able to allocate the number of
481 	 *  queues we requested.  If not, revert to one IO queue.
482 	 */
483 	if (sq_allocated < ctrlr->num_io_queues ||
484 	    cq_allocated < ctrlr->num_io_queues) {
485 		ctrlr->num_io_queues = 1;
486 		ctrlr->per_cpu_io_queues = 0;
487 
488 		/* TODO: destroy extra queues that were created
489 		 *  previously but now found to be not needed.
490 		 */
491 	}
492 
493 	return (0);
494 }
495 
496 static int
497 nvme_ctrlr_create_qpairs(struct nvme_controller *ctrlr)
498 {
499 	struct mtx		*mtx;
500 	struct nvme_qpair	*qpair;
501 	struct nvme_completion	cpl;
502 	int			i, status;
503 
504 	mtx = mtx_pool_find(mtxpool_sleep, &cpl);
505 
506 	for (i = 0; i < ctrlr->num_io_queues; i++) {
507 		qpair = &ctrlr->ioq[i];
508 
509 		mtx_lock(mtx);
510 		nvme_ctrlr_cmd_create_io_cq(ctrlr, qpair, qpair->vector,
511 		    nvme_ctrlr_cb, &cpl);
512 		status = msleep(&cpl, mtx, PRIBIO, "nvme_start", hz*5);
513 		mtx_unlock(mtx);
514 		if ((status != 0) || cpl.sf_sc || cpl.sf_sct) {
515 			printf("nvme_create_io_cq failed!\n");
516 			return (ENXIO);
517 		}
518 
519 		mtx_lock(mtx);
520 		nvme_ctrlr_cmd_create_io_sq(qpair->ctrlr, qpair,
521 		    nvme_ctrlr_cb, &cpl);
522 		status = msleep(&cpl, mtx, PRIBIO, "nvme_start", hz*5);
523 		mtx_unlock(mtx);
524 		if ((status != 0) || cpl.sf_sc || cpl.sf_sct) {
525 			printf("nvme_create_io_sq failed!\n");
526 			return (ENXIO);
527 		}
528 	}
529 
530 	return (0);
531 }
532 
533 static int
534 nvme_ctrlr_construct_namespaces(struct nvme_controller *ctrlr)
535 {
536 	struct nvme_namespace	*ns;
537 	int			i, status;
538 
539 	for (i = 0; i < ctrlr->cdata.nn; i++) {
540 		ns = &ctrlr->ns[i];
541 		status = nvme_ns_construct(ns, i+1, ctrlr);
542 		if (status != 0)
543 			return (status);
544 	}
545 
546 	return (0);
547 }
548 
549 static void
550 nvme_ctrlr_configure_aer(struct nvme_controller *ctrlr)
551 {
552 	union nvme_critical_warning_state	state;
553 	uint8_t					num_async_events;
554 
555 	state.raw = 0xFF;
556 	state.bits.reserved = 0;
557 	nvme_ctrlr_cmd_set_asynchronous_event_config(ctrlr, state, NULL, NULL);
558 
559 	/* aerl is a zero-based value, so we need to add 1 here. */
560 	num_async_events = min(NVME_MAX_ASYNC_EVENTS, (ctrlr->cdata.aerl+1));
561 
562 	/*
563 	 * Disable this code for now, since Chatham doesn't support
564 	 *  AERs so I have no good way to test them.
565 	 */
566 #if 0
567 	for (int i = 0; i < num_async_events; i++)
568 		nvme_ctrlr_cmd_asynchronous_event_request(ctrlr,
569 		    nvme_async_event_cb, ctrlr);
570 #endif
571 }
572 
573 static void
574 nvme_ctrlr_configure_int_coalescing(struct nvme_controller *ctrlr)
575 {
576 
577 	ctrlr->int_coal_time = 0;
578 	TUNABLE_INT_FETCH("hw.nvme.int_coal_time",
579 	    &ctrlr->int_coal_time);
580 
581 	ctrlr->int_coal_threshold = 0;
582 	TUNABLE_INT_FETCH("hw.nvme.int_coal_threshold",
583 	    &ctrlr->int_coal_threshold);
584 
585 	nvme_ctrlr_cmd_set_interrupt_coalescing(ctrlr, ctrlr->int_coal_time,
586 	    ctrlr->int_coal_threshold, NULL, NULL);
587 }
588 
589 void
590 nvme_ctrlr_start(void *ctrlr_arg)
591 {
592 	struct nvme_controller *ctrlr = ctrlr_arg;
593 
594 	if (nvme_ctrlr_identify(ctrlr) != 0)
595 		goto err;
596 
597 	if (nvme_ctrlr_set_num_qpairs(ctrlr) != 0)
598 		goto err;
599 
600 	if (nvme_ctrlr_create_qpairs(ctrlr) != 0)
601 		goto err;
602 
603 	if (nvme_ctrlr_construct_namespaces(ctrlr) != 0)
604 		goto err;
605 
606 	nvme_ctrlr_configure_aer(ctrlr);
607 	nvme_ctrlr_configure_int_coalescing(ctrlr);
608 
609 	ctrlr->is_started = TRUE;
610 
611 err:
612 
613 	/*
614 	 * Initialize sysctls, even if controller failed to start, to
615 	 *  assist with debugging admin queue pair.
616 	 */
617 	nvme_sysctl_initialize_ctrlr(ctrlr);
618 	config_intrhook_disestablish(&ctrlr->config_hook);
619 }
620 
621 static void
622 nvme_ctrlr_intx_task(void *arg, int pending)
623 {
624 	struct nvme_controller *ctrlr = arg;
625 
626 	nvme_qpair_process_completions(&ctrlr->adminq);
627 
628 	if (ctrlr->ioq[0].cpl)
629 		nvme_qpair_process_completions(&ctrlr->ioq[0]);
630 
631 	nvme_mmio_write_4(ctrlr, intmc, 1);
632 }
633 
634 static void
635 nvme_ctrlr_intx_handler(void *arg)
636 {
637 	struct nvme_controller *ctrlr = arg;
638 
639 	nvme_mmio_write_4(ctrlr, intms, 1);
640 	taskqueue_enqueue_fast(ctrlr->taskqueue, &ctrlr->task);
641 }
642 
643 static int
644 nvme_ctrlr_configure_intx(struct nvme_controller *ctrlr)
645 {
646 
647 	ctrlr->num_io_queues = 1;
648 	ctrlr->per_cpu_io_queues = 0;
649 	ctrlr->rid = 0;
650 	ctrlr->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ,
651 	    &ctrlr->rid, RF_SHAREABLE | RF_ACTIVE);
652 
653 	if (ctrlr->res == NULL) {
654 		device_printf(ctrlr->dev, "unable to allocate shared IRQ\n");
655 		return (ENOMEM);
656 	}
657 
658 	bus_setup_intr(ctrlr->dev, ctrlr->res,
659 	    INTR_TYPE_MISC | INTR_MPSAFE, NULL, nvme_ctrlr_intx_handler,
660 	    ctrlr, &ctrlr->tag);
661 
662 	if (ctrlr->tag == NULL) {
663 		device_printf(ctrlr->dev,
664 		    "unable to setup legacy interrupt handler\n");
665 		return (ENOMEM);
666 	}
667 
668 	TASK_INIT(&ctrlr->task, 0, nvme_ctrlr_intx_task, ctrlr);
669 	ctrlr->taskqueue = taskqueue_create_fast("nvme_taskq", M_NOWAIT,
670 	    taskqueue_thread_enqueue, &ctrlr->taskqueue);
671 	taskqueue_start_threads(&ctrlr->taskqueue, 1, PI_NET,
672 	    "%s intx taskq", device_get_nameunit(ctrlr->dev));
673 
674 	return (0);
675 }
676 
677 static int
678 nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
679     struct thread *td)
680 {
681 	struct nvme_controller	*ctrlr;
682 	struct nvme_completion	cpl;
683 	struct mtx		*mtx;
684 
685 	ctrlr = cdev->si_drv1;
686 
687 	switch (cmd) {
688 	case NVME_IDENTIFY_CONTROLLER:
689 #ifdef CHATHAM2
690 		/*
691 		 * Don't refresh data on Chatham, since Chatham returns
692 		 *  garbage on IDENTIFY anyways.
693 		 */
694 		if (pci_get_devid(ctrlr->dev) == CHATHAM_PCI_ID) {
695 			memcpy(arg, &ctrlr->cdata, sizeof(ctrlr->cdata));
696 			break;
697 		}
698 #endif
699 		/* Refresh data before returning to user. */
700 		mtx = mtx_pool_find(mtxpool_sleep, &cpl);
701 		mtx_lock(mtx);
702 		nvme_ctrlr_cmd_identify_controller(ctrlr, &ctrlr->cdata,
703 		    nvme_ctrlr_cb, &cpl);
704 		msleep(&cpl, mtx, PRIBIO, "nvme_ioctl", 0);
705 		mtx_unlock(mtx);
706 		if (cpl.sf_sc || cpl.sf_sct)
707 			return (ENXIO);
708 		memcpy(arg, &ctrlr->cdata, sizeof(ctrlr->cdata));
709 		break;
710 	default:
711 		return (ENOTTY);
712 	}
713 
714 	return (0);
715 }
716 
717 static struct cdevsw nvme_ctrlr_cdevsw = {
718 	.d_version =	D_VERSION,
719 	.d_flags =	0,
720 	.d_ioctl =	nvme_ctrlr_ioctl
721 };
722 
723 int
724 nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev)
725 {
726 	union cap_lo_register	cap_lo;
727 	union cap_hi_register	cap_hi;
728 	int			num_vectors, per_cpu_io_queues, status = 0;
729 
730 	ctrlr->dev = dev;
731 	ctrlr->is_started = FALSE;
732 
733 	status = nvme_ctrlr_allocate_bar(ctrlr);
734 
735 	if (status != 0)
736 		return (status);
737 
738 #ifdef CHATHAM2
739 	if (pci_get_devid(dev) == CHATHAM_PCI_ID) {
740 		status = nvme_ctrlr_allocate_chatham_bar(ctrlr);
741 		if (status != 0)
742 			return (status);
743 		nvme_ctrlr_setup_chatham(ctrlr);
744 	}
745 #endif
746 
747 	/*
748 	 * Software emulators may set the doorbell stride to something
749 	 *  other than zero, but this driver is not set up to handle that.
750 	 */
751 	cap_hi.raw = nvme_mmio_read_4(ctrlr, cap_hi);
752 	if (cap_hi.bits.dstrd != 0)
753 		return (ENXIO);
754 
755 	/* Get ready timeout value from controller, in units of 500ms. */
756 	cap_lo.raw = nvme_mmio_read_4(ctrlr, cap_lo);
757 	ctrlr->ready_timeout_in_ms = cap_lo.bits.to * 500;
758 
759 	per_cpu_io_queues = 1;
760 	TUNABLE_INT_FETCH("hw.nvme.per_cpu_io_queues", &per_cpu_io_queues);
761 	ctrlr->per_cpu_io_queues = per_cpu_io_queues ? TRUE : FALSE;
762 
763 	if (ctrlr->per_cpu_io_queues)
764 		ctrlr->num_io_queues = mp_ncpus;
765 	else
766 		ctrlr->num_io_queues = 1;
767 
768 	ctrlr->force_intx = 0;
769 	TUNABLE_INT_FETCH("hw.nvme.force_intx", &ctrlr->force_intx);
770 
771 	ctrlr->msix_enabled = 1;
772 
773 	if (ctrlr->force_intx) {
774 		ctrlr->msix_enabled = 0;
775 		goto intx;
776 	}
777 
778 	/* One vector per IO queue, plus one vector for admin queue. */
779 	num_vectors = ctrlr->num_io_queues + 1;
780 
781 	if (pci_msix_count(dev) < num_vectors) {
782 		ctrlr->msix_enabled = 0;
783 		goto intx;
784 	}
785 
786 	if (pci_alloc_msix(dev, &num_vectors) != 0)
787 		ctrlr->msix_enabled = 0;
788 
789 intx:
790 
791 	if (!ctrlr->msix_enabled)
792 		nvme_ctrlr_configure_intx(ctrlr);
793 
794 	nvme_ctrlr_construct_admin_qpair(ctrlr);
795 
796 	status = nvme_ctrlr_construct_io_qpairs(ctrlr);
797 
798 	if (status != 0)
799 		return (status);
800 
801 	ctrlr->cdev = make_dev(&nvme_ctrlr_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
802 	    "nvme%d", device_get_unit(dev));
803 
804 	if (ctrlr->cdev == NULL)
805 		return (ENXIO);
806 
807 	ctrlr->cdev->si_drv1 = (void *)ctrlr;
808 
809 	return (0);
810 }
811 
812 void
813 nvme_ctrlr_submit_admin_request(struct nvme_controller *ctrlr,
814     struct nvme_request *req)
815 {
816 
817 	nvme_qpair_submit_request(&ctrlr->adminq, req);
818 }
819 
820 void
821 nvme_ctrlr_submit_io_request(struct nvme_controller *ctrlr,
822     struct nvme_request *req)
823 {
824 	struct nvme_qpair       *qpair;
825 
826 	if (ctrlr->per_cpu_io_queues)
827 		qpair = &ctrlr->ioq[curcpu];
828 	else
829 		qpair = &ctrlr->ioq[0];
830 
831 	nvme_qpair_submit_request(qpair, req);
832 }
833