xref: /freebsd/usr.sbin/bhyve/pci_nvme.c (revision 4b50c451720d8b427757a6da1dd2bb4c52cd9e35)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  *
7  * Function crc16 Copyright (c) 2017, Fedor Uporov
8  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 /*
33  * bhyve PCIe-NVMe device emulation.
34  *
35  * options:
36  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#
37  *
38  *  accepted devpath:
39  *    /dev/blockdev
40  *    /path/to/image
41  *    ram=size_in_MiB
42  *
43  *  maxq    = max number of queues
44  *  qsz     = max elements in each queue
45  *  ioslots = max number of concurrent io requests
46  *  sectsz  = sector size (defaults to blockif sector size)
47  *  ser     = serial number (20-chars max)
48  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
49  *
50  */
51 
52 /* TODO:
53     - create async event for smart and log
54     - intr coalesce
55  */
56 
57 #include <sys/cdefs.h>
58 __FBSDID("$FreeBSD$");
59 
60 #include <sys/types.h>
61 #include <net/ieee_oui.h>
62 
63 #include <assert.h>
64 #include <pthread.h>
65 #include <semaphore.h>
66 #include <stdbool.h>
67 #include <stddef.h>
68 #include <stdint.h>
69 #include <stdio.h>
70 #include <stdlib.h>
71 #include <string.h>
72 
73 #include <machine/atomic.h>
74 #include <machine/vmm.h>
75 #include <vmmapi.h>
76 
77 #include <dev/nvme/nvme.h>
78 
79 #include "bhyverun.h"
80 #include "block_if.h"
81 #include "debug.h"
82 #include "pci_emul.h"
83 
84 
85 static int nvme_debug = 0;
86 #define	DPRINTF(params) if (nvme_debug) PRINTLN params
87 #define	WPRINTF(params) PRINTLN params
88 
89 /* defaults; can be overridden */
90 #define	NVME_MSIX_BAR		4
91 
92 #define	NVME_IOSLOTS		8
93 
94 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
95 #define NVME_MMIO_SPACE_MIN	(1 << 14)
96 
97 #define	NVME_QUEUES		16
98 #define	NVME_MAX_QENTRIES	2048
99 
100 #define	NVME_PRP2_ITEMS		(PAGE_SIZE/sizeof(uint64_t))
101 #define	NVME_MAX_BLOCKIOVS	512
102 
103 /* This is a synthetic status code to indicate there is no status */
104 #define NVME_NO_STATUS		0xffff
105 #define NVME_COMPLETION_VALID(c)	((c).status != NVME_NO_STATUS)
106 
107 /* helpers */
108 
109 /* Convert a zero-based value into a one-based value */
110 #define ONE_BASED(zero)		((zero) + 1)
111 /* Convert a one-based value into a zero-based value */
112 #define ZERO_BASED(one)		((one)  - 1)
113 
114 /* Encode number of SQ's and CQ's for Set/Get Features */
115 #define NVME_FEATURE_NUM_QUEUES(sc) \
116 	(ZERO_BASED((sc)->num_squeues) & 0xffff) | \
117 	(ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
118 
119 #define	NVME_DOORBELL_OFFSET	offsetof(struct nvme_registers, doorbell)
120 
121 enum nvme_controller_register_offsets {
122 	NVME_CR_CAP_LOW = 0x00,
123 	NVME_CR_CAP_HI  = 0x04,
124 	NVME_CR_VS      = 0x08,
125 	NVME_CR_INTMS   = 0x0c,
126 	NVME_CR_INTMC   = 0x10,
127 	NVME_CR_CC      = 0x14,
128 	NVME_CR_CSTS    = 0x1c,
129 	NVME_CR_NSSR    = 0x20,
130 	NVME_CR_AQA     = 0x24,
131 	NVME_CR_ASQ_LOW = 0x28,
132 	NVME_CR_ASQ_HI  = 0x2c,
133 	NVME_CR_ACQ_LOW = 0x30,
134 	NVME_CR_ACQ_HI  = 0x34,
135 };
136 
137 enum nvme_cmd_cdw11 {
138 	NVME_CMD_CDW11_PC  = 0x0001,
139 	NVME_CMD_CDW11_IEN = 0x0002,
140 	NVME_CMD_CDW11_IV  = 0xFFFF0000,
141 };
142 
143 #define	NVME_CQ_INTEN	0x01
144 #define	NVME_CQ_INTCOAL	0x02
145 
146 struct nvme_completion_queue {
147 	struct nvme_completion *qbase;
148 	uint32_t	size;
149 	uint16_t	tail; /* nvme progress */
150 	uint16_t	head; /* guest progress */
151 	uint16_t	intr_vec;
152 	uint32_t	intr_en;
153 	pthread_mutex_t	mtx;
154 };
155 
156 struct nvme_submission_queue {
157 	struct nvme_command *qbase;
158 	uint32_t	size;
159 	uint16_t	head; /* nvme progress */
160 	uint16_t	tail; /* guest progress */
161 	uint16_t	cqid; /* completion queue id */
162 	int		busy; /* queue is being processed */
163 	int		qpriority;
164 };
165 
166 enum nvme_storage_type {
167 	NVME_STOR_BLOCKIF = 0,
168 	NVME_STOR_RAM = 1,
169 };
170 
171 struct pci_nvme_blockstore {
172 	enum nvme_storage_type type;
173 	void		*ctx;
174 	uint64_t	size;
175 	uint32_t	sectsz;
176 	uint32_t	sectsz_bits;
177 	uint64_t	eui64;
178 };
179 
180 struct pci_nvme_ioreq {
181 	struct pci_nvme_softc *sc;
182 	struct pci_nvme_ioreq *next;
183 	struct nvme_submission_queue *nvme_sq;
184 	uint16_t	sqid;
185 
186 	/* command information */
187 	uint16_t	opc;
188 	uint16_t	cid;
189 	uint32_t	nsid;
190 
191 	uint64_t	prev_gpaddr;
192 	size_t		prev_size;
193 
194 	/*
195 	 * lock if all iovs consumed (big IO);
196 	 * complete transaction before continuing
197 	 */
198 	pthread_mutex_t	mtx;
199 	pthread_cond_t	cv;
200 
201 	struct blockif_req io_req;
202 
203 	/* pad to fit up to 512 page descriptors from guest IO request */
204 	struct iovec	iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX];
205 };
206 
207 struct pci_nvme_softc {
208 	struct pci_devinst *nsc_pi;
209 
210 	pthread_mutex_t	mtx;
211 
212 	struct nvme_registers regs;
213 
214 	struct nvme_namespace_data  nsdata;
215 	struct nvme_controller_data ctrldata;
216 	struct nvme_error_information_entry err_log;
217 	struct nvme_health_information_page health_log;
218 	struct nvme_firmware_page fw_log;
219 
220 	struct pci_nvme_blockstore nvstore;
221 
222 	uint16_t	max_qentries;	/* max entries per queue */
223 	uint32_t	max_queues;	/* max number of IO SQ's or CQ's */
224 	uint32_t	num_cqueues;
225 	uint32_t	num_squeues;
226 
227 	struct pci_nvme_ioreq *ioreqs;
228 	struct pci_nvme_ioreq *ioreqs_free; /* free list of ioreqs */
229 	uint32_t	pending_ios;
230 	uint32_t	ioslots;
231 	sem_t		iosemlock;
232 
233 	/*
234 	 * Memory mapped Submission and Completion queues
235 	 * Each array includes both Admin and IO queues
236 	 */
237 	struct nvme_completion_queue *compl_queues;
238 	struct nvme_submission_queue *submit_queues;
239 
240 	/* controller features */
241 	uint32_t	intr_coales_aggr_time;   /* 0x08: uS to delay intr */
242 	uint32_t	intr_coales_aggr_thresh; /* 0x08: compl-Q entries */
243 	uint32_t	async_ev_config;         /* 0x0B: async event config */
244 };
245 
246 
247 static void pci_nvme_io_partial(struct blockif_req *br, int err);
248 
249 /* Controller Configuration utils */
250 #define	NVME_CC_GET_EN(cc) \
251 	((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
252 #define	NVME_CC_GET_CSS(cc) \
253 	((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
254 #define	NVME_CC_GET_SHN(cc) \
255 	((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
256 #define	NVME_CC_GET_IOSQES(cc) \
257 	((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
258 #define	NVME_CC_GET_IOCQES(cc) \
259 	((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
260 
261 #define	NVME_CC_WRITE_MASK \
262 	((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
263 	 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
264 	 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
265 
266 #define	NVME_CC_NEN_WRITE_MASK \
267 	((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
268 	 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
269 	 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
270 
271 /* Controller Status utils */
272 #define	NVME_CSTS_GET_RDY(sts) \
273 	((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
274 
275 #define	NVME_CSTS_RDY	(1 << NVME_CSTS_REG_RDY_SHIFT)
276 
277 /* Completion Queue status word utils */
278 #define	NVME_STATUS_P	(1 << NVME_STATUS_P_SHIFT)
279 #define	NVME_STATUS_MASK \
280 	((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
281 	 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
282 
283 static __inline void
284 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
285 {
286 	size_t len;
287 
288 	len = strnlen(src, dst_size);
289 	memset(dst, pad, dst_size);
290 	memcpy(dst, src, len);
291 }
292 
293 static __inline void
294 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
295 {
296 
297 	*status &= ~NVME_STATUS_MASK;
298 	*status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
299 		(code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
300 }
301 
302 static __inline void
303 pci_nvme_status_genc(uint16_t *status, uint16_t code)
304 {
305 
306 	pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
307 }
308 
309 static __inline void
310 pci_nvme_toggle_phase(uint16_t *status, int prev)
311 {
312 
313 	if (prev)
314 		*status &= ~NVME_STATUS_P;
315 	else
316 		*status |= NVME_STATUS_P;
317 }
318 
319 static void
320 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
321 {
322 	struct nvme_controller_data *cd = &sc->ctrldata;
323 
324 	cd->vid = 0xFB5D;
325 	cd->ssvid = 0x0000;
326 
327 	cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
328 	cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
329 
330 	/* Num of submission commands that we can handle at a time (2^rab) */
331 	cd->rab   = 4;
332 
333 	/* FreeBSD OUI */
334 	cd->ieee[0] = 0x58;
335 	cd->ieee[1] = 0x9c;
336 	cd->ieee[2] = 0xfc;
337 
338 	cd->mic = 0;
339 
340 	cd->mdts = 9;	/* max data transfer size (2^mdts * CAP.MPSMIN) */
341 
342 	cd->ver = 0x00010300;
343 
344 	cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
345 	cd->acl = 2;
346 	cd->aerl = 4;
347 
348 	cd->lpa = 0;	/* TODO: support some simple things like SMART */
349 	cd->elpe = 0;	/* max error log page entries */
350 	cd->npss = 1;	/* number of power states support */
351 
352 	/* Warning Composite Temperature Threshold */
353 	cd->wctemp = 0x0157;
354 
355 	cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
356 	    (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
357 	cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
358 	    (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
359 	cd->nn = 1;	/* number of namespaces */
360 
361 	cd->fna = 0x03;
362 
363 	cd->power_state[0].mp = 10;
364 }
365 
366 /*
367  * Calculate the CRC-16 of the given buffer
368  * See copyright attribution at top of file
369  */
370 static uint16_t
371 crc16(uint16_t crc, const void *buffer, unsigned int len)
372 {
373 	const unsigned char *cp = buffer;
374 	/* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
375 	static uint16_t const crc16_table[256] = {
376 		0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
377 		0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
378 		0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
379 		0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
380 		0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
381 		0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
382 		0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
383 		0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
384 		0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
385 		0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
386 		0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
387 		0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
388 		0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
389 		0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
390 		0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
391 		0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
392 		0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
393 		0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
394 		0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
395 		0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
396 		0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
397 		0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
398 		0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
399 		0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
400 		0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
401 		0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
402 		0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
403 		0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
404 		0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
405 		0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
406 		0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
407 		0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
408 	};
409 
410 	while (len--)
411 		crc = (((crc >> 8) & 0xffU) ^
412 		    crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
413 	return crc;
414 }
415 
416 static void
417 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
418     struct nvme_namespace_data *nd, uint32_t nsid,
419     uint64_t eui64)
420 {
421 
422 	nd->nsze = sc->nvstore.size / sc->nvstore.sectsz;
423 	nd->ncap = nd->nsze;
424 	nd->nuse = nd->nsze;
425 
426 	/* Get LBA and backstore information from backing store */
427 	nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
428 	nd->flbas = 0;
429 
430 	/* Create an EUI-64 if user did not provide one */
431 	if (eui64 == 0) {
432 		char *data = NULL;
433 
434 		asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
435 		    sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
436 
437 		if (data != NULL) {
438 			eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
439 			free(data);
440 		}
441 		eui64 = (eui64 << 16) | (nsid & 0xffff);
442 	}
443 	be64enc(nd->eui64, eui64);
444 
445 	/* LBA data-sz = 2^lbads */
446 	nd->lbaf[0] = sc->nvstore.sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
447 }
448 
449 static void
450 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
451 {
452 
453 	memset(&sc->err_log, 0, sizeof(sc->err_log));
454 	memset(&sc->health_log, 0, sizeof(sc->health_log));
455 	memset(&sc->fw_log, 0, sizeof(sc->fw_log));
456 }
457 
458 static void
459 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
460 {
461 	DPRINTF(("%s", __func__));
462 
463 	sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
464 	    (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
465 	    (60 << NVME_CAP_LO_REG_TO_SHIFT);
466 
467 	sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
468 
469 	sc->regs.vs = 0x00010300;	/* NVMe v1.3 */
470 
471 	sc->regs.cc = 0;
472 	sc->regs.csts = 0;
473 
474 	sc->num_cqueues = sc->num_squeues = sc->max_queues;
475 	if (sc->submit_queues != NULL) {
476 		for (int i = 0; i < sc->num_squeues + 1; i++) {
477 			/*
478 			 * The Admin Submission Queue is at index 0.
479 			 * It must not be changed at reset otherwise the
480 			 * emulation will be out of sync with the guest.
481 			 */
482 			if (i != 0) {
483 				sc->submit_queues[i].qbase = NULL;
484 				sc->submit_queues[i].size = 0;
485 				sc->submit_queues[i].cqid = 0;
486 			}
487 			sc->submit_queues[i].tail = 0;
488 			sc->submit_queues[i].head = 0;
489 			sc->submit_queues[i].busy = 0;
490 		}
491 	} else
492 		sc->submit_queues = calloc(sc->num_squeues + 1,
493 		                        sizeof(struct nvme_submission_queue));
494 
495 	if (sc->compl_queues != NULL) {
496 		for (int i = 0; i < sc->num_cqueues + 1; i++) {
497 			/* See Admin Submission Queue note above */
498 			if (i != 0) {
499 				sc->compl_queues[i].qbase = NULL;
500 				sc->compl_queues[i].size = 0;
501 			}
502 
503 			sc->compl_queues[i].tail = 0;
504 			sc->compl_queues[i].head = 0;
505 		}
506 	} else {
507 		sc->compl_queues = calloc(sc->num_cqueues + 1,
508 		                        sizeof(struct nvme_completion_queue));
509 
510 		for (int i = 0; i < sc->num_cqueues + 1; i++)
511 			pthread_mutex_init(&sc->compl_queues[i].mtx, NULL);
512 	}
513 }
514 
515 static void
516 pci_nvme_reset(struct pci_nvme_softc *sc)
517 {
518 	pthread_mutex_lock(&sc->mtx);
519 	pci_nvme_reset_locked(sc);
520 	pthread_mutex_unlock(&sc->mtx);
521 }
522 
523 static void
524 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
525 {
526 	uint16_t acqs, asqs;
527 
528 	DPRINTF(("%s", __func__));
529 
530 	asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
531 	sc->submit_queues[0].size = asqs;
532 	sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
533 	            sizeof(struct nvme_command) * asqs);
534 
535 	DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p",
536 	        __func__, sc->regs.asq, sc->submit_queues[0].qbase));
537 
538 	acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
539 	    NVME_AQA_REG_ACQS_MASK) + 1;
540 	sc->compl_queues[0].size = acqs;
541 	sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
542 	         sizeof(struct nvme_completion) * acqs);
543 	DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p",
544 	        __func__, sc->regs.acq, sc->compl_queues[0].qbase));
545 }
546 
547 static int
548 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *src,
549 	size_t len)
550 {
551 	uint8_t *dst;
552 	size_t bytes;
553 
554 	if (len > (8 * 1024)) {
555 		return (-1);
556 	}
557 
558 	/* Copy from the start of prp1 to the end of the physical page */
559 	bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
560 	bytes = MIN(bytes, len);
561 
562 	dst = vm_map_gpa(ctx, prp1, bytes);
563 	if (dst == NULL) {
564 		return (-1);
565 	}
566 
567 	memcpy(dst, src, bytes);
568 
569 	src += bytes;
570 
571 	len -= bytes;
572 	if (len == 0) {
573 		return (0);
574 	}
575 
576 	len = MIN(len, PAGE_SIZE);
577 
578 	dst = vm_map_gpa(ctx, prp2, len);
579 	if (dst == NULL) {
580 		return (-1);
581 	}
582 
583 	memcpy(dst, src, len);
584 
585 	return (0);
586 }
587 
588 static int
589 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
590 	struct nvme_completion* compl)
591 {
592 	uint16_t qid = command->cdw10 & 0xffff;
593 
594 	DPRINTF(("%s DELETE_IO_SQ %u", __func__, qid));
595 	if (qid == 0 || qid > sc->num_squeues) {
596 		WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u",
597 		        __func__, qid, sc->num_squeues));
598 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
599 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
600 		return (1);
601 	}
602 
603 	sc->submit_queues[qid].qbase = NULL;
604 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
605 	return (1);
606 }
607 
608 static int
609 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
610 	struct nvme_completion* compl)
611 {
612 	if (command->cdw11 & NVME_CMD_CDW11_PC) {
613 		uint16_t qid = command->cdw10 & 0xffff;
614 		struct nvme_submission_queue *nsq;
615 
616 		if ((qid == 0) || (qid > sc->num_squeues)) {
617 			WPRINTF(("%s queue index %u > num_squeues %u",
618 			        __func__, qid, sc->num_squeues));
619 			pci_nvme_status_tc(&compl->status,
620 			    NVME_SCT_COMMAND_SPECIFIC,
621 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
622 			return (1);
623 		}
624 
625 		nsq = &sc->submit_queues[qid];
626 		nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
627 
628 		nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
629 		              sizeof(struct nvme_command) * (size_t)nsq->size);
630 		nsq->cqid = (command->cdw11 >> 16) & 0xffff;
631 		nsq->qpriority = (command->cdw11 >> 1) & 0x03;
632 
633 		DPRINTF(("%s sq %u size %u gaddr %p cqid %u", __func__,
634 		        qid, nsq->size, nsq->qbase, nsq->cqid));
635 
636 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
637 
638 		DPRINTF(("%s completed creating IOSQ qid %u",
639 		         __func__, qid));
640 	} else {
641 		/*
642 		 * Guest sent non-cont submission queue request.
643 		 * This setting is unsupported by this emulation.
644 		 */
645 		WPRINTF(("%s unsupported non-contig (list-based) "
646 		         "create i/o submission queue", __func__));
647 
648 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
649 	}
650 	return (1);
651 }
652 
653 static int
654 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
655 	struct nvme_completion* compl)
656 {
657 	uint16_t qid = command->cdw10 & 0xffff;
658 
659 	DPRINTF(("%s DELETE_IO_CQ %u", __func__, qid));
660 	if (qid == 0 || qid > sc->num_cqueues) {
661 		WPRINTF(("%s queue index %u / num_cqueues %u",
662 		        __func__, qid, sc->num_cqueues));
663 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
664 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
665 		return (1);
666 	}
667 
668 	sc->compl_queues[qid].qbase = NULL;
669 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
670 	return (1);
671 }
672 
673 static int
674 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
675 	struct nvme_completion* compl)
676 {
677 	if (command->cdw11 & NVME_CMD_CDW11_PC) {
678 		uint16_t qid = command->cdw10 & 0xffff;
679 		struct nvme_completion_queue *ncq;
680 
681 		if ((qid == 0) || (qid > sc->num_cqueues)) {
682 			WPRINTF(("%s queue index %u > num_cqueues %u",
683 			        __func__, qid, sc->num_cqueues));
684 			pci_nvme_status_tc(&compl->status,
685 			    NVME_SCT_COMMAND_SPECIFIC,
686 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
687 			return (1);
688 		}
689 
690 		ncq = &sc->compl_queues[qid];
691 		ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
692 		ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
693 		ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
694 
695 		ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
696 		             command->prp1,
697 		             sizeof(struct nvme_command) * (size_t)ncq->size);
698 
699 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
700 	} else {
701 		/*
702 		 * Non-contig completion queue unsupported.
703 		 */
704 		WPRINTF(("%s unsupported non-contig (list-based) "
705 		         "create i/o completion queue",
706 		         __func__));
707 
708 		/* 0x12 = Invalid Use of Controller Memory Buffer */
709 		pci_nvme_status_genc(&compl->status, 0x12);
710 	}
711 
712 	return (1);
713 }
714 
715 static int
716 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
717 	struct nvme_completion* compl)
718 {
719 	uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2;
720 	uint8_t logpage = command->cdw10 & 0xFF;
721 
722 	DPRINTF(("%s log page %u len %u", __func__, logpage, logsize));
723 
724 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
725 
726 	switch (logpage) {
727 	case NVME_LOG_ERROR:
728 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
729 		    command->prp2, (uint8_t *)&sc->err_log, logsize);
730 		break;
731 	case NVME_LOG_HEALTH_INFORMATION:
732 		/* TODO: present some smart info */
733 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
734 		    command->prp2, (uint8_t *)&sc->health_log, logsize);
735 		break;
736 	case NVME_LOG_FIRMWARE_SLOT:
737 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
738 		    command->prp2, (uint8_t *)&sc->fw_log, logsize);
739 		break;
740 	default:
741 		WPRINTF(("%s get log page %x command not supported",
742 		        __func__, logpage));
743 
744 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
745 		    NVME_SC_INVALID_LOG_PAGE);
746 	}
747 
748 	return (1);
749 }
750 
751 static int
752 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
753 	struct nvme_completion* compl)
754 {
755 	void *dest;
756 
757 	DPRINTF(("%s identify 0x%x nsid 0x%x", __func__,
758 	        command->cdw10 & 0xFF, command->nsid));
759 
760 	switch (command->cdw10 & 0xFF) {
761 	case 0x00: /* return Identify Namespace data structure */
762 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
763 		    command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata));
764 		break;
765 	case 0x01: /* return Identify Controller data structure */
766 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
767 		    command->prp2, (uint8_t *)&sc->ctrldata,
768 		    sizeof(sc->ctrldata));
769 		break;
770 	case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
771 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
772 		                  sizeof(uint32_t) * 1024);
773 		((uint32_t *)dest)[0] = 1;
774 		((uint32_t *)dest)[1] = 0;
775 		break;
776 	case 0x11:
777 		pci_nvme_status_genc(&compl->status,
778 		    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
779 		return (1);
780 	case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
781 	case 0x10:
782 	case 0x12:
783 	case 0x13:
784 	case 0x14:
785 	case 0x15:
786 	default:
787 		DPRINTF(("%s unsupported identify command requested 0x%x",
788 		         __func__, command->cdw10 & 0xFF));
789 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
790 		return (1);
791 	}
792 
793 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
794 	return (1);
795 }
796 
797 static int
798 nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command,
799 	struct nvme_completion* compl)
800 {
801 	uint16_t nqr;	/* Number of Queues Requested */
802 
803 	nqr = command->cdw11 & 0xFFFF;
804 	if (nqr == 0xffff) {
805 		WPRINTF(("%s: Illegal NSQR value %#x", __func__, nqr));
806 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
807 		return (-1);
808 	}
809 
810 	sc->num_squeues = ONE_BASED(nqr);
811 	if (sc->num_squeues > sc->max_queues) {
812 		DPRINTF(("NSQR=%u is greater than max %u", sc->num_squeues,
813 					sc->max_queues));
814 		sc->num_squeues = sc->max_queues;
815 	}
816 
817 	nqr = (command->cdw11 >> 16) & 0xFFFF;
818 	if (nqr == 0xffff) {
819 		WPRINTF(("%s: Illegal NCQR value %#x", __func__, nqr));
820 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
821 		return (-1);
822 	}
823 
824 	sc->num_cqueues = ONE_BASED(nqr);
825 	if (sc->num_cqueues > sc->max_queues) {
826 		DPRINTF(("NCQR=%u is greater than max %u", sc->num_cqueues,
827 					sc->max_queues));
828 		sc->num_cqueues = sc->max_queues;
829 	}
830 
831 	compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
832 
833 	return (0);
834 }
835 
836 static int
837 nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command,
838 	struct nvme_completion* compl)
839 {
840 	int feature = command->cdw10 & 0xFF;
841 	uint32_t iv;
842 
843 	DPRINTF(("%s feature 0x%x", __func__, feature));
844 	compl->cdw0 = 0;
845 
846 	switch (feature) {
847 	case NVME_FEAT_ARBITRATION:
848 		DPRINTF(("  arbitration 0x%x", command->cdw11));
849 		break;
850 	case NVME_FEAT_POWER_MANAGEMENT:
851 		DPRINTF(("  power management 0x%x", command->cdw11));
852 		break;
853 	case NVME_FEAT_LBA_RANGE_TYPE:
854 		DPRINTF(("  lba range 0x%x", command->cdw11));
855 		break;
856 	case NVME_FEAT_TEMPERATURE_THRESHOLD:
857 		DPRINTF(("  temperature threshold 0x%x", command->cdw11));
858 		break;
859 	case NVME_FEAT_ERROR_RECOVERY:
860 		DPRINTF(("  error recovery 0x%x", command->cdw11));
861 		break;
862 	case NVME_FEAT_VOLATILE_WRITE_CACHE:
863 		DPRINTF(("  volatile write cache 0x%x", command->cdw11));
864 		break;
865 	case NVME_FEAT_NUMBER_OF_QUEUES:
866 		nvme_set_feature_queues(sc, command, compl);
867 		break;
868 	case NVME_FEAT_INTERRUPT_COALESCING:
869 		DPRINTF(("  interrupt coalescing 0x%x", command->cdw11));
870 
871 		/* in uS */
872 		sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100;
873 
874 		sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF;
875 		break;
876 	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
877 		iv = command->cdw11 & 0xFFFF;
878 
879 		DPRINTF(("  interrupt vector configuration 0x%x",
880 		        command->cdw11));
881 
882 		for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) {
883 			if (sc->compl_queues[i].intr_vec == iv) {
884 				if (command->cdw11 & (1 << 16))
885 					sc->compl_queues[i].intr_en |=
886 					                      NVME_CQ_INTCOAL;
887 				else
888 					sc->compl_queues[i].intr_en &=
889 					                     ~NVME_CQ_INTCOAL;
890 			}
891 		}
892 		break;
893 	case NVME_FEAT_WRITE_ATOMICITY:
894 		DPRINTF(("  write atomicity 0x%x", command->cdw11));
895 		break;
896 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
897 		DPRINTF(("  async event configuration 0x%x",
898 		        command->cdw11));
899 		sc->async_ev_config = command->cdw11;
900 		break;
901 	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
902 		DPRINTF(("  software progress marker 0x%x",
903 		        command->cdw11));
904 		break;
905 	case 0x0C:
906 		DPRINTF(("  autonomous power state transition 0x%x",
907 		        command->cdw11));
908 		break;
909 	default:
910 		WPRINTF(("%s invalid feature", __func__));
911 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
912 		return (1);
913 	}
914 
915 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
916 	return (1);
917 }
918 
919 static int
920 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
921 	struct nvme_completion* compl)
922 {
923 	int feature = command->cdw10 & 0xFF;
924 
925 	DPRINTF(("%s feature 0x%x", __func__, feature));
926 
927 	compl->cdw0 = 0;
928 
929 	switch (feature) {
930 	case NVME_FEAT_ARBITRATION:
931 		DPRINTF(("  arbitration"));
932 		break;
933 	case NVME_FEAT_POWER_MANAGEMENT:
934 		DPRINTF(("  power management"));
935 		break;
936 	case NVME_FEAT_LBA_RANGE_TYPE:
937 		DPRINTF(("  lba range"));
938 		break;
939 	case NVME_FEAT_TEMPERATURE_THRESHOLD:
940 		DPRINTF(("  temperature threshold"));
941 		switch ((command->cdw11 >> 20) & 0x3) {
942 		case 0:
943 			/* Over temp threshold */
944 			compl->cdw0 = 0xFFFF;
945 			break;
946 		case 1:
947 			/* Under temp threshold */
948 			compl->cdw0 = 0;
949 			break;
950 		default:
951 			WPRINTF(("  invalid threshold type select"));
952 			pci_nvme_status_genc(&compl->status,
953 			    NVME_SC_INVALID_FIELD);
954 			return (1);
955 		}
956 		break;
957 	case NVME_FEAT_ERROR_RECOVERY:
958 		DPRINTF(("  error recovery"));
959 		break;
960 	case NVME_FEAT_VOLATILE_WRITE_CACHE:
961 		DPRINTF(("  volatile write cache"));
962 		break;
963 	case NVME_FEAT_NUMBER_OF_QUEUES:
964 		compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
965 
966 		DPRINTF(("  number of queues (submit %u, completion %u)",
967 		        compl->cdw0 & 0xFFFF,
968 		        (compl->cdw0 >> 16) & 0xFFFF));
969 
970 		break;
971 	case NVME_FEAT_INTERRUPT_COALESCING:
972 		DPRINTF(("  interrupt coalescing"));
973 		break;
974 	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
975 		DPRINTF(("  interrupt vector configuration"));
976 		break;
977 	case NVME_FEAT_WRITE_ATOMICITY:
978 		DPRINTF(("  write atomicity"));
979 		break;
980 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
981 		DPRINTF(("  async event configuration"));
982 		sc->async_ev_config = command->cdw11;
983 		break;
984 	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
985 		DPRINTF(("  software progress marker"));
986 		break;
987 	case 0x0C:
988 		DPRINTF(("  autonomous power state transition"));
989 		break;
990 	default:
991 		WPRINTF(("%s invalid feature 0x%x", __func__, feature));
992 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
993 		return (1);
994 	}
995 
996 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
997 	return (1);
998 }
999 
1000 static int
1001 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1002 	struct nvme_completion* compl)
1003 {
1004 	DPRINTF(("%s submission queue %u, command ID 0x%x", __func__,
1005 	        command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF));
1006 
1007 	/* TODO: search for the command ID and abort it */
1008 
1009 	compl->cdw0 = 1;
1010 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1011 	return (1);
1012 }
1013 
1014 static int
1015 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1016 	struct nvme_command* command, struct nvme_completion* compl)
1017 {
1018 	DPRINTF(("%s async event request 0x%x", __func__, command->cdw11));
1019 
1020 	/*
1021 	 * TODO: raise events when they happen based on the Set Features cmd.
1022 	 * These events happen async, so only set completion successful if
1023 	 * there is an event reflective of the request to get event.
1024 	 */
1025 	pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1026 	    NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1027 	return (0);
1028 }
1029 
1030 static void
1031 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1032 {
1033 	struct nvme_completion compl;
1034 	struct nvme_command *cmd;
1035 	struct nvme_submission_queue *sq;
1036 	struct nvme_completion_queue *cq;
1037 	int do_intr = 0;
1038 	uint16_t sqhead;
1039 
1040 	DPRINTF(("%s index %u", __func__, (uint32_t)value));
1041 
1042 	sq = &sc->submit_queues[0];
1043 
1044 	sqhead = atomic_load_acq_short(&sq->head);
1045 
1046 	if (atomic_testandset_int(&sq->busy, 1)) {
1047 		DPRINTF(("%s SQ busy, head %u, tail %u",
1048 		        __func__, sqhead, sq->tail));
1049 		return;
1050 	}
1051 
1052 	DPRINTF(("sqhead %u, tail %u", sqhead, sq->tail));
1053 
1054 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
1055 		cmd = &(sq->qbase)[sqhead];
1056 		compl.cdw0 = 0;
1057 		compl.status = 0;
1058 
1059 		switch (cmd->opc) {
1060 		case NVME_OPC_DELETE_IO_SQ:
1061 			DPRINTF(("%s command DELETE_IO_SQ", __func__));
1062 			do_intr |= nvme_opc_delete_io_sq(sc, cmd, &compl);
1063 			break;
1064 		case NVME_OPC_CREATE_IO_SQ:
1065 			DPRINTF(("%s command CREATE_IO_SQ", __func__));
1066 			do_intr |= nvme_opc_create_io_sq(sc, cmd, &compl);
1067 			break;
1068 		case NVME_OPC_DELETE_IO_CQ:
1069 			DPRINTF(("%s command DELETE_IO_CQ", __func__));
1070 			do_intr |= nvme_opc_delete_io_cq(sc, cmd, &compl);
1071 			break;
1072 		case NVME_OPC_CREATE_IO_CQ:
1073 			DPRINTF(("%s command CREATE_IO_CQ", __func__));
1074 			do_intr |= nvme_opc_create_io_cq(sc, cmd, &compl);
1075 			break;
1076 		case NVME_OPC_GET_LOG_PAGE:
1077 			DPRINTF(("%s command GET_LOG_PAGE", __func__));
1078 			do_intr |= nvme_opc_get_log_page(sc, cmd, &compl);
1079 			break;
1080 		case NVME_OPC_IDENTIFY:
1081 			DPRINTF(("%s command IDENTIFY", __func__));
1082 			do_intr |= nvme_opc_identify(sc, cmd, &compl);
1083 			break;
1084 		case NVME_OPC_ABORT:
1085 			DPRINTF(("%s command ABORT", __func__));
1086 			do_intr |= nvme_opc_abort(sc, cmd, &compl);
1087 			break;
1088 		case NVME_OPC_SET_FEATURES:
1089 			DPRINTF(("%s command SET_FEATURES", __func__));
1090 			do_intr |= nvme_opc_set_features(sc, cmd, &compl);
1091 			break;
1092 		case NVME_OPC_GET_FEATURES:
1093 			DPRINTF(("%s command GET_FEATURES", __func__));
1094 			do_intr |= nvme_opc_get_features(sc, cmd, &compl);
1095 			break;
1096 		case NVME_OPC_ASYNC_EVENT_REQUEST:
1097 			DPRINTF(("%s command ASYNC_EVENT_REQ", __func__));
1098 			/* XXX dont care, unhandled for now
1099 			do_intr |= nvme_opc_async_event_req(sc, cmd, &compl);
1100 			*/
1101 			compl.status = NVME_NO_STATUS;
1102 			break;
1103 		default:
1104 			WPRINTF(("0x%x command is not implemented",
1105 			    cmd->opc));
1106 			pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1107 			do_intr |= 1;
1108 		}
1109 
1110 		if (NVME_COMPLETION_VALID(compl)) {
1111 			struct nvme_completion *cp;
1112 			int phase;
1113 
1114 			cq = &sc->compl_queues[0];
1115 
1116 			cp = &(cq->qbase)[cq->tail];
1117 			cp->cdw0 = compl.cdw0;
1118 			cp->sqid = 0;
1119 			cp->sqhd = sqhead;
1120 			cp->cid = cmd->cid;
1121 
1122 			phase = NVME_STATUS_GET_P(cp->status);
1123 			cp->status = compl.status;
1124 			pci_nvme_toggle_phase(&cp->status, phase);
1125 
1126 			cq->tail = (cq->tail + 1) % cq->size;
1127 		}
1128 		sqhead = (sqhead + 1) % sq->size;
1129 	}
1130 
1131 	DPRINTF(("setting sqhead %u", sqhead));
1132 	atomic_store_short(&sq->head, sqhead);
1133 	atomic_store_int(&sq->busy, 0);
1134 
1135 	if (do_intr)
1136 		pci_generate_msix(sc->nsc_pi, 0);
1137 
1138 }
1139 
1140 static int
1141 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1142 	uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1143 {
1144 	int iovidx;
1145 
1146 	if (req != NULL) {
1147 		/* concatenate contig block-iovs to minimize number of iovs */
1148 		if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1149 			iovidx = req->io_req.br_iovcnt - 1;
1150 
1151 			req->io_req.br_iov[iovidx].iov_base =
1152 			    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1153 			                     req->prev_gpaddr, size);
1154 
1155 			req->prev_size += size;
1156 			req->io_req.br_resid += size;
1157 
1158 			req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1159 		} else {
1160 			pthread_mutex_lock(&req->mtx);
1161 
1162 			iovidx = req->io_req.br_iovcnt;
1163 			if (iovidx == NVME_MAX_BLOCKIOVS) {
1164 				int err = 0;
1165 
1166 				DPRINTF(("large I/O, doing partial req"));
1167 
1168 				iovidx = 0;
1169 				req->io_req.br_iovcnt = 0;
1170 
1171 				req->io_req.br_callback = pci_nvme_io_partial;
1172 
1173 				if (!do_write)
1174 					err = blockif_read(sc->nvstore.ctx,
1175 					                   &req->io_req);
1176 				else
1177 					err = blockif_write(sc->nvstore.ctx,
1178 					                    &req->io_req);
1179 
1180 				/* wait until req completes before cont */
1181 				if (err == 0)
1182 					pthread_cond_wait(&req->cv, &req->mtx);
1183 			}
1184 			if (iovidx == 0) {
1185 				req->io_req.br_offset = lba;
1186 				req->io_req.br_resid = 0;
1187 				req->io_req.br_param = req;
1188 			}
1189 
1190 			req->io_req.br_iov[iovidx].iov_base =
1191 			    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1192 			                     gpaddr, size);
1193 
1194 			req->io_req.br_iov[iovidx].iov_len = size;
1195 
1196 			req->prev_gpaddr = gpaddr;
1197 			req->prev_size = size;
1198 			req->io_req.br_resid += size;
1199 
1200 			req->io_req.br_iovcnt++;
1201 
1202 			pthread_mutex_unlock(&req->mtx);
1203 		}
1204 	} else {
1205 		/* RAM buffer: read/write directly */
1206 		void *p = sc->nvstore.ctx;
1207 		void *gptr;
1208 
1209 		if ((lba + size) > sc->nvstore.size) {
1210 			WPRINTF(("%s write would overflow RAM", __func__));
1211 			return (-1);
1212 		}
1213 
1214 		p = (void *)((uintptr_t)p + (uintptr_t)lba);
1215 		gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size);
1216 		if (do_write)
1217 			memcpy(p, gptr, size);
1218 		else
1219 			memcpy(gptr, p, size);
1220 	}
1221 	return (0);
1222 }
1223 
1224 static void
1225 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1226 	struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1227 	uint32_t cdw0, uint16_t status, int ignore_busy)
1228 {
1229 	struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1230 	struct nvme_completion *compl;
1231 	int do_intr = 0;
1232 	int phase;
1233 
1234 	DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1235 		 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1236 		 NVME_STATUS_GET_SC(status)));
1237 
1238 	pthread_mutex_lock(&cq->mtx);
1239 
1240 	assert(cq->qbase != NULL);
1241 
1242 	compl = &cq->qbase[cq->tail];
1243 
1244 	compl->sqhd = atomic_load_acq_short(&sq->head);
1245 	compl->sqid = sqid;
1246 	compl->cid = cid;
1247 
1248 	// toggle phase
1249 	phase = NVME_STATUS_GET_P(compl->status);
1250 	compl->status = status;
1251 	pci_nvme_toggle_phase(&compl->status, phase);
1252 
1253 	cq->tail = (cq->tail + 1) % cq->size;
1254 
1255 	if (cq->intr_en & NVME_CQ_INTEN)
1256 		do_intr = 1;
1257 
1258 	pthread_mutex_unlock(&cq->mtx);
1259 
1260 	if (ignore_busy || !atomic_load_acq_int(&sq->busy))
1261 		if (do_intr)
1262 			pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1263 }
1264 
1265 static void
1266 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1267 {
1268 	req->sc = NULL;
1269 	req->nvme_sq = NULL;
1270 	req->sqid = 0;
1271 
1272 	pthread_mutex_lock(&sc->mtx);
1273 
1274 	req->next = sc->ioreqs_free;
1275 	sc->ioreqs_free = req;
1276 	sc->pending_ios--;
1277 
1278 	/* when no more IO pending, can set to ready if device reset/enabled */
1279 	if (sc->pending_ios == 0 &&
1280 	    NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1281 		sc->regs.csts |= NVME_CSTS_RDY;
1282 
1283 	pthread_mutex_unlock(&sc->mtx);
1284 
1285 	sem_post(&sc->iosemlock);
1286 }
1287 
1288 static struct pci_nvme_ioreq *
1289 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1290 {
1291 	struct pci_nvme_ioreq *req = NULL;;
1292 
1293 	sem_wait(&sc->iosemlock);
1294 	pthread_mutex_lock(&sc->mtx);
1295 
1296 	req = sc->ioreqs_free;
1297 	assert(req != NULL);
1298 
1299 	sc->ioreqs_free = req->next;
1300 
1301 	req->next = NULL;
1302 	req->sc = sc;
1303 
1304 	sc->pending_ios++;
1305 
1306 	pthread_mutex_unlock(&sc->mtx);
1307 
1308 	req->io_req.br_iovcnt = 0;
1309 	req->io_req.br_offset = 0;
1310 	req->io_req.br_resid = 0;
1311 	req->io_req.br_param = req;
1312 	req->prev_gpaddr = 0;
1313 	req->prev_size = 0;
1314 
1315 	return req;
1316 }
1317 
1318 static void
1319 pci_nvme_io_done(struct blockif_req *br, int err)
1320 {
1321 	struct pci_nvme_ioreq *req = br->br_param;
1322 	struct nvme_submission_queue *sq = req->nvme_sq;
1323 	uint16_t code, status;
1324 
1325 	DPRINTF(("%s error %d %s", __func__, err, strerror(err)));
1326 
1327 	/* TODO return correct error */
1328 	code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1329 	pci_nvme_status_genc(&status, code);
1330 
1331 	pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0);
1332 	pci_nvme_release_ioreq(req->sc, req);
1333 }
1334 
1335 static void
1336 pci_nvme_io_partial(struct blockif_req *br, int err)
1337 {
1338 	struct pci_nvme_ioreq *req = br->br_param;
1339 
1340 	DPRINTF(("%s error %d %s", __func__, err, strerror(err)));
1341 
1342 	pthread_cond_signal(&req->cv);
1343 }
1344 
1345 
1346 static void
1347 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
1348 {
1349 	struct nvme_submission_queue *sq;
1350 	uint16_t status;
1351 	uint16_t sqhead;
1352 	int err;
1353 
1354 	/* handle all submissions up to sq->tail index */
1355 	sq = &sc->submit_queues[idx];
1356 
1357 	if (atomic_testandset_int(&sq->busy, 1)) {
1358 		DPRINTF(("%s sqid %u busy", __func__, idx));
1359 		return;
1360 	}
1361 
1362 	sqhead = atomic_load_acq_short(&sq->head);
1363 
1364 	DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p",
1365 	         idx, sqhead, sq->tail, sq->qbase));
1366 
1367 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
1368 		struct nvme_command *cmd;
1369 		struct pci_nvme_ioreq *req = NULL;
1370 		uint64_t lba;
1371 		uint64_t nblocks, bytes, size, cpsz;
1372 
1373 		/* TODO: support scatter gather list handling */
1374 
1375 		cmd = &sq->qbase[sqhead];
1376 		sqhead = (sqhead + 1) % sq->size;
1377 
1378 		lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
1379 
1380 		if (cmd->opc == NVME_OPC_FLUSH) {
1381 			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1382 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1383 			                        status, 1);
1384 
1385 			continue;
1386 		} else if (cmd->opc == 0x08) {
1387 			/* TODO: write zeroes */
1388 			WPRINTF(("%s write zeroes lba 0x%lx blocks %u",
1389 			        __func__, lba, cmd->cdw12 & 0xFFFF));
1390 			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1391 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1392 			                        status, 1);
1393 
1394 			continue;
1395 		}
1396 
1397 		nblocks = (cmd->cdw12 & 0xFFFF) + 1;
1398 
1399 		bytes = nblocks * sc->nvstore.sectsz;
1400 
1401 		if (sc->nvstore.type == NVME_STOR_BLOCKIF) {
1402 			req = pci_nvme_get_ioreq(sc);
1403 			req->nvme_sq = sq;
1404 			req->sqid = idx;
1405 		}
1406 
1407 		/*
1408 		 * If data starts mid-page and flows into the next page, then
1409 		 * increase page count
1410 		 */
1411 
1412 		DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu "
1413 		         "(%lu-bytes)",
1414 		         sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size,
1415 		         cmd->opc == NVME_OPC_WRITE ?
1416 			     "WRITE" : "READ",
1417 		         lba, nblocks, bytes));
1418 
1419 		cmd->prp1 &= ~(0x03UL);
1420 		cmd->prp2 &= ~(0x03UL);
1421 
1422 		DPRINTF((" prp1 0x%lx prp2 0x%lx", cmd->prp1, cmd->prp2));
1423 
1424 		size = bytes;
1425 		lba *= sc->nvstore.sectsz;
1426 
1427 		cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE);
1428 
1429 		if (cpsz > bytes)
1430 			cpsz = bytes;
1431 
1432 		if (req != NULL) {
1433 			req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) |
1434 			                        cmd->cdw10;
1435 			req->opc = cmd->opc;
1436 			req->cid = cmd->cid;
1437 			req->nsid = cmd->nsid;
1438 		}
1439 
1440 		err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz,
1441 		    cmd->opc == NVME_OPC_WRITE, lba);
1442 		lba += cpsz;
1443 		size -= cpsz;
1444 
1445 		if (size == 0)
1446 			goto iodone;
1447 
1448 		if (size <= PAGE_SIZE) {
1449 			/* prp2 is second (and final) page in transfer */
1450 
1451 			err = pci_nvme_append_iov_req(sc, req, cmd->prp2,
1452 			    size,
1453 			    cmd->opc == NVME_OPC_WRITE,
1454 			    lba);
1455 		} else {
1456 			uint64_t *prp_list;
1457 			int i;
1458 
1459 			/* prp2 is pointer to a physical region page list */
1460 			prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx,
1461 			                            cmd->prp2, PAGE_SIZE);
1462 
1463 			i = 0;
1464 			while (size != 0) {
1465 				cpsz = MIN(size, PAGE_SIZE);
1466 
1467 				/*
1468 				 * Move to linked physical region page list
1469 				 * in last item.
1470 				 */
1471 				if (i == (NVME_PRP2_ITEMS-1) &&
1472 				    size > PAGE_SIZE) {
1473 					assert((prp_list[i] & (PAGE_SIZE-1)) == 0);
1474 					prp_list = paddr_guest2host(
1475 					              sc->nsc_pi->pi_vmctx,
1476 					              prp_list[i], PAGE_SIZE);
1477 					i = 0;
1478 				}
1479 				if (prp_list[i] == 0) {
1480 					WPRINTF(("PRP2[%d] = 0 !!!", i));
1481 					err = 1;
1482 					break;
1483 				}
1484 
1485 				err = pci_nvme_append_iov_req(sc, req,
1486 				    prp_list[i], cpsz,
1487 				    cmd->opc == NVME_OPC_WRITE, lba);
1488 				if (err)
1489 					break;
1490 
1491 				lba += cpsz;
1492 				size -= cpsz;
1493 				i++;
1494 			}
1495 		}
1496 
1497 iodone:
1498 		if (sc->nvstore.type == NVME_STOR_RAM) {
1499 			uint16_t code, status;
1500 
1501 			code = err ? NVME_SC_LBA_OUT_OF_RANGE :
1502 			    NVME_SC_SUCCESS;
1503 			pci_nvme_status_genc(&status, code);
1504 
1505 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1506 			                        status, 1);
1507 
1508 			continue;
1509 		}
1510 
1511 
1512 		if (err)
1513 			goto do_error;
1514 
1515 		req->io_req.br_callback = pci_nvme_io_done;
1516 
1517 		err = 0;
1518 		switch (cmd->opc) {
1519 		case NVME_OPC_READ:
1520 			err = blockif_read(sc->nvstore.ctx, &req->io_req);
1521 			break;
1522 		case NVME_OPC_WRITE:
1523 			err = blockif_write(sc->nvstore.ctx, &req->io_req);
1524 			break;
1525 		default:
1526 			WPRINTF(("%s unhandled io command 0x%x",
1527 				 __func__, cmd->opc));
1528 			err = 1;
1529 		}
1530 
1531 do_error:
1532 		if (err) {
1533 			uint16_t status;
1534 
1535 			pci_nvme_status_genc(&status,
1536 			    NVME_SC_DATA_TRANSFER_ERROR);
1537 
1538 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1539 			                        status, 1);
1540 			pci_nvme_release_ioreq(sc, req);
1541 		}
1542 	}
1543 
1544 	atomic_store_short(&sq->head, sqhead);
1545 	atomic_store_int(&sq->busy, 0);
1546 }
1547 
1548 static void
1549 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
1550 	uint64_t idx, int is_sq, uint64_t value)
1551 {
1552 	DPRINTF(("nvme doorbell %lu, %s, val 0x%lx",
1553 	        idx, is_sq ? "SQ" : "CQ", value & 0xFFFF));
1554 
1555 	if (is_sq) {
1556 		atomic_store_short(&sc->submit_queues[idx].tail,
1557 		                   (uint16_t)value);
1558 
1559 		if (idx == 0) {
1560 			pci_nvme_handle_admin_cmd(sc, value);
1561 		} else {
1562 			/* submission queue; handle new entries in SQ */
1563 			if (idx > sc->num_squeues) {
1564 				WPRINTF(("%s SQ index %lu overflow from "
1565 				         "guest (max %u)",
1566 				         __func__, idx, sc->num_squeues));
1567 				return;
1568 			}
1569 			pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
1570 		}
1571 	} else {
1572 		if (idx > sc->num_cqueues) {
1573 			WPRINTF(("%s queue index %lu overflow from "
1574 			         "guest (max %u)",
1575 			         __func__, idx, sc->num_cqueues));
1576 			return;
1577 		}
1578 
1579 		sc->compl_queues[idx].head = (uint16_t)value;
1580 	}
1581 }
1582 
1583 static void
1584 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
1585 {
1586 	const char *s = iswrite ? "WRITE" : "READ";
1587 
1588 	switch (offset) {
1589 	case NVME_CR_CAP_LOW:
1590 		DPRINTF(("%s %s NVME_CR_CAP_LOW", func, s));
1591 		break;
1592 	case NVME_CR_CAP_HI:
1593 		DPRINTF(("%s %s NVME_CR_CAP_HI", func, s));
1594 		break;
1595 	case NVME_CR_VS:
1596 		DPRINTF(("%s %s NVME_CR_VS", func, s));
1597 		break;
1598 	case NVME_CR_INTMS:
1599 		DPRINTF(("%s %s NVME_CR_INTMS", func, s));
1600 		break;
1601 	case NVME_CR_INTMC:
1602 		DPRINTF(("%s %s NVME_CR_INTMC", func, s));
1603 		break;
1604 	case NVME_CR_CC:
1605 		DPRINTF(("%s %s NVME_CR_CC", func, s));
1606 		break;
1607 	case NVME_CR_CSTS:
1608 		DPRINTF(("%s %s NVME_CR_CSTS", func, s));
1609 		break;
1610 	case NVME_CR_NSSR:
1611 		DPRINTF(("%s %s NVME_CR_NSSR", func, s));
1612 		break;
1613 	case NVME_CR_AQA:
1614 		DPRINTF(("%s %s NVME_CR_AQA", func, s));
1615 		break;
1616 	case NVME_CR_ASQ_LOW:
1617 		DPRINTF(("%s %s NVME_CR_ASQ_LOW", func, s));
1618 		break;
1619 	case NVME_CR_ASQ_HI:
1620 		DPRINTF(("%s %s NVME_CR_ASQ_HI", func, s));
1621 		break;
1622 	case NVME_CR_ACQ_LOW:
1623 		DPRINTF(("%s %s NVME_CR_ACQ_LOW", func, s));
1624 		break;
1625 	case NVME_CR_ACQ_HI:
1626 		DPRINTF(("%s %s NVME_CR_ACQ_HI", func, s));
1627 		break;
1628 	default:
1629 		DPRINTF(("unknown nvme bar-0 offset 0x%lx", offset));
1630 	}
1631 
1632 }
1633 
1634 static void
1635 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
1636 	uint64_t offset, int size, uint64_t value)
1637 {
1638 	uint32_t ccreg;
1639 
1640 	if (offset >= NVME_DOORBELL_OFFSET) {
1641 		uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
1642 		uint64_t idx = belloffset / 8; /* door bell size = 2*int */
1643 		int is_sq = (belloffset % 8) < 4;
1644 
1645 		if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
1646 			WPRINTF(("guest attempted an overflow write offset "
1647 			         "0x%lx, val 0x%lx in %s",
1648 			         offset, value, __func__));
1649 			return;
1650 		}
1651 
1652 		pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
1653 		return;
1654 	}
1655 
1656 	DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx",
1657 	        offset, size, value));
1658 
1659 	if (size != 4) {
1660 		WPRINTF(("guest wrote invalid size %d (offset 0x%lx, "
1661 		         "val 0x%lx) to bar0 in %s",
1662 		         size, offset, value, __func__));
1663 		/* TODO: shutdown device */
1664 		return;
1665 	}
1666 
1667 	pci_nvme_bar0_reg_dumps(__func__, offset, 1);
1668 
1669 	pthread_mutex_lock(&sc->mtx);
1670 
1671 	switch (offset) {
1672 	case NVME_CR_CAP_LOW:
1673 	case NVME_CR_CAP_HI:
1674 		/* readonly */
1675 		break;
1676 	case NVME_CR_VS:
1677 		/* readonly */
1678 		break;
1679 	case NVME_CR_INTMS:
1680 		/* MSI-X, so ignore */
1681 		break;
1682 	case NVME_CR_INTMC:
1683 		/* MSI-X, so ignore */
1684 		break;
1685 	case NVME_CR_CC:
1686 		ccreg = (uint32_t)value;
1687 
1688 		DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
1689 		         "iocqes %u",
1690 		        __func__,
1691 			 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
1692 			 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
1693 			 NVME_CC_GET_IOCQES(ccreg)));
1694 
1695 		if (NVME_CC_GET_SHN(ccreg)) {
1696 			/* perform shutdown - flush out data to backend */
1697 			sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
1698 			    NVME_CSTS_REG_SHST_SHIFT);
1699 			sc->regs.csts |= NVME_SHST_COMPLETE <<
1700 			    NVME_CSTS_REG_SHST_SHIFT;
1701 		}
1702 		if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
1703 			if (NVME_CC_GET_EN(ccreg) == 0)
1704 				/* transition 1-> causes controller reset */
1705 				pci_nvme_reset_locked(sc);
1706 			else
1707 				pci_nvme_init_controller(ctx, sc);
1708 		}
1709 
1710 		/* Insert the iocqes, iosqes and en bits from the write */
1711 		sc->regs.cc &= ~NVME_CC_WRITE_MASK;
1712 		sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
1713 		if (NVME_CC_GET_EN(ccreg) == 0) {
1714 			/* Insert the ams, mps and css bit fields */
1715 			sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
1716 			sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
1717 			sc->regs.csts &= ~NVME_CSTS_RDY;
1718 		} else if (sc->pending_ios == 0) {
1719 			sc->regs.csts |= NVME_CSTS_RDY;
1720 		}
1721 		break;
1722 	case NVME_CR_CSTS:
1723 		break;
1724 	case NVME_CR_NSSR:
1725 		/* ignore writes; don't support subsystem reset */
1726 		break;
1727 	case NVME_CR_AQA:
1728 		sc->regs.aqa = (uint32_t)value;
1729 		break;
1730 	case NVME_CR_ASQ_LOW:
1731 		sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
1732 		               (0xFFFFF000 & value);
1733 		break;
1734 	case NVME_CR_ASQ_HI:
1735 		sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
1736 		               (value << 32);
1737 		break;
1738 	case NVME_CR_ACQ_LOW:
1739 		sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
1740 		               (0xFFFFF000 & value);
1741 		break;
1742 	case NVME_CR_ACQ_HI:
1743 		sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
1744 		               (value << 32);
1745 		break;
1746 	default:
1747 		DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d",
1748 		         __func__, offset, value, size));
1749 	}
1750 	pthread_mutex_unlock(&sc->mtx);
1751 }
1752 
1753 static void
1754 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
1755                 int baridx, uint64_t offset, int size, uint64_t value)
1756 {
1757 	struct pci_nvme_softc* sc = pi->pi_arg;
1758 
1759 	if (baridx == pci_msix_table_bar(pi) ||
1760 	    baridx == pci_msix_pba_bar(pi)) {
1761 		DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, "
1762 		         " value 0x%lx", baridx, offset, size, value));
1763 
1764 		pci_emul_msix_twrite(pi, offset, size, value);
1765 		return;
1766 	}
1767 
1768 	switch (baridx) {
1769 	case 0:
1770 		pci_nvme_write_bar_0(ctx, sc, offset, size, value);
1771 		break;
1772 
1773 	default:
1774 		DPRINTF(("%s unknown baridx %d, val 0x%lx",
1775 		         __func__, baridx, value));
1776 	}
1777 }
1778 
1779 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
1780 	uint64_t offset, int size)
1781 {
1782 	uint64_t value;
1783 
1784 	pci_nvme_bar0_reg_dumps(__func__, offset, 0);
1785 
1786 	if (offset < NVME_DOORBELL_OFFSET) {
1787 		void *p = &(sc->regs);
1788 		pthread_mutex_lock(&sc->mtx);
1789 		memcpy(&value, (void *)((uintptr_t)p + offset), size);
1790 		pthread_mutex_unlock(&sc->mtx);
1791 	} else {
1792 		value = 0;
1793                 WPRINTF(("pci_nvme: read invalid offset %ld", offset));
1794 	}
1795 
1796 	switch (size) {
1797 	case 1:
1798 		value &= 0xFF;
1799 		break;
1800 	case 2:
1801 		value &= 0xFFFF;
1802 		break;
1803 	case 4:
1804 		value &= 0xFFFFFFFF;
1805 		break;
1806 	}
1807 
1808 	DPRINTF(("   nvme-read offset 0x%lx, size %d -> value 0x%x",
1809 	         offset, size, (uint32_t)value));
1810 
1811 	return (value);
1812 }
1813 
1814 
1815 
1816 static uint64_t
1817 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
1818     uint64_t offset, int size)
1819 {
1820 	struct pci_nvme_softc* sc = pi->pi_arg;
1821 
1822 	if (baridx == pci_msix_table_bar(pi) ||
1823 	    baridx == pci_msix_pba_bar(pi)) {
1824 		DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
1825 		        baridx, offset, size));
1826 
1827 		return pci_emul_msix_tread(pi, offset, size);
1828 	}
1829 
1830 	switch (baridx) {
1831 	case 0:
1832        		return pci_nvme_read_bar_0(sc, offset, size);
1833 
1834 	default:
1835 		DPRINTF(("unknown bar %d, 0x%lx", baridx, offset));
1836 	}
1837 
1838 	return (0);
1839 }
1840 
1841 
1842 static int
1843 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
1844 {
1845 	char bident[sizeof("XX:X:X")];
1846 	char	*uopt, *xopts, *config;
1847 	uint32_t sectsz;
1848 	int optidx;
1849 
1850 	sc->max_queues = NVME_QUEUES;
1851 	sc->max_qentries = NVME_MAX_QENTRIES;
1852 	sc->ioslots = NVME_IOSLOTS;
1853 	sc->num_squeues = sc->max_queues;
1854 	sc->num_cqueues = sc->max_queues;
1855 	sectsz = 0;
1856 
1857 	uopt = strdup(opts);
1858 	optidx = 0;
1859 	snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
1860 	         "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1861 	for (xopts = strtok(uopt, ",");
1862 	     xopts != NULL;
1863 	     xopts = strtok(NULL, ",")) {
1864 
1865 		if ((config = strchr(xopts, '=')) != NULL)
1866 			*config++ = '\0';
1867 
1868 		if (!strcmp("maxq", xopts)) {
1869 			sc->max_queues = atoi(config);
1870 		} else if (!strcmp("qsz", xopts)) {
1871 			sc->max_qentries = atoi(config);
1872 		} else if (!strcmp("ioslots", xopts)) {
1873 			sc->ioslots = atoi(config);
1874 		} else if (!strcmp("sectsz", xopts)) {
1875 			sectsz = atoi(config);
1876 		} else if (!strcmp("ser", xopts)) {
1877 			/*
1878 			 * This field indicates the Product Serial Number in
1879 			 * 7-bit ASCII, unused bytes should be space characters.
1880 			 * Ref: NVMe v1.3c.
1881 			 */
1882 			cpywithpad((char *)sc->ctrldata.sn,
1883 			           sizeof(sc->ctrldata.sn), config, ' ');
1884 		} else if (!strcmp("ram", xopts)) {
1885 			uint64_t sz = strtoull(&xopts[4], NULL, 10);
1886 
1887 			sc->nvstore.type = NVME_STOR_RAM;
1888 			sc->nvstore.size = sz * 1024 * 1024;
1889 			sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1890 			sc->nvstore.sectsz = 4096;
1891 			sc->nvstore.sectsz_bits = 12;
1892 			if (sc->nvstore.ctx == NULL) {
1893 				perror("Unable to allocate RAM");
1894 				free(uopt);
1895 				return (-1);
1896 			}
1897 		} else if (!strcmp("eui64", xopts)) {
1898 			sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
1899 		} else if (optidx == 0) {
1900 			snprintf(bident, sizeof(bident), "%d:%d",
1901 			         sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1902 			sc->nvstore.ctx = blockif_open(xopts, bident);
1903 			if (sc->nvstore.ctx == NULL) {
1904 				perror("Could not open backing file");
1905 				free(uopt);
1906 				return (-1);
1907 			}
1908 			sc->nvstore.type = NVME_STOR_BLOCKIF;
1909 			sc->nvstore.size = blockif_size(sc->nvstore.ctx);
1910 		} else {
1911 			EPRINTLN("Invalid option %s", xopts);
1912 			free(uopt);
1913 			return (-1);
1914 		}
1915 
1916 		optidx++;
1917 	}
1918 	free(uopt);
1919 
1920 	if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
1921 		EPRINTLN("backing store not specified");
1922 		return (-1);
1923 	}
1924 	if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
1925 		sc->nvstore.sectsz = sectsz;
1926 	else if (sc->nvstore.type != NVME_STOR_RAM)
1927 		sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
1928 	for (sc->nvstore.sectsz_bits = 9;
1929 	     (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
1930 	     sc->nvstore.sectsz_bits++);
1931 
1932 	if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
1933 		sc->max_queues = NVME_QUEUES;
1934 
1935 	if (sc->max_qentries <= 0) {
1936 		EPRINTLN("Invalid qsz option");
1937 		return (-1);
1938 	}
1939 	if (sc->ioslots <= 0) {
1940 		EPRINTLN("Invalid ioslots option");
1941 		return (-1);
1942 	}
1943 
1944 	return (0);
1945 }
1946 
1947 static int
1948 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
1949 {
1950 	struct pci_nvme_softc *sc;
1951 	uint32_t pci_membar_sz;
1952 	int	error;
1953 
1954 	error = 0;
1955 
1956 	sc = calloc(1, sizeof(struct pci_nvme_softc));
1957 	pi->pi_arg = sc;
1958 	sc->nsc_pi = pi;
1959 
1960 	error = pci_nvme_parse_opts(sc, opts);
1961 	if (error < 0)
1962 		goto done;
1963 	else
1964 		error = 0;
1965 
1966 	sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
1967 	for (int i = 0; i < sc->ioslots; i++) {
1968 		if (i < (sc->ioslots-1))
1969 			sc->ioreqs[i].next = &sc->ioreqs[i+1];
1970 		pthread_mutex_init(&sc->ioreqs[i].mtx, NULL);
1971 		pthread_cond_init(&sc->ioreqs[i].cv, NULL);
1972 	}
1973 	sc->ioreqs_free = sc->ioreqs;
1974 	sc->intr_coales_aggr_thresh = 1;
1975 
1976 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
1977 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
1978 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
1979 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
1980 	pci_set_cfgdata8(pi, PCIR_PROGIF,
1981 	                 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
1982 
1983 	/*
1984 	 * Allocate size of NVMe registers + doorbell space for all queues.
1985 	 *
1986 	 * The specification requires a minimum memory I/O window size of 16K.
1987 	 * The Windows driver will refuse to start a device with a smaller
1988 	 * window.
1989 	 */
1990 	pci_membar_sz = sizeof(struct nvme_registers) +
1991 	    2 * sizeof(uint32_t) * (sc->max_queues + 1);
1992 	pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
1993 
1994 	DPRINTF(("nvme membar size: %u", pci_membar_sz));
1995 
1996 	error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
1997 	if (error) {
1998 		WPRINTF(("%s pci alloc mem bar failed", __func__));
1999 		goto done;
2000 	}
2001 
2002 	error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2003 	if (error) {
2004 		WPRINTF(("%s pci add msixcap failed", __func__));
2005 		goto done;
2006 	}
2007 
2008 	error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2009 	if (error) {
2010 		WPRINTF(("%s pci add Express capability failed", __func__));
2011 		goto done;
2012 	}
2013 
2014 	pthread_mutex_init(&sc->mtx, NULL);
2015 	sem_init(&sc->iosemlock, 0, sc->ioslots);
2016 
2017 	pci_nvme_reset(sc);
2018 	pci_nvme_init_ctrldata(sc);
2019 	pci_nvme_init_nsdata(sc, &sc->nsdata, 1, sc->nvstore.eui64);
2020 	pci_nvme_init_logpages(sc);
2021 
2022 	pci_lintr_request(pi);
2023 
2024 done:
2025 	return (error);
2026 }
2027 
2028 
2029 struct pci_devemu pci_de_nvme = {
2030 	.pe_emu =	"nvme",
2031 	.pe_init =	pci_nvme_init,
2032 	.pe_barwrite =	pci_nvme_write,
2033 	.pe_barread =	pci_nvme_read
2034 };
2035 PCI_EMUL_SET(pci_de_nvme);
2036