xref: /illumos-gate/usr/src/cmd/bhyve/pci_nvme.c (revision d15d17d4231f87f1571fa6d585377206f360f667)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  *
7  * Function crc16 Copyright (c) 2017, Fedor Uporov
8  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 /*
33  * bhyve PCIe-NVMe device emulation.
34  *
35  * options:
36  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#
37  *
38  *  accepted devpath:
39  *    /dev/blockdev
40  *    /path/to/image
41  *    ram=size_in_MiB
42  *
43  *  maxq    = max number of queues
44  *  qsz     = max elements in each queue
45  *  ioslots = max number of concurrent io requests
46  *  sectsz  = sector size (defaults to blockif sector size)
47  *  ser     = serial number (20-chars max)
48  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
49  *
50  */
51 
52 /* TODO:
53     - create async event for smart and log
54     - intr coalesce
55  */
56 
57 #include <sys/cdefs.h>
58 __FBSDID("$FreeBSD$");
59 
60 #include <sys/types.h>
61 #include <net/ieee_oui.h>
62 #ifndef __FreeBSD__
63 #include <endian.h>
64 #endif
65 
66 #include <assert.h>
67 #include <pthread.h>
68 #include <semaphore.h>
69 #include <stdbool.h>
70 #include <stddef.h>
71 #include <stdint.h>
72 #include <stdio.h>
73 #include <stdlib.h>
74 #include <string.h>
75 
76 #include <machine/atomic.h>
77 #include <machine/vmm.h>
78 #include <vmmapi.h>
79 
80 #include <dev/nvme/nvme.h>
81 
82 #include "bhyverun.h"
83 #include "block_if.h"
84 #include "pci_emul.h"
85 
86 
87 static int nvme_debug = 0;
88 #define	DPRINTF(params) if (nvme_debug) printf params
89 #define	WPRINTF(params) printf params
90 
91 /* defaults; can be overridden */
92 #define	NVME_MSIX_BAR		4
93 
94 #define	NVME_IOSLOTS		8
95 
96 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
97 #define NVME_MMIO_SPACE_MIN	(1 << 14)
98 
99 #define	NVME_QUEUES		16
100 #define	NVME_MAX_QENTRIES	2048
101 
102 #define	NVME_PRP2_ITEMS		(PAGE_SIZE/sizeof(uint64_t))
103 #define	NVME_MAX_BLOCKIOVS	512
104 
105 /* This is a synthetic status code to indicate there is no status */
106 #define NVME_NO_STATUS		0xffff
107 #define NVME_COMPLETION_VALID(c)	((c).status != NVME_NO_STATUS)
108 
109 /* helpers */
110 
111 /* Convert a zero-based value into a one-based value */
112 #define ONE_BASED(zero)		((zero) + 1)
113 /* Convert a one-based value into a zero-based value */
114 #define ZERO_BASED(one)		((one)  - 1)
115 
116 /* Encode number of SQ's and CQ's for Set/Get Features */
117 #define NVME_FEATURE_NUM_QUEUES(sc) \
118 	(ZERO_BASED((sc)->num_squeues) & 0xffff) | \
119 	(ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
120 
121 #define	NVME_DOORBELL_OFFSET	offsetof(struct nvme_registers, doorbell)
122 
123 enum nvme_controller_register_offsets {
124 	NVME_CR_CAP_LOW = 0x00,
125 	NVME_CR_CAP_HI  = 0x04,
126 	NVME_CR_VS      = 0x08,
127 	NVME_CR_INTMS   = 0x0c,
128 	NVME_CR_INTMC   = 0x10,
129 	NVME_CR_CC      = 0x14,
130 	NVME_CR_CSTS    = 0x1c,
131 	NVME_CR_NSSR    = 0x20,
132 	NVME_CR_AQA     = 0x24,
133 	NVME_CR_ASQ_LOW = 0x28,
134 	NVME_CR_ASQ_HI  = 0x2c,
135 	NVME_CR_ACQ_LOW = 0x30,
136 	NVME_CR_ACQ_HI  = 0x34,
137 };
138 
139 enum nvme_cmd_cdw11 {
140 	NVME_CMD_CDW11_PC  = 0x0001,
141 	NVME_CMD_CDW11_IEN = 0x0002,
142 	NVME_CMD_CDW11_IV  = 0xFFFF0000,
143 };
144 
145 #define	NVME_CQ_INTEN	0x01
146 #define	NVME_CQ_INTCOAL	0x02
147 
148 struct nvme_completion_queue {
149 	struct nvme_completion *qbase;
150 	uint32_t	size;
151 	uint16_t	tail; /* nvme progress */
152 	uint16_t	head; /* guest progress */
153 	uint16_t	intr_vec;
154 	uint32_t	intr_en;
155 	pthread_mutex_t	mtx;
156 };
157 
158 struct nvme_submission_queue {
159 	struct nvme_command *qbase;
160 	uint32_t	size;
161 	uint16_t	head; /* nvme progress */
162 	uint16_t	tail; /* guest progress */
163 	uint16_t	cqid; /* completion queue id */
164 	int		busy; /* queue is being processed */
165 	int		qpriority;
166 };
167 
168 enum nvme_storage_type {
169 	NVME_STOR_BLOCKIF = 0,
170 	NVME_STOR_RAM = 1,
171 };
172 
173 struct pci_nvme_blockstore {
174 	enum nvme_storage_type type;
175 	void		*ctx;
176 	uint64_t	size;
177 	uint32_t	sectsz;
178 	uint32_t	sectsz_bits;
179 	uint64_t	eui64;
180 };
181 
182 struct pci_nvme_ioreq {
183 	struct pci_nvme_softc *sc;
184 	struct pci_nvme_ioreq *next;
185 	struct nvme_submission_queue *nvme_sq;
186 	uint16_t	sqid;
187 
188 	/* command information */
189 	uint16_t	opc;
190 	uint16_t	cid;
191 	uint32_t	nsid;
192 
193 	uint64_t	prev_gpaddr;
194 	size_t		prev_size;
195 
196 	/*
197 	 * lock if all iovs consumed (big IO);
198 	 * complete transaction before continuing
199 	 */
200 	pthread_mutex_t	mtx;
201 	pthread_cond_t	cv;
202 
203 	struct blockif_req io_req;
204 
205 	/* pad to fit up to 512 page descriptors from guest IO request */
206 	struct iovec	iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX];
207 };
208 
209 struct pci_nvme_softc {
210 	struct pci_devinst *nsc_pi;
211 
212 	pthread_mutex_t	mtx;
213 
214 	struct nvme_registers regs;
215 
216 	struct nvme_namespace_data  nsdata;
217 	struct nvme_controller_data ctrldata;
218 	struct nvme_error_information_entry err_log;
219 	struct nvme_health_information_page health_log;
220 	struct nvme_firmware_page fw_log;
221 
222 	struct pci_nvme_blockstore nvstore;
223 
224 	uint16_t	max_qentries;	/* max entries per queue */
225 	uint32_t	max_queues;	/* max number of IO SQ's or CQ's */
226 	uint32_t	num_cqueues;
227 	uint32_t	num_squeues;
228 
229 	struct pci_nvme_ioreq *ioreqs;
230 	struct pci_nvme_ioreq *ioreqs_free; /* free list of ioreqs */
231 	uint32_t	pending_ios;
232 	uint32_t	ioslots;
233 	sem_t		iosemlock;
234 
235 	/*
236 	 * Memory mapped Submission and Completion queues
237 	 * Each array includes both Admin and IO queues
238 	 */
239 	struct nvme_completion_queue *compl_queues;
240 	struct nvme_submission_queue *submit_queues;
241 
242 	/* controller features */
243 	uint32_t	intr_coales_aggr_time;   /* 0x08: uS to delay intr */
244 	uint32_t	intr_coales_aggr_thresh; /* 0x08: compl-Q entries */
245 	uint32_t	async_ev_config;         /* 0x0B: async event config */
246 };
247 
248 
249 static void pci_nvme_io_partial(struct blockif_req *br, int err);
250 
251 /* Controller Configuration utils */
252 #define	NVME_CC_GET_EN(cc) \
253 	((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
254 #define	NVME_CC_GET_CSS(cc) \
255 	((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
256 #define	NVME_CC_GET_SHN(cc) \
257 	((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
258 #define	NVME_CC_GET_IOSQES(cc) \
259 	((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
260 #define	NVME_CC_GET_IOCQES(cc) \
261 	((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
262 
263 #define	NVME_CC_WRITE_MASK \
264 	((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
265 	 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
266 	 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
267 
268 #define	NVME_CC_NEN_WRITE_MASK \
269 	((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
270 	 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
271 	 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
272 
273 /* Controller Status utils */
274 #define	NVME_CSTS_GET_RDY(sts) \
275 	((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
276 
277 #define	NVME_CSTS_RDY	(1 << NVME_CSTS_REG_RDY_SHIFT)
278 
279 /* Completion Queue status word utils */
280 #define	NVME_STATUS_P	(1 << NVME_STATUS_P_SHIFT)
281 #define	NVME_STATUS_MASK \
282 	((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
283 	 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
284 
285 static __inline void
286 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
287 {
288 	size_t len;
289 
290 	len = strnlen(src, dst_size);
291 	memset(dst, pad, dst_size);
292 	memcpy(dst, src, len);
293 }
294 
295 static __inline void
296 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
297 {
298 
299 	*status &= ~NVME_STATUS_MASK;
300 	*status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
301 		(code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
302 }
303 
304 static __inline void
305 pci_nvme_status_genc(uint16_t *status, uint16_t code)
306 {
307 
308 	pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
309 }
310 
311 static __inline void
312 pci_nvme_toggle_phase(uint16_t *status, int prev)
313 {
314 
315 	if (prev)
316 		*status &= ~NVME_STATUS_P;
317 	else
318 		*status |= NVME_STATUS_P;
319 }
320 
321 static void
322 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
323 {
324 	struct nvme_controller_data *cd = &sc->ctrldata;
325 
326 	cd->vid = 0xFB5D;
327 	cd->ssvid = 0x0000;
328 
329 	cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
330 	cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
331 
332 	/* Num of submission commands that we can handle at a time (2^rab) */
333 	cd->rab   = 4;
334 
335 	/* FreeBSD OUI */
336 	cd->ieee[0] = 0x58;
337 	cd->ieee[1] = 0x9c;
338 	cd->ieee[2] = 0xfc;
339 
340 	cd->mic = 0;
341 
342 	cd->mdts = 9;	/* max data transfer size (2^mdts * CAP.MPSMIN) */
343 
344 	cd->ver = 0x00010300;
345 
346 	cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
347 	cd->acl = 2;
348 	cd->aerl = 4;
349 
350 	cd->lpa = 0;	/* TODO: support some simple things like SMART */
351 	cd->elpe = 0;	/* max error log page entries */
352 	cd->npss = 1;	/* number of power states support */
353 
354 	/* Warning Composite Temperature Threshold */
355 	cd->wctemp = 0x0157;
356 
357 	cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
358 	    (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
359 	cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
360 	    (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
361 	cd->nn = 1;	/* number of namespaces */
362 
363 	cd->fna = 0x03;
364 
365 	cd->power_state[0].mp = 10;
366 }
367 
368 /*
369  * Calculate the CRC-16 of the given buffer
370  * See copyright attribution at top of file
371  */
372 static uint16_t
373 crc16(uint16_t crc, const void *buffer, unsigned int len)
374 {
375 	const unsigned char *cp = buffer;
376 	/* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
377 	static uint16_t const crc16_table[256] = {
378 		0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
379 		0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
380 		0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
381 		0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
382 		0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
383 		0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
384 		0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
385 		0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
386 		0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
387 		0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
388 		0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
389 		0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
390 		0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
391 		0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
392 		0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
393 		0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
394 		0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
395 		0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
396 		0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
397 		0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
398 		0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
399 		0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
400 		0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
401 		0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
402 		0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
403 		0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
404 		0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
405 		0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
406 		0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
407 		0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
408 		0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
409 		0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
410 	};
411 
412 	while (len--)
413 		crc = (((crc >> 8) & 0xffU) ^
414 		    crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
415 	return crc;
416 }
417 
418 static void
419 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
420     struct nvme_namespace_data *nd, uint32_t nsid,
421     uint64_t eui64)
422 {
423 
424 	nd->nsze = sc->nvstore.size / sc->nvstore.sectsz;
425 	nd->ncap = nd->nsze;
426 	nd->nuse = nd->nsze;
427 
428 	/* Get LBA and backstore information from backing store */
429 	nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
430 	nd->flbas = 0;
431 
432 	/* Create an EUI-64 if user did not provide one */
433 	if (eui64 == 0) {
434 		char *data = NULL;
435 
436 		asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
437 		    sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
438 
439 		if (data != NULL) {
440 			eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
441 			free(data);
442 		}
443 		eui64 = (eui64 << 16) | (nsid & 0xffff);
444 	}
445 	be64enc(nd->eui64, eui64);
446 
447 	/* LBA data-sz = 2^lbads */
448 	nd->lbaf[0] = sc->nvstore.sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
449 }
450 
451 static void
452 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
453 {
454 
455 	memset(&sc->err_log, 0, sizeof(sc->err_log));
456 	memset(&sc->health_log, 0, sizeof(sc->health_log));
457 	memset(&sc->fw_log, 0, sizeof(sc->fw_log));
458 }
459 
460 static void
461 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
462 {
463 	DPRINTF(("%s\r\n", __func__));
464 
465 	sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
466 	    (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
467 	    (60 << NVME_CAP_LO_REG_TO_SHIFT);
468 
469 	sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
470 
471 	sc->regs.vs = 0x00010300;	/* NVMe v1.3 */
472 
473 	sc->regs.cc = 0;
474 	sc->regs.csts = 0;
475 
476 	sc->num_cqueues = sc->num_squeues = sc->max_queues;
477 	if (sc->submit_queues != NULL) {
478 		for (int i = 0; i < sc->num_squeues + 1; i++) {
479 			/*
480 			 * The Admin Submission Queue is at index 0.
481 			 * It must not be changed at reset otherwise the
482 			 * emulation will be out of sync with the guest.
483 			 */
484 			if (i != 0) {
485 				sc->submit_queues[i].qbase = NULL;
486 				sc->submit_queues[i].size = 0;
487 				sc->submit_queues[i].cqid = 0;
488 			}
489 			sc->submit_queues[i].tail = 0;
490 			sc->submit_queues[i].head = 0;
491 			sc->submit_queues[i].busy = 0;
492 		}
493 	} else
494 		sc->submit_queues = calloc(sc->num_squeues + 1,
495 		                        sizeof(struct nvme_submission_queue));
496 
497 	if (sc->compl_queues != NULL) {
498 		for (int i = 0; i < sc->num_cqueues + 1; i++) {
499 			/* See Admin Submission Queue note above */
500 			if (i != 0) {
501 				sc->compl_queues[i].qbase = NULL;
502 				sc->compl_queues[i].size = 0;
503 			}
504 
505 			sc->compl_queues[i].tail = 0;
506 			sc->compl_queues[i].head = 0;
507 		}
508 	} else {
509 		sc->compl_queues = calloc(sc->num_cqueues + 1,
510 		                        sizeof(struct nvme_completion_queue));
511 
512 		for (int i = 0; i < sc->num_cqueues + 1; i++)
513 			pthread_mutex_init(&sc->compl_queues[i].mtx, NULL);
514 	}
515 }
516 
517 static void
518 pci_nvme_reset(struct pci_nvme_softc *sc)
519 {
520 	pthread_mutex_lock(&sc->mtx);
521 	pci_nvme_reset_locked(sc);
522 	pthread_mutex_unlock(&sc->mtx);
523 }
524 
525 static void
526 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
527 {
528 	uint16_t acqs, asqs;
529 
530 	DPRINTF(("%s\r\n", __func__));
531 
532 	asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
533 	sc->submit_queues[0].size = asqs;
534 	sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
535 	            sizeof(struct nvme_command) * asqs);
536 
537 	DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p\r\n",
538 	        __func__, sc->regs.asq, sc->submit_queues[0].qbase));
539 
540 	acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
541 	    NVME_AQA_REG_ACQS_MASK) + 1;
542 	sc->compl_queues[0].size = acqs;
543 	sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
544 	         sizeof(struct nvme_completion) * acqs);
545 	DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p\r\n",
546 	        __func__, sc->regs.acq, sc->compl_queues[0].qbase));
547 }
548 
549 static int
550 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *src,
551 	size_t len)
552 {
553 	uint8_t *dst;
554 	size_t bytes;
555 
556 	if (len > (8 * 1024)) {
557 		return (-1);
558 	}
559 
560 	/* Copy from the start of prp1 to the end of the physical page */
561 	bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
562 	bytes = MIN(bytes, len);
563 
564 	dst = vm_map_gpa(ctx, prp1, bytes);
565 	if (dst == NULL) {
566 		return (-1);
567 	}
568 
569 	memcpy(dst, src, bytes);
570 
571 	src += bytes;
572 
573 	len -= bytes;
574 	if (len == 0) {
575 		return (0);
576 	}
577 
578 	len = MIN(len, PAGE_SIZE);
579 
580 	dst = vm_map_gpa(ctx, prp2, len);
581 	if (dst == NULL) {
582 		return (-1);
583 	}
584 
585 	memcpy(dst, src, len);
586 
587 	return (0);
588 }
589 
590 static int
591 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
592 	struct nvme_completion* compl)
593 {
594 	uint16_t qid = command->cdw10 & 0xffff;
595 
596 	DPRINTF(("%s DELETE_IO_SQ %u\r\n", __func__, qid));
597 	if (qid == 0 || qid > sc->num_squeues) {
598 		WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u\r\n",
599 		        __func__, qid, sc->num_squeues));
600 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
601 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
602 		return (1);
603 	}
604 
605 	sc->submit_queues[qid].qbase = NULL;
606 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
607 	return (1);
608 }
609 
610 static int
611 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
612 	struct nvme_completion* compl)
613 {
614 	if (command->cdw11 & NVME_CMD_CDW11_PC) {
615 		uint16_t qid = command->cdw10 & 0xffff;
616 		struct nvme_submission_queue *nsq;
617 
618 		if ((qid == 0) || (qid > sc->num_squeues)) {
619 			WPRINTF(("%s queue index %u > num_squeues %u\r\n",
620 			        __func__, qid, sc->num_squeues));
621 			pci_nvme_status_tc(&compl->status,
622 			    NVME_SCT_COMMAND_SPECIFIC,
623 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
624 			return (1);
625 		}
626 
627 		nsq = &sc->submit_queues[qid];
628 		nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
629 
630 		nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
631 		              sizeof(struct nvme_command) * (size_t)nsq->size);
632 		nsq->cqid = (command->cdw11 >> 16) & 0xffff;
633 		nsq->qpriority = (command->cdw11 >> 1) & 0x03;
634 
635 		DPRINTF(("%s sq %u size %u gaddr %p cqid %u\r\n", __func__,
636 		        qid, nsq->size, nsq->qbase, nsq->cqid));
637 
638 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
639 
640 		DPRINTF(("%s completed creating IOSQ qid %u\r\n",
641 		         __func__, qid));
642 	} else {
643 		/*
644 		 * Guest sent non-cont submission queue request.
645 		 * This setting is unsupported by this emulation.
646 		 */
647 		WPRINTF(("%s unsupported non-contig (list-based) "
648 		         "create i/o submission queue\r\n", __func__));
649 
650 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
651 	}
652 	return (1);
653 }
654 
655 static int
656 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
657 	struct nvme_completion* compl)
658 {
659 	uint16_t qid = command->cdw10 & 0xffff;
660 
661 	DPRINTF(("%s DELETE_IO_CQ %u\r\n", __func__, qid));
662 	if (qid == 0 || qid > sc->num_cqueues) {
663 		WPRINTF(("%s queue index %u / num_cqueues %u\r\n",
664 		        __func__, qid, sc->num_cqueues));
665 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
666 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
667 		return (1);
668 	}
669 
670 	sc->compl_queues[qid].qbase = NULL;
671 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
672 	return (1);
673 }
674 
675 static int
676 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
677 	struct nvme_completion* compl)
678 {
679 	if (command->cdw11 & NVME_CMD_CDW11_PC) {
680 		uint16_t qid = command->cdw10 & 0xffff;
681 		struct nvme_completion_queue *ncq;
682 
683 		if ((qid == 0) || (qid > sc->num_cqueues)) {
684 			WPRINTF(("%s queue index %u > num_cqueues %u\r\n",
685 			        __func__, qid, sc->num_cqueues));
686 			pci_nvme_status_tc(&compl->status,
687 			    NVME_SCT_COMMAND_SPECIFIC,
688 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
689 			return (1);
690 		}
691 
692 		ncq = &sc->compl_queues[qid];
693 		ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
694 		ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
695 		ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
696 
697 		ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
698 		             command->prp1,
699 		             sizeof(struct nvme_command) * (size_t)ncq->size);
700 
701 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
702 	} else {
703 		/*
704 		 * Non-contig completion queue unsupported.
705 		 */
706 		WPRINTF(("%s unsupported non-contig (list-based) "
707 		         "create i/o completion queue\r\n",
708 		         __func__));
709 
710 		/* 0x12 = Invalid Use of Controller Memory Buffer */
711 		pci_nvme_status_genc(&compl->status, 0x12);
712 	}
713 
714 	return (1);
715 }
716 
717 static int
718 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
719 	struct nvme_completion* compl)
720 {
721 	uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2;
722 	uint8_t logpage = command->cdw10 & 0xFF;
723 
724 	DPRINTF(("%s log page %u len %u\r\n", __func__, logpage, logsize));
725 
726 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
727 
728 	switch (logpage) {
729 	case NVME_LOG_ERROR:
730 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
731 		    command->prp2, (uint8_t *)&sc->err_log, logsize);
732 		break;
733 	case NVME_LOG_HEALTH_INFORMATION:
734 		/* TODO: present some smart info */
735 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
736 		    command->prp2, (uint8_t *)&sc->health_log, logsize);
737 		break;
738 	case NVME_LOG_FIRMWARE_SLOT:
739 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
740 		    command->prp2, (uint8_t *)&sc->fw_log, logsize);
741 		break;
742 	default:
743 		WPRINTF(("%s get log page %x command not supported\r\n",
744 		        __func__, logpage));
745 
746 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
747 		    NVME_SC_INVALID_LOG_PAGE);
748 	}
749 
750 	return (1);
751 }
752 
753 static int
754 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
755 	struct nvme_completion* compl)
756 {
757 	void *dest;
758 
759 	DPRINTF(("%s identify 0x%x nsid 0x%x\r\n", __func__,
760 	        command->cdw10 & 0xFF, command->nsid));
761 
762 	switch (command->cdw10 & 0xFF) {
763 	case 0x00: /* return Identify Namespace data structure */
764 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
765 		    command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata));
766 		break;
767 	case 0x01: /* return Identify Controller data structure */
768 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
769 		    command->prp2, (uint8_t *)&sc->ctrldata,
770 		    sizeof(sc->ctrldata));
771 		break;
772 	case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
773 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
774 		                  sizeof(uint32_t) * 1024);
775 		((uint32_t *)dest)[0] = 1;
776 		((uint32_t *)dest)[1] = 0;
777 		break;
778 	case 0x11:
779 		pci_nvme_status_genc(&compl->status,
780 		    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
781 		return (1);
782 	case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
783 	case 0x10:
784 	case 0x12:
785 	case 0x13:
786 	case 0x14:
787 	case 0x15:
788 	default:
789 		DPRINTF(("%s unsupported identify command requested 0x%x\r\n",
790 		         __func__, command->cdw10 & 0xFF));
791 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
792 		return (1);
793 	}
794 
795 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
796 	return (1);
797 }
798 
799 static int
800 nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command,
801 	struct nvme_completion* compl)
802 {
803 	uint16_t nqr;	/* Number of Queues Requested */
804 
805 	nqr = command->cdw11 & 0xFFFF;
806 	if (nqr == 0xffff) {
807 		WPRINTF(("%s: Illegal NSQR value %#x\n", __func__, nqr));
808 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
809 		return (-1);
810 	}
811 
812 	sc->num_squeues = ONE_BASED(nqr);
813 	if (sc->num_squeues > sc->max_queues) {
814 		DPRINTF(("NSQR=%u is greater than max %u\n", sc->num_squeues,
815 					sc->max_queues));
816 		sc->num_squeues = sc->max_queues;
817 	}
818 
819 	nqr = (command->cdw11 >> 16) & 0xFFFF;
820 	if (nqr == 0xffff) {
821 		WPRINTF(("%s: Illegal NCQR value %#x\n", __func__, nqr));
822 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
823 		return (-1);
824 	}
825 
826 	sc->num_cqueues = ONE_BASED(nqr);
827 	if (sc->num_cqueues > sc->max_queues) {
828 		DPRINTF(("NCQR=%u is greater than max %u\n", sc->num_cqueues,
829 					sc->max_queues));
830 		sc->num_cqueues = sc->max_queues;
831 	}
832 
833 	compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
834 
835 	return (0);
836 }
837 
838 static int
839 nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command,
840 	struct nvme_completion* compl)
841 {
842 	int feature = command->cdw10 & 0xFF;
843 	uint32_t iv;
844 
845 	DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
846 	compl->cdw0 = 0;
847 
848 	switch (feature) {
849 	case NVME_FEAT_ARBITRATION:
850 		DPRINTF(("  arbitration 0x%x\r\n", command->cdw11));
851 		break;
852 	case NVME_FEAT_POWER_MANAGEMENT:
853 		DPRINTF(("  power management 0x%x\r\n", command->cdw11));
854 		break;
855 	case NVME_FEAT_LBA_RANGE_TYPE:
856 		DPRINTF(("  lba range 0x%x\r\n", command->cdw11));
857 		break;
858 	case NVME_FEAT_TEMPERATURE_THRESHOLD:
859 		DPRINTF(("  temperature threshold 0x%x\r\n", command->cdw11));
860 		break;
861 	case NVME_FEAT_ERROR_RECOVERY:
862 		DPRINTF(("  error recovery 0x%x\r\n", command->cdw11));
863 		break;
864 	case NVME_FEAT_VOLATILE_WRITE_CACHE:
865 		DPRINTF(("  volatile write cache 0x%x\r\n", command->cdw11));
866 		break;
867 	case NVME_FEAT_NUMBER_OF_QUEUES:
868 		nvme_set_feature_queues(sc, command, compl);
869 		break;
870 	case NVME_FEAT_INTERRUPT_COALESCING:
871 		DPRINTF(("  interrupt coalescing 0x%x\r\n", command->cdw11));
872 
873 		/* in uS */
874 		sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100;
875 
876 		sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF;
877 		break;
878 	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
879 		iv = command->cdw11 & 0xFFFF;
880 
881 		DPRINTF(("  interrupt vector configuration 0x%x\r\n",
882 		        command->cdw11));
883 
884 		for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) {
885 			if (sc->compl_queues[i].intr_vec == iv) {
886 				if (command->cdw11 & (1 << 16))
887 					sc->compl_queues[i].intr_en |=
888 					                      NVME_CQ_INTCOAL;
889 				else
890 					sc->compl_queues[i].intr_en &=
891 					                     ~NVME_CQ_INTCOAL;
892 			}
893 		}
894 		break;
895 	case NVME_FEAT_WRITE_ATOMICITY:
896 		DPRINTF(("  write atomicity 0x%x\r\n", command->cdw11));
897 		break;
898 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
899 		DPRINTF(("  async event configuration 0x%x\r\n",
900 		        command->cdw11));
901 		sc->async_ev_config = command->cdw11;
902 		break;
903 	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
904 		DPRINTF(("  software progress marker 0x%x\r\n",
905 		        command->cdw11));
906 		break;
907 	case 0x0C:
908 		DPRINTF(("  autonomous power state transition 0x%x\r\n",
909 		        command->cdw11));
910 		break;
911 	default:
912 		WPRINTF(("%s invalid feature\r\n", __func__));
913 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
914 		return (1);
915 	}
916 
917 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
918 	return (1);
919 }
920 
921 static int
922 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
923 	struct nvme_completion* compl)
924 {
925 	int feature = command->cdw10 & 0xFF;
926 
927 	DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
928 
929 	compl->cdw0 = 0;
930 
931 	switch (feature) {
932 	case NVME_FEAT_ARBITRATION:
933 		DPRINTF(("  arbitration\r\n"));
934 		break;
935 	case NVME_FEAT_POWER_MANAGEMENT:
936 		DPRINTF(("  power management\r\n"));
937 		break;
938 	case NVME_FEAT_LBA_RANGE_TYPE:
939 		DPRINTF(("  lba range\r\n"));
940 		break;
941 	case NVME_FEAT_TEMPERATURE_THRESHOLD:
942 		DPRINTF(("  temperature threshold\r\n"));
943 		switch ((command->cdw11 >> 20) & 0x3) {
944 		case 0:
945 			/* Over temp threshold */
946 			compl->cdw0 = 0xFFFF;
947 			break;
948 		case 1:
949 			/* Under temp threshold */
950 			compl->cdw0 = 0;
951 			break;
952 		default:
953 			WPRINTF(("  invalid threshold type select\r\n"));
954 			pci_nvme_status_genc(&compl->status,
955 			    NVME_SC_INVALID_FIELD);
956 			return (1);
957 		}
958 		break;
959 	case NVME_FEAT_ERROR_RECOVERY:
960 		DPRINTF(("  error recovery\r\n"));
961 		break;
962 	case NVME_FEAT_VOLATILE_WRITE_CACHE:
963 		DPRINTF(("  volatile write cache\r\n"));
964 		break;
965 	case NVME_FEAT_NUMBER_OF_QUEUES:
966 		compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
967 
968 		DPRINTF(("  number of queues (submit %u, completion %u)\r\n",
969 		        compl->cdw0 & 0xFFFF,
970 		        (compl->cdw0 >> 16) & 0xFFFF));
971 
972 		break;
973 	case NVME_FEAT_INTERRUPT_COALESCING:
974 		DPRINTF(("  interrupt coalescing\r\n"));
975 		break;
976 	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
977 		DPRINTF(("  interrupt vector configuration\r\n"));
978 		break;
979 	case NVME_FEAT_WRITE_ATOMICITY:
980 		DPRINTF(("  write atomicity\r\n"));
981 		break;
982 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
983 		DPRINTF(("  async event configuration\r\n"));
984 		sc->async_ev_config = command->cdw11;
985 		break;
986 	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
987 		DPRINTF(("  software progress marker\r\n"));
988 		break;
989 	case 0x0C:
990 		DPRINTF(("  autonomous power state transition\r\n"));
991 		break;
992 	default:
993 		WPRINTF(("%s invalid feature 0x%x\r\n", __func__, feature));
994 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
995 		return (1);
996 	}
997 
998 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
999 	return (1);
1000 }
1001 
1002 static int
1003 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1004 	struct nvme_completion* compl)
1005 {
1006 	DPRINTF(("%s submission queue %u, command ID 0x%x\r\n", __func__,
1007 	        command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF));
1008 
1009 	/* TODO: search for the command ID and abort it */
1010 
1011 	compl->cdw0 = 1;
1012 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1013 	return (1);
1014 }
1015 
1016 #ifdef __FreeBSD__
1017 static int
1018 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1019 	struct nvme_command* command, struct nvme_completion* compl)
1020 {
1021 	DPRINTF(("%s async event request 0x%x\r\n", __func__, command->cdw11));
1022 
1023 	/*
1024 	 * TODO: raise events when they happen based on the Set Features cmd.
1025 	 * These events happen async, so only set completion successful if
1026 	 * there is an event reflective of the request to get event.
1027 	 */
1028 	pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1029 	    NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1030 	return (0);
1031 }
1032 #else
1033 /* This is kept behind an ifdef while it's unused to appease the compiler. */
1034 #endif /* __FreeBSD__ */
1035 
1036 static void
1037 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1038 {
1039 	struct nvme_completion compl;
1040 	struct nvme_command *cmd;
1041 	struct nvme_submission_queue *sq;
1042 	struct nvme_completion_queue *cq;
1043 	int do_intr = 0;
1044 	uint16_t sqhead;
1045 
1046 	DPRINTF(("%s index %u\r\n", __func__, (uint32_t)value));
1047 
1048 	sq = &sc->submit_queues[0];
1049 
1050 	sqhead = atomic_load_acq_short(&sq->head);
1051 
1052 	if (atomic_testandset_int(&sq->busy, 1)) {
1053 		DPRINTF(("%s SQ busy, head %u, tail %u\r\n",
1054 		        __func__, sqhead, sq->tail));
1055 		return;
1056 	}
1057 
1058 	DPRINTF(("sqhead %u, tail %u\r\n", sqhead, sq->tail));
1059 
1060 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
1061 		cmd = &(sq->qbase)[sqhead];
1062 		compl.cdw0 = 0;
1063 		compl.status = 0;
1064 
1065 		switch (cmd->opc) {
1066 		case NVME_OPC_DELETE_IO_SQ:
1067 			DPRINTF(("%s command DELETE_IO_SQ\r\n", __func__));
1068 			do_intr |= nvme_opc_delete_io_sq(sc, cmd, &compl);
1069 			break;
1070 		case NVME_OPC_CREATE_IO_SQ:
1071 			DPRINTF(("%s command CREATE_IO_SQ\r\n", __func__));
1072 			do_intr |= nvme_opc_create_io_sq(sc, cmd, &compl);
1073 			break;
1074 		case NVME_OPC_DELETE_IO_CQ:
1075 			DPRINTF(("%s command DELETE_IO_CQ\r\n", __func__));
1076 			do_intr |= nvme_opc_delete_io_cq(sc, cmd, &compl);
1077 			break;
1078 		case NVME_OPC_CREATE_IO_CQ:
1079 			DPRINTF(("%s command CREATE_IO_CQ\r\n", __func__));
1080 			do_intr |= nvme_opc_create_io_cq(sc, cmd, &compl);
1081 			break;
1082 		case NVME_OPC_GET_LOG_PAGE:
1083 			DPRINTF(("%s command GET_LOG_PAGE\r\n", __func__));
1084 			do_intr |= nvme_opc_get_log_page(sc, cmd, &compl);
1085 			break;
1086 		case NVME_OPC_IDENTIFY:
1087 			DPRINTF(("%s command IDENTIFY\r\n", __func__));
1088 			do_intr |= nvme_opc_identify(sc, cmd, &compl);
1089 			break;
1090 		case NVME_OPC_ABORT:
1091 			DPRINTF(("%s command ABORT\r\n", __func__));
1092 			do_intr |= nvme_opc_abort(sc, cmd, &compl);
1093 			break;
1094 		case NVME_OPC_SET_FEATURES:
1095 			DPRINTF(("%s command SET_FEATURES\r\n", __func__));
1096 			do_intr |= nvme_opc_set_features(sc, cmd, &compl);
1097 			break;
1098 		case NVME_OPC_GET_FEATURES:
1099 			DPRINTF(("%s command GET_FEATURES\r\n", __func__));
1100 			do_intr |= nvme_opc_get_features(sc, cmd, &compl);
1101 			break;
1102 		case NVME_OPC_ASYNC_EVENT_REQUEST:
1103 			DPRINTF(("%s command ASYNC_EVENT_REQ\r\n", __func__));
1104 			/* XXX dont care, unhandled for now
1105 			do_intr |= nvme_opc_async_event_req(sc, cmd, &compl);
1106 			*/
1107 			compl.status = NVME_NO_STATUS;
1108 			break;
1109 		default:
1110 			WPRINTF(("0x%x command is not implemented\r\n",
1111 			    cmd->opc));
1112 			pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1113 			do_intr |= 1;
1114 		}
1115 
1116 		if (NVME_COMPLETION_VALID(compl)) {
1117 			struct nvme_completion *cp;
1118 			int phase;
1119 
1120 			cq = &sc->compl_queues[0];
1121 
1122 			cp = &(cq->qbase)[cq->tail];
1123 			cp->cdw0 = compl.cdw0;
1124 			cp->sqid = 0;
1125 			cp->sqhd = sqhead;
1126 			cp->cid = cmd->cid;
1127 
1128 			phase = NVME_STATUS_GET_P(cp->status);
1129 			cp->status = compl.status;
1130 			pci_nvme_toggle_phase(&cp->status, phase);
1131 
1132 			cq->tail = (cq->tail + 1) % cq->size;
1133 		}
1134 		sqhead = (sqhead + 1) % sq->size;
1135 	}
1136 
1137 	DPRINTF(("setting sqhead %u\r\n", sqhead));
1138 	atomic_store_short(&sq->head, sqhead);
1139 	atomic_store_int(&sq->busy, 0);
1140 
1141 	if (do_intr)
1142 		pci_generate_msix(sc->nsc_pi, 0);
1143 
1144 }
1145 
1146 static int
1147 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1148 	uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1149 {
1150 	int iovidx;
1151 
1152 	if (req != NULL) {
1153 		/* concatenate contig block-iovs to minimize number of iovs */
1154 		if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1155 			iovidx = req->io_req.br_iovcnt - 1;
1156 
1157 			req->io_req.br_iov[iovidx].iov_base =
1158 			    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1159 			                     req->prev_gpaddr, size);
1160 
1161 			req->prev_size += size;
1162 			req->io_req.br_resid += size;
1163 
1164 			req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1165 		} else {
1166 			pthread_mutex_lock(&req->mtx);
1167 
1168 			iovidx = req->io_req.br_iovcnt;
1169 			if (iovidx == NVME_MAX_BLOCKIOVS) {
1170 				int err = 0;
1171 
1172 				DPRINTF(("large I/O, doing partial req\r\n"));
1173 
1174 				iovidx = 0;
1175 				req->io_req.br_iovcnt = 0;
1176 
1177 				req->io_req.br_callback = pci_nvme_io_partial;
1178 
1179 				if (!do_write)
1180 					err = blockif_read(sc->nvstore.ctx,
1181 					                   &req->io_req);
1182 				else
1183 					err = blockif_write(sc->nvstore.ctx,
1184 					                    &req->io_req);
1185 
1186 				/* wait until req completes before cont */
1187 				if (err == 0)
1188 					pthread_cond_wait(&req->cv, &req->mtx);
1189 			}
1190 			if (iovidx == 0) {
1191 				req->io_req.br_offset = lba;
1192 				req->io_req.br_resid = 0;
1193 				req->io_req.br_param = req;
1194 			}
1195 
1196 			req->io_req.br_iov[iovidx].iov_base =
1197 			    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1198 			                     gpaddr, size);
1199 
1200 			req->io_req.br_iov[iovidx].iov_len = size;
1201 
1202 			req->prev_gpaddr = gpaddr;
1203 			req->prev_size = size;
1204 			req->io_req.br_resid += size;
1205 
1206 			req->io_req.br_iovcnt++;
1207 
1208 			pthread_mutex_unlock(&req->mtx);
1209 		}
1210 	} else {
1211 		/* RAM buffer: read/write directly */
1212 		void *p = sc->nvstore.ctx;
1213 		void *gptr;
1214 
1215 		if ((lba + size) > sc->nvstore.size) {
1216 			WPRINTF(("%s write would overflow RAM\r\n", __func__));
1217 			return (-1);
1218 		}
1219 
1220 		p = (void *)((uintptr_t)p + (uintptr_t)lba);
1221 		gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size);
1222 		if (do_write)
1223 			memcpy(p, gptr, size);
1224 		else
1225 			memcpy(gptr, p, size);
1226 	}
1227 	return (0);
1228 }
1229 
1230 static void
1231 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1232 	struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1233 	uint32_t cdw0, uint16_t status, int ignore_busy)
1234 {
1235 	struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1236 	struct nvme_completion *compl;
1237 	int do_intr = 0;
1238 	int phase;
1239 
1240 	DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x\r\n",
1241 		 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1242 		 NVME_STATUS_GET_SC(status)));
1243 
1244 	pthread_mutex_lock(&cq->mtx);
1245 
1246 	assert(cq->qbase != NULL);
1247 
1248 	compl = &cq->qbase[cq->tail];
1249 
1250 	compl->sqhd = atomic_load_acq_short(&sq->head);
1251 	compl->sqid = sqid;
1252 	compl->cid = cid;
1253 
1254 	// toggle phase
1255 	phase = NVME_STATUS_GET_P(compl->status);
1256 	compl->status = status;
1257 	pci_nvme_toggle_phase(&compl->status, phase);
1258 
1259 	cq->tail = (cq->tail + 1) % cq->size;
1260 
1261 	if (cq->intr_en & NVME_CQ_INTEN)
1262 		do_intr = 1;
1263 
1264 	pthread_mutex_unlock(&cq->mtx);
1265 
1266 	if (ignore_busy || !atomic_load_acq_int(&sq->busy))
1267 		if (do_intr)
1268 			pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1269 }
1270 
1271 static void
1272 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1273 {
1274 	req->sc = NULL;
1275 	req->nvme_sq = NULL;
1276 	req->sqid = 0;
1277 
1278 	pthread_mutex_lock(&sc->mtx);
1279 
1280 	req->next = sc->ioreqs_free;
1281 	sc->ioreqs_free = req;
1282 	sc->pending_ios--;
1283 
1284 	/* when no more IO pending, can set to ready if device reset/enabled */
1285 	if (sc->pending_ios == 0 &&
1286 	    NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1287 		sc->regs.csts |= NVME_CSTS_RDY;
1288 
1289 	pthread_mutex_unlock(&sc->mtx);
1290 
1291 	sem_post(&sc->iosemlock);
1292 }
1293 
1294 static struct pci_nvme_ioreq *
1295 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1296 {
1297 	struct pci_nvme_ioreq *req = NULL;;
1298 
1299 	sem_wait(&sc->iosemlock);
1300 	pthread_mutex_lock(&sc->mtx);
1301 
1302 	req = sc->ioreqs_free;
1303 	assert(req != NULL);
1304 
1305 	sc->ioreqs_free = req->next;
1306 
1307 	req->next = NULL;
1308 	req->sc = sc;
1309 
1310 	sc->pending_ios++;
1311 
1312 	pthread_mutex_unlock(&sc->mtx);
1313 
1314 	req->io_req.br_iovcnt = 0;
1315 	req->io_req.br_offset = 0;
1316 	req->io_req.br_resid = 0;
1317 	req->io_req.br_param = req;
1318 	req->prev_gpaddr = 0;
1319 	req->prev_size = 0;
1320 
1321 	return req;
1322 }
1323 
1324 static void
1325 pci_nvme_io_done(struct blockif_req *br, int err)
1326 {
1327 	struct pci_nvme_ioreq *req = br->br_param;
1328 	struct nvme_submission_queue *sq = req->nvme_sq;
1329 	uint16_t code, status = 0;
1330 
1331 	DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
1332 
1333 	/* TODO return correct error */
1334 	code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1335 	pci_nvme_status_genc(&status, code);
1336 
1337 	pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0);
1338 	pci_nvme_release_ioreq(req->sc, req);
1339 }
1340 
1341 static void
1342 pci_nvme_io_partial(struct blockif_req *br, int err)
1343 {
1344 	struct pci_nvme_ioreq *req = br->br_param;
1345 
1346 	DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
1347 
1348 	pthread_cond_signal(&req->cv);
1349 }
1350 
1351 
1352 static void
1353 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
1354 {
1355 	struct nvme_submission_queue *sq;
1356 	uint16_t status = 0;
1357 	uint16_t sqhead;
1358 	int err;
1359 
1360 	/* handle all submissions up to sq->tail index */
1361 	sq = &sc->submit_queues[idx];
1362 
1363 	if (atomic_testandset_int(&sq->busy, 1)) {
1364 		DPRINTF(("%s sqid %u busy\r\n", __func__, idx));
1365 		return;
1366 	}
1367 
1368 	sqhead = atomic_load_acq_short(&sq->head);
1369 
1370 	DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p\r\n",
1371 	         idx, sqhead, sq->tail, sq->qbase));
1372 
1373 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
1374 		struct nvme_command *cmd;
1375 		struct pci_nvme_ioreq *req = NULL;
1376 		uint64_t lba;
1377 		uint64_t nblocks, bytes, size, cpsz;
1378 
1379 		/* TODO: support scatter gather list handling */
1380 
1381 		cmd = &sq->qbase[sqhead];
1382 		sqhead = (sqhead + 1) % sq->size;
1383 
1384 		lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
1385 
1386 		if (cmd->opc == NVME_OPC_FLUSH) {
1387 			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1388 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1389 			                        status, 1);
1390 
1391 			continue;
1392 		} else if (cmd->opc == 0x08) {
1393 			/* TODO: write zeroes */
1394 			WPRINTF(("%s write zeroes lba 0x%lx blocks %u\r\n",
1395 			        __func__, lba, cmd->cdw12 & 0xFFFF));
1396 			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1397 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1398 			                        status, 1);
1399 
1400 			continue;
1401 		}
1402 
1403 		nblocks = (cmd->cdw12 & 0xFFFF) + 1;
1404 
1405 		bytes = nblocks * sc->nvstore.sectsz;
1406 
1407 		if (sc->nvstore.type == NVME_STOR_BLOCKIF) {
1408 			req = pci_nvme_get_ioreq(sc);
1409 			req->nvme_sq = sq;
1410 			req->sqid = idx;
1411 		}
1412 
1413 		/*
1414 		 * If data starts mid-page and flows into the next page, then
1415 		 * increase page count
1416 		 */
1417 
1418 		DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu "
1419 		         "(%lu-bytes)\r\n",
1420 		         sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size,
1421 		         cmd->opc == NVME_OPC_WRITE ?
1422 			     "WRITE" : "READ",
1423 		         lba, nblocks, bytes));
1424 
1425 		cmd->prp1 &= ~(0x03UL);
1426 		cmd->prp2 &= ~(0x03UL);
1427 
1428 		DPRINTF((" prp1 0x%lx prp2 0x%lx\r\n", cmd->prp1, cmd->prp2));
1429 
1430 		size = bytes;
1431 		lba *= sc->nvstore.sectsz;
1432 
1433 		cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE);
1434 
1435 		if (cpsz > bytes)
1436 			cpsz = bytes;
1437 
1438 		if (req != NULL) {
1439 			req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) |
1440 			                        cmd->cdw10;
1441 			req->opc = cmd->opc;
1442 			req->cid = cmd->cid;
1443 			req->nsid = cmd->nsid;
1444 		}
1445 
1446 		err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz,
1447 		    cmd->opc == NVME_OPC_WRITE, lba);
1448 		lba += cpsz;
1449 		size -= cpsz;
1450 
1451 		if (size == 0)
1452 			goto iodone;
1453 
1454 		if (size <= PAGE_SIZE) {
1455 			/* prp2 is second (and final) page in transfer */
1456 
1457 			err = pci_nvme_append_iov_req(sc, req, cmd->prp2,
1458 			    size,
1459 			    cmd->opc == NVME_OPC_WRITE,
1460 			    lba);
1461 		} else {
1462 			uint64_t *prp_list;
1463 			int i;
1464 
1465 			/* prp2 is pointer to a physical region page list */
1466 			prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx,
1467 			                            cmd->prp2, PAGE_SIZE);
1468 
1469 			i = 0;
1470 			while (size != 0) {
1471 				cpsz = MIN(size, PAGE_SIZE);
1472 
1473 				/*
1474 				 * Move to linked physical region page list
1475 				 * in last item.
1476 				 */
1477 				if (i == (NVME_PRP2_ITEMS-1) &&
1478 				    size > PAGE_SIZE) {
1479 					assert((prp_list[i] & (PAGE_SIZE-1)) == 0);
1480 					prp_list = paddr_guest2host(
1481 					              sc->nsc_pi->pi_vmctx,
1482 					              prp_list[i], PAGE_SIZE);
1483 					i = 0;
1484 				}
1485 				if (prp_list[i] == 0) {
1486 					WPRINTF(("PRP2[%d] = 0 !!!\r\n", i));
1487 					err = 1;
1488 					break;
1489 				}
1490 
1491 				err = pci_nvme_append_iov_req(sc, req,
1492 				    prp_list[i], cpsz,
1493 				    cmd->opc == NVME_OPC_WRITE, lba);
1494 				if (err)
1495 					break;
1496 
1497 				lba += cpsz;
1498 				size -= cpsz;
1499 				i++;
1500 			}
1501 		}
1502 
1503 iodone:
1504 		if (sc->nvstore.type == NVME_STOR_RAM) {
1505 			uint16_t code, status = 0;
1506 
1507 			code = err ? NVME_SC_LBA_OUT_OF_RANGE :
1508 			    NVME_SC_SUCCESS;
1509 			pci_nvme_status_genc(&status, code);
1510 
1511 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1512 			                        status, 1);
1513 
1514 			continue;
1515 		}
1516 
1517 
1518 		if (err)
1519 			goto do_error;
1520 
1521 		req->io_req.br_callback = pci_nvme_io_done;
1522 
1523 		err = 0;
1524 		switch (cmd->opc) {
1525 		case NVME_OPC_READ:
1526 			err = blockif_read(sc->nvstore.ctx, &req->io_req);
1527 			break;
1528 		case NVME_OPC_WRITE:
1529 			err = blockif_write(sc->nvstore.ctx, &req->io_req);
1530 			break;
1531 		default:
1532 			WPRINTF(("%s unhandled io command 0x%x\r\n",
1533 				 __func__, cmd->opc));
1534 			err = 1;
1535 		}
1536 
1537 do_error:
1538 		if (err) {
1539 			uint16_t status = 0;
1540 
1541 			pci_nvme_status_genc(&status,
1542 			    NVME_SC_DATA_TRANSFER_ERROR);
1543 
1544 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1545 			                        status, 1);
1546 			pci_nvme_release_ioreq(sc, req);
1547 		}
1548 	}
1549 
1550 	atomic_store_short(&sq->head, sqhead);
1551 	atomic_store_int(&sq->busy, 0);
1552 }
1553 
1554 static void
1555 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
1556 	uint64_t idx, int is_sq, uint64_t value)
1557 {
1558 	DPRINTF(("nvme doorbell %lu, %s, val 0x%lx\r\n",
1559 	        idx, is_sq ? "SQ" : "CQ", value & 0xFFFF));
1560 
1561 	if (is_sq) {
1562 		atomic_store_short(&sc->submit_queues[idx].tail,
1563 		                   (uint16_t)value);
1564 
1565 		if (idx == 0) {
1566 			pci_nvme_handle_admin_cmd(sc, value);
1567 		} else {
1568 			/* submission queue; handle new entries in SQ */
1569 			if (idx > sc->num_squeues) {
1570 				WPRINTF(("%s SQ index %lu overflow from "
1571 				         "guest (max %u)\r\n",
1572 				         __func__, idx, sc->num_squeues));
1573 				return;
1574 			}
1575 			pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
1576 		}
1577 	} else {
1578 		if (idx > sc->num_cqueues) {
1579 			WPRINTF(("%s queue index %lu overflow from "
1580 			         "guest (max %u)\r\n",
1581 			         __func__, idx, sc->num_cqueues));
1582 			return;
1583 		}
1584 
1585 		sc->compl_queues[idx].head = (uint16_t)value;
1586 	}
1587 }
1588 
1589 static void
1590 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
1591 {
1592 	const char *s = iswrite ? "WRITE" : "READ";
1593 
1594 	switch (offset) {
1595 	case NVME_CR_CAP_LOW:
1596 		DPRINTF(("%s %s NVME_CR_CAP_LOW\r\n", func, s));
1597 		break;
1598 	case NVME_CR_CAP_HI:
1599 		DPRINTF(("%s %s NVME_CR_CAP_HI\r\n", func, s));
1600 		break;
1601 	case NVME_CR_VS:
1602 		DPRINTF(("%s %s NVME_CR_VS\r\n", func, s));
1603 		break;
1604 	case NVME_CR_INTMS:
1605 		DPRINTF(("%s %s NVME_CR_INTMS\r\n", func, s));
1606 		break;
1607 	case NVME_CR_INTMC:
1608 		DPRINTF(("%s %s NVME_CR_INTMC\r\n", func, s));
1609 		break;
1610 	case NVME_CR_CC:
1611 		DPRINTF(("%s %s NVME_CR_CC\r\n", func, s));
1612 		break;
1613 	case NVME_CR_CSTS:
1614 		DPRINTF(("%s %s NVME_CR_CSTS\r\n", func, s));
1615 		break;
1616 	case NVME_CR_NSSR:
1617 		DPRINTF(("%s %s NVME_CR_NSSR\r\n", func, s));
1618 		break;
1619 	case NVME_CR_AQA:
1620 		DPRINTF(("%s %s NVME_CR_AQA\r\n", func, s));
1621 		break;
1622 	case NVME_CR_ASQ_LOW:
1623 		DPRINTF(("%s %s NVME_CR_ASQ_LOW\r\n", func, s));
1624 		break;
1625 	case NVME_CR_ASQ_HI:
1626 		DPRINTF(("%s %s NVME_CR_ASQ_HI\r\n", func, s));
1627 		break;
1628 	case NVME_CR_ACQ_LOW:
1629 		DPRINTF(("%s %s NVME_CR_ACQ_LOW\r\n", func, s));
1630 		break;
1631 	case NVME_CR_ACQ_HI:
1632 		DPRINTF(("%s %s NVME_CR_ACQ_HI\r\n", func, s));
1633 		break;
1634 	default:
1635 		DPRINTF(("unknown nvme bar-0 offset 0x%lx\r\n", offset));
1636 	}
1637 
1638 }
1639 
1640 static void
1641 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
1642 	uint64_t offset, int size, uint64_t value)
1643 {
1644 	uint32_t ccreg;
1645 
1646 	if (offset >= NVME_DOORBELL_OFFSET) {
1647 		uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
1648 		uint64_t idx = belloffset / 8; /* door bell size = 2*int */
1649 		int is_sq = (belloffset % 8) < 4;
1650 
1651 		if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
1652 			WPRINTF(("guest attempted an overflow write offset "
1653 			         "0x%lx, val 0x%lx in %s",
1654 			         offset, value, __func__));
1655 			return;
1656 		}
1657 
1658 		pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
1659 		return;
1660 	}
1661 
1662 	DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx\r\n",
1663 	        offset, size, value));
1664 
1665 	if (size != 4) {
1666 		WPRINTF(("guest wrote invalid size %d (offset 0x%lx, "
1667 		         "val 0x%lx) to bar0 in %s",
1668 		         size, offset, value, __func__));
1669 		/* TODO: shutdown device */
1670 		return;
1671 	}
1672 
1673 	pci_nvme_bar0_reg_dumps(__func__, offset, 1);
1674 
1675 	pthread_mutex_lock(&sc->mtx);
1676 
1677 	switch (offset) {
1678 	case NVME_CR_CAP_LOW:
1679 	case NVME_CR_CAP_HI:
1680 		/* readonly */
1681 		break;
1682 	case NVME_CR_VS:
1683 		/* readonly */
1684 		break;
1685 	case NVME_CR_INTMS:
1686 		/* MSI-X, so ignore */
1687 		break;
1688 	case NVME_CR_INTMC:
1689 		/* MSI-X, so ignore */
1690 		break;
1691 	case NVME_CR_CC:
1692 		ccreg = (uint32_t)value;
1693 
1694 		DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
1695 		         "iocqes %u\r\n",
1696 		        __func__,
1697 			 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
1698 			 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
1699 			 NVME_CC_GET_IOCQES(ccreg)));
1700 
1701 		if (NVME_CC_GET_SHN(ccreg)) {
1702 			/* perform shutdown - flush out data to backend */
1703 			sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
1704 			    NVME_CSTS_REG_SHST_SHIFT);
1705 			sc->regs.csts |= NVME_SHST_COMPLETE <<
1706 			    NVME_CSTS_REG_SHST_SHIFT;
1707 		}
1708 		if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
1709 			if (NVME_CC_GET_EN(ccreg) == 0)
1710 				/* transition 1-> causes controller reset */
1711 				pci_nvme_reset_locked(sc);
1712 			else
1713 				pci_nvme_init_controller(ctx, sc);
1714 		}
1715 
1716 		/* Insert the iocqes, iosqes and en bits from the write */
1717 		sc->regs.cc &= ~NVME_CC_WRITE_MASK;
1718 		sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
1719 		if (NVME_CC_GET_EN(ccreg) == 0) {
1720 			/* Insert the ams, mps and css bit fields */
1721 			sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
1722 			sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
1723 			sc->regs.csts &= ~NVME_CSTS_RDY;
1724 		} else if (sc->pending_ios == 0) {
1725 			sc->regs.csts |= NVME_CSTS_RDY;
1726 		}
1727 		break;
1728 	case NVME_CR_CSTS:
1729 		break;
1730 	case NVME_CR_NSSR:
1731 		/* ignore writes; don't support subsystem reset */
1732 		break;
1733 	case NVME_CR_AQA:
1734 		sc->regs.aqa = (uint32_t)value;
1735 		break;
1736 	case NVME_CR_ASQ_LOW:
1737 		sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
1738 		               (0xFFFFF000 & value);
1739 		break;
1740 	case NVME_CR_ASQ_HI:
1741 		sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
1742 		               (value << 32);
1743 		break;
1744 	case NVME_CR_ACQ_LOW:
1745 		sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
1746 		               (0xFFFFF000 & value);
1747 		break;
1748 	case NVME_CR_ACQ_HI:
1749 		sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
1750 		               (value << 32);
1751 		break;
1752 	default:
1753 		DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d\r\n",
1754 		         __func__, offset, value, size));
1755 	}
1756 	pthread_mutex_unlock(&sc->mtx);
1757 }
1758 
1759 static void
1760 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
1761                 int baridx, uint64_t offset, int size, uint64_t value)
1762 {
1763 	struct pci_nvme_softc* sc = pi->pi_arg;
1764 
1765 	if (baridx == pci_msix_table_bar(pi) ||
1766 	    baridx == pci_msix_pba_bar(pi)) {
1767 		DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, "
1768 		         " value 0x%lx\r\n", baridx, offset, size, value));
1769 
1770 		pci_emul_msix_twrite(pi, offset, size, value);
1771 		return;
1772 	}
1773 
1774 	switch (baridx) {
1775 	case 0:
1776 		pci_nvme_write_bar_0(ctx, sc, offset, size, value);
1777 		break;
1778 
1779 	default:
1780 		DPRINTF(("%s unknown baridx %d, val 0x%lx\r\n",
1781 		         __func__, baridx, value));
1782 	}
1783 }
1784 
1785 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
1786 	uint64_t offset, int size)
1787 {
1788 	uint64_t value;
1789 
1790 	pci_nvme_bar0_reg_dumps(__func__, offset, 0);
1791 
1792 	if (offset < NVME_DOORBELL_OFFSET) {
1793 		void *p = &(sc->regs);
1794 		pthread_mutex_lock(&sc->mtx);
1795 		memcpy(&value, (void *)((uintptr_t)p + offset), size);
1796 		pthread_mutex_unlock(&sc->mtx);
1797 	} else {
1798 		value = 0;
1799                 WPRINTF(("pci_nvme: read invalid offset %ld\r\n", offset));
1800 	}
1801 
1802 	switch (size) {
1803 	case 1:
1804 		value &= 0xFF;
1805 		break;
1806 	case 2:
1807 		value &= 0xFFFF;
1808 		break;
1809 	case 4:
1810 		value &= 0xFFFFFFFF;
1811 		break;
1812 	}
1813 
1814 	DPRINTF(("   nvme-read offset 0x%lx, size %d -> value 0x%x\r\n",
1815 	         offset, size, (uint32_t)value));
1816 
1817 	return (value);
1818 }
1819 
1820 
1821 
1822 static uint64_t
1823 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
1824     uint64_t offset, int size)
1825 {
1826 	struct pci_nvme_softc* sc = pi->pi_arg;
1827 
1828 	if (baridx == pci_msix_table_bar(pi) ||
1829 	    baridx == pci_msix_pba_bar(pi)) {
1830 		DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d\r\n",
1831 		        baridx, offset, size));
1832 
1833 		return pci_emul_msix_tread(pi, offset, size);
1834 	}
1835 
1836 	switch (baridx) {
1837 	case 0:
1838        		return pci_nvme_read_bar_0(sc, offset, size);
1839 
1840 	default:
1841 		DPRINTF(("unknown bar %d, 0x%lx\r\n", baridx, offset));
1842 	}
1843 
1844 	return (0);
1845 }
1846 
1847 
1848 static int
1849 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
1850 {
1851 	char bident[sizeof("XX:X:X")];
1852 	char	*uopt, *xopts, *config;
1853 	uint32_t sectsz;
1854 	int optidx;
1855 
1856 	sc->max_queues = NVME_QUEUES;
1857 	sc->max_qentries = NVME_MAX_QENTRIES;
1858 	sc->ioslots = NVME_IOSLOTS;
1859 	sc->num_squeues = sc->max_queues;
1860 	sc->num_cqueues = sc->max_queues;
1861 	sectsz = 0;
1862 
1863 	uopt = strdup(opts);
1864 	optidx = 0;
1865 	snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
1866 	         "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1867 	for (xopts = strtok(uopt, ",");
1868 	     xopts != NULL;
1869 	     xopts = strtok(NULL, ",")) {
1870 
1871 		if ((config = strchr(xopts, '=')) != NULL)
1872 			*config++ = '\0';
1873 
1874 		if (!strcmp("maxq", xopts)) {
1875 			sc->max_queues = atoi(config);
1876 		} else if (!strcmp("qsz", xopts)) {
1877 			sc->max_qentries = atoi(config);
1878 		} else if (!strcmp("ioslots", xopts)) {
1879 			sc->ioslots = atoi(config);
1880 		} else if (!strcmp("sectsz", xopts)) {
1881 			sectsz = atoi(config);
1882 		} else if (!strcmp("ser", xopts)) {
1883 			/*
1884 			 * This field indicates the Product Serial Number in
1885 			 * 7-bit ASCII, unused bytes should be space characters.
1886 			 * Ref: NVMe v1.3c.
1887 			 */
1888 			cpywithpad((char *)sc->ctrldata.sn,
1889 			           sizeof(sc->ctrldata.sn), config, ' ');
1890 		} else if (!strcmp("ram", xopts)) {
1891 			uint64_t sz = strtoull(&xopts[4], NULL, 10);
1892 
1893 			sc->nvstore.type = NVME_STOR_RAM;
1894 			sc->nvstore.size = sz * 1024 * 1024;
1895 			sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1896 			sc->nvstore.sectsz = 4096;
1897 			sc->nvstore.sectsz_bits = 12;
1898 			if (sc->nvstore.ctx == NULL) {
1899 				perror("Unable to allocate RAM");
1900 				free(uopt);
1901 				return (-1);
1902 			}
1903 		} else if (!strcmp("eui64", xopts)) {
1904 			sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
1905 		} else if (optidx == 0) {
1906 			snprintf(bident, sizeof(bident), "%d:%d",
1907 			         sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1908 			sc->nvstore.ctx = blockif_open(xopts, bident);
1909 			if (sc->nvstore.ctx == NULL) {
1910 				perror("Could not open backing file");
1911 				free(uopt);
1912 				return (-1);
1913 			}
1914 			sc->nvstore.type = NVME_STOR_BLOCKIF;
1915 			sc->nvstore.size = blockif_size(sc->nvstore.ctx);
1916 		} else {
1917 			fprintf(stderr, "Invalid option %s\n", xopts);
1918 			free(uopt);
1919 			return (-1);
1920 		}
1921 
1922 		optidx++;
1923 	}
1924 	free(uopt);
1925 
1926 	if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
1927 		fprintf(stderr, "backing store not specified\n");
1928 		return (-1);
1929 	}
1930 	if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
1931 		sc->nvstore.sectsz = sectsz;
1932 	else if (sc->nvstore.type != NVME_STOR_RAM)
1933 		sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
1934 	for (sc->nvstore.sectsz_bits = 9;
1935 	     (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
1936 	     sc->nvstore.sectsz_bits++);
1937 
1938 	if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
1939 		sc->max_queues = NVME_QUEUES;
1940 
1941 	if (sc->max_qentries <= 0) {
1942 		fprintf(stderr, "Invalid qsz option\n");
1943 		return (-1);
1944 	}
1945 	if (sc->ioslots <= 0) {
1946 		fprintf(stderr, "Invalid ioslots option\n");
1947 		return (-1);
1948 	}
1949 
1950 	return (0);
1951 }
1952 
1953 static int
1954 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
1955 {
1956 	struct pci_nvme_softc *sc;
1957 	uint32_t pci_membar_sz;
1958 	int	error;
1959 
1960 	error = 0;
1961 
1962 	sc = calloc(1, sizeof(struct pci_nvme_softc));
1963 	pi->pi_arg = sc;
1964 	sc->nsc_pi = pi;
1965 
1966 	error = pci_nvme_parse_opts(sc, opts);
1967 	if (error < 0)
1968 		goto done;
1969 	else
1970 		error = 0;
1971 
1972 	sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
1973 	for (int i = 0; i < sc->ioslots; i++) {
1974 		if (i < (sc->ioslots-1))
1975 			sc->ioreqs[i].next = &sc->ioreqs[i+1];
1976 		pthread_mutex_init(&sc->ioreqs[i].mtx, NULL);
1977 		pthread_cond_init(&sc->ioreqs[i].cv, NULL);
1978 	}
1979 	sc->ioreqs_free = sc->ioreqs;
1980 	sc->intr_coales_aggr_thresh = 1;
1981 
1982 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
1983 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
1984 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
1985 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
1986 	pci_set_cfgdata8(pi, PCIR_PROGIF,
1987 	                 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
1988 
1989 	/*
1990 	 * Allocate size of NVMe registers + doorbell space for all queues.
1991 	 *
1992 	 * The specification requires a minimum memory I/O window size of 16K.
1993 	 * The Windows driver will refuse to start a device with a smaller
1994 	 * window.
1995 	 */
1996 	pci_membar_sz = sizeof(struct nvme_registers) +
1997 	    2 * sizeof(uint32_t) * (sc->max_queues + 1);
1998 	pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
1999 
2000 	DPRINTF(("nvme membar size: %u\r\n", pci_membar_sz));
2001 
2002 	error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
2003 	if (error) {
2004 		WPRINTF(("%s pci alloc mem bar failed\r\n", __func__));
2005 		goto done;
2006 	}
2007 
2008 	error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2009 	if (error) {
2010 		WPRINTF(("%s pci add msixcap failed\r\n", __func__));
2011 		goto done;
2012 	}
2013 
2014 	error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2015 	if (error) {
2016 		WPRINTF(("%s pci add Express capability failed\r\n", __func__));
2017 		goto done;
2018 	}
2019 
2020 	pthread_mutex_init(&sc->mtx, NULL);
2021 	sem_init(&sc->iosemlock, 0, sc->ioslots);
2022 
2023 	pci_nvme_reset(sc);
2024 	pci_nvme_init_ctrldata(sc);
2025 	pci_nvme_init_nsdata(sc, &sc->nsdata, 1, sc->nvstore.eui64);
2026 	pci_nvme_init_logpages(sc);
2027 
2028 	pci_lintr_request(pi);
2029 
2030 done:
2031 	return (error);
2032 }
2033 
2034 
2035 struct pci_devemu pci_de_nvme = {
2036 	.pe_emu =	"nvme",
2037 	.pe_init =	pci_nvme_init,
2038 	.pe_barwrite =	pci_nvme_write,
2039 	.pe_barread =	pci_nvme_read
2040 };
2041 PCI_EMUL_SET(pci_de_nvme);
2042