xref: /freebsd/usr.sbin/bhyve/pci_nvme.c (revision 99282790b7d01ec3c4072621d46a0d7302517ad4)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  *
7  * Function crc16 Copyright (c) 2017, Fedor Uporov
8  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 /*
33  * bhyve PCIe-NVMe device emulation.
34  *
35  * options:
36  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#
37  *
38  *  accepted devpath:
39  *    /dev/blockdev
40  *    /path/to/image
41  *    ram=size_in_MiB
42  *
43  *  maxq    = max number of queues
44  *  qsz     = max elements in each queue
45  *  ioslots = max number of concurrent io requests
46  *  sectsz  = sector size (defaults to blockif sector size)
47  *  ser     = serial number (20-chars max)
48  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
49  *
50  */
51 
52 /* TODO:
53     - create async event for smart and log
54     - intr coalesce
55  */
56 
57 #include <sys/cdefs.h>
58 __FBSDID("$FreeBSD$");
59 
60 #include <sys/types.h>
61 #include <net/ieee_oui.h>
62 
63 #include <assert.h>
64 #include <pthread.h>
65 #include <semaphore.h>
66 #include <stdbool.h>
67 #include <stddef.h>
68 #include <stdint.h>
69 #include <stdio.h>
70 #include <stdlib.h>
71 #include <string.h>
72 
73 #include <machine/atomic.h>
74 #include <machine/vmm.h>
75 #include <vmmapi.h>
76 
77 #include <dev/nvme/nvme.h>
78 
79 #include "bhyverun.h"
80 #include "block_if.h"
81 #include "debug.h"
82 #include "pci_emul.h"
83 
84 
85 static int nvme_debug = 0;
86 #define	DPRINTF(params) if (nvme_debug) PRINTLN params
87 #define	WPRINTF(params) PRINTLN params
88 
89 /* defaults; can be overridden */
90 #define	NVME_MSIX_BAR		4
91 
92 #define	NVME_IOSLOTS		8
93 
94 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
95 #define NVME_MMIO_SPACE_MIN	(1 << 14)
96 
97 #define	NVME_QUEUES		16
98 #define	NVME_MAX_QENTRIES	2048
99 
100 #define	NVME_PRP2_ITEMS		(PAGE_SIZE/sizeof(uint64_t))
101 #define	NVME_MAX_BLOCKIOVS	512
102 
103 /* This is a synthetic status code to indicate there is no status */
104 #define NVME_NO_STATUS		0xffff
105 #define NVME_COMPLETION_VALID(c)	((c).status != NVME_NO_STATUS)
106 
107 /* helpers */
108 
109 /* Convert a zero-based value into a one-based value */
110 #define ONE_BASED(zero)		((zero) + 1)
111 /* Convert a one-based value into a zero-based value */
112 #define ZERO_BASED(one)		((one)  - 1)
113 
114 /* Encode number of SQ's and CQ's for Set/Get Features */
115 #define NVME_FEATURE_NUM_QUEUES(sc) \
116 	(ZERO_BASED((sc)->num_squeues) & 0xffff) | \
117 	(ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
118 
119 #define	NVME_DOORBELL_OFFSET	offsetof(struct nvme_registers, doorbell)
120 
121 enum nvme_controller_register_offsets {
122 	NVME_CR_CAP_LOW = 0x00,
123 	NVME_CR_CAP_HI  = 0x04,
124 	NVME_CR_VS      = 0x08,
125 	NVME_CR_INTMS   = 0x0c,
126 	NVME_CR_INTMC   = 0x10,
127 	NVME_CR_CC      = 0x14,
128 	NVME_CR_CSTS    = 0x1c,
129 	NVME_CR_NSSR    = 0x20,
130 	NVME_CR_AQA     = 0x24,
131 	NVME_CR_ASQ_LOW = 0x28,
132 	NVME_CR_ASQ_HI  = 0x2c,
133 	NVME_CR_ACQ_LOW = 0x30,
134 	NVME_CR_ACQ_HI  = 0x34,
135 };
136 
137 enum nvme_cmd_cdw11 {
138 	NVME_CMD_CDW11_PC  = 0x0001,
139 	NVME_CMD_CDW11_IEN = 0x0002,
140 	NVME_CMD_CDW11_IV  = 0xFFFF0000,
141 };
142 
143 enum nvme_copy_dir {
144 	NVME_COPY_TO_PRP,
145 	NVME_COPY_FROM_PRP,
146 };
147 
148 #define	NVME_CQ_INTEN	0x01
149 #define	NVME_CQ_INTCOAL	0x02
150 
151 struct nvme_completion_queue {
152 	struct nvme_completion *qbase;
153 	uint32_t	size;
154 	uint16_t	tail; /* nvme progress */
155 	uint16_t	head; /* guest progress */
156 	uint16_t	intr_vec;
157 	uint32_t	intr_en;
158 	pthread_mutex_t	mtx;
159 };
160 
161 struct nvme_submission_queue {
162 	struct nvme_command *qbase;
163 	uint32_t	size;
164 	uint16_t	head; /* nvme progress */
165 	uint16_t	tail; /* guest progress */
166 	uint16_t	cqid; /* completion queue id */
167 	int		busy; /* queue is being processed */
168 	int		qpriority;
169 };
170 
171 enum nvme_storage_type {
172 	NVME_STOR_BLOCKIF = 0,
173 	NVME_STOR_RAM = 1,
174 };
175 
176 struct pci_nvme_blockstore {
177 	enum nvme_storage_type type;
178 	void		*ctx;
179 	uint64_t	size;
180 	uint32_t	sectsz;
181 	uint32_t	sectsz_bits;
182 	uint64_t	eui64;
183 	uint32_t	deallocate:1;
184 };
185 
186 struct pci_nvme_ioreq {
187 	struct pci_nvme_softc *sc;
188 	STAILQ_ENTRY(pci_nvme_ioreq) link;
189 	struct nvme_submission_queue *nvme_sq;
190 	uint16_t	sqid;
191 
192 	/* command information */
193 	uint16_t	opc;
194 	uint16_t	cid;
195 	uint32_t	nsid;
196 
197 	uint64_t	prev_gpaddr;
198 	size_t		prev_size;
199 
200 	/*
201 	 * lock if all iovs consumed (big IO);
202 	 * complete transaction before continuing
203 	 */
204 	pthread_mutex_t	mtx;
205 	pthread_cond_t	cv;
206 
207 	struct blockif_req io_req;
208 
209 	/* pad to fit up to 512 page descriptors from guest IO request */
210 	struct iovec	iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX];
211 };
212 
213 enum nvme_dsm_type {
214 	/* Dataset Management bit in ONCS reflects backing storage capability */
215 	NVME_DATASET_MANAGEMENT_AUTO,
216 	/* Unconditionally set Dataset Management bit in ONCS */
217 	NVME_DATASET_MANAGEMENT_ENABLE,
218 	/* Unconditionally clear Dataset Management bit in ONCS */
219 	NVME_DATASET_MANAGEMENT_DISABLE,
220 };
221 
222 struct pci_nvme_softc {
223 	struct pci_devinst *nsc_pi;
224 
225 	pthread_mutex_t	mtx;
226 
227 	struct nvme_registers regs;
228 
229 	struct nvme_namespace_data  nsdata;
230 	struct nvme_controller_data ctrldata;
231 	struct nvme_error_information_entry err_log;
232 	struct nvme_health_information_page health_log;
233 	struct nvme_firmware_page fw_log;
234 
235 	struct pci_nvme_blockstore nvstore;
236 
237 	uint16_t	max_qentries;	/* max entries per queue */
238 	uint32_t	max_queues;	/* max number of IO SQ's or CQ's */
239 	uint32_t	num_cqueues;
240 	uint32_t	num_squeues;
241 
242 	struct pci_nvme_ioreq *ioreqs;
243 	STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
244 	uint32_t	pending_ios;
245 	uint32_t	ioslots;
246 	sem_t		iosemlock;
247 
248 	/*
249 	 * Memory mapped Submission and Completion queues
250 	 * Each array includes both Admin and IO queues
251 	 */
252 	struct nvme_completion_queue *compl_queues;
253 	struct nvme_submission_queue *submit_queues;
254 
255 	/* controller features */
256 	uint32_t	intr_coales_aggr_time;   /* 0x08: uS to delay intr */
257 	uint32_t	intr_coales_aggr_thresh; /* 0x08: compl-Q entries */
258 	uint32_t	async_ev_config;         /* 0x0B: async event config */
259 
260 	enum nvme_dsm_type dataset_management;
261 };
262 
263 
264 static void pci_nvme_io_partial(struct blockif_req *br, int err);
265 
266 /* Controller Configuration utils */
267 #define	NVME_CC_GET_EN(cc) \
268 	((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
269 #define	NVME_CC_GET_CSS(cc) \
270 	((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
271 #define	NVME_CC_GET_SHN(cc) \
272 	((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
273 #define	NVME_CC_GET_IOSQES(cc) \
274 	((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
275 #define	NVME_CC_GET_IOCQES(cc) \
276 	((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
277 
278 #define	NVME_CC_WRITE_MASK \
279 	((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
280 	 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
281 	 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
282 
283 #define	NVME_CC_NEN_WRITE_MASK \
284 	((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
285 	 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
286 	 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
287 
288 /* Controller Status utils */
289 #define	NVME_CSTS_GET_RDY(sts) \
290 	((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
291 
292 #define	NVME_CSTS_RDY	(1 << NVME_CSTS_REG_RDY_SHIFT)
293 
294 /* Completion Queue status word utils */
295 #define	NVME_STATUS_P	(1 << NVME_STATUS_P_SHIFT)
296 #define	NVME_STATUS_MASK \
297 	((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
298 	 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
299 
300 #define NVME_ONCS_DSM	(NVME_CTRLR_DATA_ONCS_DSM_MASK << \
301 	NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
302 
303 static __inline void
304 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
305 {
306 	size_t len;
307 
308 	len = strnlen(src, dst_size);
309 	memset(dst, pad, dst_size);
310 	memcpy(dst, src, len);
311 }
312 
313 static __inline void
314 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
315 {
316 
317 	*status &= ~NVME_STATUS_MASK;
318 	*status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
319 		(code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
320 }
321 
322 static __inline void
323 pci_nvme_status_genc(uint16_t *status, uint16_t code)
324 {
325 
326 	pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
327 }
328 
329 static __inline void
330 pci_nvme_toggle_phase(uint16_t *status, int prev)
331 {
332 
333 	if (prev)
334 		*status &= ~NVME_STATUS_P;
335 	else
336 		*status |= NVME_STATUS_P;
337 }
338 
339 static void
340 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
341 {
342 	struct nvme_controller_data *cd = &sc->ctrldata;
343 
344 	cd->vid = 0xFB5D;
345 	cd->ssvid = 0x0000;
346 
347 	cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
348 	cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
349 
350 	/* Num of submission commands that we can handle at a time (2^rab) */
351 	cd->rab   = 4;
352 
353 	/* FreeBSD OUI */
354 	cd->ieee[0] = 0x58;
355 	cd->ieee[1] = 0x9c;
356 	cd->ieee[2] = 0xfc;
357 
358 	cd->mic = 0;
359 
360 	cd->mdts = 9;	/* max data transfer size (2^mdts * CAP.MPSMIN) */
361 
362 	cd->ver = 0x00010300;
363 
364 	cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
365 	cd->acl = 2;
366 	cd->aerl = 4;
367 
368 	cd->lpa = 0;	/* TODO: support some simple things like SMART */
369 	cd->elpe = 0;	/* max error log page entries */
370 	cd->npss = 1;	/* number of power states support */
371 
372 	/* Warning Composite Temperature Threshold */
373 	cd->wctemp = 0x0157;
374 
375 	cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
376 	    (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
377 	cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
378 	    (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
379 	cd->nn = 1;	/* number of namespaces */
380 
381 	cd->oncs = 0;
382 	switch (sc->dataset_management) {
383 	case NVME_DATASET_MANAGEMENT_AUTO:
384 		if (sc->nvstore.deallocate)
385 			cd->oncs |= NVME_ONCS_DSM;
386 		break;
387 	case NVME_DATASET_MANAGEMENT_ENABLE:
388 		cd->oncs |= NVME_ONCS_DSM;
389 		break;
390 	default:
391 		break;
392 	}
393 
394 	cd->fna = 0x03;
395 
396 	cd->power_state[0].mp = 10;
397 }
398 
399 /*
400  * Calculate the CRC-16 of the given buffer
401  * See copyright attribution at top of file
402  */
403 static uint16_t
404 crc16(uint16_t crc, const void *buffer, unsigned int len)
405 {
406 	const unsigned char *cp = buffer;
407 	/* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
408 	static uint16_t const crc16_table[256] = {
409 		0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
410 		0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
411 		0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
412 		0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
413 		0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
414 		0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
415 		0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
416 		0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
417 		0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
418 		0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
419 		0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
420 		0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
421 		0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
422 		0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
423 		0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
424 		0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
425 		0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
426 		0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
427 		0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
428 		0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
429 		0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
430 		0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
431 		0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
432 		0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
433 		0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
434 		0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
435 		0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
436 		0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
437 		0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
438 		0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
439 		0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
440 		0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
441 	};
442 
443 	while (len--)
444 		crc = (((crc >> 8) & 0xffU) ^
445 		    crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
446 	return crc;
447 }
448 
449 static void
450 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
451     struct nvme_namespace_data *nd, uint32_t nsid,
452     struct pci_nvme_blockstore *nvstore)
453 {
454 
455 	/* Get capacity and block size information from backing store */
456 	nd->nsze = nvstore->size / nvstore->sectsz;
457 	nd->ncap = nd->nsze;
458 	nd->nuse = nd->nsze;
459 
460 	if (nvstore->type == NVME_STOR_BLOCKIF)
461 		nvstore->deallocate = blockif_candelete(nvstore->ctx);
462 
463 	nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
464 	nd->flbas = 0;
465 
466 	/* Create an EUI-64 if user did not provide one */
467 	if (nvstore->eui64 == 0) {
468 		char *data = NULL;
469 		uint64_t eui64 = nvstore->eui64;
470 
471 		asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
472 		    sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
473 
474 		if (data != NULL) {
475 			eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
476 			free(data);
477 		}
478 		nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
479 	}
480 	be64enc(nd->eui64, nvstore->eui64);
481 
482 	/* LBA data-sz = 2^lbads */
483 	nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
484 }
485 
486 static void
487 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
488 {
489 
490 	memset(&sc->err_log, 0, sizeof(sc->err_log));
491 	memset(&sc->health_log, 0, sizeof(sc->health_log));
492 	memset(&sc->fw_log, 0, sizeof(sc->fw_log));
493 }
494 
495 static void
496 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
497 {
498 	DPRINTF(("%s", __func__));
499 
500 	sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
501 	    (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
502 	    (60 << NVME_CAP_LO_REG_TO_SHIFT);
503 
504 	sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
505 
506 	sc->regs.vs = 0x00010300;	/* NVMe v1.3 */
507 
508 	sc->regs.cc = 0;
509 	sc->regs.csts = 0;
510 
511 	sc->num_cqueues = sc->num_squeues = sc->max_queues;
512 	if (sc->submit_queues != NULL) {
513 		for (int i = 0; i < sc->num_squeues + 1; i++) {
514 			/*
515 			 * The Admin Submission Queue is at index 0.
516 			 * It must not be changed at reset otherwise the
517 			 * emulation will be out of sync with the guest.
518 			 */
519 			if (i != 0) {
520 				sc->submit_queues[i].qbase = NULL;
521 				sc->submit_queues[i].size = 0;
522 				sc->submit_queues[i].cqid = 0;
523 			}
524 			sc->submit_queues[i].tail = 0;
525 			sc->submit_queues[i].head = 0;
526 			sc->submit_queues[i].busy = 0;
527 		}
528 	} else
529 		sc->submit_queues = calloc(sc->num_squeues + 1,
530 		                        sizeof(struct nvme_submission_queue));
531 
532 	if (sc->compl_queues != NULL) {
533 		for (int i = 0; i < sc->num_cqueues + 1; i++) {
534 			/* See Admin Submission Queue note above */
535 			if (i != 0) {
536 				sc->compl_queues[i].qbase = NULL;
537 				sc->compl_queues[i].size = 0;
538 			}
539 
540 			sc->compl_queues[i].tail = 0;
541 			sc->compl_queues[i].head = 0;
542 		}
543 	} else {
544 		sc->compl_queues = calloc(sc->num_cqueues + 1,
545 		                        sizeof(struct nvme_completion_queue));
546 
547 		for (int i = 0; i < sc->num_cqueues + 1; i++)
548 			pthread_mutex_init(&sc->compl_queues[i].mtx, NULL);
549 	}
550 }
551 
552 static void
553 pci_nvme_reset(struct pci_nvme_softc *sc)
554 {
555 	pthread_mutex_lock(&sc->mtx);
556 	pci_nvme_reset_locked(sc);
557 	pthread_mutex_unlock(&sc->mtx);
558 }
559 
560 static void
561 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
562 {
563 	uint16_t acqs, asqs;
564 
565 	DPRINTF(("%s", __func__));
566 
567 	asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
568 	sc->submit_queues[0].size = asqs;
569 	sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
570 	            sizeof(struct nvme_command) * asqs);
571 
572 	DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p",
573 	        __func__, sc->regs.asq, sc->submit_queues[0].qbase));
574 
575 	acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
576 	    NVME_AQA_REG_ACQS_MASK) + 1;
577 	sc->compl_queues[0].size = acqs;
578 	sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
579 	         sizeof(struct nvme_completion) * acqs);
580 	DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p",
581 	        __func__, sc->regs.acq, sc->compl_queues[0].qbase));
582 }
583 
584 static int
585 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
586 	size_t len, enum nvme_copy_dir dir)
587 {
588 	uint8_t *p;
589 	size_t bytes;
590 
591 	if (len > (8 * 1024)) {
592 		return (-1);
593 	}
594 
595 	/* Copy from the start of prp1 to the end of the physical page */
596 	bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
597 	bytes = MIN(bytes, len);
598 
599 	p = vm_map_gpa(ctx, prp1, bytes);
600 	if (p == NULL) {
601 		return (-1);
602 	}
603 
604 	if (dir == NVME_COPY_TO_PRP)
605 		memcpy(p, b, bytes);
606 	else
607 		memcpy(b, p, bytes);
608 
609 	b += bytes;
610 
611 	len -= bytes;
612 	if (len == 0) {
613 		return (0);
614 	}
615 
616 	len = MIN(len, PAGE_SIZE);
617 
618 	p = vm_map_gpa(ctx, prp2, len);
619 	if (p == NULL) {
620 		return (-1);
621 	}
622 
623 	if (dir == NVME_COPY_TO_PRP)
624 		memcpy(p, b, len);
625 	else
626 		memcpy(b, p, len);
627 
628 	return (0);
629 }
630 
631 static int
632 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
633 	struct nvme_completion* compl)
634 {
635 	uint16_t qid = command->cdw10 & 0xffff;
636 
637 	DPRINTF(("%s DELETE_IO_SQ %u", __func__, qid));
638 	if (qid == 0 || qid > sc->num_squeues) {
639 		WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u",
640 		        __func__, qid, sc->num_squeues));
641 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
642 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
643 		return (1);
644 	}
645 
646 	sc->submit_queues[qid].qbase = NULL;
647 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
648 	return (1);
649 }
650 
651 static int
652 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
653 	struct nvme_completion* compl)
654 {
655 	if (command->cdw11 & NVME_CMD_CDW11_PC) {
656 		uint16_t qid = command->cdw10 & 0xffff;
657 		struct nvme_submission_queue *nsq;
658 
659 		if ((qid == 0) || (qid > sc->num_squeues)) {
660 			WPRINTF(("%s queue index %u > num_squeues %u",
661 			        __func__, qid, sc->num_squeues));
662 			pci_nvme_status_tc(&compl->status,
663 			    NVME_SCT_COMMAND_SPECIFIC,
664 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
665 			return (1);
666 		}
667 
668 		nsq = &sc->submit_queues[qid];
669 		nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
670 
671 		nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
672 		              sizeof(struct nvme_command) * (size_t)nsq->size);
673 		nsq->cqid = (command->cdw11 >> 16) & 0xffff;
674 		nsq->qpriority = (command->cdw11 >> 1) & 0x03;
675 
676 		DPRINTF(("%s sq %u size %u gaddr %p cqid %u", __func__,
677 		        qid, nsq->size, nsq->qbase, nsq->cqid));
678 
679 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
680 
681 		DPRINTF(("%s completed creating IOSQ qid %u",
682 		         __func__, qid));
683 	} else {
684 		/*
685 		 * Guest sent non-cont submission queue request.
686 		 * This setting is unsupported by this emulation.
687 		 */
688 		WPRINTF(("%s unsupported non-contig (list-based) "
689 		         "create i/o submission queue", __func__));
690 
691 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
692 	}
693 	return (1);
694 }
695 
696 static int
697 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
698 	struct nvme_completion* compl)
699 {
700 	uint16_t qid = command->cdw10 & 0xffff;
701 
702 	DPRINTF(("%s DELETE_IO_CQ %u", __func__, qid));
703 	if (qid == 0 || qid > sc->num_cqueues) {
704 		WPRINTF(("%s queue index %u / num_cqueues %u",
705 		        __func__, qid, sc->num_cqueues));
706 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
707 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
708 		return (1);
709 	}
710 
711 	sc->compl_queues[qid].qbase = NULL;
712 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
713 	return (1);
714 }
715 
716 static int
717 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
718 	struct nvme_completion* compl)
719 {
720 	if (command->cdw11 & NVME_CMD_CDW11_PC) {
721 		uint16_t qid = command->cdw10 & 0xffff;
722 		struct nvme_completion_queue *ncq;
723 
724 		if ((qid == 0) || (qid > sc->num_cqueues)) {
725 			WPRINTF(("%s queue index %u > num_cqueues %u",
726 			        __func__, qid, sc->num_cqueues));
727 			pci_nvme_status_tc(&compl->status,
728 			    NVME_SCT_COMMAND_SPECIFIC,
729 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
730 			return (1);
731 		}
732 
733 		ncq = &sc->compl_queues[qid];
734 		ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
735 		ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
736 		ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
737 
738 		ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
739 		             command->prp1,
740 		             sizeof(struct nvme_command) * (size_t)ncq->size);
741 
742 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
743 	} else {
744 		/*
745 		 * Non-contig completion queue unsupported.
746 		 */
747 		WPRINTF(("%s unsupported non-contig (list-based) "
748 		         "create i/o completion queue",
749 		         __func__));
750 
751 		/* 0x12 = Invalid Use of Controller Memory Buffer */
752 		pci_nvme_status_genc(&compl->status, 0x12);
753 	}
754 
755 	return (1);
756 }
757 
758 static int
759 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
760 	struct nvme_completion* compl)
761 {
762 	uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2;
763 	uint8_t logpage = command->cdw10 & 0xFF;
764 
765 	DPRINTF(("%s log page %u len %u", __func__, logpage, logsize));
766 
767 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
768 
769 	switch (logpage) {
770 	case NVME_LOG_ERROR:
771 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
772 		    command->prp2, (uint8_t *)&sc->err_log, logsize,
773 		    NVME_COPY_TO_PRP);
774 		break;
775 	case NVME_LOG_HEALTH_INFORMATION:
776 		/* TODO: present some smart info */
777 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
778 		    command->prp2, (uint8_t *)&sc->health_log, logsize,
779 		    NVME_COPY_TO_PRP);
780 		break;
781 	case NVME_LOG_FIRMWARE_SLOT:
782 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
783 		    command->prp2, (uint8_t *)&sc->fw_log, logsize,
784 		    NVME_COPY_TO_PRP);
785 		break;
786 	default:
787 		WPRINTF(("%s get log page %x command not supported",
788 		        __func__, logpage));
789 
790 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
791 		    NVME_SC_INVALID_LOG_PAGE);
792 	}
793 
794 	return (1);
795 }
796 
797 static int
798 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
799 	struct nvme_completion* compl)
800 {
801 	void *dest;
802 
803 	DPRINTF(("%s identify 0x%x nsid 0x%x", __func__,
804 	        command->cdw10 & 0xFF, command->nsid));
805 
806 	switch (command->cdw10 & 0xFF) {
807 	case 0x00: /* return Identify Namespace data structure */
808 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
809 		    command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
810 		    NVME_COPY_TO_PRP);
811 		break;
812 	case 0x01: /* return Identify Controller data structure */
813 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
814 		    command->prp2, (uint8_t *)&sc->ctrldata,
815 		    sizeof(sc->ctrldata),
816 		    NVME_COPY_TO_PRP);
817 		break;
818 	case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
819 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
820 		                  sizeof(uint32_t) * 1024);
821 		((uint32_t *)dest)[0] = 1;
822 		((uint32_t *)dest)[1] = 0;
823 		break;
824 	case 0x11:
825 		pci_nvme_status_genc(&compl->status,
826 		    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
827 		return (1);
828 	case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
829 	case 0x10:
830 	case 0x12:
831 	case 0x13:
832 	case 0x14:
833 	case 0x15:
834 	default:
835 		DPRINTF(("%s unsupported identify command requested 0x%x",
836 		         __func__, command->cdw10 & 0xFF));
837 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
838 		return (1);
839 	}
840 
841 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
842 	return (1);
843 }
844 
845 static int
846 nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command,
847 	struct nvme_completion* compl)
848 {
849 	uint16_t nqr;	/* Number of Queues Requested */
850 
851 	nqr = command->cdw11 & 0xFFFF;
852 	if (nqr == 0xffff) {
853 		WPRINTF(("%s: Illegal NSQR value %#x", __func__, nqr));
854 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
855 		return (-1);
856 	}
857 
858 	sc->num_squeues = ONE_BASED(nqr);
859 	if (sc->num_squeues > sc->max_queues) {
860 		DPRINTF(("NSQR=%u is greater than max %u", sc->num_squeues,
861 					sc->max_queues));
862 		sc->num_squeues = sc->max_queues;
863 	}
864 
865 	nqr = (command->cdw11 >> 16) & 0xFFFF;
866 	if (nqr == 0xffff) {
867 		WPRINTF(("%s: Illegal NCQR value %#x", __func__, nqr));
868 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
869 		return (-1);
870 	}
871 
872 	sc->num_cqueues = ONE_BASED(nqr);
873 	if (sc->num_cqueues > sc->max_queues) {
874 		DPRINTF(("NCQR=%u is greater than max %u", sc->num_cqueues,
875 					sc->max_queues));
876 		sc->num_cqueues = sc->max_queues;
877 	}
878 
879 	compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
880 
881 	return (0);
882 }
883 
884 static int
885 nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command,
886 	struct nvme_completion* compl)
887 {
888 	int feature = command->cdw10 & 0xFF;
889 	uint32_t iv;
890 
891 	DPRINTF(("%s feature 0x%x", __func__, feature));
892 	compl->cdw0 = 0;
893 
894 	switch (feature) {
895 	case NVME_FEAT_ARBITRATION:
896 		DPRINTF(("  arbitration 0x%x", command->cdw11));
897 		break;
898 	case NVME_FEAT_POWER_MANAGEMENT:
899 		DPRINTF(("  power management 0x%x", command->cdw11));
900 		break;
901 	case NVME_FEAT_LBA_RANGE_TYPE:
902 		DPRINTF(("  lba range 0x%x", command->cdw11));
903 		break;
904 	case NVME_FEAT_TEMPERATURE_THRESHOLD:
905 		DPRINTF(("  temperature threshold 0x%x", command->cdw11));
906 		break;
907 	case NVME_FEAT_ERROR_RECOVERY:
908 		DPRINTF(("  error recovery 0x%x", command->cdw11));
909 		break;
910 	case NVME_FEAT_VOLATILE_WRITE_CACHE:
911 		DPRINTF(("  volatile write cache 0x%x", command->cdw11));
912 		break;
913 	case NVME_FEAT_NUMBER_OF_QUEUES:
914 		nvme_set_feature_queues(sc, command, compl);
915 		break;
916 	case NVME_FEAT_INTERRUPT_COALESCING:
917 		DPRINTF(("  interrupt coalescing 0x%x", command->cdw11));
918 
919 		/* in uS */
920 		sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100;
921 
922 		sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF;
923 		break;
924 	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
925 		iv = command->cdw11 & 0xFFFF;
926 
927 		DPRINTF(("  interrupt vector configuration 0x%x",
928 		        command->cdw11));
929 
930 		for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) {
931 			if (sc->compl_queues[i].intr_vec == iv) {
932 				if (command->cdw11 & (1 << 16))
933 					sc->compl_queues[i].intr_en |=
934 					                      NVME_CQ_INTCOAL;
935 				else
936 					sc->compl_queues[i].intr_en &=
937 					                     ~NVME_CQ_INTCOAL;
938 			}
939 		}
940 		break;
941 	case NVME_FEAT_WRITE_ATOMICITY:
942 		DPRINTF(("  write atomicity 0x%x", command->cdw11));
943 		break;
944 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
945 		DPRINTF(("  async event configuration 0x%x",
946 		        command->cdw11));
947 		sc->async_ev_config = command->cdw11;
948 		break;
949 	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
950 		DPRINTF(("  software progress marker 0x%x",
951 		        command->cdw11));
952 		break;
953 	case 0x0C:
954 		DPRINTF(("  autonomous power state transition 0x%x",
955 		        command->cdw11));
956 		break;
957 	default:
958 		WPRINTF(("%s invalid feature", __func__));
959 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
960 		return (1);
961 	}
962 
963 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
964 	return (1);
965 }
966 
967 static int
968 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
969 	struct nvme_completion* compl)
970 {
971 	int feature = command->cdw10 & 0xFF;
972 
973 	DPRINTF(("%s feature 0x%x", __func__, feature));
974 
975 	compl->cdw0 = 0;
976 
977 	switch (feature) {
978 	case NVME_FEAT_ARBITRATION:
979 		DPRINTF(("  arbitration"));
980 		break;
981 	case NVME_FEAT_POWER_MANAGEMENT:
982 		DPRINTF(("  power management"));
983 		break;
984 	case NVME_FEAT_LBA_RANGE_TYPE:
985 		DPRINTF(("  lba range"));
986 		break;
987 	case NVME_FEAT_TEMPERATURE_THRESHOLD:
988 		DPRINTF(("  temperature threshold"));
989 		switch ((command->cdw11 >> 20) & 0x3) {
990 		case 0:
991 			/* Over temp threshold */
992 			compl->cdw0 = 0xFFFF;
993 			break;
994 		case 1:
995 			/* Under temp threshold */
996 			compl->cdw0 = 0;
997 			break;
998 		default:
999 			WPRINTF(("  invalid threshold type select"));
1000 			pci_nvme_status_genc(&compl->status,
1001 			    NVME_SC_INVALID_FIELD);
1002 			return (1);
1003 		}
1004 		break;
1005 	case NVME_FEAT_ERROR_RECOVERY:
1006 		DPRINTF(("  error recovery"));
1007 		break;
1008 	case NVME_FEAT_VOLATILE_WRITE_CACHE:
1009 		DPRINTF(("  volatile write cache"));
1010 		break;
1011 	case NVME_FEAT_NUMBER_OF_QUEUES:
1012 		compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1013 
1014 		DPRINTF(("  number of queues (submit %u, completion %u)",
1015 		        compl->cdw0 & 0xFFFF,
1016 		        (compl->cdw0 >> 16) & 0xFFFF));
1017 
1018 		break;
1019 	case NVME_FEAT_INTERRUPT_COALESCING:
1020 		DPRINTF(("  interrupt coalescing"));
1021 		break;
1022 	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1023 		DPRINTF(("  interrupt vector configuration"));
1024 		break;
1025 	case NVME_FEAT_WRITE_ATOMICITY:
1026 		DPRINTF(("  write atomicity"));
1027 		break;
1028 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1029 		DPRINTF(("  async event configuration"));
1030 		sc->async_ev_config = command->cdw11;
1031 		break;
1032 	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1033 		DPRINTF(("  software progress marker"));
1034 		break;
1035 	case 0x0C:
1036 		DPRINTF(("  autonomous power state transition"));
1037 		break;
1038 	default:
1039 		WPRINTF(("%s invalid feature 0x%x", __func__, feature));
1040 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1041 		return (1);
1042 	}
1043 
1044 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1045 	return (1);
1046 }
1047 
1048 static int
1049 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1050 	struct nvme_completion* compl)
1051 {
1052 	DPRINTF(("%s submission queue %u, command ID 0x%x", __func__,
1053 	        command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF));
1054 
1055 	/* TODO: search for the command ID and abort it */
1056 
1057 	compl->cdw0 = 1;
1058 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1059 	return (1);
1060 }
1061 
1062 static int
1063 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1064 	struct nvme_command* command, struct nvme_completion* compl)
1065 {
1066 	DPRINTF(("%s async event request 0x%x", __func__, command->cdw11));
1067 
1068 	/*
1069 	 * TODO: raise events when they happen based on the Set Features cmd.
1070 	 * These events happen async, so only set completion successful if
1071 	 * there is an event reflective of the request to get event.
1072 	 */
1073 	pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1074 	    NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1075 	return (0);
1076 }
1077 
1078 static void
1079 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1080 {
1081 	struct nvme_completion compl;
1082 	struct nvme_command *cmd;
1083 	struct nvme_submission_queue *sq;
1084 	struct nvme_completion_queue *cq;
1085 	uint16_t sqhead;
1086 
1087 	DPRINTF(("%s index %u", __func__, (uint32_t)value));
1088 
1089 	sq = &sc->submit_queues[0];
1090 	cq = &sc->compl_queues[0];
1091 
1092 	sqhead = atomic_load_acq_short(&sq->head);
1093 
1094 	if (atomic_testandset_int(&sq->busy, 1)) {
1095 		DPRINTF(("%s SQ busy, head %u, tail %u",
1096 		        __func__, sqhead, sq->tail));
1097 		return;
1098 	}
1099 
1100 	DPRINTF(("sqhead %u, tail %u", sqhead, sq->tail));
1101 
1102 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
1103 		cmd = &(sq->qbase)[sqhead];
1104 		compl.cdw0 = 0;
1105 		compl.status = 0;
1106 
1107 		switch (cmd->opc) {
1108 		case NVME_OPC_DELETE_IO_SQ:
1109 			DPRINTF(("%s command DELETE_IO_SQ", __func__));
1110 			nvme_opc_delete_io_sq(sc, cmd, &compl);
1111 			break;
1112 		case NVME_OPC_CREATE_IO_SQ:
1113 			DPRINTF(("%s command CREATE_IO_SQ", __func__));
1114 			nvme_opc_create_io_sq(sc, cmd, &compl);
1115 			break;
1116 		case NVME_OPC_DELETE_IO_CQ:
1117 			DPRINTF(("%s command DELETE_IO_CQ", __func__));
1118 			nvme_opc_delete_io_cq(sc, cmd, &compl);
1119 			break;
1120 		case NVME_OPC_CREATE_IO_CQ:
1121 			DPRINTF(("%s command CREATE_IO_CQ", __func__));
1122 			nvme_opc_create_io_cq(sc, cmd, &compl);
1123 			break;
1124 		case NVME_OPC_GET_LOG_PAGE:
1125 			DPRINTF(("%s command GET_LOG_PAGE", __func__));
1126 			nvme_opc_get_log_page(sc, cmd, &compl);
1127 			break;
1128 		case NVME_OPC_IDENTIFY:
1129 			DPRINTF(("%s command IDENTIFY", __func__));
1130 			nvme_opc_identify(sc, cmd, &compl);
1131 			break;
1132 		case NVME_OPC_ABORT:
1133 			DPRINTF(("%s command ABORT", __func__));
1134 			nvme_opc_abort(sc, cmd, &compl);
1135 			break;
1136 		case NVME_OPC_SET_FEATURES:
1137 			DPRINTF(("%s command SET_FEATURES", __func__));
1138 			nvme_opc_set_features(sc, cmd, &compl);
1139 			break;
1140 		case NVME_OPC_GET_FEATURES:
1141 			DPRINTF(("%s command GET_FEATURES", __func__));
1142 			nvme_opc_get_features(sc, cmd, &compl);
1143 			break;
1144 		case NVME_OPC_ASYNC_EVENT_REQUEST:
1145 			DPRINTF(("%s command ASYNC_EVENT_REQ", __func__));
1146 			/* XXX dont care, unhandled for now
1147 			nvme_opc_async_event_req(sc, cmd, &compl);
1148 			*/
1149 			compl.status = NVME_NO_STATUS;
1150 			break;
1151 		default:
1152 			WPRINTF(("0x%x command is not implemented",
1153 			    cmd->opc));
1154 			pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1155 		}
1156 		sqhead = (sqhead + 1) % sq->size;
1157 
1158 		if (NVME_COMPLETION_VALID(compl)) {
1159 			struct nvme_completion *cp;
1160 			int phase;
1161 
1162 			cp = &(cq->qbase)[cq->tail];
1163 			cp->cdw0 = compl.cdw0;
1164 			cp->sqid = 0;
1165 			cp->sqhd = sqhead;
1166 			cp->cid = cmd->cid;
1167 
1168 			phase = NVME_STATUS_GET_P(cp->status);
1169 			cp->status = compl.status;
1170 			pci_nvme_toggle_phase(&cp->status, phase);
1171 
1172 			cq->tail = (cq->tail + 1) % cq->size;
1173 		}
1174 	}
1175 
1176 	DPRINTF(("setting sqhead %u", sqhead));
1177 	atomic_store_short(&sq->head, sqhead);
1178 	atomic_store_int(&sq->busy, 0);
1179 
1180 	if (cq->head != cq->tail)
1181 		pci_generate_msix(sc->nsc_pi, 0);
1182 
1183 }
1184 
1185 static int
1186 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1187 	uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1188 {
1189 	int iovidx;
1190 
1191 	if (req != NULL) {
1192 		/* concatenate contig block-iovs to minimize number of iovs */
1193 		if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1194 			iovidx = req->io_req.br_iovcnt - 1;
1195 
1196 			req->io_req.br_iov[iovidx].iov_base =
1197 			    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1198 			                     req->prev_gpaddr, size);
1199 
1200 			req->prev_size += size;
1201 			req->io_req.br_resid += size;
1202 
1203 			req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1204 		} else {
1205 			pthread_mutex_lock(&req->mtx);
1206 
1207 			iovidx = req->io_req.br_iovcnt;
1208 			if (iovidx == NVME_MAX_BLOCKIOVS) {
1209 				int err = 0;
1210 
1211 				DPRINTF(("large I/O, doing partial req"));
1212 
1213 				iovidx = 0;
1214 				req->io_req.br_iovcnt = 0;
1215 
1216 				req->io_req.br_callback = pci_nvme_io_partial;
1217 
1218 				if (!do_write)
1219 					err = blockif_read(sc->nvstore.ctx,
1220 					                   &req->io_req);
1221 				else
1222 					err = blockif_write(sc->nvstore.ctx,
1223 					                    &req->io_req);
1224 
1225 				/* wait until req completes before cont */
1226 				if (err == 0)
1227 					pthread_cond_wait(&req->cv, &req->mtx);
1228 			}
1229 			if (iovidx == 0) {
1230 				req->io_req.br_offset = lba;
1231 				req->io_req.br_resid = 0;
1232 				req->io_req.br_param = req;
1233 			}
1234 
1235 			req->io_req.br_iov[iovidx].iov_base =
1236 			    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1237 			                     gpaddr, size);
1238 
1239 			req->io_req.br_iov[iovidx].iov_len = size;
1240 
1241 			req->prev_gpaddr = gpaddr;
1242 			req->prev_size = size;
1243 			req->io_req.br_resid += size;
1244 
1245 			req->io_req.br_iovcnt++;
1246 
1247 			pthread_mutex_unlock(&req->mtx);
1248 		}
1249 	} else {
1250 		/* RAM buffer: read/write directly */
1251 		void *p = sc->nvstore.ctx;
1252 		void *gptr;
1253 
1254 		if ((lba + size) > sc->nvstore.size) {
1255 			WPRINTF(("%s write would overflow RAM", __func__));
1256 			return (-1);
1257 		}
1258 
1259 		p = (void *)((uintptr_t)p + (uintptr_t)lba);
1260 		gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size);
1261 		if (do_write)
1262 			memcpy(p, gptr, size);
1263 		else
1264 			memcpy(gptr, p, size);
1265 	}
1266 	return (0);
1267 }
1268 
1269 static void
1270 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1271 	struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1272 	uint32_t cdw0, uint16_t status, int ignore_busy)
1273 {
1274 	struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1275 	struct nvme_completion *compl;
1276 	int phase;
1277 
1278 	DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1279 		 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1280 		 NVME_STATUS_GET_SC(status)));
1281 
1282 	pthread_mutex_lock(&cq->mtx);
1283 
1284 	assert(cq->qbase != NULL);
1285 
1286 	compl = &cq->qbase[cq->tail];
1287 
1288 	compl->cdw0 = cdw0;
1289 	compl->sqid = sqid;
1290 	compl->sqhd = atomic_load_acq_short(&sq->head);
1291 	compl->cid = cid;
1292 
1293 	// toggle phase
1294 	phase = NVME_STATUS_GET_P(compl->status);
1295 	compl->status = status;
1296 	pci_nvme_toggle_phase(&compl->status, phase);
1297 
1298 	cq->tail = (cq->tail + 1) % cq->size;
1299 
1300 	pthread_mutex_unlock(&cq->mtx);
1301 
1302 	if (cq->head != cq->tail) {
1303 		if (cq->intr_en & NVME_CQ_INTEN) {
1304 			pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1305 		} else {
1306 			DPRINTF(("%s: CQ%u interrupt disabled\n",
1307 						__func__, sq->cqid));
1308 		}
1309 	}
1310 }
1311 
1312 static void
1313 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1314 {
1315 	req->sc = NULL;
1316 	req->nvme_sq = NULL;
1317 	req->sqid = 0;
1318 
1319 	pthread_mutex_lock(&sc->mtx);
1320 
1321 	STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
1322 	sc->pending_ios--;
1323 
1324 	/* when no more IO pending, can set to ready if device reset/enabled */
1325 	if (sc->pending_ios == 0 &&
1326 	    NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1327 		sc->regs.csts |= NVME_CSTS_RDY;
1328 
1329 	pthread_mutex_unlock(&sc->mtx);
1330 
1331 	sem_post(&sc->iosemlock);
1332 }
1333 
1334 static struct pci_nvme_ioreq *
1335 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1336 {
1337 	struct pci_nvme_ioreq *req = NULL;;
1338 
1339 	sem_wait(&sc->iosemlock);
1340 	pthread_mutex_lock(&sc->mtx);
1341 
1342 	req = STAILQ_FIRST(&sc->ioreqs_free);
1343 	assert(req != NULL);
1344 	STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
1345 
1346 	req->sc = sc;
1347 
1348 	sc->pending_ios++;
1349 
1350 	pthread_mutex_unlock(&sc->mtx);
1351 
1352 	req->io_req.br_iovcnt = 0;
1353 	req->io_req.br_offset = 0;
1354 	req->io_req.br_resid = 0;
1355 	req->io_req.br_param = req;
1356 	req->prev_gpaddr = 0;
1357 	req->prev_size = 0;
1358 
1359 	return req;
1360 }
1361 
1362 static void
1363 pci_nvme_io_done(struct blockif_req *br, int err)
1364 {
1365 	struct pci_nvme_ioreq *req = br->br_param;
1366 	struct nvme_submission_queue *sq = req->nvme_sq;
1367 	uint16_t code, status;
1368 
1369 	DPRINTF(("%s error %d %s", __func__, err, strerror(err)));
1370 
1371 	/* TODO return correct error */
1372 	code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1373 	pci_nvme_status_genc(&status, code);
1374 
1375 	pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0);
1376 	pci_nvme_release_ioreq(req->sc, req);
1377 }
1378 
1379 static void
1380 pci_nvme_io_partial(struct blockif_req *br, int err)
1381 {
1382 	struct pci_nvme_ioreq *req = br->br_param;
1383 
1384 	DPRINTF(("%s error %d %s", __func__, err, strerror(err)));
1385 
1386 	pthread_cond_signal(&req->cv);
1387 }
1388 
1389 static void
1390 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
1391 {
1392 	struct pci_nvme_ioreq *req = br->br_param;
1393 	struct pci_nvme_softc *sc = req->sc;
1394 	bool done = true;
1395 	uint16_t status;
1396 
1397 	if (err) {
1398 		pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
1399 	} else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
1400 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1401 	} else {
1402 		struct iovec *iov = req->io_req.br_iov;
1403 
1404 		req->prev_gpaddr++;
1405 		iov += req->prev_gpaddr;
1406 
1407 		/* The iov_* values already include the sector size */
1408 		req->io_req.br_offset = (off_t)iov->iov_base;
1409 		req->io_req.br_resid = iov->iov_len;
1410 		if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
1411 			pci_nvme_status_genc(&status,
1412 			    NVME_SC_INTERNAL_DEVICE_ERROR);
1413 		} else
1414 			done = false;
1415 	}
1416 
1417 	if (done) {
1418 		pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
1419 		    req->cid, 0, status, 0);
1420 		pci_nvme_release_ioreq(sc, req);
1421 	}
1422 }
1423 
1424 static int
1425 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
1426     struct nvme_command *cmd,
1427     struct pci_nvme_blockstore *nvstore,
1428     struct pci_nvme_ioreq *req,
1429     uint16_t *status)
1430 {
1431 	int err = -1;
1432 
1433 	if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
1434 		pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
1435 		goto out;
1436 	}
1437 
1438 	if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
1439 		struct nvme_dsm_range *range;
1440 		uint32_t nr, r;
1441 		int sectsz = sc->nvstore.sectsz;
1442 
1443 		/*
1444 		 * DSM calls are advisory only, and compliant controllers
1445 		 * may choose to take no actions (i.e. return Success).
1446 		 */
1447 		if (!nvstore->deallocate) {
1448 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1449 			goto out;
1450 		}
1451 
1452 		if (req == NULL) {
1453 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1454 			goto out;
1455 		}
1456 
1457 		/* copy locally because a range entry could straddle PRPs */
1458 		range = calloc(1, NVME_MAX_DSM_TRIM);
1459 		if (range == NULL) {
1460 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1461 			goto out;
1462 		}
1463 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
1464 		    (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
1465 
1466 		req->opc = cmd->opc;
1467 		req->cid = cmd->cid;
1468 		req->nsid = cmd->nsid;
1469 		/*
1470 		 * If the request is for more than a single range, store
1471 		 * the ranges in the br_iov. Optimize for the common case
1472 		 * of a single range.
1473 		 *
1474 		 * Note that NVMe Number of Ranges is a zero based value
1475 		 */
1476 		nr = cmd->cdw10 & 0xff;
1477 
1478 		req->io_req.br_iovcnt = 0;
1479 		req->io_req.br_offset = range[0].starting_lba * sectsz;
1480 		req->io_req.br_resid = range[0].length * sectsz;
1481 
1482 		if (nr == 0) {
1483 			req->io_req.br_callback = pci_nvme_io_done;
1484 		} else {
1485 			struct iovec *iov = req->io_req.br_iov;
1486 
1487 			for (r = 0; r <= nr; r++) {
1488 				iov[r].iov_base = (void *)(range[r].starting_lba * sectsz);
1489 				iov[r].iov_len = range[r].length * sectsz;
1490 			}
1491 			req->io_req.br_callback = pci_nvme_dealloc_sm;
1492 
1493 			/*
1494 			 * Use prev_gpaddr to track the current entry and
1495 			 * prev_size to track the number of entries
1496 			 */
1497 			req->prev_gpaddr = 0;
1498 			req->prev_size = r;
1499 		}
1500 
1501 		err = blockif_delete(nvstore->ctx, &req->io_req);
1502 		if (err)
1503 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1504 
1505 		free(range);
1506 	}
1507 out:
1508 	return (err);
1509 }
1510 
1511 static void
1512 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
1513 {
1514 	struct nvme_submission_queue *sq;
1515 	uint16_t status;
1516 	uint16_t sqhead;
1517 	int err;
1518 
1519 	/* handle all submissions up to sq->tail index */
1520 	sq = &sc->submit_queues[idx];
1521 
1522 	if (atomic_testandset_int(&sq->busy, 1)) {
1523 		DPRINTF(("%s sqid %u busy", __func__, idx));
1524 		return;
1525 	}
1526 
1527 	sqhead = atomic_load_acq_short(&sq->head);
1528 
1529 	DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p",
1530 	         idx, sqhead, sq->tail, sq->qbase));
1531 
1532 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
1533 		struct nvme_command *cmd;
1534 		struct pci_nvme_ioreq *req = NULL;
1535 		uint64_t lba;
1536 		uint64_t nblocks, bytes, size, cpsz;
1537 
1538 		/* TODO: support scatter gather list handling */
1539 
1540 		cmd = &sq->qbase[sqhead];
1541 		sqhead = (sqhead + 1) % sq->size;
1542 
1543 		lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
1544 
1545 		if (cmd->opc == NVME_OPC_FLUSH) {
1546 			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1547 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1548 			                        status, 1);
1549 
1550 			continue;
1551 		} else if (cmd->opc == 0x08) {
1552 			/* TODO: write zeroes */
1553 			WPRINTF(("%s write zeroes lba 0x%lx blocks %u",
1554 			        __func__, lba, cmd->cdw12 & 0xFFFF));
1555 			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1556 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1557 			                        status, 1);
1558 
1559 			continue;
1560 		}
1561 
1562 		if (sc->nvstore.type == NVME_STOR_BLOCKIF) {
1563 			req = pci_nvme_get_ioreq(sc);
1564 			req->nvme_sq = sq;
1565 			req->sqid = idx;
1566 		}
1567 
1568 		if (cmd->opc == NVME_OPC_DATASET_MANAGEMENT) {
1569 			if (nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, req,
1570 			    &status)) {
1571 				pci_nvme_set_completion(sc, sq, idx, cmd->cid,
1572 				    0, status, 1);
1573 				if (req)
1574 					pci_nvme_release_ioreq(sc, req);
1575 			}
1576 			continue;
1577 		}
1578 
1579 		nblocks = (cmd->cdw12 & 0xFFFF) + 1;
1580 
1581 		bytes = nblocks * sc->nvstore.sectsz;
1582 
1583 		/*
1584 		 * If data starts mid-page and flows into the next page, then
1585 		 * increase page count
1586 		 */
1587 
1588 		DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu "
1589 		         "(%lu-bytes)",
1590 		         sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size,
1591 		         cmd->opc == NVME_OPC_WRITE ?
1592 			     "WRITE" : "READ",
1593 		         lba, nblocks, bytes));
1594 
1595 		cmd->prp1 &= ~(0x03UL);
1596 		cmd->prp2 &= ~(0x03UL);
1597 
1598 		DPRINTF((" prp1 0x%lx prp2 0x%lx", cmd->prp1, cmd->prp2));
1599 
1600 		size = bytes;
1601 		lba *= sc->nvstore.sectsz;
1602 
1603 		cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE);
1604 
1605 		if (cpsz > bytes)
1606 			cpsz = bytes;
1607 
1608 		if (req != NULL) {
1609 			req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) |
1610 			                        cmd->cdw10;
1611 			req->opc = cmd->opc;
1612 			req->cid = cmd->cid;
1613 			req->nsid = cmd->nsid;
1614 		}
1615 
1616 		err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz,
1617 		    cmd->opc == NVME_OPC_WRITE, lba);
1618 		lba += cpsz;
1619 		size -= cpsz;
1620 
1621 		if (size == 0)
1622 			goto iodone;
1623 
1624 		if (size <= PAGE_SIZE) {
1625 			/* prp2 is second (and final) page in transfer */
1626 
1627 			err = pci_nvme_append_iov_req(sc, req, cmd->prp2,
1628 			    size,
1629 			    cmd->opc == NVME_OPC_WRITE,
1630 			    lba);
1631 		} else {
1632 			uint64_t *prp_list;
1633 			int i;
1634 
1635 			/* prp2 is pointer to a physical region page list */
1636 			prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx,
1637 			                            cmd->prp2, PAGE_SIZE);
1638 
1639 			i = 0;
1640 			while (size != 0) {
1641 				cpsz = MIN(size, PAGE_SIZE);
1642 
1643 				/*
1644 				 * Move to linked physical region page list
1645 				 * in last item.
1646 				 */
1647 				if (i == (NVME_PRP2_ITEMS-1) &&
1648 				    size > PAGE_SIZE) {
1649 					assert((prp_list[i] & (PAGE_SIZE-1)) == 0);
1650 					prp_list = paddr_guest2host(
1651 					              sc->nsc_pi->pi_vmctx,
1652 					              prp_list[i], PAGE_SIZE);
1653 					i = 0;
1654 				}
1655 				if (prp_list[i] == 0) {
1656 					WPRINTF(("PRP2[%d] = 0 !!!", i));
1657 					err = 1;
1658 					break;
1659 				}
1660 
1661 				err = pci_nvme_append_iov_req(sc, req,
1662 				    prp_list[i], cpsz,
1663 				    cmd->opc == NVME_OPC_WRITE, lba);
1664 				if (err)
1665 					break;
1666 
1667 				lba += cpsz;
1668 				size -= cpsz;
1669 				i++;
1670 			}
1671 		}
1672 
1673 iodone:
1674 		if (sc->nvstore.type == NVME_STOR_RAM) {
1675 			uint16_t code, status;
1676 
1677 			code = err ? NVME_SC_LBA_OUT_OF_RANGE :
1678 			    NVME_SC_SUCCESS;
1679 			pci_nvme_status_genc(&status, code);
1680 
1681 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1682 			                        status, 1);
1683 
1684 			continue;
1685 		}
1686 
1687 
1688 		if (err)
1689 			goto do_error;
1690 
1691 		req->io_req.br_callback = pci_nvme_io_done;
1692 
1693 		err = 0;
1694 		switch (cmd->opc) {
1695 		case NVME_OPC_READ:
1696 			err = blockif_read(sc->nvstore.ctx, &req->io_req);
1697 			break;
1698 		case NVME_OPC_WRITE:
1699 			err = blockif_write(sc->nvstore.ctx, &req->io_req);
1700 			break;
1701 		default:
1702 			WPRINTF(("%s unhandled io command 0x%x",
1703 				 __func__, cmd->opc));
1704 			err = 1;
1705 		}
1706 
1707 do_error:
1708 		if (err) {
1709 			uint16_t status;
1710 
1711 			pci_nvme_status_genc(&status,
1712 			    NVME_SC_DATA_TRANSFER_ERROR);
1713 
1714 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1715 			                        status, 1);
1716 			pci_nvme_release_ioreq(sc, req);
1717 		}
1718 	}
1719 
1720 	atomic_store_short(&sq->head, sqhead);
1721 	atomic_store_int(&sq->busy, 0);
1722 }
1723 
1724 static void
1725 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
1726 	uint64_t idx, int is_sq, uint64_t value)
1727 {
1728 	DPRINTF(("nvme doorbell %lu, %s, val 0x%lx",
1729 	        idx, is_sq ? "SQ" : "CQ", value & 0xFFFF));
1730 
1731 	if (is_sq) {
1732 		atomic_store_short(&sc->submit_queues[idx].tail,
1733 		                   (uint16_t)value);
1734 
1735 		if (idx == 0) {
1736 			pci_nvme_handle_admin_cmd(sc, value);
1737 		} else {
1738 			/* submission queue; handle new entries in SQ */
1739 			if (idx > sc->num_squeues) {
1740 				WPRINTF(("%s SQ index %lu overflow from "
1741 				         "guest (max %u)",
1742 				         __func__, idx, sc->num_squeues));
1743 				return;
1744 			}
1745 			pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
1746 		}
1747 	} else {
1748 		if (idx > sc->num_cqueues) {
1749 			WPRINTF(("%s queue index %lu overflow from "
1750 			         "guest (max %u)",
1751 			         __func__, idx, sc->num_cqueues));
1752 			return;
1753 		}
1754 
1755 		sc->compl_queues[idx].head = (uint16_t)value;
1756 	}
1757 }
1758 
1759 static void
1760 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
1761 {
1762 	const char *s = iswrite ? "WRITE" : "READ";
1763 
1764 	switch (offset) {
1765 	case NVME_CR_CAP_LOW:
1766 		DPRINTF(("%s %s NVME_CR_CAP_LOW", func, s));
1767 		break;
1768 	case NVME_CR_CAP_HI:
1769 		DPRINTF(("%s %s NVME_CR_CAP_HI", func, s));
1770 		break;
1771 	case NVME_CR_VS:
1772 		DPRINTF(("%s %s NVME_CR_VS", func, s));
1773 		break;
1774 	case NVME_CR_INTMS:
1775 		DPRINTF(("%s %s NVME_CR_INTMS", func, s));
1776 		break;
1777 	case NVME_CR_INTMC:
1778 		DPRINTF(("%s %s NVME_CR_INTMC", func, s));
1779 		break;
1780 	case NVME_CR_CC:
1781 		DPRINTF(("%s %s NVME_CR_CC", func, s));
1782 		break;
1783 	case NVME_CR_CSTS:
1784 		DPRINTF(("%s %s NVME_CR_CSTS", func, s));
1785 		break;
1786 	case NVME_CR_NSSR:
1787 		DPRINTF(("%s %s NVME_CR_NSSR", func, s));
1788 		break;
1789 	case NVME_CR_AQA:
1790 		DPRINTF(("%s %s NVME_CR_AQA", func, s));
1791 		break;
1792 	case NVME_CR_ASQ_LOW:
1793 		DPRINTF(("%s %s NVME_CR_ASQ_LOW", func, s));
1794 		break;
1795 	case NVME_CR_ASQ_HI:
1796 		DPRINTF(("%s %s NVME_CR_ASQ_HI", func, s));
1797 		break;
1798 	case NVME_CR_ACQ_LOW:
1799 		DPRINTF(("%s %s NVME_CR_ACQ_LOW", func, s));
1800 		break;
1801 	case NVME_CR_ACQ_HI:
1802 		DPRINTF(("%s %s NVME_CR_ACQ_HI", func, s));
1803 		break;
1804 	default:
1805 		DPRINTF(("unknown nvme bar-0 offset 0x%lx", offset));
1806 	}
1807 
1808 }
1809 
1810 static void
1811 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
1812 	uint64_t offset, int size, uint64_t value)
1813 {
1814 	uint32_t ccreg;
1815 
1816 	if (offset >= NVME_DOORBELL_OFFSET) {
1817 		uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
1818 		uint64_t idx = belloffset / 8; /* door bell size = 2*int */
1819 		int is_sq = (belloffset % 8) < 4;
1820 
1821 		if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
1822 			WPRINTF(("guest attempted an overflow write offset "
1823 			         "0x%lx, val 0x%lx in %s",
1824 			         offset, value, __func__));
1825 			return;
1826 		}
1827 
1828 		pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
1829 		return;
1830 	}
1831 
1832 	DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx",
1833 	        offset, size, value));
1834 
1835 	if (size != 4) {
1836 		WPRINTF(("guest wrote invalid size %d (offset 0x%lx, "
1837 		         "val 0x%lx) to bar0 in %s",
1838 		         size, offset, value, __func__));
1839 		/* TODO: shutdown device */
1840 		return;
1841 	}
1842 
1843 	pci_nvme_bar0_reg_dumps(__func__, offset, 1);
1844 
1845 	pthread_mutex_lock(&sc->mtx);
1846 
1847 	switch (offset) {
1848 	case NVME_CR_CAP_LOW:
1849 	case NVME_CR_CAP_HI:
1850 		/* readonly */
1851 		break;
1852 	case NVME_CR_VS:
1853 		/* readonly */
1854 		break;
1855 	case NVME_CR_INTMS:
1856 		/* MSI-X, so ignore */
1857 		break;
1858 	case NVME_CR_INTMC:
1859 		/* MSI-X, so ignore */
1860 		break;
1861 	case NVME_CR_CC:
1862 		ccreg = (uint32_t)value;
1863 
1864 		DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
1865 		         "iocqes %u",
1866 		        __func__,
1867 			 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
1868 			 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
1869 			 NVME_CC_GET_IOCQES(ccreg)));
1870 
1871 		if (NVME_CC_GET_SHN(ccreg)) {
1872 			/* perform shutdown - flush out data to backend */
1873 			sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
1874 			    NVME_CSTS_REG_SHST_SHIFT);
1875 			sc->regs.csts |= NVME_SHST_COMPLETE <<
1876 			    NVME_CSTS_REG_SHST_SHIFT;
1877 		}
1878 		if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
1879 			if (NVME_CC_GET_EN(ccreg) == 0)
1880 				/* transition 1-> causes controller reset */
1881 				pci_nvme_reset_locked(sc);
1882 			else
1883 				pci_nvme_init_controller(ctx, sc);
1884 		}
1885 
1886 		/* Insert the iocqes, iosqes and en bits from the write */
1887 		sc->regs.cc &= ~NVME_CC_WRITE_MASK;
1888 		sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
1889 		if (NVME_CC_GET_EN(ccreg) == 0) {
1890 			/* Insert the ams, mps and css bit fields */
1891 			sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
1892 			sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
1893 			sc->regs.csts &= ~NVME_CSTS_RDY;
1894 		} else if (sc->pending_ios == 0) {
1895 			sc->regs.csts |= NVME_CSTS_RDY;
1896 		}
1897 		break;
1898 	case NVME_CR_CSTS:
1899 		break;
1900 	case NVME_CR_NSSR:
1901 		/* ignore writes; don't support subsystem reset */
1902 		break;
1903 	case NVME_CR_AQA:
1904 		sc->regs.aqa = (uint32_t)value;
1905 		break;
1906 	case NVME_CR_ASQ_LOW:
1907 		sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
1908 		               (0xFFFFF000 & value);
1909 		break;
1910 	case NVME_CR_ASQ_HI:
1911 		sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
1912 		               (value << 32);
1913 		break;
1914 	case NVME_CR_ACQ_LOW:
1915 		sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
1916 		               (0xFFFFF000 & value);
1917 		break;
1918 	case NVME_CR_ACQ_HI:
1919 		sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
1920 		               (value << 32);
1921 		break;
1922 	default:
1923 		DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d",
1924 		         __func__, offset, value, size));
1925 	}
1926 	pthread_mutex_unlock(&sc->mtx);
1927 }
1928 
1929 static void
1930 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
1931                 int baridx, uint64_t offset, int size, uint64_t value)
1932 {
1933 	struct pci_nvme_softc* sc = pi->pi_arg;
1934 
1935 	if (baridx == pci_msix_table_bar(pi) ||
1936 	    baridx == pci_msix_pba_bar(pi)) {
1937 		DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, "
1938 		         " value 0x%lx", baridx, offset, size, value));
1939 
1940 		pci_emul_msix_twrite(pi, offset, size, value);
1941 		return;
1942 	}
1943 
1944 	switch (baridx) {
1945 	case 0:
1946 		pci_nvme_write_bar_0(ctx, sc, offset, size, value);
1947 		break;
1948 
1949 	default:
1950 		DPRINTF(("%s unknown baridx %d, val 0x%lx",
1951 		         __func__, baridx, value));
1952 	}
1953 }
1954 
1955 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
1956 	uint64_t offset, int size)
1957 {
1958 	uint64_t value;
1959 
1960 	pci_nvme_bar0_reg_dumps(__func__, offset, 0);
1961 
1962 	if (offset < NVME_DOORBELL_OFFSET) {
1963 		void *p = &(sc->regs);
1964 		pthread_mutex_lock(&sc->mtx);
1965 		memcpy(&value, (void *)((uintptr_t)p + offset), size);
1966 		pthread_mutex_unlock(&sc->mtx);
1967 	} else {
1968 		value = 0;
1969                 WPRINTF(("pci_nvme: read invalid offset %ld", offset));
1970 	}
1971 
1972 	switch (size) {
1973 	case 1:
1974 		value &= 0xFF;
1975 		break;
1976 	case 2:
1977 		value &= 0xFFFF;
1978 		break;
1979 	case 4:
1980 		value &= 0xFFFFFFFF;
1981 		break;
1982 	}
1983 
1984 	DPRINTF(("   nvme-read offset 0x%lx, size %d -> value 0x%x",
1985 	         offset, size, (uint32_t)value));
1986 
1987 	return (value);
1988 }
1989 
1990 
1991 
1992 static uint64_t
1993 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
1994     uint64_t offset, int size)
1995 {
1996 	struct pci_nvme_softc* sc = pi->pi_arg;
1997 
1998 	if (baridx == pci_msix_table_bar(pi) ||
1999 	    baridx == pci_msix_pba_bar(pi)) {
2000 		DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2001 		        baridx, offset, size));
2002 
2003 		return pci_emul_msix_tread(pi, offset, size);
2004 	}
2005 
2006 	switch (baridx) {
2007 	case 0:
2008        		return pci_nvme_read_bar_0(sc, offset, size);
2009 
2010 	default:
2011 		DPRINTF(("unknown bar %d, 0x%lx", baridx, offset));
2012 	}
2013 
2014 	return (0);
2015 }
2016 
2017 
2018 static int
2019 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
2020 {
2021 	char bident[sizeof("XX:X:X")];
2022 	char	*uopt, *xopts, *config;
2023 	uint32_t sectsz;
2024 	int optidx;
2025 
2026 	sc->max_queues = NVME_QUEUES;
2027 	sc->max_qentries = NVME_MAX_QENTRIES;
2028 	sc->ioslots = NVME_IOSLOTS;
2029 	sc->num_squeues = sc->max_queues;
2030 	sc->num_cqueues = sc->max_queues;
2031 	sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2032 	sectsz = 0;
2033 
2034 	uopt = strdup(opts);
2035 	optidx = 0;
2036 	snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2037 	         "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2038 	for (xopts = strtok(uopt, ",");
2039 	     xopts != NULL;
2040 	     xopts = strtok(NULL, ",")) {
2041 
2042 		if ((config = strchr(xopts, '=')) != NULL)
2043 			*config++ = '\0';
2044 
2045 		if (!strcmp("maxq", xopts)) {
2046 			sc->max_queues = atoi(config);
2047 		} else if (!strcmp("qsz", xopts)) {
2048 			sc->max_qentries = atoi(config);
2049 		} else if (!strcmp("ioslots", xopts)) {
2050 			sc->ioslots = atoi(config);
2051 		} else if (!strcmp("sectsz", xopts)) {
2052 			sectsz = atoi(config);
2053 		} else if (!strcmp("ser", xopts)) {
2054 			/*
2055 			 * This field indicates the Product Serial Number in
2056 			 * 7-bit ASCII, unused bytes should be space characters.
2057 			 * Ref: NVMe v1.3c.
2058 			 */
2059 			cpywithpad((char *)sc->ctrldata.sn,
2060 			           sizeof(sc->ctrldata.sn), config, ' ');
2061 		} else if (!strcmp("ram", xopts)) {
2062 			uint64_t sz = strtoull(&xopts[4], NULL, 10);
2063 
2064 			sc->nvstore.type = NVME_STOR_RAM;
2065 			sc->nvstore.size = sz * 1024 * 1024;
2066 			sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2067 			sc->nvstore.sectsz = 4096;
2068 			sc->nvstore.sectsz_bits = 12;
2069 			if (sc->nvstore.ctx == NULL) {
2070 				perror("Unable to allocate RAM");
2071 				free(uopt);
2072 				return (-1);
2073 			}
2074 		} else if (!strcmp("eui64", xopts)) {
2075 			sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
2076 		} else if (!strcmp("dsm", xopts)) {
2077 			if (!strcmp("auto", config))
2078 				sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2079 			else if (!strcmp("enable", config))
2080 				sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2081 			else if (!strcmp("disable", config))
2082 				sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2083 		} else if (optidx == 0) {
2084 			snprintf(bident, sizeof(bident), "%d:%d",
2085 			         sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2086 			sc->nvstore.ctx = blockif_open(xopts, bident);
2087 			if (sc->nvstore.ctx == NULL) {
2088 				perror("Could not open backing file");
2089 				free(uopt);
2090 				return (-1);
2091 			}
2092 			sc->nvstore.type = NVME_STOR_BLOCKIF;
2093 			sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2094 		} else {
2095 			EPRINTLN("Invalid option %s", xopts);
2096 			free(uopt);
2097 			return (-1);
2098 		}
2099 
2100 		optidx++;
2101 	}
2102 	free(uopt);
2103 
2104 	if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
2105 		EPRINTLN("backing store not specified");
2106 		return (-1);
2107 	}
2108 	if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2109 		sc->nvstore.sectsz = sectsz;
2110 	else if (sc->nvstore.type != NVME_STOR_RAM)
2111 		sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2112 	for (sc->nvstore.sectsz_bits = 9;
2113 	     (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2114 	     sc->nvstore.sectsz_bits++);
2115 
2116 	if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2117 		sc->max_queues = NVME_QUEUES;
2118 
2119 	if (sc->max_qentries <= 0) {
2120 		EPRINTLN("Invalid qsz option");
2121 		return (-1);
2122 	}
2123 	if (sc->ioslots <= 0) {
2124 		EPRINTLN("Invalid ioslots option");
2125 		return (-1);
2126 	}
2127 
2128 	return (0);
2129 }
2130 
2131 static int
2132 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
2133 {
2134 	struct pci_nvme_softc *sc;
2135 	uint32_t pci_membar_sz;
2136 	int	error;
2137 
2138 	error = 0;
2139 
2140 	sc = calloc(1, sizeof(struct pci_nvme_softc));
2141 	pi->pi_arg = sc;
2142 	sc->nsc_pi = pi;
2143 
2144 	error = pci_nvme_parse_opts(sc, opts);
2145 	if (error < 0)
2146 		goto done;
2147 	else
2148 		error = 0;
2149 
2150 	STAILQ_INIT(&sc->ioreqs_free);
2151 	sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
2152 	for (int i = 0; i < sc->ioslots; i++) {
2153 		STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
2154 		pthread_mutex_init(&sc->ioreqs[i].mtx, NULL);
2155 		pthread_cond_init(&sc->ioreqs[i].cv, NULL);
2156 	}
2157 	sc->intr_coales_aggr_thresh = 1;
2158 
2159 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
2160 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
2161 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
2162 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
2163 	pci_set_cfgdata8(pi, PCIR_PROGIF,
2164 	                 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
2165 
2166 	/*
2167 	 * Allocate size of NVMe registers + doorbell space for all queues.
2168 	 *
2169 	 * The specification requires a minimum memory I/O window size of 16K.
2170 	 * The Windows driver will refuse to start a device with a smaller
2171 	 * window.
2172 	 */
2173 	pci_membar_sz = sizeof(struct nvme_registers) +
2174 	    2 * sizeof(uint32_t) * (sc->max_queues + 1);
2175 	pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
2176 
2177 	DPRINTF(("nvme membar size: %u", pci_membar_sz));
2178 
2179 	error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
2180 	if (error) {
2181 		WPRINTF(("%s pci alloc mem bar failed", __func__));
2182 		goto done;
2183 	}
2184 
2185 	error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2186 	if (error) {
2187 		WPRINTF(("%s pci add msixcap failed", __func__));
2188 		goto done;
2189 	}
2190 
2191 	error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2192 	if (error) {
2193 		WPRINTF(("%s pci add Express capability failed", __func__));
2194 		goto done;
2195 	}
2196 
2197 	pthread_mutex_init(&sc->mtx, NULL);
2198 	sem_init(&sc->iosemlock, 0, sc->ioslots);
2199 
2200 	pci_nvme_reset(sc);
2201 	/*
2202 	 * Controller data depends on Namespace data so initialize Namespace
2203 	 * data first.
2204 	 */
2205 	pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
2206 	pci_nvme_init_ctrldata(sc);
2207 	pci_nvme_init_logpages(sc);
2208 
2209 	pci_lintr_request(pi);
2210 
2211 done:
2212 	return (error);
2213 }
2214 
2215 
2216 struct pci_devemu pci_de_nvme = {
2217 	.pe_emu =	"nvme",
2218 	.pe_init =	pci_nvme_init,
2219 	.pe_barwrite =	pci_nvme_write,
2220 	.pe_barread =	pci_nvme_read
2221 };
2222 PCI_EMUL_SET(pci_de_nvme);
2223