xref: /freebsd/usr.sbin/bhyve/pci_nvme.c (revision 06e20d1babecec1f45ffda513f55a8db5f1c0f56)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  * Copyright (c) 2020 Chuck Tuffli
7  *
8  * Function crc16 Copyright (c) 2017, Fedor Uporov
9  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 /*
34  * bhyve PCIe-NVMe device emulation.
35  *
36  * options:
37  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
38  *
39  *  accepted devpath:
40  *    /dev/blockdev
41  *    /path/to/image
42  *    ram=size_in_MiB
43  *
44  *  maxq    = max number of queues
45  *  qsz     = max elements in each queue
46  *  ioslots = max number of concurrent io requests
47  *  sectsz  = sector size (defaults to blockif sector size)
48  *  ser     = serial number (20-chars max)
49  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
50  *  dsm     = DataSet Management support. Option is one of auto, enable,disable
51  *
52  */
53 
54 /* TODO:
55     - create async event for smart and log
56     - intr coalesce
57  */
58 
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61 
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
65 
66 #include <assert.h>
67 #include <pthread.h>
68 #include <semaphore.h>
69 #include <stdbool.h>
70 #include <stddef.h>
71 #include <stdint.h>
72 #include <stdio.h>
73 #include <stdlib.h>
74 #include <string.h>
75 
76 #include <machine/atomic.h>
77 #include <machine/vmm.h>
78 #include <vmmapi.h>
79 
80 #include <dev/nvme/nvme.h>
81 
82 #include "bhyverun.h"
83 #include "block_if.h"
84 #include "debug.h"
85 #include "pci_emul.h"
86 
87 
88 static int nvme_debug = 0;
89 #define	DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
90 #define	WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
91 
92 /* defaults; can be overridden */
93 #define	NVME_MSIX_BAR		4
94 
95 #define	NVME_IOSLOTS		8
96 
97 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
98 #define NVME_MMIO_SPACE_MIN	(1 << 14)
99 
100 #define	NVME_QUEUES		16
101 #define	NVME_MAX_QENTRIES	2048
102 /* Memory Page size Minimum reported in CAP register */
103 #define	NVME_MPSMIN		0
104 /* MPSMIN converted to bytes */
105 #define	NVME_MPSMIN_BYTES	(1 << (12 + NVME_MPSMIN))
106 
107 #define	NVME_PRP2_ITEMS		(PAGE_SIZE/sizeof(uint64_t))
108 #define	NVME_MDTS		9
109 /* Note the + 1 allows for the initial descriptor to not be page aligned */
110 #define	NVME_MAX_IOVEC		((1 << NVME_MDTS) + 1)
111 #define	NVME_MAX_DATA_SIZE	((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
112 
113 /* This is a synthetic status code to indicate there is no status */
114 #define NVME_NO_STATUS		0xffff
115 #define NVME_COMPLETION_VALID(c)	((c).status != NVME_NO_STATUS)
116 
117 /* helpers */
118 
119 /* Convert a zero-based value into a one-based value */
120 #define ONE_BASED(zero)		((zero) + 1)
121 /* Convert a one-based value into a zero-based value */
122 #define ZERO_BASED(one)		((one)  - 1)
123 
124 /* Encode number of SQ's and CQ's for Set/Get Features */
125 #define NVME_FEATURE_NUM_QUEUES(sc) \
126 	(ZERO_BASED((sc)->num_squeues) & 0xffff) | \
127 	(ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
128 
129 #define	NVME_DOORBELL_OFFSET	offsetof(struct nvme_registers, doorbell)
130 
131 enum nvme_controller_register_offsets {
132 	NVME_CR_CAP_LOW = 0x00,
133 	NVME_CR_CAP_HI  = 0x04,
134 	NVME_CR_VS      = 0x08,
135 	NVME_CR_INTMS   = 0x0c,
136 	NVME_CR_INTMC   = 0x10,
137 	NVME_CR_CC      = 0x14,
138 	NVME_CR_CSTS    = 0x1c,
139 	NVME_CR_NSSR    = 0x20,
140 	NVME_CR_AQA     = 0x24,
141 	NVME_CR_ASQ_LOW = 0x28,
142 	NVME_CR_ASQ_HI  = 0x2c,
143 	NVME_CR_ACQ_LOW = 0x30,
144 	NVME_CR_ACQ_HI  = 0x34,
145 };
146 
147 enum nvme_cmd_cdw11 {
148 	NVME_CMD_CDW11_PC  = 0x0001,
149 	NVME_CMD_CDW11_IEN = 0x0002,
150 	NVME_CMD_CDW11_IV  = 0xFFFF0000,
151 };
152 
153 enum nvme_copy_dir {
154 	NVME_COPY_TO_PRP,
155 	NVME_COPY_FROM_PRP,
156 };
157 
158 #define	NVME_CQ_INTEN	0x01
159 #define	NVME_CQ_INTCOAL	0x02
160 
161 struct nvme_completion_queue {
162 	struct nvme_completion *qbase;
163 	pthread_mutex_t	mtx;
164 	uint32_t	size;
165 	uint16_t	tail; /* nvme progress */
166 	uint16_t	head; /* guest progress */
167 	uint16_t	intr_vec;
168 	uint32_t	intr_en;
169 };
170 
171 struct nvme_submission_queue {
172 	struct nvme_command *qbase;
173 	pthread_mutex_t	mtx;
174 	uint32_t	size;
175 	uint16_t	head; /* nvme progress */
176 	uint16_t	tail; /* guest progress */
177 	uint16_t	cqid; /* completion queue id */
178 	int		qpriority;
179 };
180 
181 enum nvme_storage_type {
182 	NVME_STOR_BLOCKIF = 0,
183 	NVME_STOR_RAM = 1,
184 };
185 
186 struct pci_nvme_blockstore {
187 	enum nvme_storage_type type;
188 	void		*ctx;
189 	uint64_t	size;
190 	uint32_t	sectsz;
191 	uint32_t	sectsz_bits;
192 	uint64_t	eui64;
193 	uint32_t	deallocate:1;
194 };
195 
196 /*
197  * Calculate the number of additional page descriptors for guest IO requests
198  * based on the advertised Max Data Transfer (MDTS) and given the number of
199  * default iovec's in a struct blockif_req.
200  *
201  * Note the + 1 allows for the initial descriptor to not be page aligned.
202  */
203 #define MDTS_PAD_SIZE \
204 	NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
205 	NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
206 	0
207 
208 struct pci_nvme_ioreq {
209 	struct pci_nvme_softc *sc;
210 	STAILQ_ENTRY(pci_nvme_ioreq) link;
211 	struct nvme_submission_queue *nvme_sq;
212 	uint16_t	sqid;
213 
214 	/* command information */
215 	uint16_t	opc;
216 	uint16_t	cid;
217 	uint32_t	nsid;
218 
219 	uint64_t	prev_gpaddr;
220 	size_t		prev_size;
221 	size_t		bytes;
222 
223 	struct blockif_req io_req;
224 
225 	struct iovec	iovpadding[MDTS_PAD_SIZE];
226 };
227 
228 enum nvme_dsm_type {
229 	/* Dataset Management bit in ONCS reflects backing storage capability */
230 	NVME_DATASET_MANAGEMENT_AUTO,
231 	/* Unconditionally set Dataset Management bit in ONCS */
232 	NVME_DATASET_MANAGEMENT_ENABLE,
233 	/* Unconditionally clear Dataset Management bit in ONCS */
234 	NVME_DATASET_MANAGEMENT_DISABLE,
235 };
236 
237 struct pci_nvme_softc;
238 struct nvme_feature_obj;
239 
240 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
241     struct nvme_feature_obj *,
242     struct nvme_command *,
243     struct nvme_completion *);
244 
245 struct nvme_feature_obj {
246 	uint32_t	cdw11;
247 	nvme_feature_cb	set;
248 	nvme_feature_cb	get;
249 	bool namespace_specific;
250 };
251 
252 #define NVME_FID_MAX		(NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
253 
254 struct pci_nvme_aer {
255 	STAILQ_ENTRY(pci_nvme_aer) link;
256 	uint16_t	cid;	/* Command ID of the submitted AER */
257 };
258 
259 struct pci_nvme_softc {
260 	struct pci_devinst *nsc_pi;
261 
262 	pthread_mutex_t	mtx;
263 
264 	struct nvme_registers regs;
265 
266 	struct nvme_namespace_data  nsdata;
267 	struct nvme_controller_data ctrldata;
268 	struct nvme_error_information_entry err_log;
269 	struct nvme_health_information_page health_log;
270 	struct nvme_firmware_page fw_log;
271 
272 	struct pci_nvme_blockstore nvstore;
273 
274 	uint16_t	max_qentries;	/* max entries per queue */
275 	uint32_t	max_queues;	/* max number of IO SQ's or CQ's */
276 	uint32_t	num_cqueues;
277 	uint32_t	num_squeues;
278 	bool		num_q_is_set; /* Has host set Number of Queues */
279 
280 	struct pci_nvme_ioreq *ioreqs;
281 	STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
282 	uint32_t	pending_ios;
283 	uint32_t	ioslots;
284 	sem_t		iosemlock;
285 
286 	/*
287 	 * Memory mapped Submission and Completion queues
288 	 * Each array includes both Admin and IO queues
289 	 */
290 	struct nvme_completion_queue *compl_queues;
291 	struct nvme_submission_queue *submit_queues;
292 
293 	struct nvme_feature_obj feat[NVME_FID_MAX];
294 
295 	enum nvme_dsm_type dataset_management;
296 
297 	/* Accounting for SMART data */
298 	__uint128_t	read_data_units;
299 	__uint128_t	write_data_units;
300 	__uint128_t	read_commands;
301 	__uint128_t	write_commands;
302 	uint32_t	read_dunits_remainder;
303 	uint32_t	write_dunits_remainder;
304 
305 	STAILQ_HEAD(, pci_nvme_aer) aer_list;
306 	uint32_t	aer_count;
307 };
308 
309 
310 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
311 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
312 static void pci_nvme_io_done(struct blockif_req *, int);
313 
314 /* Controller Configuration utils */
315 #define	NVME_CC_GET_EN(cc) \
316 	((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
317 #define	NVME_CC_GET_CSS(cc) \
318 	((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
319 #define	NVME_CC_GET_SHN(cc) \
320 	((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
321 #define	NVME_CC_GET_IOSQES(cc) \
322 	((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
323 #define	NVME_CC_GET_IOCQES(cc) \
324 	((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
325 
326 #define	NVME_CC_WRITE_MASK \
327 	((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
328 	 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
329 	 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
330 
331 #define	NVME_CC_NEN_WRITE_MASK \
332 	((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
333 	 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
334 	 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
335 
336 /* Controller Status utils */
337 #define	NVME_CSTS_GET_RDY(sts) \
338 	((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
339 
340 #define	NVME_CSTS_RDY	(1 << NVME_CSTS_REG_RDY_SHIFT)
341 
342 /* Completion Queue status word utils */
343 #define	NVME_STATUS_P	(1 << NVME_STATUS_P_SHIFT)
344 #define	NVME_STATUS_MASK \
345 	((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
346 	 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
347 
348 #define NVME_ONCS_DSM	(NVME_CTRLR_DATA_ONCS_DSM_MASK << \
349 	NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
350 
351 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
352     struct nvme_feature_obj *,
353     struct nvme_command *,
354     struct nvme_completion *);
355 static void nvme_feature_num_queues(struct pci_nvme_softc *,
356     struct nvme_feature_obj *,
357     struct nvme_command *,
358     struct nvme_completion *);
359 static void nvme_feature_iv_config(struct pci_nvme_softc *,
360     struct nvme_feature_obj *,
361     struct nvme_command *,
362     struct nvme_completion *);
363 
364 static __inline void
365 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
366 {
367 	size_t len;
368 
369 	len = strnlen(src, dst_size);
370 	memset(dst, pad, dst_size);
371 	memcpy(dst, src, len);
372 }
373 
374 static __inline void
375 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
376 {
377 
378 	*status &= ~NVME_STATUS_MASK;
379 	*status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
380 		(code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
381 }
382 
383 static __inline void
384 pci_nvme_status_genc(uint16_t *status, uint16_t code)
385 {
386 
387 	pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
388 }
389 
390 /*
391  * Initialize the requested number or IO Submission and Completion Queues.
392  * Admin queues are allocated implicitly.
393  */
394 static void
395 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
396 {
397 	uint32_t i;
398 
399 	/*
400 	 * Allocate and initialize the Submission Queues
401 	 */
402 	if (nsq > NVME_QUEUES) {
403 		WPRINTF("%s: clamping number of SQ from %u to %u",
404 					__func__, nsq, NVME_QUEUES);
405 		nsq = NVME_QUEUES;
406 	}
407 
408 	sc->num_squeues = nsq;
409 
410 	sc->submit_queues = calloc(sc->num_squeues + 1,
411 				sizeof(struct nvme_submission_queue));
412 	if (sc->submit_queues == NULL) {
413 		WPRINTF("%s: SQ allocation failed", __func__);
414 		sc->num_squeues = 0;
415 	} else {
416 		struct nvme_submission_queue *sq = sc->submit_queues;
417 
418 		for (i = 0; i < sc->num_squeues; i++)
419 			pthread_mutex_init(&sq[i].mtx, NULL);
420 	}
421 
422 	/*
423 	 * Allocate and initialize the Completion Queues
424 	 */
425 	if (ncq > NVME_QUEUES) {
426 		WPRINTF("%s: clamping number of CQ from %u to %u",
427 					__func__, ncq, NVME_QUEUES);
428 		ncq = NVME_QUEUES;
429 	}
430 
431 	sc->num_cqueues = ncq;
432 
433 	sc->compl_queues = calloc(sc->num_cqueues + 1,
434 				sizeof(struct nvme_completion_queue));
435 	if (sc->compl_queues == NULL) {
436 		WPRINTF("%s: CQ allocation failed", __func__);
437 		sc->num_cqueues = 0;
438 	} else {
439 		struct nvme_completion_queue *cq = sc->compl_queues;
440 
441 		for (i = 0; i < sc->num_cqueues; i++)
442 			pthread_mutex_init(&cq[i].mtx, NULL);
443 	}
444 }
445 
446 static void
447 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
448 {
449 	struct nvme_controller_data *cd = &sc->ctrldata;
450 
451 	cd->vid = 0xFB5D;
452 	cd->ssvid = 0x0000;
453 
454 	cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
455 	cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
456 
457 	/* Num of submission commands that we can handle at a time (2^rab) */
458 	cd->rab   = 4;
459 
460 	/* FreeBSD OUI */
461 	cd->ieee[0] = 0x58;
462 	cd->ieee[1] = 0x9c;
463 	cd->ieee[2] = 0xfc;
464 
465 	cd->mic = 0;
466 
467 	cd->mdts = NVME_MDTS;	/* max data transfer size (2^mdts * CAP.MPSMIN) */
468 
469 	cd->ver = 0x00010300;
470 
471 	cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
472 	cd->acl = 2;
473 	cd->aerl = 4;
474 
475 	/* Advertise 1, Read-only firmware slot */
476 	cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK |
477 	    (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
478 	cd->lpa = 0;	/* TODO: support some simple things like SMART */
479 	cd->elpe = 0;	/* max error log page entries */
480 	cd->npss = 1;	/* number of power states support */
481 
482 	/* Warning Composite Temperature Threshold */
483 	cd->wctemp = 0x0157;
484 
485 	cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
486 	    (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
487 	cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
488 	    (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
489 	cd->nn = 1;	/* number of namespaces */
490 
491 	cd->oncs = 0;
492 	switch (sc->dataset_management) {
493 	case NVME_DATASET_MANAGEMENT_AUTO:
494 		if (sc->nvstore.deallocate)
495 			cd->oncs |= NVME_ONCS_DSM;
496 		break;
497 	case NVME_DATASET_MANAGEMENT_ENABLE:
498 		cd->oncs |= NVME_ONCS_DSM;
499 		break;
500 	default:
501 		break;
502 	}
503 
504 	cd->fna = 0x03;
505 
506 	cd->power_state[0].mp = 10;
507 }
508 
509 /*
510  * Calculate the CRC-16 of the given buffer
511  * See copyright attribution at top of file
512  */
513 static uint16_t
514 crc16(uint16_t crc, const void *buffer, unsigned int len)
515 {
516 	const unsigned char *cp = buffer;
517 	/* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
518 	static uint16_t const crc16_table[256] = {
519 		0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
520 		0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
521 		0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
522 		0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
523 		0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
524 		0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
525 		0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
526 		0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
527 		0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
528 		0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
529 		0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
530 		0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
531 		0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
532 		0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
533 		0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
534 		0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
535 		0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
536 		0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
537 		0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
538 		0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
539 		0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
540 		0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
541 		0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
542 		0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
543 		0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
544 		0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
545 		0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
546 		0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
547 		0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
548 		0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
549 		0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
550 		0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
551 	};
552 
553 	while (len--)
554 		crc = (((crc >> 8) & 0xffU) ^
555 		    crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
556 	return crc;
557 }
558 
559 static void
560 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
561     struct nvme_namespace_data *nd, uint32_t nsid,
562     struct pci_nvme_blockstore *nvstore)
563 {
564 
565 	/* Get capacity and block size information from backing store */
566 	nd->nsze = nvstore->size / nvstore->sectsz;
567 	nd->ncap = nd->nsze;
568 	nd->nuse = nd->nsze;
569 
570 	if (nvstore->type == NVME_STOR_BLOCKIF)
571 		nvstore->deallocate = blockif_candelete(nvstore->ctx);
572 
573 	nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
574 	nd->flbas = 0;
575 
576 	/* Create an EUI-64 if user did not provide one */
577 	if (nvstore->eui64 == 0) {
578 		char *data = NULL;
579 		uint64_t eui64 = nvstore->eui64;
580 
581 		asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
582 		    sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
583 
584 		if (data != NULL) {
585 			eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
586 			free(data);
587 		}
588 		nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
589 	}
590 	be64enc(nd->eui64, nvstore->eui64);
591 
592 	/* LBA data-sz = 2^lbads */
593 	nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
594 }
595 
596 static void
597 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
598 {
599 
600 	memset(&sc->err_log, 0, sizeof(sc->err_log));
601 	memset(&sc->health_log, 0, sizeof(sc->health_log));
602 	memset(&sc->fw_log, 0, sizeof(sc->fw_log));
603 
604 	/* Set read/write remainder to round up according to spec */
605 	sc->read_dunits_remainder = 999;
606 	sc->write_dunits_remainder = 999;
607 
608 	/* Set nominal Health values checked by implementations */
609 	sc->health_log.temperature = 310;
610 	sc->health_log.available_spare = 100;
611 	sc->health_log.available_spare_threshold = 10;
612 }
613 
614 static void
615 pci_nvme_init_features(struct pci_nvme_softc *sc)
616 {
617 
618 	sc->feat[0].set = nvme_feature_invalid_cb;
619 	sc->feat[0].get = nvme_feature_invalid_cb;
620 
621 	sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true;
622 	sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true;
623 	sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues;
624 	sc->feat[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION].set =
625 	    nvme_feature_iv_config;
626 	sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG].get =
627 	    nvme_feature_invalid_cb;
628 	sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW].get =
629 	    nvme_feature_invalid_cb;
630 }
631 
632 static void
633 pci_nvme_aer_init(struct pci_nvme_softc *sc)
634 {
635 
636 	STAILQ_INIT(&sc->aer_list);
637 	sc->aer_count = 0;
638 }
639 
640 static void
641 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
642 {
643 	struct pci_nvme_aer *aer = NULL;
644 
645 	while (!STAILQ_EMPTY(&sc->aer_list)) {
646 		aer = STAILQ_FIRST(&sc->aer_list);
647 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
648 		free(aer);
649 	}
650 
651 	pci_nvme_aer_init(sc);
652 }
653 
654 static bool
655 pci_nvme_aer_available(struct pci_nvme_softc *sc)
656 {
657 
658 	return (!STAILQ_EMPTY(&sc->aer_list));
659 }
660 
661 static bool
662 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
663 {
664 	struct nvme_controller_data *cd = &sc->ctrldata;
665 
666 	/* AERL is a zero based value while aer_count is one's based */
667 	return (sc->aer_count == (cd->aerl + 1));
668 }
669 
670 /*
671  * Add an Async Event Request
672  *
673  * Stores an AER to be returned later if the Controller needs to notify the
674  * host of an event.
675  * Note that while the NVMe spec doesn't require Controllers to return AER's
676  * in order, this implementation does preserve the order.
677  */
678 static int
679 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
680 {
681 	struct pci_nvme_aer *aer = NULL;
682 
683 	if (pci_nvme_aer_limit_reached(sc))
684 		return (-1);
685 
686 	aer = calloc(1, sizeof(struct pci_nvme_aer));
687 	if (aer == NULL)
688 		return (-1);
689 
690 	sc->aer_count++;
691 
692 	/* Save the Command ID for use in the completion message */
693 	aer->cid = cid;
694 	STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
695 
696 	return (0);
697 }
698 
699 /*
700  * Get an Async Event Request structure
701  *
702  * Returns a pointer to an AER previously submitted by the host or NULL if
703  * no AER's exist. Caller is responsible for freeing the returned struct.
704  */
705 static struct pci_nvme_aer *
706 pci_nvme_aer_get(struct pci_nvme_softc *sc)
707 {
708 	struct pci_nvme_aer *aer = NULL;
709 
710 	aer = STAILQ_FIRST(&sc->aer_list);
711 	if (aer != NULL) {
712 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
713 		sc->aer_count--;
714 	}
715 
716 	return (aer);
717 }
718 
719 static void
720 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
721 {
722 	uint32_t i;
723 
724 	DPRINTF("%s", __func__);
725 
726 	sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
727 	    (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
728 	    (60 << NVME_CAP_LO_REG_TO_SHIFT);
729 
730 	sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
731 
732 	sc->regs.vs = 0x00010300;	/* NVMe v1.3 */
733 
734 	sc->regs.cc = 0;
735 	sc->regs.csts = 0;
736 
737 	assert(sc->submit_queues != NULL);
738 
739 	for (i = 0; i < sc->num_squeues + 1; i++) {
740 		sc->submit_queues[i].qbase = NULL;
741 		sc->submit_queues[i].size = 0;
742 		sc->submit_queues[i].cqid = 0;
743 		sc->submit_queues[i].tail = 0;
744 		sc->submit_queues[i].head = 0;
745 	}
746 
747 	assert(sc->compl_queues != NULL);
748 
749 	for (i = 0; i < sc->num_cqueues + 1; i++) {
750 		sc->compl_queues[i].qbase = NULL;
751 		sc->compl_queues[i].size = 0;
752 		sc->compl_queues[i].tail = 0;
753 		sc->compl_queues[i].head = 0;
754 	}
755 
756 	sc->num_q_is_set = false;
757 
758 	pci_nvme_aer_destroy(sc);
759 }
760 
761 static void
762 pci_nvme_reset(struct pci_nvme_softc *sc)
763 {
764 	pthread_mutex_lock(&sc->mtx);
765 	pci_nvme_reset_locked(sc);
766 	pthread_mutex_unlock(&sc->mtx);
767 }
768 
769 static void
770 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
771 {
772 	uint16_t acqs, asqs;
773 
774 	DPRINTF("%s", __func__);
775 
776 	asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
777 	sc->submit_queues[0].size = asqs;
778 	sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
779 	            sizeof(struct nvme_command) * asqs);
780 
781 	DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
782 	        __func__, sc->regs.asq, sc->submit_queues[0].qbase);
783 
784 	acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
785 	    NVME_AQA_REG_ACQS_MASK) + 1;
786 	sc->compl_queues[0].size = acqs;
787 	sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
788 	         sizeof(struct nvme_completion) * acqs);
789 	sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
790 
791 	DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
792 	        __func__, sc->regs.acq, sc->compl_queues[0].qbase);
793 }
794 
795 static int
796 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
797 	size_t len, enum nvme_copy_dir dir)
798 {
799 	uint8_t *p;
800 	size_t bytes;
801 
802 	if (len > (8 * 1024)) {
803 		return (-1);
804 	}
805 
806 	/* Copy from the start of prp1 to the end of the physical page */
807 	bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
808 	bytes = MIN(bytes, len);
809 
810 	p = vm_map_gpa(ctx, prp1, bytes);
811 	if (p == NULL) {
812 		return (-1);
813 	}
814 
815 	if (dir == NVME_COPY_TO_PRP)
816 		memcpy(p, b, bytes);
817 	else
818 		memcpy(b, p, bytes);
819 
820 	b += bytes;
821 
822 	len -= bytes;
823 	if (len == 0) {
824 		return (0);
825 	}
826 
827 	len = MIN(len, PAGE_SIZE);
828 
829 	p = vm_map_gpa(ctx, prp2, len);
830 	if (p == NULL) {
831 		return (-1);
832 	}
833 
834 	if (dir == NVME_COPY_TO_PRP)
835 		memcpy(p, b, len);
836 	else
837 		memcpy(b, p, len);
838 
839 	return (0);
840 }
841 
842 /*
843  * Write a Completion Queue Entry update
844  *
845  * Write the completion and update the doorbell value
846  */
847 static void
848 pci_nvme_cq_update(struct pci_nvme_softc *sc,
849 		struct nvme_completion_queue *cq,
850 		uint32_t cdw0,
851 		uint16_t cid,
852 		uint16_t sqid,
853 		uint16_t status)
854 {
855 	struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
856 	struct nvme_completion *cqe;
857 
858 	assert(cq->qbase != NULL);
859 
860 	pthread_mutex_lock(&cq->mtx);
861 
862 	cqe = &cq->qbase[cq->tail];
863 
864 	/* Flip the phase bit */
865 	status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
866 
867 	cqe->cdw0 = cdw0;
868 	cqe->sqhd = sq->head;
869 	cqe->sqid = sqid;
870 	cqe->cid = cid;
871 	cqe->status = status;
872 
873 	cq->tail++;
874 	if (cq->tail >= cq->size) {
875 		cq->tail = 0;
876 	}
877 
878 	pthread_mutex_unlock(&cq->mtx);
879 }
880 
881 static int
882 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
883 	struct nvme_completion* compl)
884 {
885 	uint16_t qid = command->cdw10 & 0xffff;
886 
887 	DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
888 	if (qid == 0 || qid > sc->num_squeues ||
889 	    (sc->submit_queues[qid].qbase == NULL)) {
890 		WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
891 		        __func__, qid, sc->num_squeues);
892 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
893 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
894 		return (1);
895 	}
896 
897 	sc->submit_queues[qid].qbase = NULL;
898 	sc->submit_queues[qid].cqid = 0;
899 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
900 	return (1);
901 }
902 
903 static int
904 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
905 	struct nvme_completion* compl)
906 {
907 	if (command->cdw11 & NVME_CMD_CDW11_PC) {
908 		uint16_t qid = command->cdw10 & 0xffff;
909 		struct nvme_submission_queue *nsq;
910 
911 		if ((qid == 0) || (qid > sc->num_squeues) ||
912 		    (sc->submit_queues[qid].qbase != NULL)) {
913 			WPRINTF("%s queue index %u > num_squeues %u",
914 			        __func__, qid, sc->num_squeues);
915 			pci_nvme_status_tc(&compl->status,
916 			    NVME_SCT_COMMAND_SPECIFIC,
917 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
918 			return (1);
919 		}
920 
921 		nsq = &sc->submit_queues[qid];
922 		nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
923 		DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
924 		if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
925 			/*
926 			 * Queues must specify at least two entries
927 			 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
928 			 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
929 			 */
930 			pci_nvme_status_tc(&compl->status,
931 			    NVME_SCT_COMMAND_SPECIFIC,
932 			    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
933 			return (1);
934 		}
935 		nsq->head = nsq->tail = 0;
936 
937 		nsq->cqid = (command->cdw11 >> 16) & 0xffff;
938 		if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
939 			pci_nvme_status_tc(&compl->status,
940 			    NVME_SCT_COMMAND_SPECIFIC,
941 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
942 			return (1);
943 		}
944 
945 		if (sc->compl_queues[nsq->cqid].qbase == NULL) {
946 			pci_nvme_status_tc(&compl->status,
947 			    NVME_SCT_COMMAND_SPECIFIC,
948 			    NVME_SC_COMPLETION_QUEUE_INVALID);
949 			return (1);
950 		}
951 
952 		nsq->qpriority = (command->cdw11 >> 1) & 0x03;
953 
954 		nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
955 		              sizeof(struct nvme_command) * (size_t)nsq->size);
956 
957 		DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
958 		        qid, nsq->size, nsq->qbase, nsq->cqid);
959 
960 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
961 
962 		DPRINTF("%s completed creating IOSQ qid %u",
963 		         __func__, qid);
964 	} else {
965 		/*
966 		 * Guest sent non-cont submission queue request.
967 		 * This setting is unsupported by this emulation.
968 		 */
969 		WPRINTF("%s unsupported non-contig (list-based) "
970 		         "create i/o submission queue", __func__);
971 
972 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
973 	}
974 	return (1);
975 }
976 
977 static int
978 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
979 	struct nvme_completion* compl)
980 {
981 	uint16_t qid = command->cdw10 & 0xffff;
982 	uint16_t sqid;
983 
984 	DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
985 	if (qid == 0 || qid > sc->num_cqueues ||
986 	    (sc->compl_queues[qid].qbase == NULL)) {
987 		WPRINTF("%s queue index %u / num_cqueues %u",
988 		        __func__, qid, sc->num_cqueues);
989 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
990 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
991 		return (1);
992 	}
993 
994 	/* Deleting an Active CQ is an error */
995 	for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
996 		if (sc->submit_queues[sqid].cqid == qid) {
997 			pci_nvme_status_tc(&compl->status,
998 			    NVME_SCT_COMMAND_SPECIFIC,
999 			    NVME_SC_INVALID_QUEUE_DELETION);
1000 			return (1);
1001 		}
1002 
1003 	sc->compl_queues[qid].qbase = NULL;
1004 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1005 	return (1);
1006 }
1007 
1008 static int
1009 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1010 	struct nvme_completion* compl)
1011 {
1012 	struct nvme_completion_queue *ncq;
1013 	uint16_t qid = command->cdw10 & 0xffff;
1014 
1015 	/* Only support Physically Contiguous queues */
1016 	if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1017 		WPRINTF("%s unsupported non-contig (list-based) "
1018 		         "create i/o completion queue",
1019 		         __func__);
1020 
1021 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1022 		return (1);
1023 	}
1024 
1025 	if ((qid == 0) || (qid > sc->num_cqueues) ||
1026 	    (sc->compl_queues[qid].qbase != NULL)) {
1027 		WPRINTF("%s queue index %u > num_cqueues %u",
1028 			__func__, qid, sc->num_cqueues);
1029 		pci_nvme_status_tc(&compl->status,
1030 		    NVME_SCT_COMMAND_SPECIFIC,
1031 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1032 		return (1);
1033  	}
1034 
1035 	ncq = &sc->compl_queues[qid];
1036 	ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1037 	ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1038 	if (ncq->intr_vec > (sc->max_queues + 1)) {
1039 		pci_nvme_status_tc(&compl->status,
1040 		    NVME_SCT_COMMAND_SPECIFIC,
1041 		    NVME_SC_INVALID_INTERRUPT_VECTOR);
1042 		return (1);
1043 	}
1044 
1045 	ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1046 	if ((ncq->size < 2) || (ncq->size > sc->max_qentries))  {
1047 		/*
1048 		 * Queues must specify at least two entries
1049 		 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1050 		 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1051 		 */
1052 		pci_nvme_status_tc(&compl->status,
1053 		    NVME_SCT_COMMAND_SPECIFIC,
1054 		    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1055 		return (1);
1056 	}
1057 	ncq->head = ncq->tail = 0;
1058 	ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1059 		     command->prp1,
1060 		     sizeof(struct nvme_command) * (size_t)ncq->size);
1061 
1062 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1063 
1064 
1065 	return (1);
1066 }
1067 
1068 static int
1069 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1070 	struct nvme_completion* compl)
1071 {
1072 	uint32_t logsize;
1073 	uint8_t logpage = command->cdw10 & 0xFF;
1074 
1075 	DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1076 
1077 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1078 
1079 	/*
1080 	 * Command specifies the number of dwords to return in fields NUMDU
1081 	 * and NUMDL. This is a zero-based value.
1082 	 */
1083 	logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1084 	logsize *= sizeof(uint32_t);
1085 
1086 	switch (logpage) {
1087 	case NVME_LOG_ERROR:
1088 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1089 		    command->prp2, (uint8_t *)&sc->err_log,
1090 		    MIN(logsize, sizeof(sc->err_log)),
1091 		    NVME_COPY_TO_PRP);
1092 		break;
1093 	case NVME_LOG_HEALTH_INFORMATION:
1094 		pthread_mutex_lock(&sc->mtx);
1095 		memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1096 		    sizeof(sc->health_log.data_units_read));
1097 		memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1098 		    sizeof(sc->health_log.data_units_written));
1099 		memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1100 		    sizeof(sc->health_log.host_read_commands));
1101 		memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1102 		    sizeof(sc->health_log.host_write_commands));
1103 		pthread_mutex_unlock(&sc->mtx);
1104 
1105 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1106 		    command->prp2, (uint8_t *)&sc->health_log,
1107 		    MIN(logsize, sizeof(sc->health_log)),
1108 		    NVME_COPY_TO_PRP);
1109 		break;
1110 	case NVME_LOG_FIRMWARE_SLOT:
1111 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1112 		    command->prp2, (uint8_t *)&sc->fw_log,
1113 		    MIN(logsize, sizeof(sc->fw_log)),
1114 		    NVME_COPY_TO_PRP);
1115 		break;
1116 	default:
1117 		DPRINTF("%s get log page %x command not supported",
1118 		        __func__, logpage);
1119 
1120 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1121 		    NVME_SC_INVALID_LOG_PAGE);
1122 	}
1123 
1124 	return (1);
1125 }
1126 
1127 static int
1128 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1129 	struct nvme_completion* compl)
1130 {
1131 	void *dest;
1132 	uint16_t status;
1133 
1134 	DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1135 	        command->cdw10 & 0xFF, command->nsid);
1136 
1137 	pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1138 
1139 	switch (command->cdw10 & 0xFF) {
1140 	case 0x00: /* return Identify Namespace data structure */
1141 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1142 		    command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1143 		    NVME_COPY_TO_PRP);
1144 		break;
1145 	case 0x01: /* return Identify Controller data structure */
1146 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1147 		    command->prp2, (uint8_t *)&sc->ctrldata,
1148 		    sizeof(sc->ctrldata),
1149 		    NVME_COPY_TO_PRP);
1150 		break;
1151 	case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1152 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1153 		                  sizeof(uint32_t) * 1024);
1154 		/* All unused entries shall be zero */
1155 		bzero(dest, sizeof(uint32_t) * 1024);
1156 		((uint32_t *)dest)[0] = 1;
1157 		break;
1158 	case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1159 		if (command->nsid != 1) {
1160 			pci_nvme_status_genc(&status,
1161 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1162 			break;
1163 		}
1164 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1165 		                  sizeof(uint32_t) * 1024);
1166 		/* All bytes after the descriptor shall be zero */
1167 		bzero(dest, sizeof(uint32_t) * 1024);
1168 
1169 		/* Return NIDT=1 (i.e. EUI64) descriptor */
1170 		((uint8_t *)dest)[0] = 1;
1171 		((uint8_t *)dest)[1] = sizeof(uint64_t);
1172 		bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1173 		break;
1174 	default:
1175 		DPRINTF("%s unsupported identify command requested 0x%x",
1176 		         __func__, command->cdw10 & 0xFF);
1177 		pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1178 		break;
1179 	}
1180 
1181 	compl->status = status;
1182 	return (1);
1183 }
1184 
1185 static const char *
1186 nvme_fid_to_name(uint8_t fid)
1187 {
1188 	const char *name;
1189 
1190 	switch (fid) {
1191 	case NVME_FEAT_ARBITRATION:
1192 		name = "Arbitration";
1193 		break;
1194 	case NVME_FEAT_POWER_MANAGEMENT:
1195 		name = "Power Management";
1196 		break;
1197 	case NVME_FEAT_LBA_RANGE_TYPE:
1198 		name = "LBA Range Type";
1199 		break;
1200 	case NVME_FEAT_TEMPERATURE_THRESHOLD:
1201 		name = "Temperature Threshold";
1202 		break;
1203 	case NVME_FEAT_ERROR_RECOVERY:
1204 		name = "Error Recovery";
1205 		break;
1206 	case NVME_FEAT_VOLATILE_WRITE_CACHE:
1207 		name = "Volatile Write Cache";
1208 		break;
1209 	case NVME_FEAT_NUMBER_OF_QUEUES:
1210 		name = "Number of Queues";
1211 		break;
1212 	case NVME_FEAT_INTERRUPT_COALESCING:
1213 		name = "Interrupt Coalescing";
1214 		break;
1215 	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1216 		name = "Interrupt Vector Configuration";
1217 		break;
1218 	case NVME_FEAT_WRITE_ATOMICITY:
1219 		name = "Write Atomicity Normal";
1220 		break;
1221 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1222 		name = "Asynchronous Event Configuration";
1223 		break;
1224 	case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1225 		name = "Autonomous Power State Transition";
1226 		break;
1227 	case NVME_FEAT_HOST_MEMORY_BUFFER:
1228 		name = "Host Memory Buffer";
1229 		break;
1230 	case NVME_FEAT_TIMESTAMP:
1231 		name = "Timestamp";
1232 		break;
1233 	case NVME_FEAT_KEEP_ALIVE_TIMER:
1234 		name = "Keep Alive Timer";
1235 		break;
1236 	case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1237 		name = "Host Controlled Thermal Management";
1238 		break;
1239 	case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1240 		name = "Non-Operation Power State Config";
1241 		break;
1242 	case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1243 		name = "Read Recovery Level Config";
1244 		break;
1245 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1246 		name = "Predictable Latency Mode Config";
1247 		break;
1248 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1249 		name = "Predictable Latency Mode Window";
1250 		break;
1251 	case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1252 		name = "LBA Status Information Report Interval";
1253 		break;
1254 	case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1255 		name = "Host Behavior Support";
1256 		break;
1257 	case NVME_FEAT_SANITIZE_CONFIG:
1258 		name = "Sanitize Config";
1259 		break;
1260 	case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1261 		name = "Endurance Group Event Configuration";
1262 		break;
1263 	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1264 		name = "Software Progress Marker";
1265 		break;
1266 	case NVME_FEAT_HOST_IDENTIFIER:
1267 		name = "Host Identifier";
1268 		break;
1269 	case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1270 		name = "Reservation Notification Mask";
1271 		break;
1272 	case NVME_FEAT_RESERVATION_PERSISTENCE:
1273 		name = "Reservation Persistence";
1274 		break;
1275 	case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1276 		name = "Namespace Write Protection Config";
1277 		break;
1278 	default:
1279 		name = "Unknown";
1280 		break;
1281 	}
1282 
1283 	return (name);
1284 }
1285 
1286 static void
1287 nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
1288     struct nvme_feature_obj *feat,
1289     struct nvme_command *command,
1290     struct nvme_completion *compl)
1291 {
1292 
1293 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1294 }
1295 
1296 static void
1297 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1298     struct nvme_feature_obj *feat,
1299     struct nvme_command *command,
1300     struct nvme_completion *compl)
1301 {
1302 	uint32_t i;
1303 	uint32_t cdw11 = command->cdw11;
1304 	uint16_t iv;
1305 	bool cd;
1306 
1307 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1308 
1309 	iv = cdw11 & 0xffff;
1310 	cd = cdw11 & (1 << 16);
1311 
1312 	if (iv > (sc->max_queues + 1)) {
1313 		return;
1314 	}
1315 
1316 	/* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1317 	if ((iv == 0) && !cd)
1318 		return;
1319 
1320 	/* Requested Interrupt Vector must be used by a CQ */
1321 	for (i = 0; i < sc->num_cqueues + 1; i++) {
1322 		if (sc->compl_queues[i].intr_vec == iv) {
1323 			pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1324 		}
1325 	}
1326 
1327 }
1328 
1329 static void
1330 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1331     struct nvme_feature_obj *feat,
1332     struct nvme_command *command,
1333     struct nvme_completion *compl)
1334 {
1335 	uint16_t nqr;	/* Number of Queues Requested */
1336 
1337 	if (sc->num_q_is_set) {
1338 		WPRINTF("%s: Number of Queues already set", __func__);
1339 		pci_nvme_status_genc(&compl->status,
1340 		    NVME_SC_COMMAND_SEQUENCE_ERROR);
1341 		return;
1342 	}
1343 
1344 	nqr = command->cdw11 & 0xFFFF;
1345 	if (nqr == 0xffff) {
1346 		WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1347 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1348 		return;
1349 	}
1350 
1351 	sc->num_squeues = ONE_BASED(nqr);
1352 	if (sc->num_squeues > sc->max_queues) {
1353 		DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1354 					sc->max_queues);
1355 		sc->num_squeues = sc->max_queues;
1356 	}
1357 
1358 	nqr = (command->cdw11 >> 16) & 0xFFFF;
1359 	if (nqr == 0xffff) {
1360 		WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1361 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1362 		return;
1363 	}
1364 
1365 	sc->num_cqueues = ONE_BASED(nqr);
1366 	if (sc->num_cqueues > sc->max_queues) {
1367 		DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1368 					sc->max_queues);
1369 		sc->num_cqueues = sc->max_queues;
1370 	}
1371 
1372 	/* Patch the command value which will be saved on callback's return */
1373 	command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1374 	compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1375 
1376 	sc->num_q_is_set = true;
1377 }
1378 
1379 static int
1380 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1381 	struct nvme_completion *compl)
1382 {
1383 	struct nvme_feature_obj *feat;
1384 	uint32_t nsid = command->nsid;
1385 	uint8_t fid = command->cdw10 & 0xFF;
1386 
1387 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1388 
1389 	if (fid >= NVME_FID_MAX) {
1390 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1391 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1392 		return (1);
1393 	}
1394 	feat = &sc->feat[fid];
1395 
1396 	if (!feat->namespace_specific &&
1397 	    !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1398 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1399 		    NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1400 		return (1);
1401 	}
1402 
1403 	compl->cdw0 = 0;
1404 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1405 
1406 	if (feat->set)
1407 		feat->set(sc, feat, command, compl);
1408 
1409 	if (compl->status == NVME_SC_SUCCESS)
1410 		feat->cdw11 = command->cdw11;
1411 
1412 	return (0);
1413 }
1414 
1415 static int
1416 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1417 	struct nvme_completion* compl)
1418 {
1419 	struct nvme_feature_obj *feat;
1420 	uint8_t fid = command->cdw10 & 0xFF;
1421 
1422 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1423 
1424 	if (fid >= NVME_FID_MAX) {
1425 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1426 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1427 		return (1);
1428 	}
1429 
1430 	compl->cdw0 = 0;
1431 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1432 
1433 	feat = &sc->feat[fid];
1434 	if (feat->get) {
1435 		feat->get(sc, feat, command, compl);
1436 	}
1437 
1438 	if (compl->status == NVME_SC_SUCCESS) {
1439 		compl->cdw0 = feat->cdw11;
1440 	}
1441 
1442 	return (0);
1443 }
1444 
1445 static int
1446 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1447 	struct nvme_completion* compl)
1448 {
1449 	uint8_t	ses, lbaf, pi;
1450 
1451 	/* Only supports Secure Erase Setting - User Data Erase */
1452 	ses = (command->cdw10 >> 9) & 0x7;
1453 	if (ses > 0x1) {
1454 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1455 		return (1);
1456 	}
1457 
1458 	/* Only supports a single LBA Format */
1459 	lbaf = command->cdw10 & 0xf;
1460 	if (lbaf != 0) {
1461 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1462 		    NVME_SC_INVALID_FORMAT);
1463 		return (1);
1464 	}
1465 
1466 	/* Doesn't support Protection Infomation */
1467 	pi = (command->cdw10 >> 5) & 0x7;
1468 	if (pi != 0) {
1469 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1470 		return (1);
1471 	}
1472 
1473 	if (sc->nvstore.type == NVME_STOR_RAM) {
1474 		if (sc->nvstore.ctx)
1475 			free(sc->nvstore.ctx);
1476 		sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1477 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1478 	} else {
1479 		struct pci_nvme_ioreq *req;
1480 		int err;
1481 
1482 		req = pci_nvme_get_ioreq(sc);
1483 		if (req == NULL) {
1484 			pci_nvme_status_genc(&compl->status,
1485 			    NVME_SC_INTERNAL_DEVICE_ERROR);
1486 			WPRINTF("%s: unable to allocate IO req", __func__);
1487 			return (1);
1488 		}
1489 		req->nvme_sq = &sc->submit_queues[0];
1490 		req->sqid = 0;
1491 		req->opc = command->opc;
1492 		req->cid = command->cid;
1493 		req->nsid = command->nsid;
1494 
1495 		req->io_req.br_offset = 0;
1496 		req->io_req.br_resid = sc->nvstore.size;
1497 		req->io_req.br_callback = pci_nvme_io_done;
1498 
1499 		err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1500 		if (err) {
1501 			pci_nvme_status_genc(&compl->status,
1502 			    NVME_SC_INTERNAL_DEVICE_ERROR);
1503 			pci_nvme_release_ioreq(sc, req);
1504 		}
1505 	}
1506 
1507 	return (1);
1508 }
1509 
1510 static int
1511 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1512 	struct nvme_completion* compl)
1513 {
1514 	DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1515 	        command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1516 
1517 	/* TODO: search for the command ID and abort it */
1518 
1519 	compl->cdw0 = 1;
1520 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1521 	return (1);
1522 }
1523 
1524 static int
1525 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1526 	struct nvme_command* command, struct nvme_completion* compl)
1527 {
1528 	DPRINTF("%s async event request 0x%x", __func__, command->cdw11);
1529 
1530 	/* Don't exceed the Async Event Request Limit (AERL). */
1531 	if (pci_nvme_aer_limit_reached(sc)) {
1532 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1533 				NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1534 		return (1);
1535 	}
1536 
1537 	if (pci_nvme_aer_add(sc, command->cid)) {
1538 		pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
1539 				NVME_SC_INTERNAL_DEVICE_ERROR);
1540 		return (1);
1541 	}
1542 
1543 	/*
1544 	 * Raise events when they happen based on the Set Features cmd.
1545 	 * These events happen async, so only set completion successful if
1546 	 * there is an event reflective of the request to get event.
1547 	 */
1548 	compl->status = NVME_NO_STATUS;
1549 
1550 	return (0);
1551 }
1552 
1553 static void
1554 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1555 {
1556 	struct nvme_completion compl;
1557 	struct nvme_command *cmd;
1558 	struct nvme_submission_queue *sq;
1559 	struct nvme_completion_queue *cq;
1560 	uint16_t sqhead;
1561 
1562 	DPRINTF("%s index %u", __func__, (uint32_t)value);
1563 
1564 	sq = &sc->submit_queues[0];
1565 	cq = &sc->compl_queues[0];
1566 
1567 	pthread_mutex_lock(&sq->mtx);
1568 
1569 	sqhead = sq->head;
1570 	DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1571 
1572 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
1573 		cmd = &(sq->qbase)[sqhead];
1574 		compl.cdw0 = 0;
1575 		compl.status = 0;
1576 
1577 		switch (cmd->opc) {
1578 		case NVME_OPC_DELETE_IO_SQ:
1579 			DPRINTF("%s command DELETE_IO_SQ", __func__);
1580 			nvme_opc_delete_io_sq(sc, cmd, &compl);
1581 			break;
1582 		case NVME_OPC_CREATE_IO_SQ:
1583 			DPRINTF("%s command CREATE_IO_SQ", __func__);
1584 			nvme_opc_create_io_sq(sc, cmd, &compl);
1585 			break;
1586 		case NVME_OPC_DELETE_IO_CQ:
1587 			DPRINTF("%s command DELETE_IO_CQ", __func__);
1588 			nvme_opc_delete_io_cq(sc, cmd, &compl);
1589 			break;
1590 		case NVME_OPC_CREATE_IO_CQ:
1591 			DPRINTF("%s command CREATE_IO_CQ", __func__);
1592 			nvme_opc_create_io_cq(sc, cmd, &compl);
1593 			break;
1594 		case NVME_OPC_GET_LOG_PAGE:
1595 			DPRINTF("%s command GET_LOG_PAGE", __func__);
1596 			nvme_opc_get_log_page(sc, cmd, &compl);
1597 			break;
1598 		case NVME_OPC_IDENTIFY:
1599 			DPRINTF("%s command IDENTIFY", __func__);
1600 			nvme_opc_identify(sc, cmd, &compl);
1601 			break;
1602 		case NVME_OPC_ABORT:
1603 			DPRINTF("%s command ABORT", __func__);
1604 			nvme_opc_abort(sc, cmd, &compl);
1605 			break;
1606 		case NVME_OPC_SET_FEATURES:
1607 			DPRINTF("%s command SET_FEATURES", __func__);
1608 			nvme_opc_set_features(sc, cmd, &compl);
1609 			break;
1610 		case NVME_OPC_GET_FEATURES:
1611 			DPRINTF("%s command GET_FEATURES", __func__);
1612 			nvme_opc_get_features(sc, cmd, &compl);
1613 			break;
1614 		case NVME_OPC_FIRMWARE_ACTIVATE:
1615 			DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
1616 			pci_nvme_status_tc(&compl.status,
1617 			    NVME_SCT_COMMAND_SPECIFIC,
1618 			    NVME_SC_INVALID_FIRMWARE_SLOT);
1619 			break;
1620 		case NVME_OPC_ASYNC_EVENT_REQUEST:
1621 			DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1622 			nvme_opc_async_event_req(sc, cmd, &compl);
1623 			break;
1624 		case NVME_OPC_FORMAT_NVM:
1625 			DPRINTF("%s command FORMAT_NVM", __func__);
1626 			if ((sc->ctrldata.oacs &
1627 			    (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
1628 				pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1629 			}
1630 			compl.status = NVME_NO_STATUS;
1631 			nvme_opc_format_nvm(sc, cmd, &compl);
1632 			break;
1633 		default:
1634 			DPRINTF("0x%x command is not implemented",
1635 			    cmd->opc);
1636 			pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1637 		}
1638 		sqhead = (sqhead + 1) % sq->size;
1639 
1640 		if (NVME_COMPLETION_VALID(compl)) {
1641 			pci_nvme_cq_update(sc, &sc->compl_queues[0],
1642 			    compl.cdw0,
1643 			    cmd->cid,
1644 			    0,		/* SQID */
1645 			    compl.status);
1646 		}
1647 	}
1648 
1649 	DPRINTF("setting sqhead %u", sqhead);
1650 	sq->head = sqhead;
1651 
1652 	if (cq->head != cq->tail)
1653 		pci_generate_msix(sc->nsc_pi, 0);
1654 
1655 	pthread_mutex_unlock(&sq->mtx);
1656 }
1657 
1658 /*
1659  * Update the Write and Read statistics reported in SMART data
1660  *
1661  * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
1662  * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
1663  * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
1664  */
1665 static void
1666 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
1667     size_t bytes, uint16_t status)
1668 {
1669 
1670 	pthread_mutex_lock(&sc->mtx);
1671 	switch (opc) {
1672 	case NVME_OPC_WRITE:
1673 		sc->write_commands++;
1674 		if (status != NVME_SC_SUCCESS)
1675 			break;
1676 		sc->write_dunits_remainder += (bytes / 512);
1677 		while (sc->write_dunits_remainder >= 1000) {
1678 			sc->write_data_units++;
1679 			sc->write_dunits_remainder -= 1000;
1680 		}
1681 		break;
1682 	case NVME_OPC_READ:
1683 		sc->read_commands++;
1684 		if (status != NVME_SC_SUCCESS)
1685 			break;
1686 		sc->read_dunits_remainder += (bytes / 512);
1687 		while (sc->read_dunits_remainder >= 1000) {
1688 			sc->read_data_units++;
1689 			sc->read_dunits_remainder -= 1000;
1690 		}
1691 		break;
1692 	default:
1693 		DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
1694 		break;
1695 	}
1696 	pthread_mutex_unlock(&sc->mtx);
1697 }
1698 
1699 /*
1700  * Check if the combination of Starting LBA (slba) and Number of Logical
1701  * Blocks (nlb) exceeds the range of the underlying storage.
1702  *
1703  * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
1704  * the capacity in bytes as a uint64_t, care must be taken to avoid integer
1705  * overflow.
1706  */
1707 static bool
1708 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
1709     uint32_t nlb)
1710 {
1711 	size_t	offset, bytes;
1712 
1713 	/* Overflow check of multiplying Starting LBA by the sector size */
1714 	if (slba >> (64 - nvstore->sectsz_bits))
1715 		return (true);
1716 
1717 	offset = slba << nvstore->sectsz_bits;
1718 	bytes = nlb << nvstore->sectsz_bits;
1719 
1720 	/* Overflow check of Number of Logical Blocks */
1721 	if ((nvstore->size - offset) < bytes)
1722 		return (true);
1723 
1724 	return (false);
1725 }
1726 
1727 static int
1728 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1729 	uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1730 {
1731 	int iovidx;
1732 
1733 	if (req == NULL)
1734 		return (-1);
1735 
1736 	if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
1737 		return (-1);
1738 	}
1739 
1740 	/* concatenate contig block-iovs to minimize number of iovs */
1741 	if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1742 		iovidx = req->io_req.br_iovcnt - 1;
1743 
1744 		req->io_req.br_iov[iovidx].iov_base =
1745 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1746 				     req->prev_gpaddr, size);
1747 
1748 		req->prev_size += size;
1749 		req->io_req.br_resid += size;
1750 
1751 		req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1752 	} else {
1753 		iovidx = req->io_req.br_iovcnt;
1754 		if (iovidx == 0) {
1755 			req->io_req.br_offset = lba;
1756 			req->io_req.br_resid = 0;
1757 			req->io_req.br_param = req;
1758 		}
1759 
1760 		req->io_req.br_iov[iovidx].iov_base =
1761 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1762 				     gpaddr, size);
1763 
1764 		req->io_req.br_iov[iovidx].iov_len = size;
1765 
1766 		req->prev_gpaddr = gpaddr;
1767 		req->prev_size = size;
1768 		req->io_req.br_resid += size;
1769 
1770 		req->io_req.br_iovcnt++;
1771 	}
1772 
1773 	return (0);
1774 }
1775 
1776 static void
1777 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1778 	struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1779 	uint32_t cdw0, uint16_t status)
1780 {
1781 	struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1782 
1783 	DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1784 		 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1785 		 NVME_STATUS_GET_SC(status));
1786 
1787 	pci_nvme_cq_update(sc, cq,
1788 	    0,		/* CDW0 */
1789 	    cid,
1790 	    sqid,
1791 	    status);
1792 
1793 	if (cq->head != cq->tail) {
1794 		if (cq->intr_en & NVME_CQ_INTEN) {
1795 			pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1796 		} else {
1797 			DPRINTF("%s: CQ%u interrupt disabled",
1798 						__func__, sq->cqid);
1799 		}
1800 	}
1801 }
1802 
1803 static void
1804 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1805 {
1806 	req->sc = NULL;
1807 	req->nvme_sq = NULL;
1808 	req->sqid = 0;
1809 
1810 	pthread_mutex_lock(&sc->mtx);
1811 
1812 	STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
1813 	sc->pending_ios--;
1814 
1815 	/* when no more IO pending, can set to ready if device reset/enabled */
1816 	if (sc->pending_ios == 0 &&
1817 	    NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1818 		sc->regs.csts |= NVME_CSTS_RDY;
1819 
1820 	pthread_mutex_unlock(&sc->mtx);
1821 
1822 	sem_post(&sc->iosemlock);
1823 }
1824 
1825 static struct pci_nvme_ioreq *
1826 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1827 {
1828 	struct pci_nvme_ioreq *req = NULL;;
1829 
1830 	sem_wait(&sc->iosemlock);
1831 	pthread_mutex_lock(&sc->mtx);
1832 
1833 	req = STAILQ_FIRST(&sc->ioreqs_free);
1834 	assert(req != NULL);
1835 	STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
1836 
1837 	req->sc = sc;
1838 
1839 	sc->pending_ios++;
1840 
1841 	pthread_mutex_unlock(&sc->mtx);
1842 
1843 	req->io_req.br_iovcnt = 0;
1844 	req->io_req.br_offset = 0;
1845 	req->io_req.br_resid = 0;
1846 	req->io_req.br_param = req;
1847 	req->prev_gpaddr = 0;
1848 	req->prev_size = 0;
1849 
1850 	return req;
1851 }
1852 
1853 static void
1854 pci_nvme_io_done(struct blockif_req *br, int err)
1855 {
1856 	struct pci_nvme_ioreq *req = br->br_param;
1857 	struct nvme_submission_queue *sq = req->nvme_sq;
1858 	uint16_t code, status;
1859 
1860 	DPRINTF("%s error %d %s", __func__, err, strerror(err));
1861 
1862 	/* TODO return correct error */
1863 	code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1864 	pci_nvme_status_genc(&status, code);
1865 
1866 	pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
1867 	pci_nvme_stats_write_read_update(req->sc, req->opc,
1868 	    req->bytes, status);
1869 	pci_nvme_release_ioreq(req->sc, req);
1870 }
1871 
1872 /*
1873  * Implements the Flush command. The specification states:
1874  *    If a volatile write cache is not present, Flush commands complete
1875  *    successfully and have no effect
1876  * in the description of the Volatile Write Cache (VWC) field of the Identify
1877  * Controller data. Therefore, set status to Success if the command is
1878  * not supported (i.e. RAM or as indicated by the blockif).
1879  */
1880 static bool
1881 nvme_opc_flush(struct pci_nvme_softc *sc,
1882     struct nvme_command *cmd,
1883     struct pci_nvme_blockstore *nvstore,
1884     struct pci_nvme_ioreq *req,
1885     uint16_t *status)
1886 {
1887 	bool pending = false;
1888 
1889 	if (nvstore->type == NVME_STOR_RAM) {
1890 		pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1891 	} else {
1892 		int err;
1893 
1894 		req->io_req.br_callback = pci_nvme_io_done;
1895 
1896 		err = blockif_flush(nvstore->ctx, &req->io_req);
1897 		switch (err) {
1898 		case 0:
1899 			pending = true;
1900 			break;
1901 		case EOPNOTSUPP:
1902 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1903 			break;
1904 		default:
1905 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1906 		}
1907 	}
1908 
1909 	return (pending);
1910 }
1911 
1912 static uint16_t
1913 nvme_write_read_ram(struct pci_nvme_softc *sc,
1914     struct pci_nvme_blockstore *nvstore,
1915     uint64_t prp1, uint64_t prp2,
1916     size_t offset, uint64_t bytes,
1917     bool is_write)
1918 {
1919 	uint8_t *buf = nvstore->ctx;
1920 	enum nvme_copy_dir dir;
1921 	uint16_t status;
1922 
1923 	if (is_write)
1924 		dir = NVME_COPY_TO_PRP;
1925 	else
1926 		dir = NVME_COPY_FROM_PRP;
1927 
1928 	if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
1929 	    buf + offset, bytes, dir))
1930 		pci_nvme_status_genc(&status,
1931 		    NVME_SC_DATA_TRANSFER_ERROR);
1932 	else
1933 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1934 
1935 	return (status);
1936 }
1937 
1938 static uint16_t
1939 nvme_write_read_blockif(struct pci_nvme_softc *sc,
1940     struct pci_nvme_blockstore *nvstore,
1941     struct pci_nvme_ioreq *req,
1942     uint64_t prp1, uint64_t prp2,
1943     size_t offset, uint64_t bytes,
1944     bool is_write)
1945 {
1946 	uint64_t size;
1947 	int err;
1948 	uint16_t status = NVME_NO_STATUS;
1949 
1950 	size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
1951 	if (pci_nvme_append_iov_req(sc, req, prp1,
1952 	    size, is_write, offset)) {
1953 		pci_nvme_status_genc(&status,
1954 		    NVME_SC_DATA_TRANSFER_ERROR);
1955 		goto out;
1956 	}
1957 
1958 	offset += size;
1959 	bytes  -= size;
1960 
1961 	if (bytes == 0) {
1962 		;
1963 	} else if (bytes <= PAGE_SIZE) {
1964 		size = bytes;
1965 		if (pci_nvme_append_iov_req(sc, req, prp2,
1966 		    size, is_write, offset)) {
1967 			pci_nvme_status_genc(&status,
1968 			    NVME_SC_DATA_TRANSFER_ERROR);
1969 			goto out;
1970 		}
1971 	} else {
1972 		void *vmctx = sc->nsc_pi->pi_vmctx;
1973 		uint64_t *prp_list = &prp2;
1974 		uint64_t *last = prp_list;
1975 
1976 		/* PRP2 is pointer to a physical region page list */
1977 		while (bytes) {
1978 			/* Last entry in list points to the next list */
1979 			if (prp_list == last) {
1980 				uint64_t prp = *prp_list;
1981 
1982 				prp_list = paddr_guest2host(vmctx, prp,
1983 				    PAGE_SIZE - (prp % PAGE_SIZE));
1984 				last = prp_list + (NVME_PRP2_ITEMS - 1);
1985 			}
1986 
1987 			size = MIN(bytes, PAGE_SIZE);
1988 
1989 			if (pci_nvme_append_iov_req(sc, req, *prp_list,
1990 			    size, is_write, offset)) {
1991 				pci_nvme_status_genc(&status,
1992 				    NVME_SC_DATA_TRANSFER_ERROR);
1993 				goto out;
1994 			}
1995 
1996 			offset += size;
1997 			bytes  -= size;
1998 
1999 			prp_list++;
2000 		}
2001 	}
2002 	req->io_req.br_callback = pci_nvme_io_done;
2003 	if (is_write)
2004 		err = blockif_write(nvstore->ctx, &req->io_req);
2005 	else
2006 		err = blockif_read(nvstore->ctx, &req->io_req);
2007 
2008 	if (err)
2009 		pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2010 out:
2011 	return (status);
2012 }
2013 
2014 static bool
2015 nvme_opc_write_read(struct pci_nvme_softc *sc,
2016     struct nvme_command *cmd,
2017     struct pci_nvme_blockstore *nvstore,
2018     struct pci_nvme_ioreq *req,
2019     uint16_t *status)
2020 {
2021 	uint64_t lba, nblocks, bytes;
2022 	size_t offset;
2023 	bool is_write = cmd->opc == NVME_OPC_WRITE;
2024 	bool pending = false;
2025 
2026 	lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2027 	nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2028 	if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2029 		WPRINTF("%s command would exceed LBA range", __func__);
2030 		pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2031 		goto out;
2032 	}
2033 
2034 	bytes  = nblocks << nvstore->sectsz_bits;
2035 	if (bytes > NVME_MAX_DATA_SIZE) {
2036 		WPRINTF("%s command would exceed MDTS", __func__);
2037 		pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2038 		goto out;
2039 	}
2040 
2041 	offset = lba << nvstore->sectsz_bits;
2042 
2043 	req->bytes = bytes;
2044 	req->io_req.br_offset = lba;
2045 
2046 	/* PRP bits 1:0 must be zero */
2047 	cmd->prp1 &= ~0x3UL;
2048 	cmd->prp2 &= ~0x3UL;
2049 
2050 	if (nvstore->type == NVME_STOR_RAM) {
2051 		*status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2052 		    cmd->prp2, offset, bytes, is_write);
2053 	} else {
2054 		*status = nvme_write_read_blockif(sc, nvstore, req,
2055 		    cmd->prp1, cmd->prp2, offset, bytes, is_write);
2056 
2057 		if (*status == NVME_NO_STATUS)
2058 			pending = true;
2059 	}
2060 out:
2061 	if (!pending)
2062 		pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2063 
2064 	return (pending);
2065 }
2066 
2067 static void
2068 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2069 {
2070 	struct pci_nvme_ioreq *req = br->br_param;
2071 	struct pci_nvme_softc *sc = req->sc;
2072 	bool done = true;
2073 	uint16_t status;
2074 
2075 	if (err) {
2076 		pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2077 	} else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2078 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2079 	} else {
2080 		struct iovec *iov = req->io_req.br_iov;
2081 
2082 		req->prev_gpaddr++;
2083 		iov += req->prev_gpaddr;
2084 
2085 		/* The iov_* values already include the sector size */
2086 		req->io_req.br_offset = (off_t)iov->iov_base;
2087 		req->io_req.br_resid = iov->iov_len;
2088 		if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2089 			pci_nvme_status_genc(&status,
2090 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2091 		} else
2092 			done = false;
2093 	}
2094 
2095 	if (done) {
2096 		pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
2097 		    req->cid, 0, status);
2098 		pci_nvme_release_ioreq(sc, req);
2099 	}
2100 }
2101 
2102 static bool
2103 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2104     struct nvme_command *cmd,
2105     struct pci_nvme_blockstore *nvstore,
2106     struct pci_nvme_ioreq *req,
2107     uint16_t *status)
2108 {
2109 	struct nvme_dsm_range *range;
2110 	uint32_t nr, r, non_zero, dr;
2111 	int err;
2112 	bool pending = false;
2113 
2114 	if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2115 		pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2116 		goto out;
2117 	}
2118 
2119 	nr = cmd->cdw10 & 0xff;
2120 
2121 	/* copy locally because a range entry could straddle PRPs */
2122 	range = calloc(1, NVME_MAX_DSM_TRIM);
2123 	if (range == NULL) {
2124 		pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2125 		goto out;
2126 	}
2127 	nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2128 	    (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2129 
2130 	/* Check for invalid ranges and the number of non-zero lengths */
2131 	non_zero = 0;
2132 	for (r = 0; r <= nr; r++) {
2133 		if (pci_nvme_out_of_range(nvstore,
2134 		    range[r].starting_lba, range[r].length)) {
2135 			pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2136 			goto out;
2137 		}
2138 		if (range[r].length != 0)
2139 			non_zero++;
2140 	}
2141 
2142 	if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2143 		size_t offset, bytes;
2144 		int sectsz_bits = sc->nvstore.sectsz_bits;
2145 
2146 		/*
2147 		 * DSM calls are advisory only, and compliant controllers
2148 		 * may choose to take no actions (i.e. return Success).
2149 		 */
2150 		if (!nvstore->deallocate) {
2151 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2152 			goto out;
2153 		}
2154 
2155 		/* If all ranges have a zero length, return Success */
2156 		if (non_zero == 0) {
2157 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2158 			goto out;
2159 		}
2160 
2161 		if (req == NULL) {
2162 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2163 			goto out;
2164 		}
2165 
2166 		offset = range[0].starting_lba << sectsz_bits;
2167 		bytes = range[0].length << sectsz_bits;
2168 
2169 		/*
2170 		 * If the request is for more than a single range, store
2171 		 * the ranges in the br_iov. Optimize for the common case
2172 		 * of a single range.
2173 		 *
2174 		 * Note that NVMe Number of Ranges is a zero based value
2175 		 */
2176 		req->io_req.br_iovcnt = 0;
2177 		req->io_req.br_offset = offset;
2178 		req->io_req.br_resid = bytes;
2179 
2180 		if (nr == 0) {
2181 			req->io_req.br_callback = pci_nvme_io_done;
2182 		} else {
2183 			struct iovec *iov = req->io_req.br_iov;
2184 
2185 			for (r = 0, dr = 0; r <= nr; r++) {
2186 				offset = range[r].starting_lba << sectsz_bits;
2187 				bytes = range[r].length << sectsz_bits;
2188 				if (bytes == 0)
2189 					continue;
2190 
2191 				if ((nvstore->size - offset) < bytes) {
2192 					pci_nvme_status_genc(status,
2193 					    NVME_SC_LBA_OUT_OF_RANGE);
2194 					goto out;
2195 				}
2196 				iov[dr].iov_base = (void *)offset;
2197 				iov[dr].iov_len = bytes;
2198 				dr++;
2199 			}
2200 			req->io_req.br_callback = pci_nvme_dealloc_sm;
2201 
2202 			/*
2203 			 * Use prev_gpaddr to track the current entry and
2204 			 * prev_size to track the number of entries
2205 			 */
2206 			req->prev_gpaddr = 0;
2207 			req->prev_size = dr;
2208 		}
2209 
2210 		err = blockif_delete(nvstore->ctx, &req->io_req);
2211 		if (err)
2212 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2213 		else
2214 			pending = true;
2215 	}
2216 out:
2217 	free(range);
2218 	return (pending);
2219 }
2220 
2221 static void
2222 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2223 {
2224 	struct nvme_submission_queue *sq;
2225 	uint16_t status;
2226 	uint16_t sqhead;
2227 
2228 	/* handle all submissions up to sq->tail index */
2229 	sq = &sc->submit_queues[idx];
2230 
2231 	pthread_mutex_lock(&sq->mtx);
2232 
2233 	sqhead = sq->head;
2234 	DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2235 	         idx, sqhead, sq->tail, sq->qbase);
2236 
2237 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
2238 		struct nvme_command *cmd;
2239 		struct pci_nvme_ioreq *req;
2240 		uint32_t nsid;
2241 		bool pending;
2242 
2243 		pending = false;
2244 		req = NULL;
2245 		status = 0;
2246 
2247 		cmd = &sq->qbase[sqhead];
2248 		sqhead = (sqhead + 1) % sq->size;
2249 
2250 		nsid = le32toh(cmd->nsid);
2251 		if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2252 			pci_nvme_status_genc(&status,
2253 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2254 			status |=
2255 			    NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2256 			goto complete;
2257  		}
2258 
2259 		req = pci_nvme_get_ioreq(sc);
2260 		if (req == NULL) {
2261 			pci_nvme_status_genc(&status,
2262 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2263 			WPRINTF("%s: unable to allocate IO req", __func__);
2264 			goto complete;
2265 		}
2266 		req->nvme_sq = sq;
2267 		req->sqid = idx;
2268 		req->opc = cmd->opc;
2269 		req->cid = cmd->cid;
2270 		req->nsid = cmd->nsid;
2271 
2272 		switch (cmd->opc) {
2273 		case NVME_OPC_FLUSH:
2274 			pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2275 			    req, &status);
2276  			break;
2277 		case NVME_OPC_WRITE:
2278 		case NVME_OPC_READ:
2279 			pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2280 			    req, &status);
2281 			break;
2282 		case NVME_OPC_WRITE_ZEROES:
2283 			/* TODO: write zeroes
2284 			WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2285 			        __func__, lba, cmd->cdw12 & 0xFFFF); */
2286 			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2287 			break;
2288 		case NVME_OPC_DATASET_MANAGEMENT:
2289  			pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2290 			    req, &status);
2291 			break;
2292  		default:
2293  			WPRINTF("%s unhandled io command 0x%x",
2294 			    __func__, cmd->opc);
2295 			pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2296 		}
2297 complete:
2298 		if (!pending) {
2299 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
2300 			    status);
2301 			if (req != NULL)
2302 				pci_nvme_release_ioreq(sc, req);
2303 		}
2304 	}
2305 
2306 	sq->head = sqhead;
2307 
2308 	pthread_mutex_unlock(&sq->mtx);
2309 }
2310 
2311 static void
2312 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
2313 	uint64_t idx, int is_sq, uint64_t value)
2314 {
2315 	DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2316 	        idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2317 
2318 	if (is_sq) {
2319 		if (idx > sc->num_squeues) {
2320 			WPRINTF("%s queue index %lu overflow from "
2321 			         "guest (max %u)",
2322 			         __func__, idx, sc->num_squeues);
2323 			return;
2324 		}
2325 
2326 		atomic_store_short(&sc->submit_queues[idx].tail,
2327 		                   (uint16_t)value);
2328 
2329 		if (idx == 0) {
2330 			pci_nvme_handle_admin_cmd(sc, value);
2331 		} else {
2332 			/* submission queue; handle new entries in SQ */
2333 			if (idx > sc->num_squeues) {
2334 				WPRINTF("%s SQ index %lu overflow from "
2335 				         "guest (max %u)",
2336 				         __func__, idx, sc->num_squeues);
2337 				return;
2338 			}
2339 			pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2340 		}
2341 	} else {
2342 		if (idx > sc->num_cqueues) {
2343 			WPRINTF("%s queue index %lu overflow from "
2344 			         "guest (max %u)",
2345 			         __func__, idx, sc->num_cqueues);
2346 			return;
2347 		}
2348 
2349 		atomic_store_short(&sc->compl_queues[idx].head,
2350 				(uint16_t)value);
2351 	}
2352 }
2353 
2354 static void
2355 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2356 {
2357 	const char *s = iswrite ? "WRITE" : "READ";
2358 
2359 	switch (offset) {
2360 	case NVME_CR_CAP_LOW:
2361 		DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2362 		break;
2363 	case NVME_CR_CAP_HI:
2364 		DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2365 		break;
2366 	case NVME_CR_VS:
2367 		DPRINTF("%s %s NVME_CR_VS", func, s);
2368 		break;
2369 	case NVME_CR_INTMS:
2370 		DPRINTF("%s %s NVME_CR_INTMS", func, s);
2371 		break;
2372 	case NVME_CR_INTMC:
2373 		DPRINTF("%s %s NVME_CR_INTMC", func, s);
2374 		break;
2375 	case NVME_CR_CC:
2376 		DPRINTF("%s %s NVME_CR_CC", func, s);
2377 		break;
2378 	case NVME_CR_CSTS:
2379 		DPRINTF("%s %s NVME_CR_CSTS", func, s);
2380 		break;
2381 	case NVME_CR_NSSR:
2382 		DPRINTF("%s %s NVME_CR_NSSR", func, s);
2383 		break;
2384 	case NVME_CR_AQA:
2385 		DPRINTF("%s %s NVME_CR_AQA", func, s);
2386 		break;
2387 	case NVME_CR_ASQ_LOW:
2388 		DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2389 		break;
2390 	case NVME_CR_ASQ_HI:
2391 		DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2392 		break;
2393 	case NVME_CR_ACQ_LOW:
2394 		DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2395 		break;
2396 	case NVME_CR_ACQ_HI:
2397 		DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2398 		break;
2399 	default:
2400 		DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2401 	}
2402 
2403 }
2404 
2405 static void
2406 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2407 	uint64_t offset, int size, uint64_t value)
2408 {
2409 	uint32_t ccreg;
2410 
2411 	if (offset >= NVME_DOORBELL_OFFSET) {
2412 		uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2413 		uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2414 		int is_sq = (belloffset % 8) < 4;
2415 
2416 		if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2417 			WPRINTF("guest attempted an overflow write offset "
2418 			         "0x%lx, val 0x%lx in %s",
2419 			         offset, value, __func__);
2420 			return;
2421 		}
2422 
2423 		pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2424 		return;
2425 	}
2426 
2427 	DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2428 	        offset, size, value);
2429 
2430 	if (size != 4) {
2431 		WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2432 		         "val 0x%lx) to bar0 in %s",
2433 		         size, offset, value, __func__);
2434 		/* TODO: shutdown device */
2435 		return;
2436 	}
2437 
2438 	pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2439 
2440 	pthread_mutex_lock(&sc->mtx);
2441 
2442 	switch (offset) {
2443 	case NVME_CR_CAP_LOW:
2444 	case NVME_CR_CAP_HI:
2445 		/* readonly */
2446 		break;
2447 	case NVME_CR_VS:
2448 		/* readonly */
2449 		break;
2450 	case NVME_CR_INTMS:
2451 		/* MSI-X, so ignore */
2452 		break;
2453 	case NVME_CR_INTMC:
2454 		/* MSI-X, so ignore */
2455 		break;
2456 	case NVME_CR_CC:
2457 		ccreg = (uint32_t)value;
2458 
2459 		DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2460 		         "iocqes %u",
2461 		        __func__,
2462 			 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2463 			 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2464 			 NVME_CC_GET_IOCQES(ccreg));
2465 
2466 		if (NVME_CC_GET_SHN(ccreg)) {
2467 			/* perform shutdown - flush out data to backend */
2468 			sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2469 			    NVME_CSTS_REG_SHST_SHIFT);
2470 			sc->regs.csts |= NVME_SHST_COMPLETE <<
2471 			    NVME_CSTS_REG_SHST_SHIFT;
2472 		}
2473 		if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2474 			if (NVME_CC_GET_EN(ccreg) == 0)
2475 				/* transition 1-> causes controller reset */
2476 				pci_nvme_reset_locked(sc);
2477 			else
2478 				pci_nvme_init_controller(ctx, sc);
2479 		}
2480 
2481 		/* Insert the iocqes, iosqes and en bits from the write */
2482 		sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2483 		sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2484 		if (NVME_CC_GET_EN(ccreg) == 0) {
2485 			/* Insert the ams, mps and css bit fields */
2486 			sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2487 			sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2488 			sc->regs.csts &= ~NVME_CSTS_RDY;
2489 		} else if (sc->pending_ios == 0) {
2490 			sc->regs.csts |= NVME_CSTS_RDY;
2491 		}
2492 		break;
2493 	case NVME_CR_CSTS:
2494 		break;
2495 	case NVME_CR_NSSR:
2496 		/* ignore writes; don't support subsystem reset */
2497 		break;
2498 	case NVME_CR_AQA:
2499 		sc->regs.aqa = (uint32_t)value;
2500 		break;
2501 	case NVME_CR_ASQ_LOW:
2502 		sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2503 		               (0xFFFFF000 & value);
2504 		break;
2505 	case NVME_CR_ASQ_HI:
2506 		sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2507 		               (value << 32);
2508 		break;
2509 	case NVME_CR_ACQ_LOW:
2510 		sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2511 		               (0xFFFFF000 & value);
2512 		break;
2513 	case NVME_CR_ACQ_HI:
2514 		sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2515 		               (value << 32);
2516 		break;
2517 	default:
2518 		DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2519 		         __func__, offset, value, size);
2520 	}
2521 	pthread_mutex_unlock(&sc->mtx);
2522 }
2523 
2524 static void
2525 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2526                 int baridx, uint64_t offset, int size, uint64_t value)
2527 {
2528 	struct pci_nvme_softc* sc = pi->pi_arg;
2529 
2530 	if (baridx == pci_msix_table_bar(pi) ||
2531 	    baridx == pci_msix_pba_bar(pi)) {
2532 		DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2533 		         " value 0x%lx", baridx, offset, size, value);
2534 
2535 		pci_emul_msix_twrite(pi, offset, size, value);
2536 		return;
2537 	}
2538 
2539 	switch (baridx) {
2540 	case 0:
2541 		pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2542 		break;
2543 
2544 	default:
2545 		DPRINTF("%s unknown baridx %d, val 0x%lx",
2546 		         __func__, baridx, value);
2547 	}
2548 }
2549 
2550 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2551 	uint64_t offset, int size)
2552 {
2553 	uint64_t value;
2554 
2555 	pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2556 
2557 	if (offset < NVME_DOORBELL_OFFSET) {
2558 		void *p = &(sc->regs);
2559 		pthread_mutex_lock(&sc->mtx);
2560 		memcpy(&value, (void *)((uintptr_t)p + offset), size);
2561 		pthread_mutex_unlock(&sc->mtx);
2562 	} else {
2563 		value = 0;
2564                 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2565 	}
2566 
2567 	switch (size) {
2568 	case 1:
2569 		value &= 0xFF;
2570 		break;
2571 	case 2:
2572 		value &= 0xFFFF;
2573 		break;
2574 	case 4:
2575 		value &= 0xFFFFFFFF;
2576 		break;
2577 	}
2578 
2579 	DPRINTF("   nvme-read offset 0x%lx, size %d -> value 0x%x",
2580 	         offset, size, (uint32_t)value);
2581 
2582 	return (value);
2583 }
2584 
2585 
2586 
2587 static uint64_t
2588 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2589     uint64_t offset, int size)
2590 {
2591 	struct pci_nvme_softc* sc = pi->pi_arg;
2592 
2593 	if (baridx == pci_msix_table_bar(pi) ||
2594 	    baridx == pci_msix_pba_bar(pi)) {
2595 		DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2596 		        baridx, offset, size);
2597 
2598 		return pci_emul_msix_tread(pi, offset, size);
2599 	}
2600 
2601 	switch (baridx) {
2602 	case 0:
2603        		return pci_nvme_read_bar_0(sc, offset, size);
2604 
2605 	default:
2606 		DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2607 	}
2608 
2609 	return (0);
2610 }
2611 
2612 
2613 static int
2614 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
2615 {
2616 	char bident[sizeof("XX:X:X")];
2617 	char	*uopt, *xopts, *config;
2618 	uint32_t sectsz;
2619 	int optidx;
2620 
2621 	sc->max_queues = NVME_QUEUES;
2622 	sc->max_qentries = NVME_MAX_QENTRIES;
2623 	sc->ioslots = NVME_IOSLOTS;
2624 	sc->num_squeues = sc->max_queues;
2625 	sc->num_cqueues = sc->max_queues;
2626 	sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2627 	sectsz = 0;
2628 
2629 	uopt = strdup(opts);
2630 	optidx = 0;
2631 	snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2632 	         "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2633 	for (xopts = strtok(uopt, ",");
2634 	     xopts != NULL;
2635 	     xopts = strtok(NULL, ",")) {
2636 
2637 		if ((config = strchr(xopts, '=')) != NULL)
2638 			*config++ = '\0';
2639 
2640 		if (!strcmp("maxq", xopts)) {
2641 			sc->max_queues = atoi(config);
2642 		} else if (!strcmp("qsz", xopts)) {
2643 			sc->max_qentries = atoi(config);
2644 		} else if (!strcmp("ioslots", xopts)) {
2645 			sc->ioslots = atoi(config);
2646 		} else if (!strcmp("sectsz", xopts)) {
2647 			sectsz = atoi(config);
2648 		} else if (!strcmp("ser", xopts)) {
2649 			/*
2650 			 * This field indicates the Product Serial Number in
2651 			 * 7-bit ASCII, unused bytes should be space characters.
2652 			 * Ref: NVMe v1.3c.
2653 			 */
2654 			cpywithpad((char *)sc->ctrldata.sn,
2655 			           sizeof(sc->ctrldata.sn), config, ' ');
2656 		} else if (!strcmp("ram", xopts)) {
2657 			uint64_t sz = strtoull(&xopts[4], NULL, 10);
2658 
2659 			sc->nvstore.type = NVME_STOR_RAM;
2660 			sc->nvstore.size = sz * 1024 * 1024;
2661 			sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2662 			sc->nvstore.sectsz = 4096;
2663 			sc->nvstore.sectsz_bits = 12;
2664 			if (sc->nvstore.ctx == NULL) {
2665 				perror("Unable to allocate RAM");
2666 				free(uopt);
2667 				return (-1);
2668 			}
2669 		} else if (!strcmp("eui64", xopts)) {
2670 			sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
2671 		} else if (!strcmp("dsm", xopts)) {
2672 			if (!strcmp("auto", config))
2673 				sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2674 			else if (!strcmp("enable", config))
2675 				sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2676 			else if (!strcmp("disable", config))
2677 				sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2678 		} else if (optidx == 0) {
2679 			snprintf(bident, sizeof(bident), "%d:%d",
2680 			         sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2681 			sc->nvstore.ctx = blockif_open(xopts, bident);
2682 			if (sc->nvstore.ctx == NULL) {
2683 				perror("Could not open backing file");
2684 				free(uopt);
2685 				return (-1);
2686 			}
2687 			sc->nvstore.type = NVME_STOR_BLOCKIF;
2688 			sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2689 		} else {
2690 			EPRINTLN("Invalid option %s", xopts);
2691 			free(uopt);
2692 			return (-1);
2693 		}
2694 
2695 		optidx++;
2696 	}
2697 	free(uopt);
2698 
2699 	if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
2700 		EPRINTLN("backing store not specified");
2701 		return (-1);
2702 	}
2703 	if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2704 		sc->nvstore.sectsz = sectsz;
2705 	else if (sc->nvstore.type != NVME_STOR_RAM)
2706 		sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2707 	for (sc->nvstore.sectsz_bits = 9;
2708 	     (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2709 	     sc->nvstore.sectsz_bits++);
2710 
2711 	if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2712 		sc->max_queues = NVME_QUEUES;
2713 
2714 	if (sc->max_qentries <= 0) {
2715 		EPRINTLN("Invalid qsz option");
2716 		return (-1);
2717 	}
2718 	if (sc->ioslots <= 0) {
2719 		EPRINTLN("Invalid ioslots option");
2720 		return (-1);
2721 	}
2722 
2723 	return (0);
2724 }
2725 
2726 static int
2727 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
2728 {
2729 	struct pci_nvme_softc *sc;
2730 	uint32_t pci_membar_sz;
2731 	int	error;
2732 
2733 	error = 0;
2734 
2735 	sc = calloc(1, sizeof(struct pci_nvme_softc));
2736 	pi->pi_arg = sc;
2737 	sc->nsc_pi = pi;
2738 
2739 	error = pci_nvme_parse_opts(sc, opts);
2740 	if (error < 0)
2741 		goto done;
2742 	else
2743 		error = 0;
2744 
2745 	STAILQ_INIT(&sc->ioreqs_free);
2746 	sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
2747 	for (int i = 0; i < sc->ioslots; i++) {
2748 		STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
2749 	}
2750 
2751 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
2752 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
2753 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
2754 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
2755 	pci_set_cfgdata8(pi, PCIR_PROGIF,
2756 	                 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
2757 
2758 	/*
2759 	 * Allocate size of NVMe registers + doorbell space for all queues.
2760 	 *
2761 	 * The specification requires a minimum memory I/O window size of 16K.
2762 	 * The Windows driver will refuse to start a device with a smaller
2763 	 * window.
2764 	 */
2765 	pci_membar_sz = sizeof(struct nvme_registers) +
2766 	    2 * sizeof(uint32_t) * (sc->max_queues + 1);
2767 	pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
2768 
2769 	DPRINTF("nvme membar size: %u", pci_membar_sz);
2770 
2771 	error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
2772 	if (error) {
2773 		WPRINTF("%s pci alloc mem bar failed", __func__);
2774 		goto done;
2775 	}
2776 
2777 	error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2778 	if (error) {
2779 		WPRINTF("%s pci add msixcap failed", __func__);
2780 		goto done;
2781 	}
2782 
2783 	error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2784 	if (error) {
2785 		WPRINTF("%s pci add Express capability failed", __func__);
2786 		goto done;
2787 	}
2788 
2789 	pthread_mutex_init(&sc->mtx, NULL);
2790 	sem_init(&sc->iosemlock, 0, sc->ioslots);
2791 
2792 	pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
2793 	/*
2794 	 * Controller data depends on Namespace data so initialize Namespace
2795 	 * data first.
2796 	 */
2797 	pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
2798 	pci_nvme_init_ctrldata(sc);
2799 	pci_nvme_init_logpages(sc);
2800 	pci_nvme_init_features(sc);
2801 
2802 	pci_nvme_aer_init(sc);
2803 
2804 	pci_nvme_reset(sc);
2805 
2806 	pci_lintr_request(pi);
2807 
2808 done:
2809 	return (error);
2810 }
2811 
2812 
2813 struct pci_devemu pci_de_nvme = {
2814 	.pe_emu =	"nvme",
2815 	.pe_init =	pci_nvme_init,
2816 	.pe_barwrite =	pci_nvme_write,
2817 	.pe_barread =	pci_nvme_read
2818 };
2819 PCI_EMUL_SET(pci_de_nvme);
2820