xref: /freebsd/usr.sbin/bhyve/pci_nvme.c (revision f9693bef8dc83284e7ac905adc346f7d866b5245)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  * Copyright (c) 2020 Chuck Tuffli
7  *
8  * Function crc16 Copyright (c) 2017, Fedor Uporov
9  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 /*
34  * bhyve PCIe-NVMe device emulation.
35  *
36  * options:
37  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
38  *
39  *  accepted devpath:
40  *    /dev/blockdev
41  *    /path/to/image
42  *    ram=size_in_MiB
43  *
44  *  maxq    = max number of queues
45  *  qsz     = max elements in each queue
46  *  ioslots = max number of concurrent io requests
47  *  sectsz  = sector size (defaults to blockif sector size)
48  *  ser     = serial number (20-chars max)
49  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
50  *  dsm     = DataSet Management support. Option is one of auto, enable,disable
51  *
52  */
53 
54 /* TODO:
55     - create async event for smart and log
56     - intr coalesce
57  */
58 
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61 
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
65 
66 #include <assert.h>
67 #include <pthread.h>
68 #include <semaphore.h>
69 #include <stdbool.h>
70 #include <stddef.h>
71 #include <stdint.h>
72 #include <stdio.h>
73 #include <stdlib.h>
74 #include <string.h>
75 
76 #include <machine/atomic.h>
77 #include <machine/vmm.h>
78 #include <vmmapi.h>
79 
80 #include <dev/nvme/nvme.h>
81 
82 #include "bhyverun.h"
83 #include "block_if.h"
84 #include "config.h"
85 #include "debug.h"
86 #include "pci_emul.h"
87 
88 
89 static int nvme_debug = 0;
90 #define	DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
91 #define	WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
92 
93 /* defaults; can be overridden */
94 #define	NVME_MSIX_BAR		4
95 
96 #define	NVME_IOSLOTS		8
97 
98 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
99 #define NVME_MMIO_SPACE_MIN	(1 << 14)
100 
101 #define	NVME_QUEUES		16
102 #define	NVME_MAX_QENTRIES	2048
103 /* Memory Page size Minimum reported in CAP register */
104 #define	NVME_MPSMIN		0
105 /* MPSMIN converted to bytes */
106 #define	NVME_MPSMIN_BYTES	(1 << (12 + NVME_MPSMIN))
107 
108 #define	NVME_PRP2_ITEMS		(PAGE_SIZE/sizeof(uint64_t))
109 #define	NVME_MDTS		9
110 /* Note the + 1 allows for the initial descriptor to not be page aligned */
111 #define	NVME_MAX_IOVEC		((1 << NVME_MDTS) + 1)
112 #define	NVME_MAX_DATA_SIZE	((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
113 
114 /* This is a synthetic status code to indicate there is no status */
115 #define NVME_NO_STATUS		0xffff
116 #define NVME_COMPLETION_VALID(c)	((c).status != NVME_NO_STATUS)
117 
118 /* helpers */
119 
120 /* Convert a zero-based value into a one-based value */
121 #define ONE_BASED(zero)		((zero) + 1)
122 /* Convert a one-based value into a zero-based value */
123 #define ZERO_BASED(one)		((one)  - 1)
124 
125 /* Encode number of SQ's and CQ's for Set/Get Features */
126 #define NVME_FEATURE_NUM_QUEUES(sc) \
127 	(ZERO_BASED((sc)->num_squeues) & 0xffff) | \
128 	(ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
129 
130 #define	NVME_DOORBELL_OFFSET	offsetof(struct nvme_registers, doorbell)
131 
132 enum nvme_controller_register_offsets {
133 	NVME_CR_CAP_LOW = 0x00,
134 	NVME_CR_CAP_HI  = 0x04,
135 	NVME_CR_VS      = 0x08,
136 	NVME_CR_INTMS   = 0x0c,
137 	NVME_CR_INTMC   = 0x10,
138 	NVME_CR_CC      = 0x14,
139 	NVME_CR_CSTS    = 0x1c,
140 	NVME_CR_NSSR    = 0x20,
141 	NVME_CR_AQA     = 0x24,
142 	NVME_CR_ASQ_LOW = 0x28,
143 	NVME_CR_ASQ_HI  = 0x2c,
144 	NVME_CR_ACQ_LOW = 0x30,
145 	NVME_CR_ACQ_HI  = 0x34,
146 };
147 
148 enum nvme_cmd_cdw11 {
149 	NVME_CMD_CDW11_PC  = 0x0001,
150 	NVME_CMD_CDW11_IEN = 0x0002,
151 	NVME_CMD_CDW11_IV  = 0xFFFF0000,
152 };
153 
154 enum nvme_copy_dir {
155 	NVME_COPY_TO_PRP,
156 	NVME_COPY_FROM_PRP,
157 };
158 
159 #define	NVME_CQ_INTEN	0x01
160 #define	NVME_CQ_INTCOAL	0x02
161 
162 struct nvme_completion_queue {
163 	struct nvme_completion *qbase;
164 	pthread_mutex_t	mtx;
165 	uint32_t	size;
166 	uint16_t	tail; /* nvme progress */
167 	uint16_t	head; /* guest progress */
168 	uint16_t	intr_vec;
169 	uint32_t	intr_en;
170 };
171 
172 struct nvme_submission_queue {
173 	struct nvme_command *qbase;
174 	pthread_mutex_t	mtx;
175 	uint32_t	size;
176 	uint16_t	head; /* nvme progress */
177 	uint16_t	tail; /* guest progress */
178 	uint16_t	cqid; /* completion queue id */
179 	int		qpriority;
180 };
181 
182 enum nvme_storage_type {
183 	NVME_STOR_BLOCKIF = 0,
184 	NVME_STOR_RAM = 1,
185 };
186 
187 struct pci_nvme_blockstore {
188 	enum nvme_storage_type type;
189 	void		*ctx;
190 	uint64_t	size;
191 	uint32_t	sectsz;
192 	uint32_t	sectsz_bits;
193 	uint64_t	eui64;
194 	uint32_t	deallocate:1;
195 };
196 
197 /*
198  * Calculate the number of additional page descriptors for guest IO requests
199  * based on the advertised Max Data Transfer (MDTS) and given the number of
200  * default iovec's in a struct blockif_req.
201  *
202  * Note the + 1 allows for the initial descriptor to not be page aligned.
203  */
204 #define MDTS_PAD_SIZE \
205 	NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
206 	NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
207 	0
208 
209 struct pci_nvme_ioreq {
210 	struct pci_nvme_softc *sc;
211 	STAILQ_ENTRY(pci_nvme_ioreq) link;
212 	struct nvme_submission_queue *nvme_sq;
213 	uint16_t	sqid;
214 
215 	/* command information */
216 	uint16_t	opc;
217 	uint16_t	cid;
218 	uint32_t	nsid;
219 
220 	uint64_t	prev_gpaddr;
221 	size_t		prev_size;
222 	size_t		bytes;
223 
224 	struct blockif_req io_req;
225 
226 	struct iovec	iovpadding[MDTS_PAD_SIZE];
227 };
228 
229 enum nvme_dsm_type {
230 	/* Dataset Management bit in ONCS reflects backing storage capability */
231 	NVME_DATASET_MANAGEMENT_AUTO,
232 	/* Unconditionally set Dataset Management bit in ONCS */
233 	NVME_DATASET_MANAGEMENT_ENABLE,
234 	/* Unconditionally clear Dataset Management bit in ONCS */
235 	NVME_DATASET_MANAGEMENT_DISABLE,
236 };
237 
238 struct pci_nvme_softc;
239 struct nvme_feature_obj;
240 
241 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
242     struct nvme_feature_obj *,
243     struct nvme_command *,
244     struct nvme_completion *);
245 
246 struct nvme_feature_obj {
247 	uint32_t	cdw11;
248 	nvme_feature_cb	set;
249 	nvme_feature_cb	get;
250 	bool namespace_specific;
251 };
252 
253 #define NVME_FID_MAX		(NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
254 
255 struct pci_nvme_aer {
256 	STAILQ_ENTRY(pci_nvme_aer) link;
257 	uint16_t	cid;	/* Command ID of the submitted AER */
258 };
259 
260 struct pci_nvme_softc {
261 	struct pci_devinst *nsc_pi;
262 
263 	pthread_mutex_t	mtx;
264 
265 	struct nvme_registers regs;
266 
267 	struct nvme_namespace_data  nsdata;
268 	struct nvme_controller_data ctrldata;
269 	struct nvme_error_information_entry err_log;
270 	struct nvme_health_information_page health_log;
271 	struct nvme_firmware_page fw_log;
272 
273 	struct pci_nvme_blockstore nvstore;
274 
275 	uint16_t	max_qentries;	/* max entries per queue */
276 	uint32_t	max_queues;	/* max number of IO SQ's or CQ's */
277 	uint32_t	num_cqueues;
278 	uint32_t	num_squeues;
279 	bool		num_q_is_set; /* Has host set Number of Queues */
280 
281 	struct pci_nvme_ioreq *ioreqs;
282 	STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
283 	uint32_t	pending_ios;
284 	uint32_t	ioslots;
285 	sem_t		iosemlock;
286 
287 	/*
288 	 * Memory mapped Submission and Completion queues
289 	 * Each array includes both Admin and IO queues
290 	 */
291 	struct nvme_completion_queue *compl_queues;
292 	struct nvme_submission_queue *submit_queues;
293 
294 	struct nvme_feature_obj feat[NVME_FID_MAX];
295 
296 	enum nvme_dsm_type dataset_management;
297 
298 	/* Accounting for SMART data */
299 	__uint128_t	read_data_units;
300 	__uint128_t	write_data_units;
301 	__uint128_t	read_commands;
302 	__uint128_t	write_commands;
303 	uint32_t	read_dunits_remainder;
304 	uint32_t	write_dunits_remainder;
305 
306 	STAILQ_HEAD(, pci_nvme_aer) aer_list;
307 	uint32_t	aer_count;
308 };
309 
310 
311 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
312 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
313 static void pci_nvme_io_done(struct blockif_req *, int);
314 
315 /* Controller Configuration utils */
316 #define	NVME_CC_GET_EN(cc) \
317 	((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
318 #define	NVME_CC_GET_CSS(cc) \
319 	((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
320 #define	NVME_CC_GET_SHN(cc) \
321 	((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
322 #define	NVME_CC_GET_IOSQES(cc) \
323 	((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
324 #define	NVME_CC_GET_IOCQES(cc) \
325 	((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
326 
327 #define	NVME_CC_WRITE_MASK \
328 	((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
329 	 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
330 	 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
331 
332 #define	NVME_CC_NEN_WRITE_MASK \
333 	((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
334 	 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
335 	 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
336 
337 /* Controller Status utils */
338 #define	NVME_CSTS_GET_RDY(sts) \
339 	((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
340 
341 #define	NVME_CSTS_RDY	(1 << NVME_CSTS_REG_RDY_SHIFT)
342 
343 /* Completion Queue status word utils */
344 #define	NVME_STATUS_P	(1 << NVME_STATUS_P_SHIFT)
345 #define	NVME_STATUS_MASK \
346 	((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
347 	 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
348 
349 #define NVME_ONCS_DSM	(NVME_CTRLR_DATA_ONCS_DSM_MASK << \
350 	NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
351 
352 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
353     struct nvme_feature_obj *,
354     struct nvme_command *,
355     struct nvme_completion *);
356 static void nvme_feature_num_queues(struct pci_nvme_softc *,
357     struct nvme_feature_obj *,
358     struct nvme_command *,
359     struct nvme_completion *);
360 static void nvme_feature_iv_config(struct pci_nvme_softc *,
361     struct nvme_feature_obj *,
362     struct nvme_command *,
363     struct nvme_completion *);
364 
365 static __inline void
366 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
367 {
368 	size_t len;
369 
370 	len = strnlen(src, dst_size);
371 	memset(dst, pad, dst_size);
372 	memcpy(dst, src, len);
373 }
374 
375 static __inline void
376 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
377 {
378 
379 	*status &= ~NVME_STATUS_MASK;
380 	*status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
381 		(code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
382 }
383 
384 static __inline void
385 pci_nvme_status_genc(uint16_t *status, uint16_t code)
386 {
387 
388 	pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
389 }
390 
391 /*
392  * Initialize the requested number or IO Submission and Completion Queues.
393  * Admin queues are allocated implicitly.
394  */
395 static void
396 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
397 {
398 	uint32_t i;
399 
400 	/*
401 	 * Allocate and initialize the Submission Queues
402 	 */
403 	if (nsq > NVME_QUEUES) {
404 		WPRINTF("%s: clamping number of SQ from %u to %u",
405 					__func__, nsq, NVME_QUEUES);
406 		nsq = NVME_QUEUES;
407 	}
408 
409 	sc->num_squeues = nsq;
410 
411 	sc->submit_queues = calloc(sc->num_squeues + 1,
412 				sizeof(struct nvme_submission_queue));
413 	if (sc->submit_queues == NULL) {
414 		WPRINTF("%s: SQ allocation failed", __func__);
415 		sc->num_squeues = 0;
416 	} else {
417 		struct nvme_submission_queue *sq = sc->submit_queues;
418 
419 		for (i = 0; i < sc->num_squeues; i++)
420 			pthread_mutex_init(&sq[i].mtx, NULL);
421 	}
422 
423 	/*
424 	 * Allocate and initialize the Completion Queues
425 	 */
426 	if (ncq > NVME_QUEUES) {
427 		WPRINTF("%s: clamping number of CQ from %u to %u",
428 					__func__, ncq, NVME_QUEUES);
429 		ncq = NVME_QUEUES;
430 	}
431 
432 	sc->num_cqueues = ncq;
433 
434 	sc->compl_queues = calloc(sc->num_cqueues + 1,
435 				sizeof(struct nvme_completion_queue));
436 	if (sc->compl_queues == NULL) {
437 		WPRINTF("%s: CQ allocation failed", __func__);
438 		sc->num_cqueues = 0;
439 	} else {
440 		struct nvme_completion_queue *cq = sc->compl_queues;
441 
442 		for (i = 0; i < sc->num_cqueues; i++)
443 			pthread_mutex_init(&cq[i].mtx, NULL);
444 	}
445 }
446 
447 static void
448 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
449 {
450 	struct nvme_controller_data *cd = &sc->ctrldata;
451 
452 	cd->vid = 0xFB5D;
453 	cd->ssvid = 0x0000;
454 
455 	cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
456 	cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
457 
458 	/* Num of submission commands that we can handle at a time (2^rab) */
459 	cd->rab   = 4;
460 
461 	/* FreeBSD OUI */
462 	cd->ieee[0] = 0x58;
463 	cd->ieee[1] = 0x9c;
464 	cd->ieee[2] = 0xfc;
465 
466 	cd->mic = 0;
467 
468 	cd->mdts = NVME_MDTS;	/* max data transfer size (2^mdts * CAP.MPSMIN) */
469 
470 	cd->ver = 0x00010300;
471 
472 	cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
473 	cd->acl = 2;
474 	cd->aerl = 4;
475 
476 	/* Advertise 1, Read-only firmware slot */
477 	cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK |
478 	    (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
479 	cd->lpa = 0;	/* TODO: support some simple things like SMART */
480 	cd->elpe = 0;	/* max error log page entries */
481 	cd->npss = 1;	/* number of power states support */
482 
483 	/* Warning Composite Temperature Threshold */
484 	cd->wctemp = 0x0157;
485 
486 	cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
487 	    (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
488 	cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
489 	    (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
490 	cd->nn = 1;	/* number of namespaces */
491 
492 	cd->oncs = 0;
493 	switch (sc->dataset_management) {
494 	case NVME_DATASET_MANAGEMENT_AUTO:
495 		if (sc->nvstore.deallocate)
496 			cd->oncs |= NVME_ONCS_DSM;
497 		break;
498 	case NVME_DATASET_MANAGEMENT_ENABLE:
499 		cd->oncs |= NVME_ONCS_DSM;
500 		break;
501 	default:
502 		break;
503 	}
504 
505 	cd->fna = 0x03;
506 
507 	cd->power_state[0].mp = 10;
508 }
509 
510 /*
511  * Calculate the CRC-16 of the given buffer
512  * See copyright attribution at top of file
513  */
514 static uint16_t
515 crc16(uint16_t crc, const void *buffer, unsigned int len)
516 {
517 	const unsigned char *cp = buffer;
518 	/* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
519 	static uint16_t const crc16_table[256] = {
520 		0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
521 		0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
522 		0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
523 		0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
524 		0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
525 		0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
526 		0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
527 		0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
528 		0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
529 		0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
530 		0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
531 		0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
532 		0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
533 		0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
534 		0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
535 		0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
536 		0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
537 		0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
538 		0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
539 		0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
540 		0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
541 		0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
542 		0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
543 		0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
544 		0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
545 		0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
546 		0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
547 		0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
548 		0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
549 		0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
550 		0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
551 		0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
552 	};
553 
554 	while (len--)
555 		crc = (((crc >> 8) & 0xffU) ^
556 		    crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
557 	return crc;
558 }
559 
560 static void
561 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
562     struct nvme_namespace_data *nd, uint32_t nsid,
563     struct pci_nvme_blockstore *nvstore)
564 {
565 
566 	/* Get capacity and block size information from backing store */
567 	nd->nsze = nvstore->size / nvstore->sectsz;
568 	nd->ncap = nd->nsze;
569 	nd->nuse = nd->nsze;
570 
571 	if (nvstore->type == NVME_STOR_BLOCKIF)
572 		nvstore->deallocate = blockif_candelete(nvstore->ctx);
573 
574 	nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
575 	nd->flbas = 0;
576 
577 	/* Create an EUI-64 if user did not provide one */
578 	if (nvstore->eui64 == 0) {
579 		char *data = NULL;
580 		uint64_t eui64 = nvstore->eui64;
581 
582 		asprintf(&data, "%s%u%u%u", get_config_value("name"),
583 		    sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot,
584 		    sc->nsc_pi->pi_func);
585 
586 		if (data != NULL) {
587 			eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
588 			free(data);
589 		}
590 		nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
591 	}
592 	be64enc(nd->eui64, nvstore->eui64);
593 
594 	/* LBA data-sz = 2^lbads */
595 	nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
596 }
597 
598 static void
599 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
600 {
601 
602 	memset(&sc->err_log, 0, sizeof(sc->err_log));
603 	memset(&sc->health_log, 0, sizeof(sc->health_log));
604 	memset(&sc->fw_log, 0, sizeof(sc->fw_log));
605 
606 	/* Set read/write remainder to round up according to spec */
607 	sc->read_dunits_remainder = 999;
608 	sc->write_dunits_remainder = 999;
609 
610 	/* Set nominal Health values checked by implementations */
611 	sc->health_log.temperature = 310;
612 	sc->health_log.available_spare = 100;
613 	sc->health_log.available_spare_threshold = 10;
614 }
615 
616 static void
617 pci_nvme_init_features(struct pci_nvme_softc *sc)
618 {
619 
620 	sc->feat[0].set = nvme_feature_invalid_cb;
621 	sc->feat[0].get = nvme_feature_invalid_cb;
622 
623 	sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true;
624 	sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true;
625 	sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues;
626 	sc->feat[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION].set =
627 	    nvme_feature_iv_config;
628 	sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG].get =
629 	    nvme_feature_invalid_cb;
630 	sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW].get =
631 	    nvme_feature_invalid_cb;
632 }
633 
634 static void
635 pci_nvme_aer_init(struct pci_nvme_softc *sc)
636 {
637 
638 	STAILQ_INIT(&sc->aer_list);
639 	sc->aer_count = 0;
640 }
641 
642 static void
643 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
644 {
645 	struct pci_nvme_aer *aer = NULL;
646 
647 	while (!STAILQ_EMPTY(&sc->aer_list)) {
648 		aer = STAILQ_FIRST(&sc->aer_list);
649 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
650 		free(aer);
651 	}
652 
653 	pci_nvme_aer_init(sc);
654 }
655 
656 static bool
657 pci_nvme_aer_available(struct pci_nvme_softc *sc)
658 {
659 
660 	return (!STAILQ_EMPTY(&sc->aer_list));
661 }
662 
663 static bool
664 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
665 {
666 	struct nvme_controller_data *cd = &sc->ctrldata;
667 
668 	/* AERL is a zero based value while aer_count is one's based */
669 	return (sc->aer_count == (cd->aerl + 1));
670 }
671 
672 /*
673  * Add an Async Event Request
674  *
675  * Stores an AER to be returned later if the Controller needs to notify the
676  * host of an event.
677  * Note that while the NVMe spec doesn't require Controllers to return AER's
678  * in order, this implementation does preserve the order.
679  */
680 static int
681 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
682 {
683 	struct pci_nvme_aer *aer = NULL;
684 
685 	if (pci_nvme_aer_limit_reached(sc))
686 		return (-1);
687 
688 	aer = calloc(1, sizeof(struct pci_nvme_aer));
689 	if (aer == NULL)
690 		return (-1);
691 
692 	sc->aer_count++;
693 
694 	/* Save the Command ID for use in the completion message */
695 	aer->cid = cid;
696 	STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
697 
698 	return (0);
699 }
700 
701 /*
702  * Get an Async Event Request structure
703  *
704  * Returns a pointer to an AER previously submitted by the host or NULL if
705  * no AER's exist. Caller is responsible for freeing the returned struct.
706  */
707 static struct pci_nvme_aer *
708 pci_nvme_aer_get(struct pci_nvme_softc *sc)
709 {
710 	struct pci_nvme_aer *aer = NULL;
711 
712 	aer = STAILQ_FIRST(&sc->aer_list);
713 	if (aer != NULL) {
714 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
715 		sc->aer_count--;
716 	}
717 
718 	return (aer);
719 }
720 
721 static void
722 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
723 {
724 	uint32_t i;
725 
726 	DPRINTF("%s", __func__);
727 
728 	sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
729 	    (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
730 	    (60 << NVME_CAP_LO_REG_TO_SHIFT);
731 
732 	sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
733 
734 	sc->regs.vs = 0x00010300;	/* NVMe v1.3 */
735 
736 	sc->regs.cc = 0;
737 	sc->regs.csts = 0;
738 
739 	assert(sc->submit_queues != NULL);
740 
741 	for (i = 0; i < sc->num_squeues + 1; i++) {
742 		sc->submit_queues[i].qbase = NULL;
743 		sc->submit_queues[i].size = 0;
744 		sc->submit_queues[i].cqid = 0;
745 		sc->submit_queues[i].tail = 0;
746 		sc->submit_queues[i].head = 0;
747 	}
748 
749 	assert(sc->compl_queues != NULL);
750 
751 	for (i = 0; i < sc->num_cqueues + 1; i++) {
752 		sc->compl_queues[i].qbase = NULL;
753 		sc->compl_queues[i].size = 0;
754 		sc->compl_queues[i].tail = 0;
755 		sc->compl_queues[i].head = 0;
756 	}
757 
758 	sc->num_q_is_set = false;
759 
760 	pci_nvme_aer_destroy(sc);
761 }
762 
763 static void
764 pci_nvme_reset(struct pci_nvme_softc *sc)
765 {
766 	pthread_mutex_lock(&sc->mtx);
767 	pci_nvme_reset_locked(sc);
768 	pthread_mutex_unlock(&sc->mtx);
769 }
770 
771 static void
772 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
773 {
774 	uint16_t acqs, asqs;
775 
776 	DPRINTF("%s", __func__);
777 
778 	asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
779 	sc->submit_queues[0].size = asqs;
780 	sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
781 	            sizeof(struct nvme_command) * asqs);
782 
783 	DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
784 	        __func__, sc->regs.asq, sc->submit_queues[0].qbase);
785 
786 	acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
787 	    NVME_AQA_REG_ACQS_MASK) + 1;
788 	sc->compl_queues[0].size = acqs;
789 	sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
790 	         sizeof(struct nvme_completion) * acqs);
791 	sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
792 
793 	DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
794 	        __func__, sc->regs.acq, sc->compl_queues[0].qbase);
795 }
796 
797 static int
798 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
799 	size_t len, enum nvme_copy_dir dir)
800 {
801 	uint8_t *p;
802 	size_t bytes;
803 
804 	if (len > (8 * 1024)) {
805 		return (-1);
806 	}
807 
808 	/* Copy from the start of prp1 to the end of the physical page */
809 	bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
810 	bytes = MIN(bytes, len);
811 
812 	p = vm_map_gpa(ctx, prp1, bytes);
813 	if (p == NULL) {
814 		return (-1);
815 	}
816 
817 	if (dir == NVME_COPY_TO_PRP)
818 		memcpy(p, b, bytes);
819 	else
820 		memcpy(b, p, bytes);
821 
822 	b += bytes;
823 
824 	len -= bytes;
825 	if (len == 0) {
826 		return (0);
827 	}
828 
829 	len = MIN(len, PAGE_SIZE);
830 
831 	p = vm_map_gpa(ctx, prp2, len);
832 	if (p == NULL) {
833 		return (-1);
834 	}
835 
836 	if (dir == NVME_COPY_TO_PRP)
837 		memcpy(p, b, len);
838 	else
839 		memcpy(b, p, len);
840 
841 	return (0);
842 }
843 
844 /*
845  * Write a Completion Queue Entry update
846  *
847  * Write the completion and update the doorbell value
848  */
849 static void
850 pci_nvme_cq_update(struct pci_nvme_softc *sc,
851 		struct nvme_completion_queue *cq,
852 		uint32_t cdw0,
853 		uint16_t cid,
854 		uint16_t sqid,
855 		uint16_t status)
856 {
857 	struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
858 	struct nvme_completion *cqe;
859 
860 	assert(cq->qbase != NULL);
861 
862 	pthread_mutex_lock(&cq->mtx);
863 
864 	cqe = &cq->qbase[cq->tail];
865 
866 	/* Flip the phase bit */
867 	status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
868 
869 	cqe->cdw0 = cdw0;
870 	cqe->sqhd = sq->head;
871 	cqe->sqid = sqid;
872 	cqe->cid = cid;
873 	cqe->status = status;
874 
875 	cq->tail++;
876 	if (cq->tail >= cq->size) {
877 		cq->tail = 0;
878 	}
879 
880 	pthread_mutex_unlock(&cq->mtx);
881 }
882 
883 static int
884 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
885 	struct nvme_completion* compl)
886 {
887 	uint16_t qid = command->cdw10 & 0xffff;
888 
889 	DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
890 	if (qid == 0 || qid > sc->num_squeues ||
891 	    (sc->submit_queues[qid].qbase == NULL)) {
892 		WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
893 		        __func__, qid, sc->num_squeues);
894 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
895 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
896 		return (1);
897 	}
898 
899 	sc->submit_queues[qid].qbase = NULL;
900 	sc->submit_queues[qid].cqid = 0;
901 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
902 	return (1);
903 }
904 
905 static int
906 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
907 	struct nvme_completion* compl)
908 {
909 	if (command->cdw11 & NVME_CMD_CDW11_PC) {
910 		uint16_t qid = command->cdw10 & 0xffff;
911 		struct nvme_submission_queue *nsq;
912 
913 		if ((qid == 0) || (qid > sc->num_squeues) ||
914 		    (sc->submit_queues[qid].qbase != NULL)) {
915 			WPRINTF("%s queue index %u > num_squeues %u",
916 			        __func__, qid, sc->num_squeues);
917 			pci_nvme_status_tc(&compl->status,
918 			    NVME_SCT_COMMAND_SPECIFIC,
919 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
920 			return (1);
921 		}
922 
923 		nsq = &sc->submit_queues[qid];
924 		nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
925 		DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
926 		if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
927 			/*
928 			 * Queues must specify at least two entries
929 			 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
930 			 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
931 			 */
932 			pci_nvme_status_tc(&compl->status,
933 			    NVME_SCT_COMMAND_SPECIFIC,
934 			    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
935 			return (1);
936 		}
937 		nsq->head = nsq->tail = 0;
938 
939 		nsq->cqid = (command->cdw11 >> 16) & 0xffff;
940 		if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
941 			pci_nvme_status_tc(&compl->status,
942 			    NVME_SCT_COMMAND_SPECIFIC,
943 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
944 			return (1);
945 		}
946 
947 		if (sc->compl_queues[nsq->cqid].qbase == NULL) {
948 			pci_nvme_status_tc(&compl->status,
949 			    NVME_SCT_COMMAND_SPECIFIC,
950 			    NVME_SC_COMPLETION_QUEUE_INVALID);
951 			return (1);
952 		}
953 
954 		nsq->qpriority = (command->cdw11 >> 1) & 0x03;
955 
956 		nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
957 		              sizeof(struct nvme_command) * (size_t)nsq->size);
958 
959 		DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
960 		        qid, nsq->size, nsq->qbase, nsq->cqid);
961 
962 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
963 
964 		DPRINTF("%s completed creating IOSQ qid %u",
965 		         __func__, qid);
966 	} else {
967 		/*
968 		 * Guest sent non-cont submission queue request.
969 		 * This setting is unsupported by this emulation.
970 		 */
971 		WPRINTF("%s unsupported non-contig (list-based) "
972 		         "create i/o submission queue", __func__);
973 
974 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
975 	}
976 	return (1);
977 }
978 
979 static int
980 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
981 	struct nvme_completion* compl)
982 {
983 	uint16_t qid = command->cdw10 & 0xffff;
984 	uint16_t sqid;
985 
986 	DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
987 	if (qid == 0 || qid > sc->num_cqueues ||
988 	    (sc->compl_queues[qid].qbase == NULL)) {
989 		WPRINTF("%s queue index %u / num_cqueues %u",
990 		        __func__, qid, sc->num_cqueues);
991 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
992 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
993 		return (1);
994 	}
995 
996 	/* Deleting an Active CQ is an error */
997 	for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
998 		if (sc->submit_queues[sqid].cqid == qid) {
999 			pci_nvme_status_tc(&compl->status,
1000 			    NVME_SCT_COMMAND_SPECIFIC,
1001 			    NVME_SC_INVALID_QUEUE_DELETION);
1002 			return (1);
1003 		}
1004 
1005 	sc->compl_queues[qid].qbase = NULL;
1006 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1007 	return (1);
1008 }
1009 
1010 static int
1011 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1012 	struct nvme_completion* compl)
1013 {
1014 	struct nvme_completion_queue *ncq;
1015 	uint16_t qid = command->cdw10 & 0xffff;
1016 
1017 	/* Only support Physically Contiguous queues */
1018 	if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1019 		WPRINTF("%s unsupported non-contig (list-based) "
1020 		         "create i/o completion queue",
1021 		         __func__);
1022 
1023 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1024 		return (1);
1025 	}
1026 
1027 	if ((qid == 0) || (qid > sc->num_cqueues) ||
1028 	    (sc->compl_queues[qid].qbase != NULL)) {
1029 		WPRINTF("%s queue index %u > num_cqueues %u",
1030 			__func__, qid, sc->num_cqueues);
1031 		pci_nvme_status_tc(&compl->status,
1032 		    NVME_SCT_COMMAND_SPECIFIC,
1033 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1034 		return (1);
1035  	}
1036 
1037 	ncq = &sc->compl_queues[qid];
1038 	ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1039 	ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1040 	if (ncq->intr_vec > (sc->max_queues + 1)) {
1041 		pci_nvme_status_tc(&compl->status,
1042 		    NVME_SCT_COMMAND_SPECIFIC,
1043 		    NVME_SC_INVALID_INTERRUPT_VECTOR);
1044 		return (1);
1045 	}
1046 
1047 	ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1048 	if ((ncq->size < 2) || (ncq->size > sc->max_qentries))  {
1049 		/*
1050 		 * Queues must specify at least two entries
1051 		 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1052 		 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1053 		 */
1054 		pci_nvme_status_tc(&compl->status,
1055 		    NVME_SCT_COMMAND_SPECIFIC,
1056 		    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1057 		return (1);
1058 	}
1059 	ncq->head = ncq->tail = 0;
1060 	ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1061 		     command->prp1,
1062 		     sizeof(struct nvme_command) * (size_t)ncq->size);
1063 
1064 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1065 
1066 
1067 	return (1);
1068 }
1069 
1070 static int
1071 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1072 	struct nvme_completion* compl)
1073 {
1074 	uint32_t logsize;
1075 	uint8_t logpage = command->cdw10 & 0xFF;
1076 
1077 	DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1078 
1079 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1080 
1081 	/*
1082 	 * Command specifies the number of dwords to return in fields NUMDU
1083 	 * and NUMDL. This is a zero-based value.
1084 	 */
1085 	logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1086 	logsize *= sizeof(uint32_t);
1087 
1088 	switch (logpage) {
1089 	case NVME_LOG_ERROR:
1090 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1091 		    command->prp2, (uint8_t *)&sc->err_log,
1092 		    MIN(logsize, sizeof(sc->err_log)),
1093 		    NVME_COPY_TO_PRP);
1094 		break;
1095 	case NVME_LOG_HEALTH_INFORMATION:
1096 		pthread_mutex_lock(&sc->mtx);
1097 		memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1098 		    sizeof(sc->health_log.data_units_read));
1099 		memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1100 		    sizeof(sc->health_log.data_units_written));
1101 		memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1102 		    sizeof(sc->health_log.host_read_commands));
1103 		memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1104 		    sizeof(sc->health_log.host_write_commands));
1105 		pthread_mutex_unlock(&sc->mtx);
1106 
1107 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1108 		    command->prp2, (uint8_t *)&sc->health_log,
1109 		    MIN(logsize, sizeof(sc->health_log)),
1110 		    NVME_COPY_TO_PRP);
1111 		break;
1112 	case NVME_LOG_FIRMWARE_SLOT:
1113 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1114 		    command->prp2, (uint8_t *)&sc->fw_log,
1115 		    MIN(logsize, sizeof(sc->fw_log)),
1116 		    NVME_COPY_TO_PRP);
1117 		break;
1118 	default:
1119 		DPRINTF("%s get log page %x command not supported",
1120 		        __func__, logpage);
1121 
1122 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1123 		    NVME_SC_INVALID_LOG_PAGE);
1124 	}
1125 
1126 	return (1);
1127 }
1128 
1129 static int
1130 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1131 	struct nvme_completion* compl)
1132 {
1133 	void *dest;
1134 	uint16_t status;
1135 
1136 	DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1137 	        command->cdw10 & 0xFF, command->nsid);
1138 
1139 	pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1140 
1141 	switch (command->cdw10 & 0xFF) {
1142 	case 0x00: /* return Identify Namespace data structure */
1143 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1144 		    command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1145 		    NVME_COPY_TO_PRP);
1146 		break;
1147 	case 0x01: /* return Identify Controller data structure */
1148 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1149 		    command->prp2, (uint8_t *)&sc->ctrldata,
1150 		    sizeof(sc->ctrldata),
1151 		    NVME_COPY_TO_PRP);
1152 		break;
1153 	case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1154 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1155 		                  sizeof(uint32_t) * 1024);
1156 		/* All unused entries shall be zero */
1157 		bzero(dest, sizeof(uint32_t) * 1024);
1158 		((uint32_t *)dest)[0] = 1;
1159 		break;
1160 	case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1161 		if (command->nsid != 1) {
1162 			pci_nvme_status_genc(&status,
1163 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1164 			break;
1165 		}
1166 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1167 		                  sizeof(uint32_t) * 1024);
1168 		/* All bytes after the descriptor shall be zero */
1169 		bzero(dest, sizeof(uint32_t) * 1024);
1170 
1171 		/* Return NIDT=1 (i.e. EUI64) descriptor */
1172 		((uint8_t *)dest)[0] = 1;
1173 		((uint8_t *)dest)[1] = sizeof(uint64_t);
1174 		bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1175 		break;
1176 	default:
1177 		DPRINTF("%s unsupported identify command requested 0x%x",
1178 		         __func__, command->cdw10 & 0xFF);
1179 		pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1180 		break;
1181 	}
1182 
1183 	compl->status = status;
1184 	return (1);
1185 }
1186 
1187 static const char *
1188 nvme_fid_to_name(uint8_t fid)
1189 {
1190 	const char *name;
1191 
1192 	switch (fid) {
1193 	case NVME_FEAT_ARBITRATION:
1194 		name = "Arbitration";
1195 		break;
1196 	case NVME_FEAT_POWER_MANAGEMENT:
1197 		name = "Power Management";
1198 		break;
1199 	case NVME_FEAT_LBA_RANGE_TYPE:
1200 		name = "LBA Range Type";
1201 		break;
1202 	case NVME_FEAT_TEMPERATURE_THRESHOLD:
1203 		name = "Temperature Threshold";
1204 		break;
1205 	case NVME_FEAT_ERROR_RECOVERY:
1206 		name = "Error Recovery";
1207 		break;
1208 	case NVME_FEAT_VOLATILE_WRITE_CACHE:
1209 		name = "Volatile Write Cache";
1210 		break;
1211 	case NVME_FEAT_NUMBER_OF_QUEUES:
1212 		name = "Number of Queues";
1213 		break;
1214 	case NVME_FEAT_INTERRUPT_COALESCING:
1215 		name = "Interrupt Coalescing";
1216 		break;
1217 	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1218 		name = "Interrupt Vector Configuration";
1219 		break;
1220 	case NVME_FEAT_WRITE_ATOMICITY:
1221 		name = "Write Atomicity Normal";
1222 		break;
1223 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1224 		name = "Asynchronous Event Configuration";
1225 		break;
1226 	case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1227 		name = "Autonomous Power State Transition";
1228 		break;
1229 	case NVME_FEAT_HOST_MEMORY_BUFFER:
1230 		name = "Host Memory Buffer";
1231 		break;
1232 	case NVME_FEAT_TIMESTAMP:
1233 		name = "Timestamp";
1234 		break;
1235 	case NVME_FEAT_KEEP_ALIVE_TIMER:
1236 		name = "Keep Alive Timer";
1237 		break;
1238 	case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1239 		name = "Host Controlled Thermal Management";
1240 		break;
1241 	case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1242 		name = "Non-Operation Power State Config";
1243 		break;
1244 	case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1245 		name = "Read Recovery Level Config";
1246 		break;
1247 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1248 		name = "Predictable Latency Mode Config";
1249 		break;
1250 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1251 		name = "Predictable Latency Mode Window";
1252 		break;
1253 	case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1254 		name = "LBA Status Information Report Interval";
1255 		break;
1256 	case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1257 		name = "Host Behavior Support";
1258 		break;
1259 	case NVME_FEAT_SANITIZE_CONFIG:
1260 		name = "Sanitize Config";
1261 		break;
1262 	case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1263 		name = "Endurance Group Event Configuration";
1264 		break;
1265 	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1266 		name = "Software Progress Marker";
1267 		break;
1268 	case NVME_FEAT_HOST_IDENTIFIER:
1269 		name = "Host Identifier";
1270 		break;
1271 	case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1272 		name = "Reservation Notification Mask";
1273 		break;
1274 	case NVME_FEAT_RESERVATION_PERSISTENCE:
1275 		name = "Reservation Persistence";
1276 		break;
1277 	case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1278 		name = "Namespace Write Protection Config";
1279 		break;
1280 	default:
1281 		name = "Unknown";
1282 		break;
1283 	}
1284 
1285 	return (name);
1286 }
1287 
1288 static void
1289 nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
1290     struct nvme_feature_obj *feat,
1291     struct nvme_command *command,
1292     struct nvme_completion *compl)
1293 {
1294 
1295 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1296 }
1297 
1298 static void
1299 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1300     struct nvme_feature_obj *feat,
1301     struct nvme_command *command,
1302     struct nvme_completion *compl)
1303 {
1304 	uint32_t i;
1305 	uint32_t cdw11 = command->cdw11;
1306 	uint16_t iv;
1307 	bool cd;
1308 
1309 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1310 
1311 	iv = cdw11 & 0xffff;
1312 	cd = cdw11 & (1 << 16);
1313 
1314 	if (iv > (sc->max_queues + 1)) {
1315 		return;
1316 	}
1317 
1318 	/* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1319 	if ((iv == 0) && !cd)
1320 		return;
1321 
1322 	/* Requested Interrupt Vector must be used by a CQ */
1323 	for (i = 0; i < sc->num_cqueues + 1; i++) {
1324 		if (sc->compl_queues[i].intr_vec == iv) {
1325 			pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1326 		}
1327 	}
1328 
1329 }
1330 
1331 static void
1332 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1333     struct nvme_feature_obj *feat,
1334     struct nvme_command *command,
1335     struct nvme_completion *compl)
1336 {
1337 	uint16_t nqr;	/* Number of Queues Requested */
1338 
1339 	if (sc->num_q_is_set) {
1340 		WPRINTF("%s: Number of Queues already set", __func__);
1341 		pci_nvme_status_genc(&compl->status,
1342 		    NVME_SC_COMMAND_SEQUENCE_ERROR);
1343 		return;
1344 	}
1345 
1346 	nqr = command->cdw11 & 0xFFFF;
1347 	if (nqr == 0xffff) {
1348 		WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1349 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1350 		return;
1351 	}
1352 
1353 	sc->num_squeues = ONE_BASED(nqr);
1354 	if (sc->num_squeues > sc->max_queues) {
1355 		DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1356 					sc->max_queues);
1357 		sc->num_squeues = sc->max_queues;
1358 	}
1359 
1360 	nqr = (command->cdw11 >> 16) & 0xFFFF;
1361 	if (nqr == 0xffff) {
1362 		WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1363 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1364 		return;
1365 	}
1366 
1367 	sc->num_cqueues = ONE_BASED(nqr);
1368 	if (sc->num_cqueues > sc->max_queues) {
1369 		DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1370 					sc->max_queues);
1371 		sc->num_cqueues = sc->max_queues;
1372 	}
1373 
1374 	/* Patch the command value which will be saved on callback's return */
1375 	command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1376 	compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1377 
1378 	sc->num_q_is_set = true;
1379 }
1380 
1381 static int
1382 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1383 	struct nvme_completion *compl)
1384 {
1385 	struct nvme_feature_obj *feat;
1386 	uint32_t nsid = command->nsid;
1387 	uint8_t fid = command->cdw10 & 0xFF;
1388 
1389 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1390 
1391 	if (fid >= NVME_FID_MAX) {
1392 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1393 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1394 		return (1);
1395 	}
1396 	feat = &sc->feat[fid];
1397 
1398 	if (!feat->namespace_specific &&
1399 	    !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1400 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1401 		    NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1402 		return (1);
1403 	}
1404 
1405 	compl->cdw0 = 0;
1406 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1407 
1408 	if (feat->set)
1409 		feat->set(sc, feat, command, compl);
1410 
1411 	if (compl->status == NVME_SC_SUCCESS)
1412 		feat->cdw11 = command->cdw11;
1413 
1414 	return (0);
1415 }
1416 
1417 static int
1418 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1419 	struct nvme_completion* compl)
1420 {
1421 	struct nvme_feature_obj *feat;
1422 	uint8_t fid = command->cdw10 & 0xFF;
1423 
1424 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1425 
1426 	if (fid >= NVME_FID_MAX) {
1427 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1428 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1429 		return (1);
1430 	}
1431 
1432 	compl->cdw0 = 0;
1433 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1434 
1435 	feat = &sc->feat[fid];
1436 	if (feat->get) {
1437 		feat->get(sc, feat, command, compl);
1438 	}
1439 
1440 	if (compl->status == NVME_SC_SUCCESS) {
1441 		compl->cdw0 = feat->cdw11;
1442 	}
1443 
1444 	return (0);
1445 }
1446 
1447 static int
1448 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1449 	struct nvme_completion* compl)
1450 {
1451 	uint8_t	ses, lbaf, pi;
1452 
1453 	/* Only supports Secure Erase Setting - User Data Erase */
1454 	ses = (command->cdw10 >> 9) & 0x7;
1455 	if (ses > 0x1) {
1456 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1457 		return (1);
1458 	}
1459 
1460 	/* Only supports a single LBA Format */
1461 	lbaf = command->cdw10 & 0xf;
1462 	if (lbaf != 0) {
1463 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1464 		    NVME_SC_INVALID_FORMAT);
1465 		return (1);
1466 	}
1467 
1468 	/* Doesn't support Protection Infomation */
1469 	pi = (command->cdw10 >> 5) & 0x7;
1470 	if (pi != 0) {
1471 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1472 		return (1);
1473 	}
1474 
1475 	if (sc->nvstore.type == NVME_STOR_RAM) {
1476 		if (sc->nvstore.ctx)
1477 			free(sc->nvstore.ctx);
1478 		sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1479 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1480 	} else {
1481 		struct pci_nvme_ioreq *req;
1482 		int err;
1483 
1484 		req = pci_nvme_get_ioreq(sc);
1485 		if (req == NULL) {
1486 			pci_nvme_status_genc(&compl->status,
1487 			    NVME_SC_INTERNAL_DEVICE_ERROR);
1488 			WPRINTF("%s: unable to allocate IO req", __func__);
1489 			return (1);
1490 		}
1491 		req->nvme_sq = &sc->submit_queues[0];
1492 		req->sqid = 0;
1493 		req->opc = command->opc;
1494 		req->cid = command->cid;
1495 		req->nsid = command->nsid;
1496 
1497 		req->io_req.br_offset = 0;
1498 		req->io_req.br_resid = sc->nvstore.size;
1499 		req->io_req.br_callback = pci_nvme_io_done;
1500 
1501 		err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1502 		if (err) {
1503 			pci_nvme_status_genc(&compl->status,
1504 			    NVME_SC_INTERNAL_DEVICE_ERROR);
1505 			pci_nvme_release_ioreq(sc, req);
1506 		}
1507 	}
1508 
1509 	return (1);
1510 }
1511 
1512 static int
1513 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1514 	struct nvme_completion* compl)
1515 {
1516 	DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1517 	        command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1518 
1519 	/* TODO: search for the command ID and abort it */
1520 
1521 	compl->cdw0 = 1;
1522 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1523 	return (1);
1524 }
1525 
1526 static int
1527 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1528 	struct nvme_command* command, struct nvme_completion* compl)
1529 {
1530 	DPRINTF("%s async event request 0x%x", __func__, command->cdw11);
1531 
1532 	/* Don't exceed the Async Event Request Limit (AERL). */
1533 	if (pci_nvme_aer_limit_reached(sc)) {
1534 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1535 				NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1536 		return (1);
1537 	}
1538 
1539 	if (pci_nvme_aer_add(sc, command->cid)) {
1540 		pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
1541 				NVME_SC_INTERNAL_DEVICE_ERROR);
1542 		return (1);
1543 	}
1544 
1545 	/*
1546 	 * Raise events when they happen based on the Set Features cmd.
1547 	 * These events happen async, so only set completion successful if
1548 	 * there is an event reflective of the request to get event.
1549 	 */
1550 	compl->status = NVME_NO_STATUS;
1551 
1552 	return (0);
1553 }
1554 
1555 static void
1556 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1557 {
1558 	struct nvme_completion compl;
1559 	struct nvme_command *cmd;
1560 	struct nvme_submission_queue *sq;
1561 	struct nvme_completion_queue *cq;
1562 	uint16_t sqhead;
1563 
1564 	DPRINTF("%s index %u", __func__, (uint32_t)value);
1565 
1566 	sq = &sc->submit_queues[0];
1567 	cq = &sc->compl_queues[0];
1568 
1569 	pthread_mutex_lock(&sq->mtx);
1570 
1571 	sqhead = sq->head;
1572 	DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1573 
1574 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
1575 		cmd = &(sq->qbase)[sqhead];
1576 		compl.cdw0 = 0;
1577 		compl.status = 0;
1578 
1579 		switch (cmd->opc) {
1580 		case NVME_OPC_DELETE_IO_SQ:
1581 			DPRINTF("%s command DELETE_IO_SQ", __func__);
1582 			nvme_opc_delete_io_sq(sc, cmd, &compl);
1583 			break;
1584 		case NVME_OPC_CREATE_IO_SQ:
1585 			DPRINTF("%s command CREATE_IO_SQ", __func__);
1586 			nvme_opc_create_io_sq(sc, cmd, &compl);
1587 			break;
1588 		case NVME_OPC_DELETE_IO_CQ:
1589 			DPRINTF("%s command DELETE_IO_CQ", __func__);
1590 			nvme_opc_delete_io_cq(sc, cmd, &compl);
1591 			break;
1592 		case NVME_OPC_CREATE_IO_CQ:
1593 			DPRINTF("%s command CREATE_IO_CQ", __func__);
1594 			nvme_opc_create_io_cq(sc, cmd, &compl);
1595 			break;
1596 		case NVME_OPC_GET_LOG_PAGE:
1597 			DPRINTF("%s command GET_LOG_PAGE", __func__);
1598 			nvme_opc_get_log_page(sc, cmd, &compl);
1599 			break;
1600 		case NVME_OPC_IDENTIFY:
1601 			DPRINTF("%s command IDENTIFY", __func__);
1602 			nvme_opc_identify(sc, cmd, &compl);
1603 			break;
1604 		case NVME_OPC_ABORT:
1605 			DPRINTF("%s command ABORT", __func__);
1606 			nvme_opc_abort(sc, cmd, &compl);
1607 			break;
1608 		case NVME_OPC_SET_FEATURES:
1609 			DPRINTF("%s command SET_FEATURES", __func__);
1610 			nvme_opc_set_features(sc, cmd, &compl);
1611 			break;
1612 		case NVME_OPC_GET_FEATURES:
1613 			DPRINTF("%s command GET_FEATURES", __func__);
1614 			nvme_opc_get_features(sc, cmd, &compl);
1615 			break;
1616 		case NVME_OPC_FIRMWARE_ACTIVATE:
1617 			DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
1618 			pci_nvme_status_tc(&compl.status,
1619 			    NVME_SCT_COMMAND_SPECIFIC,
1620 			    NVME_SC_INVALID_FIRMWARE_SLOT);
1621 			break;
1622 		case NVME_OPC_ASYNC_EVENT_REQUEST:
1623 			DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1624 			nvme_opc_async_event_req(sc, cmd, &compl);
1625 			break;
1626 		case NVME_OPC_FORMAT_NVM:
1627 			DPRINTF("%s command FORMAT_NVM", __func__);
1628 			if ((sc->ctrldata.oacs &
1629 			    (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
1630 				pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1631 			}
1632 			compl.status = NVME_NO_STATUS;
1633 			nvme_opc_format_nvm(sc, cmd, &compl);
1634 			break;
1635 		default:
1636 			DPRINTF("0x%x command is not implemented",
1637 			    cmd->opc);
1638 			pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1639 		}
1640 		sqhead = (sqhead + 1) % sq->size;
1641 
1642 		if (NVME_COMPLETION_VALID(compl)) {
1643 			pci_nvme_cq_update(sc, &sc->compl_queues[0],
1644 			    compl.cdw0,
1645 			    cmd->cid,
1646 			    0,		/* SQID */
1647 			    compl.status);
1648 		}
1649 	}
1650 
1651 	DPRINTF("setting sqhead %u", sqhead);
1652 	sq->head = sqhead;
1653 
1654 	if (cq->head != cq->tail)
1655 		pci_generate_msix(sc->nsc_pi, 0);
1656 
1657 	pthread_mutex_unlock(&sq->mtx);
1658 }
1659 
1660 /*
1661  * Update the Write and Read statistics reported in SMART data
1662  *
1663  * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
1664  * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
1665  * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
1666  */
1667 static void
1668 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
1669     size_t bytes, uint16_t status)
1670 {
1671 
1672 	pthread_mutex_lock(&sc->mtx);
1673 	switch (opc) {
1674 	case NVME_OPC_WRITE:
1675 		sc->write_commands++;
1676 		if (status != NVME_SC_SUCCESS)
1677 			break;
1678 		sc->write_dunits_remainder += (bytes / 512);
1679 		while (sc->write_dunits_remainder >= 1000) {
1680 			sc->write_data_units++;
1681 			sc->write_dunits_remainder -= 1000;
1682 		}
1683 		break;
1684 	case NVME_OPC_READ:
1685 		sc->read_commands++;
1686 		if (status != NVME_SC_SUCCESS)
1687 			break;
1688 		sc->read_dunits_remainder += (bytes / 512);
1689 		while (sc->read_dunits_remainder >= 1000) {
1690 			sc->read_data_units++;
1691 			sc->read_dunits_remainder -= 1000;
1692 		}
1693 		break;
1694 	default:
1695 		DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
1696 		break;
1697 	}
1698 	pthread_mutex_unlock(&sc->mtx);
1699 }
1700 
1701 /*
1702  * Check if the combination of Starting LBA (slba) and Number of Logical
1703  * Blocks (nlb) exceeds the range of the underlying storage.
1704  *
1705  * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
1706  * the capacity in bytes as a uint64_t, care must be taken to avoid integer
1707  * overflow.
1708  */
1709 static bool
1710 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
1711     uint32_t nlb)
1712 {
1713 	size_t	offset, bytes;
1714 
1715 	/* Overflow check of multiplying Starting LBA by the sector size */
1716 	if (slba >> (64 - nvstore->sectsz_bits))
1717 		return (true);
1718 
1719 	offset = slba << nvstore->sectsz_bits;
1720 	bytes = nlb << nvstore->sectsz_bits;
1721 
1722 	/* Overflow check of Number of Logical Blocks */
1723 	if ((nvstore->size - offset) < bytes)
1724 		return (true);
1725 
1726 	return (false);
1727 }
1728 
1729 static int
1730 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1731 	uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1732 {
1733 	int iovidx;
1734 
1735 	if (req == NULL)
1736 		return (-1);
1737 
1738 	if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
1739 		return (-1);
1740 	}
1741 
1742 	/* concatenate contig block-iovs to minimize number of iovs */
1743 	if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1744 		iovidx = req->io_req.br_iovcnt - 1;
1745 
1746 		req->io_req.br_iov[iovidx].iov_base =
1747 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1748 				     req->prev_gpaddr, size);
1749 
1750 		req->prev_size += size;
1751 		req->io_req.br_resid += size;
1752 
1753 		req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1754 	} else {
1755 		iovidx = req->io_req.br_iovcnt;
1756 		if (iovidx == 0) {
1757 			req->io_req.br_offset = lba;
1758 			req->io_req.br_resid = 0;
1759 			req->io_req.br_param = req;
1760 		}
1761 
1762 		req->io_req.br_iov[iovidx].iov_base =
1763 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1764 				     gpaddr, size);
1765 
1766 		req->io_req.br_iov[iovidx].iov_len = size;
1767 
1768 		req->prev_gpaddr = gpaddr;
1769 		req->prev_size = size;
1770 		req->io_req.br_resid += size;
1771 
1772 		req->io_req.br_iovcnt++;
1773 	}
1774 
1775 	return (0);
1776 }
1777 
1778 static void
1779 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1780 	struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1781 	uint32_t cdw0, uint16_t status)
1782 {
1783 	struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1784 
1785 	DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1786 		 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1787 		 NVME_STATUS_GET_SC(status));
1788 
1789 	pci_nvme_cq_update(sc, cq,
1790 	    0,		/* CDW0 */
1791 	    cid,
1792 	    sqid,
1793 	    status);
1794 
1795 	if (cq->head != cq->tail) {
1796 		if (cq->intr_en & NVME_CQ_INTEN) {
1797 			pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1798 		} else {
1799 			DPRINTF("%s: CQ%u interrupt disabled",
1800 						__func__, sq->cqid);
1801 		}
1802 	}
1803 }
1804 
1805 static void
1806 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1807 {
1808 	req->sc = NULL;
1809 	req->nvme_sq = NULL;
1810 	req->sqid = 0;
1811 
1812 	pthread_mutex_lock(&sc->mtx);
1813 
1814 	STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
1815 	sc->pending_ios--;
1816 
1817 	/* when no more IO pending, can set to ready if device reset/enabled */
1818 	if (sc->pending_ios == 0 &&
1819 	    NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1820 		sc->regs.csts |= NVME_CSTS_RDY;
1821 
1822 	pthread_mutex_unlock(&sc->mtx);
1823 
1824 	sem_post(&sc->iosemlock);
1825 }
1826 
1827 static struct pci_nvme_ioreq *
1828 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1829 {
1830 	struct pci_nvme_ioreq *req = NULL;;
1831 
1832 	sem_wait(&sc->iosemlock);
1833 	pthread_mutex_lock(&sc->mtx);
1834 
1835 	req = STAILQ_FIRST(&sc->ioreqs_free);
1836 	assert(req != NULL);
1837 	STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
1838 
1839 	req->sc = sc;
1840 
1841 	sc->pending_ios++;
1842 
1843 	pthread_mutex_unlock(&sc->mtx);
1844 
1845 	req->io_req.br_iovcnt = 0;
1846 	req->io_req.br_offset = 0;
1847 	req->io_req.br_resid = 0;
1848 	req->io_req.br_param = req;
1849 	req->prev_gpaddr = 0;
1850 	req->prev_size = 0;
1851 
1852 	return req;
1853 }
1854 
1855 static void
1856 pci_nvme_io_done(struct blockif_req *br, int err)
1857 {
1858 	struct pci_nvme_ioreq *req = br->br_param;
1859 	struct nvme_submission_queue *sq = req->nvme_sq;
1860 	uint16_t code, status;
1861 
1862 	DPRINTF("%s error %d %s", __func__, err, strerror(err));
1863 
1864 	/* TODO return correct error */
1865 	code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1866 	pci_nvme_status_genc(&status, code);
1867 
1868 	pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
1869 	pci_nvme_stats_write_read_update(req->sc, req->opc,
1870 	    req->bytes, status);
1871 	pci_nvme_release_ioreq(req->sc, req);
1872 }
1873 
1874 /*
1875  * Implements the Flush command. The specification states:
1876  *    If a volatile write cache is not present, Flush commands complete
1877  *    successfully and have no effect
1878  * in the description of the Volatile Write Cache (VWC) field of the Identify
1879  * Controller data. Therefore, set status to Success if the command is
1880  * not supported (i.e. RAM or as indicated by the blockif).
1881  */
1882 static bool
1883 nvme_opc_flush(struct pci_nvme_softc *sc,
1884     struct nvme_command *cmd,
1885     struct pci_nvme_blockstore *nvstore,
1886     struct pci_nvme_ioreq *req,
1887     uint16_t *status)
1888 {
1889 	bool pending = false;
1890 
1891 	if (nvstore->type == NVME_STOR_RAM) {
1892 		pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1893 	} else {
1894 		int err;
1895 
1896 		req->io_req.br_callback = pci_nvme_io_done;
1897 
1898 		err = blockif_flush(nvstore->ctx, &req->io_req);
1899 		switch (err) {
1900 		case 0:
1901 			pending = true;
1902 			break;
1903 		case EOPNOTSUPP:
1904 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1905 			break;
1906 		default:
1907 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1908 		}
1909 	}
1910 
1911 	return (pending);
1912 }
1913 
1914 static uint16_t
1915 nvme_write_read_ram(struct pci_nvme_softc *sc,
1916     struct pci_nvme_blockstore *nvstore,
1917     uint64_t prp1, uint64_t prp2,
1918     size_t offset, uint64_t bytes,
1919     bool is_write)
1920 {
1921 	uint8_t *buf = nvstore->ctx;
1922 	enum nvme_copy_dir dir;
1923 	uint16_t status;
1924 
1925 	if (is_write)
1926 		dir = NVME_COPY_TO_PRP;
1927 	else
1928 		dir = NVME_COPY_FROM_PRP;
1929 
1930 	if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
1931 	    buf + offset, bytes, dir))
1932 		pci_nvme_status_genc(&status,
1933 		    NVME_SC_DATA_TRANSFER_ERROR);
1934 	else
1935 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1936 
1937 	return (status);
1938 }
1939 
1940 static uint16_t
1941 nvme_write_read_blockif(struct pci_nvme_softc *sc,
1942     struct pci_nvme_blockstore *nvstore,
1943     struct pci_nvme_ioreq *req,
1944     uint64_t prp1, uint64_t prp2,
1945     size_t offset, uint64_t bytes,
1946     bool is_write)
1947 {
1948 	uint64_t size;
1949 	int err;
1950 	uint16_t status = NVME_NO_STATUS;
1951 
1952 	size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
1953 	if (pci_nvme_append_iov_req(sc, req, prp1,
1954 	    size, is_write, offset)) {
1955 		pci_nvme_status_genc(&status,
1956 		    NVME_SC_DATA_TRANSFER_ERROR);
1957 		goto out;
1958 	}
1959 
1960 	offset += size;
1961 	bytes  -= size;
1962 
1963 	if (bytes == 0) {
1964 		;
1965 	} else if (bytes <= PAGE_SIZE) {
1966 		size = bytes;
1967 		if (pci_nvme_append_iov_req(sc, req, prp2,
1968 		    size, is_write, offset)) {
1969 			pci_nvme_status_genc(&status,
1970 			    NVME_SC_DATA_TRANSFER_ERROR);
1971 			goto out;
1972 		}
1973 	} else {
1974 		void *vmctx = sc->nsc_pi->pi_vmctx;
1975 		uint64_t *prp_list = &prp2;
1976 		uint64_t *last = prp_list;
1977 
1978 		/* PRP2 is pointer to a physical region page list */
1979 		while (bytes) {
1980 			/* Last entry in list points to the next list */
1981 			if (prp_list == last) {
1982 				uint64_t prp = *prp_list;
1983 
1984 				prp_list = paddr_guest2host(vmctx, prp,
1985 				    PAGE_SIZE - (prp % PAGE_SIZE));
1986 				last = prp_list + (NVME_PRP2_ITEMS - 1);
1987 			}
1988 
1989 			size = MIN(bytes, PAGE_SIZE);
1990 
1991 			if (pci_nvme_append_iov_req(sc, req, *prp_list,
1992 			    size, is_write, offset)) {
1993 				pci_nvme_status_genc(&status,
1994 				    NVME_SC_DATA_TRANSFER_ERROR);
1995 				goto out;
1996 			}
1997 
1998 			offset += size;
1999 			bytes  -= size;
2000 
2001 			prp_list++;
2002 		}
2003 	}
2004 	req->io_req.br_callback = pci_nvme_io_done;
2005 	if (is_write)
2006 		err = blockif_write(nvstore->ctx, &req->io_req);
2007 	else
2008 		err = blockif_read(nvstore->ctx, &req->io_req);
2009 
2010 	if (err)
2011 		pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2012 out:
2013 	return (status);
2014 }
2015 
2016 static bool
2017 nvme_opc_write_read(struct pci_nvme_softc *sc,
2018     struct nvme_command *cmd,
2019     struct pci_nvme_blockstore *nvstore,
2020     struct pci_nvme_ioreq *req,
2021     uint16_t *status)
2022 {
2023 	uint64_t lba, nblocks, bytes;
2024 	size_t offset;
2025 	bool is_write = cmd->opc == NVME_OPC_WRITE;
2026 	bool pending = false;
2027 
2028 	lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2029 	nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2030 	if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2031 		WPRINTF("%s command would exceed LBA range", __func__);
2032 		pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2033 		goto out;
2034 	}
2035 
2036 	bytes  = nblocks << nvstore->sectsz_bits;
2037 	if (bytes > NVME_MAX_DATA_SIZE) {
2038 		WPRINTF("%s command would exceed MDTS", __func__);
2039 		pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2040 		goto out;
2041 	}
2042 
2043 	offset = lba << nvstore->sectsz_bits;
2044 
2045 	req->bytes = bytes;
2046 	req->io_req.br_offset = lba;
2047 
2048 	/* PRP bits 1:0 must be zero */
2049 	cmd->prp1 &= ~0x3UL;
2050 	cmd->prp2 &= ~0x3UL;
2051 
2052 	if (nvstore->type == NVME_STOR_RAM) {
2053 		*status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2054 		    cmd->prp2, offset, bytes, is_write);
2055 	} else {
2056 		*status = nvme_write_read_blockif(sc, nvstore, req,
2057 		    cmd->prp1, cmd->prp2, offset, bytes, is_write);
2058 
2059 		if (*status == NVME_NO_STATUS)
2060 			pending = true;
2061 	}
2062 out:
2063 	if (!pending)
2064 		pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2065 
2066 	return (pending);
2067 }
2068 
2069 static void
2070 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2071 {
2072 	struct pci_nvme_ioreq *req = br->br_param;
2073 	struct pci_nvme_softc *sc = req->sc;
2074 	bool done = true;
2075 	uint16_t status;
2076 
2077 	if (err) {
2078 		pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2079 	} else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2080 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2081 	} else {
2082 		struct iovec *iov = req->io_req.br_iov;
2083 
2084 		req->prev_gpaddr++;
2085 		iov += req->prev_gpaddr;
2086 
2087 		/* The iov_* values already include the sector size */
2088 		req->io_req.br_offset = (off_t)iov->iov_base;
2089 		req->io_req.br_resid = iov->iov_len;
2090 		if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2091 			pci_nvme_status_genc(&status,
2092 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2093 		} else
2094 			done = false;
2095 	}
2096 
2097 	if (done) {
2098 		pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
2099 		    req->cid, 0, status);
2100 		pci_nvme_release_ioreq(sc, req);
2101 	}
2102 }
2103 
2104 static bool
2105 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2106     struct nvme_command *cmd,
2107     struct pci_nvme_blockstore *nvstore,
2108     struct pci_nvme_ioreq *req,
2109     uint16_t *status)
2110 {
2111 	struct nvme_dsm_range *range;
2112 	uint32_t nr, r, non_zero, dr;
2113 	int err;
2114 	bool pending = false;
2115 
2116 	if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2117 		pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2118 		goto out;
2119 	}
2120 
2121 	nr = cmd->cdw10 & 0xff;
2122 
2123 	/* copy locally because a range entry could straddle PRPs */
2124 	range = calloc(1, NVME_MAX_DSM_TRIM);
2125 	if (range == NULL) {
2126 		pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2127 		goto out;
2128 	}
2129 	nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2130 	    (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2131 
2132 	/* Check for invalid ranges and the number of non-zero lengths */
2133 	non_zero = 0;
2134 	for (r = 0; r <= nr; r++) {
2135 		if (pci_nvme_out_of_range(nvstore,
2136 		    range[r].starting_lba, range[r].length)) {
2137 			pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2138 			goto out;
2139 		}
2140 		if (range[r].length != 0)
2141 			non_zero++;
2142 	}
2143 
2144 	if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2145 		size_t offset, bytes;
2146 		int sectsz_bits = sc->nvstore.sectsz_bits;
2147 
2148 		/*
2149 		 * DSM calls are advisory only, and compliant controllers
2150 		 * may choose to take no actions (i.e. return Success).
2151 		 */
2152 		if (!nvstore->deallocate) {
2153 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2154 			goto out;
2155 		}
2156 
2157 		/* If all ranges have a zero length, return Success */
2158 		if (non_zero == 0) {
2159 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2160 			goto out;
2161 		}
2162 
2163 		if (req == NULL) {
2164 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2165 			goto out;
2166 		}
2167 
2168 		offset = range[0].starting_lba << sectsz_bits;
2169 		bytes = range[0].length << sectsz_bits;
2170 
2171 		/*
2172 		 * If the request is for more than a single range, store
2173 		 * the ranges in the br_iov. Optimize for the common case
2174 		 * of a single range.
2175 		 *
2176 		 * Note that NVMe Number of Ranges is a zero based value
2177 		 */
2178 		req->io_req.br_iovcnt = 0;
2179 		req->io_req.br_offset = offset;
2180 		req->io_req.br_resid = bytes;
2181 
2182 		if (nr == 0) {
2183 			req->io_req.br_callback = pci_nvme_io_done;
2184 		} else {
2185 			struct iovec *iov = req->io_req.br_iov;
2186 
2187 			for (r = 0, dr = 0; r <= nr; r++) {
2188 				offset = range[r].starting_lba << sectsz_bits;
2189 				bytes = range[r].length << sectsz_bits;
2190 				if (bytes == 0)
2191 					continue;
2192 
2193 				if ((nvstore->size - offset) < bytes) {
2194 					pci_nvme_status_genc(status,
2195 					    NVME_SC_LBA_OUT_OF_RANGE);
2196 					goto out;
2197 				}
2198 				iov[dr].iov_base = (void *)offset;
2199 				iov[dr].iov_len = bytes;
2200 				dr++;
2201 			}
2202 			req->io_req.br_callback = pci_nvme_dealloc_sm;
2203 
2204 			/*
2205 			 * Use prev_gpaddr to track the current entry and
2206 			 * prev_size to track the number of entries
2207 			 */
2208 			req->prev_gpaddr = 0;
2209 			req->prev_size = dr;
2210 		}
2211 
2212 		err = blockif_delete(nvstore->ctx, &req->io_req);
2213 		if (err)
2214 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2215 		else
2216 			pending = true;
2217 	}
2218 out:
2219 	free(range);
2220 	return (pending);
2221 }
2222 
2223 static void
2224 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2225 {
2226 	struct nvme_submission_queue *sq;
2227 	uint16_t status;
2228 	uint16_t sqhead;
2229 
2230 	/* handle all submissions up to sq->tail index */
2231 	sq = &sc->submit_queues[idx];
2232 
2233 	pthread_mutex_lock(&sq->mtx);
2234 
2235 	sqhead = sq->head;
2236 	DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2237 	         idx, sqhead, sq->tail, sq->qbase);
2238 
2239 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
2240 		struct nvme_command *cmd;
2241 		struct pci_nvme_ioreq *req;
2242 		uint32_t nsid;
2243 		bool pending;
2244 
2245 		pending = false;
2246 		req = NULL;
2247 		status = 0;
2248 
2249 		cmd = &sq->qbase[sqhead];
2250 		sqhead = (sqhead + 1) % sq->size;
2251 
2252 		nsid = le32toh(cmd->nsid);
2253 		if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2254 			pci_nvme_status_genc(&status,
2255 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2256 			status |=
2257 			    NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2258 			goto complete;
2259  		}
2260 
2261 		req = pci_nvme_get_ioreq(sc);
2262 		if (req == NULL) {
2263 			pci_nvme_status_genc(&status,
2264 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2265 			WPRINTF("%s: unable to allocate IO req", __func__);
2266 			goto complete;
2267 		}
2268 		req->nvme_sq = sq;
2269 		req->sqid = idx;
2270 		req->opc = cmd->opc;
2271 		req->cid = cmd->cid;
2272 		req->nsid = cmd->nsid;
2273 
2274 		switch (cmd->opc) {
2275 		case NVME_OPC_FLUSH:
2276 			pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2277 			    req, &status);
2278  			break;
2279 		case NVME_OPC_WRITE:
2280 		case NVME_OPC_READ:
2281 			pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2282 			    req, &status);
2283 			break;
2284 		case NVME_OPC_WRITE_ZEROES:
2285 			/* TODO: write zeroes
2286 			WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2287 			        __func__, lba, cmd->cdw12 & 0xFFFF); */
2288 			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2289 			break;
2290 		case NVME_OPC_DATASET_MANAGEMENT:
2291  			pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2292 			    req, &status);
2293 			break;
2294  		default:
2295  			WPRINTF("%s unhandled io command 0x%x",
2296 			    __func__, cmd->opc);
2297 			pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2298 		}
2299 complete:
2300 		if (!pending) {
2301 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
2302 			    status);
2303 			if (req != NULL)
2304 				pci_nvme_release_ioreq(sc, req);
2305 		}
2306 	}
2307 
2308 	sq->head = sqhead;
2309 
2310 	pthread_mutex_unlock(&sq->mtx);
2311 }
2312 
2313 static void
2314 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
2315 	uint64_t idx, int is_sq, uint64_t value)
2316 {
2317 	DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2318 	        idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2319 
2320 	if (is_sq) {
2321 		if (idx > sc->num_squeues) {
2322 			WPRINTF("%s queue index %lu overflow from "
2323 			         "guest (max %u)",
2324 			         __func__, idx, sc->num_squeues);
2325 			return;
2326 		}
2327 
2328 		atomic_store_short(&sc->submit_queues[idx].tail,
2329 		                   (uint16_t)value);
2330 
2331 		if (idx == 0) {
2332 			pci_nvme_handle_admin_cmd(sc, value);
2333 		} else {
2334 			/* submission queue; handle new entries in SQ */
2335 			if (idx > sc->num_squeues) {
2336 				WPRINTF("%s SQ index %lu overflow from "
2337 				         "guest (max %u)",
2338 				         __func__, idx, sc->num_squeues);
2339 				return;
2340 			}
2341 			pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2342 		}
2343 	} else {
2344 		if (idx > sc->num_cqueues) {
2345 			WPRINTF("%s queue index %lu overflow from "
2346 			         "guest (max %u)",
2347 			         __func__, idx, sc->num_cqueues);
2348 			return;
2349 		}
2350 
2351 		atomic_store_short(&sc->compl_queues[idx].head,
2352 				(uint16_t)value);
2353 	}
2354 }
2355 
2356 static void
2357 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2358 {
2359 	const char *s = iswrite ? "WRITE" : "READ";
2360 
2361 	switch (offset) {
2362 	case NVME_CR_CAP_LOW:
2363 		DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2364 		break;
2365 	case NVME_CR_CAP_HI:
2366 		DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2367 		break;
2368 	case NVME_CR_VS:
2369 		DPRINTF("%s %s NVME_CR_VS", func, s);
2370 		break;
2371 	case NVME_CR_INTMS:
2372 		DPRINTF("%s %s NVME_CR_INTMS", func, s);
2373 		break;
2374 	case NVME_CR_INTMC:
2375 		DPRINTF("%s %s NVME_CR_INTMC", func, s);
2376 		break;
2377 	case NVME_CR_CC:
2378 		DPRINTF("%s %s NVME_CR_CC", func, s);
2379 		break;
2380 	case NVME_CR_CSTS:
2381 		DPRINTF("%s %s NVME_CR_CSTS", func, s);
2382 		break;
2383 	case NVME_CR_NSSR:
2384 		DPRINTF("%s %s NVME_CR_NSSR", func, s);
2385 		break;
2386 	case NVME_CR_AQA:
2387 		DPRINTF("%s %s NVME_CR_AQA", func, s);
2388 		break;
2389 	case NVME_CR_ASQ_LOW:
2390 		DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2391 		break;
2392 	case NVME_CR_ASQ_HI:
2393 		DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2394 		break;
2395 	case NVME_CR_ACQ_LOW:
2396 		DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2397 		break;
2398 	case NVME_CR_ACQ_HI:
2399 		DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2400 		break;
2401 	default:
2402 		DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2403 	}
2404 
2405 }
2406 
2407 static void
2408 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2409 	uint64_t offset, int size, uint64_t value)
2410 {
2411 	uint32_t ccreg;
2412 
2413 	if (offset >= NVME_DOORBELL_OFFSET) {
2414 		uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2415 		uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2416 		int is_sq = (belloffset % 8) < 4;
2417 
2418 		if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2419 			WPRINTF("guest attempted an overflow write offset "
2420 			         "0x%lx, val 0x%lx in %s",
2421 			         offset, value, __func__);
2422 			return;
2423 		}
2424 
2425 		pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2426 		return;
2427 	}
2428 
2429 	DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2430 	        offset, size, value);
2431 
2432 	if (size != 4) {
2433 		WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2434 		         "val 0x%lx) to bar0 in %s",
2435 		         size, offset, value, __func__);
2436 		/* TODO: shutdown device */
2437 		return;
2438 	}
2439 
2440 	pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2441 
2442 	pthread_mutex_lock(&sc->mtx);
2443 
2444 	switch (offset) {
2445 	case NVME_CR_CAP_LOW:
2446 	case NVME_CR_CAP_HI:
2447 		/* readonly */
2448 		break;
2449 	case NVME_CR_VS:
2450 		/* readonly */
2451 		break;
2452 	case NVME_CR_INTMS:
2453 		/* MSI-X, so ignore */
2454 		break;
2455 	case NVME_CR_INTMC:
2456 		/* MSI-X, so ignore */
2457 		break;
2458 	case NVME_CR_CC:
2459 		ccreg = (uint32_t)value;
2460 
2461 		DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2462 		         "iocqes %u",
2463 		        __func__,
2464 			 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2465 			 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2466 			 NVME_CC_GET_IOCQES(ccreg));
2467 
2468 		if (NVME_CC_GET_SHN(ccreg)) {
2469 			/* perform shutdown - flush out data to backend */
2470 			sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2471 			    NVME_CSTS_REG_SHST_SHIFT);
2472 			sc->regs.csts |= NVME_SHST_COMPLETE <<
2473 			    NVME_CSTS_REG_SHST_SHIFT;
2474 		}
2475 		if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2476 			if (NVME_CC_GET_EN(ccreg) == 0)
2477 				/* transition 1-> causes controller reset */
2478 				pci_nvme_reset_locked(sc);
2479 			else
2480 				pci_nvme_init_controller(ctx, sc);
2481 		}
2482 
2483 		/* Insert the iocqes, iosqes and en bits from the write */
2484 		sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2485 		sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2486 		if (NVME_CC_GET_EN(ccreg) == 0) {
2487 			/* Insert the ams, mps and css bit fields */
2488 			sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2489 			sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2490 			sc->regs.csts &= ~NVME_CSTS_RDY;
2491 		} else if (sc->pending_ios == 0) {
2492 			sc->regs.csts |= NVME_CSTS_RDY;
2493 		}
2494 		break;
2495 	case NVME_CR_CSTS:
2496 		break;
2497 	case NVME_CR_NSSR:
2498 		/* ignore writes; don't support subsystem reset */
2499 		break;
2500 	case NVME_CR_AQA:
2501 		sc->regs.aqa = (uint32_t)value;
2502 		break;
2503 	case NVME_CR_ASQ_LOW:
2504 		sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2505 		               (0xFFFFF000 & value);
2506 		break;
2507 	case NVME_CR_ASQ_HI:
2508 		sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2509 		               (value << 32);
2510 		break;
2511 	case NVME_CR_ACQ_LOW:
2512 		sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2513 		               (0xFFFFF000 & value);
2514 		break;
2515 	case NVME_CR_ACQ_HI:
2516 		sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2517 		               (value << 32);
2518 		break;
2519 	default:
2520 		DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2521 		         __func__, offset, value, size);
2522 	}
2523 	pthread_mutex_unlock(&sc->mtx);
2524 }
2525 
2526 static void
2527 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2528                 int baridx, uint64_t offset, int size, uint64_t value)
2529 {
2530 	struct pci_nvme_softc* sc = pi->pi_arg;
2531 
2532 	if (baridx == pci_msix_table_bar(pi) ||
2533 	    baridx == pci_msix_pba_bar(pi)) {
2534 		DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2535 		         " value 0x%lx", baridx, offset, size, value);
2536 
2537 		pci_emul_msix_twrite(pi, offset, size, value);
2538 		return;
2539 	}
2540 
2541 	switch (baridx) {
2542 	case 0:
2543 		pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2544 		break;
2545 
2546 	default:
2547 		DPRINTF("%s unknown baridx %d, val 0x%lx",
2548 		         __func__, baridx, value);
2549 	}
2550 }
2551 
2552 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2553 	uint64_t offset, int size)
2554 {
2555 	uint64_t value;
2556 
2557 	pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2558 
2559 	if (offset < NVME_DOORBELL_OFFSET) {
2560 		void *p = &(sc->regs);
2561 		pthread_mutex_lock(&sc->mtx);
2562 		memcpy(&value, (void *)((uintptr_t)p + offset), size);
2563 		pthread_mutex_unlock(&sc->mtx);
2564 	} else {
2565 		value = 0;
2566                 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2567 	}
2568 
2569 	switch (size) {
2570 	case 1:
2571 		value &= 0xFF;
2572 		break;
2573 	case 2:
2574 		value &= 0xFFFF;
2575 		break;
2576 	case 4:
2577 		value &= 0xFFFFFFFF;
2578 		break;
2579 	}
2580 
2581 	DPRINTF("   nvme-read offset 0x%lx, size %d -> value 0x%x",
2582 	         offset, size, (uint32_t)value);
2583 
2584 	return (value);
2585 }
2586 
2587 
2588 
2589 static uint64_t
2590 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2591     uint64_t offset, int size)
2592 {
2593 	struct pci_nvme_softc* sc = pi->pi_arg;
2594 
2595 	if (baridx == pci_msix_table_bar(pi) ||
2596 	    baridx == pci_msix_pba_bar(pi)) {
2597 		DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2598 		        baridx, offset, size);
2599 
2600 		return pci_emul_msix_tread(pi, offset, size);
2601 	}
2602 
2603 	switch (baridx) {
2604 	case 0:
2605        		return pci_nvme_read_bar_0(sc, offset, size);
2606 
2607 	default:
2608 		DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2609 	}
2610 
2611 	return (0);
2612 }
2613 
2614 static int
2615 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl)
2616 {
2617 	char bident[sizeof("XX:X:X")];
2618 	const char *value;
2619 	uint32_t sectsz;
2620 
2621 	sc->max_queues = NVME_QUEUES;
2622 	sc->max_qentries = NVME_MAX_QENTRIES;
2623 	sc->ioslots = NVME_IOSLOTS;
2624 	sc->num_squeues = sc->max_queues;
2625 	sc->num_cqueues = sc->max_queues;
2626 	sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2627 	sectsz = 0;
2628 	snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2629 	         "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2630 
2631 	value = get_config_value_node(nvl, "maxq");
2632 	if (value != NULL)
2633 		sc->max_queues = atoi(value);
2634 	value = get_config_value_node(nvl, "qsz");
2635 	if (value != NULL) {
2636 		sc->max_qentries = atoi(value);
2637 		if (sc->max_qentries <= 0) {
2638 			EPRINTLN("nvme: Invalid qsz option %d",
2639 			    sc->max_qentries);
2640 			return (-1);
2641 		}
2642 	}
2643 	value = get_config_value_node(nvl, "ioslots");
2644 	if (value != NULL) {
2645 		sc->ioslots = atoi(value);
2646 		if (sc->ioslots <= 0) {
2647 			EPRINTLN("Invalid ioslots option %d", sc->ioslots);
2648 			return (-1);
2649 		}
2650 	}
2651 	value = get_config_value_node(nvl, "sectsz");
2652 	if (value != NULL)
2653 		sectsz = atoi(value);
2654 	value = get_config_value_node(nvl, "ser");
2655 	if (value != NULL) {
2656 		/*
2657 		 * This field indicates the Product Serial Number in
2658 		 * 7-bit ASCII, unused bytes should be space characters.
2659 		 * Ref: NVMe v1.3c.
2660 		 */
2661 		cpywithpad((char *)sc->ctrldata.sn,
2662 		    sizeof(sc->ctrldata.sn), value, ' ');
2663 	}
2664 	value = get_config_value_node(nvl, "eui64");
2665 	if (value != NULL)
2666 		sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0));
2667 	value = get_config_value_node(nvl, "dsm");
2668 	if (value != NULL) {
2669 		if (strcmp(value, "auto") == 0)
2670 			sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2671 		else if (strcmp(value, "enable") == 0)
2672 			sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2673 		else if (strcmp(value, "disable") == 0)
2674 			sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2675 	}
2676 
2677 	value = get_config_value_node(nvl, "ram");
2678 	if (value != NULL) {
2679 		uint64_t sz = strtoull(value, NULL, 10);
2680 
2681 		sc->nvstore.type = NVME_STOR_RAM;
2682 		sc->nvstore.size = sz * 1024 * 1024;
2683 		sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2684 		sc->nvstore.sectsz = 4096;
2685 		sc->nvstore.sectsz_bits = 12;
2686 		if (sc->nvstore.ctx == NULL) {
2687 			EPRINTLN("nvme: Unable to allocate RAM");
2688 			return (-1);
2689 		}
2690 	} else {
2691 		snprintf(bident, sizeof(bident), "%d:%d",
2692 		    sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2693 		sc->nvstore.ctx = blockif_open(nvl, bident);
2694 		if (sc->nvstore.ctx == NULL) {
2695 			EPRINTLN("nvme: Could not open backing file: %s",
2696 			    strerror(errno));
2697 			return (-1);
2698 		}
2699 		sc->nvstore.type = NVME_STOR_BLOCKIF;
2700 		sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2701 	}
2702 
2703 	if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2704 		sc->nvstore.sectsz = sectsz;
2705 	else if (sc->nvstore.type != NVME_STOR_RAM)
2706 		sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2707 	for (sc->nvstore.sectsz_bits = 9;
2708 	     (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2709 	     sc->nvstore.sectsz_bits++);
2710 
2711 	if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2712 		sc->max_queues = NVME_QUEUES;
2713 
2714 	return (0);
2715 }
2716 
2717 static int
2718 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl)
2719 {
2720 	struct pci_nvme_softc *sc;
2721 	uint32_t pci_membar_sz;
2722 	int	error;
2723 
2724 	error = 0;
2725 
2726 	sc = calloc(1, sizeof(struct pci_nvme_softc));
2727 	pi->pi_arg = sc;
2728 	sc->nsc_pi = pi;
2729 
2730 	error = pci_nvme_parse_config(sc, nvl);
2731 	if (error < 0)
2732 		goto done;
2733 	else
2734 		error = 0;
2735 
2736 	STAILQ_INIT(&sc->ioreqs_free);
2737 	sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
2738 	for (int i = 0; i < sc->ioslots; i++) {
2739 		STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
2740 	}
2741 
2742 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
2743 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
2744 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
2745 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
2746 	pci_set_cfgdata8(pi, PCIR_PROGIF,
2747 	                 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
2748 
2749 	/*
2750 	 * Allocate size of NVMe registers + doorbell space for all queues.
2751 	 *
2752 	 * The specification requires a minimum memory I/O window size of 16K.
2753 	 * The Windows driver will refuse to start a device with a smaller
2754 	 * window.
2755 	 */
2756 	pci_membar_sz = sizeof(struct nvme_registers) +
2757 	    2 * sizeof(uint32_t) * (sc->max_queues + 1);
2758 	pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
2759 
2760 	DPRINTF("nvme membar size: %u", pci_membar_sz);
2761 
2762 	error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
2763 	if (error) {
2764 		WPRINTF("%s pci alloc mem bar failed", __func__);
2765 		goto done;
2766 	}
2767 
2768 	error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2769 	if (error) {
2770 		WPRINTF("%s pci add msixcap failed", __func__);
2771 		goto done;
2772 	}
2773 
2774 	error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2775 	if (error) {
2776 		WPRINTF("%s pci add Express capability failed", __func__);
2777 		goto done;
2778 	}
2779 
2780 	pthread_mutex_init(&sc->mtx, NULL);
2781 	sem_init(&sc->iosemlock, 0, sc->ioslots);
2782 
2783 	pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
2784 	/*
2785 	 * Controller data depends on Namespace data so initialize Namespace
2786 	 * data first.
2787 	 */
2788 	pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
2789 	pci_nvme_init_ctrldata(sc);
2790 	pci_nvme_init_logpages(sc);
2791 	pci_nvme_init_features(sc);
2792 
2793 	pci_nvme_aer_init(sc);
2794 
2795 	pci_nvme_reset(sc);
2796 
2797 	pci_lintr_request(pi);
2798 
2799 done:
2800 	return (error);
2801 }
2802 
2803 
2804 struct pci_devemu pci_de_nvme = {
2805 	.pe_emu =	"nvme",
2806 	.pe_init =	pci_nvme_init,
2807 	.pe_legacy_config = blockif_legacy_config,
2808 	.pe_barwrite =	pci_nvme_write,
2809 	.pe_barread =	pci_nvme_read
2810 };
2811 PCI_EMUL_SET(pci_de_nvme);
2812