xref: /freebsd/usr.sbin/bhyve/pci_nvme.c (revision 5f24ef21bee1ae1335796004829d750252582ca5)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  * Copyright (c) 2020 Chuck Tuffli
7  *
8  * Function crc16 Copyright (c) 2017, Fedor Uporov
9  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 /*
34  * bhyve PCIe-NVMe device emulation.
35  *
36  * options:
37  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
38  *
39  *  accepted devpath:
40  *    /dev/blockdev
41  *    /path/to/image
42  *    ram=size_in_MiB
43  *
44  *  maxq    = max number of queues
45  *  qsz     = max elements in each queue
46  *  ioslots = max number of concurrent io requests
47  *  sectsz  = sector size (defaults to blockif sector size)
48  *  ser     = serial number (20-chars max)
49  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
50  *  dsm     = DataSet Management support. Option is one of auto, enable,disable
51  *
52  */
53 
54 /* TODO:
55     - create async event for smart and log
56     - intr coalesce
57  */
58 
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61 
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
65 
66 #include <assert.h>
67 #include <pthread.h>
68 #include <semaphore.h>
69 #include <stdbool.h>
70 #include <stddef.h>
71 #include <stdint.h>
72 #include <stdio.h>
73 #include <stdlib.h>
74 #include <string.h>
75 
76 #include <machine/atomic.h>
77 #include <machine/vmm.h>
78 #include <vmmapi.h>
79 
80 #include <dev/nvme/nvme.h>
81 
82 #include "bhyverun.h"
83 #include "block_if.h"
84 #include "debug.h"
85 #include "pci_emul.h"
86 
87 
88 static int nvme_debug = 0;
89 #define	DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
90 #define	WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
91 
92 /* defaults; can be overridden */
93 #define	NVME_MSIX_BAR		4
94 
95 #define	NVME_IOSLOTS		8
96 
97 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
98 #define NVME_MMIO_SPACE_MIN	(1 << 14)
99 
100 #define	NVME_QUEUES		16
101 #define	NVME_MAX_QENTRIES	2048
102 /* Memory Page size Minimum reported in CAP register */
103 #define	NVME_MPSMIN		0
104 /* MPSMIN converted to bytes */
105 #define	NVME_MPSMIN_BYTES	(1 << (12 + NVME_MPSMIN))
106 
107 #define	NVME_PRP2_ITEMS		(PAGE_SIZE/sizeof(uint64_t))
108 #define	NVME_MDTS		9
109 /* Note the + 1 allows for the initial descriptor to not be page aligned */
110 #define	NVME_MAX_IOVEC		((1 << NVME_MDTS) + 1)
111 #define	NVME_MAX_DATA_SIZE	((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
112 
113 /* This is a synthetic status code to indicate there is no status */
114 #define NVME_NO_STATUS		0xffff
115 #define NVME_COMPLETION_VALID(c)	((c).status != NVME_NO_STATUS)
116 
117 /* helpers */
118 
119 /* Convert a zero-based value into a one-based value */
120 #define ONE_BASED(zero)		((zero) + 1)
121 /* Convert a one-based value into a zero-based value */
122 #define ZERO_BASED(one)		((one)  - 1)
123 
124 /* Encode number of SQ's and CQ's for Set/Get Features */
125 #define NVME_FEATURE_NUM_QUEUES(sc) \
126 	(ZERO_BASED((sc)->num_squeues) & 0xffff) | \
127 	(ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
128 
129 #define	NVME_DOORBELL_OFFSET	offsetof(struct nvme_registers, doorbell)
130 
131 enum nvme_controller_register_offsets {
132 	NVME_CR_CAP_LOW = 0x00,
133 	NVME_CR_CAP_HI  = 0x04,
134 	NVME_CR_VS      = 0x08,
135 	NVME_CR_INTMS   = 0x0c,
136 	NVME_CR_INTMC   = 0x10,
137 	NVME_CR_CC      = 0x14,
138 	NVME_CR_CSTS    = 0x1c,
139 	NVME_CR_NSSR    = 0x20,
140 	NVME_CR_AQA     = 0x24,
141 	NVME_CR_ASQ_LOW = 0x28,
142 	NVME_CR_ASQ_HI  = 0x2c,
143 	NVME_CR_ACQ_LOW = 0x30,
144 	NVME_CR_ACQ_HI  = 0x34,
145 };
146 
147 enum nvme_cmd_cdw11 {
148 	NVME_CMD_CDW11_PC  = 0x0001,
149 	NVME_CMD_CDW11_IEN = 0x0002,
150 	NVME_CMD_CDW11_IV  = 0xFFFF0000,
151 };
152 
153 enum nvme_copy_dir {
154 	NVME_COPY_TO_PRP,
155 	NVME_COPY_FROM_PRP,
156 };
157 
158 #define	NVME_CQ_INTEN	0x01
159 #define	NVME_CQ_INTCOAL	0x02
160 
161 struct nvme_completion_queue {
162 	struct nvme_completion *qbase;
163 	pthread_mutex_t	mtx;
164 	uint32_t	size;
165 	uint16_t	tail; /* nvme progress */
166 	uint16_t	head; /* guest progress */
167 	uint16_t	intr_vec;
168 	uint32_t	intr_en;
169 };
170 
171 struct nvme_submission_queue {
172 	struct nvme_command *qbase;
173 	pthread_mutex_t	mtx;
174 	uint32_t	size;
175 	uint16_t	head; /* nvme progress */
176 	uint16_t	tail; /* guest progress */
177 	uint16_t	cqid; /* completion queue id */
178 	int		qpriority;
179 };
180 
181 enum nvme_storage_type {
182 	NVME_STOR_BLOCKIF = 0,
183 	NVME_STOR_RAM = 1,
184 };
185 
186 struct pci_nvme_blockstore {
187 	enum nvme_storage_type type;
188 	void		*ctx;
189 	uint64_t	size;
190 	uint32_t	sectsz;
191 	uint32_t	sectsz_bits;
192 	uint64_t	eui64;
193 	uint32_t	deallocate:1;
194 };
195 
196 /*
197  * Calculate the number of additional page descriptors for guest IO requests
198  * based on the advertised Max Data Transfer (MDTS) and given the number of
199  * default iovec's in a struct blockif_req.
200  *
201  * Note the + 1 allows for the initial descriptor to not be page aligned.
202  */
203 #define MDTS_PAD_SIZE \
204 	NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
205 	NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
206 	0
207 
208 struct pci_nvme_ioreq {
209 	struct pci_nvme_softc *sc;
210 	STAILQ_ENTRY(pci_nvme_ioreq) link;
211 	struct nvme_submission_queue *nvme_sq;
212 	uint16_t	sqid;
213 
214 	/* command information */
215 	uint16_t	opc;
216 	uint16_t	cid;
217 	uint32_t	nsid;
218 
219 	uint64_t	prev_gpaddr;
220 	size_t		prev_size;
221 	size_t		bytes;
222 
223 	struct blockif_req io_req;
224 
225 	struct iovec	iovpadding[MDTS_PAD_SIZE];
226 };
227 
228 enum nvme_dsm_type {
229 	/* Dataset Management bit in ONCS reflects backing storage capability */
230 	NVME_DATASET_MANAGEMENT_AUTO,
231 	/* Unconditionally set Dataset Management bit in ONCS */
232 	NVME_DATASET_MANAGEMENT_ENABLE,
233 	/* Unconditionally clear Dataset Management bit in ONCS */
234 	NVME_DATASET_MANAGEMENT_DISABLE,
235 };
236 
237 struct pci_nvme_softc;
238 struct nvme_feature_obj;
239 
240 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
241     struct nvme_feature_obj *,
242     struct nvme_command *,
243     struct nvme_completion *);
244 
245 struct nvme_feature_obj {
246 	uint32_t	cdw11;
247 	nvme_feature_cb	set;
248 	nvme_feature_cb	get;
249 	bool namespace_specific;
250 };
251 
252 #define NVME_FID_MAX		(NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
253 
254 struct pci_nvme_aer {
255 	STAILQ_ENTRY(pci_nvme_aer) link;
256 	uint16_t	cid;	/* Command ID of the submitted AER */
257 };
258 
259 struct pci_nvme_softc {
260 	struct pci_devinst *nsc_pi;
261 
262 	pthread_mutex_t	mtx;
263 
264 	struct nvme_registers regs;
265 
266 	struct nvme_namespace_data  nsdata;
267 	struct nvme_controller_data ctrldata;
268 	struct nvme_error_information_entry err_log;
269 	struct nvme_health_information_page health_log;
270 	struct nvme_firmware_page fw_log;
271 
272 	struct pci_nvme_blockstore nvstore;
273 
274 	uint16_t	max_qentries;	/* max entries per queue */
275 	uint32_t	max_queues;	/* max number of IO SQ's or CQ's */
276 	uint32_t	num_cqueues;
277 	uint32_t	num_squeues;
278 	bool		num_q_is_set; /* Has host set Number of Queues */
279 
280 	struct pci_nvme_ioreq *ioreqs;
281 	STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
282 	uint32_t	pending_ios;
283 	uint32_t	ioslots;
284 	sem_t		iosemlock;
285 
286 	/*
287 	 * Memory mapped Submission and Completion queues
288 	 * Each array includes both Admin and IO queues
289 	 */
290 	struct nvme_completion_queue *compl_queues;
291 	struct nvme_submission_queue *submit_queues;
292 
293 	struct nvme_feature_obj feat[NVME_FID_MAX];
294 
295 	enum nvme_dsm_type dataset_management;
296 
297 	/* Accounting for SMART data */
298 	__uint128_t	read_data_units;
299 	__uint128_t	write_data_units;
300 	__uint128_t	read_commands;
301 	__uint128_t	write_commands;
302 	uint32_t	read_dunits_remainder;
303 	uint32_t	write_dunits_remainder;
304 
305 	STAILQ_HEAD(, pci_nvme_aer) aer_list;
306 	uint32_t	aer_count;
307 };
308 
309 
310 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
311 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
312 static void pci_nvme_io_done(struct blockif_req *, int);
313 
314 /* Controller Configuration utils */
315 #define	NVME_CC_GET_EN(cc) \
316 	((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
317 #define	NVME_CC_GET_CSS(cc) \
318 	((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
319 #define	NVME_CC_GET_SHN(cc) \
320 	((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
321 #define	NVME_CC_GET_IOSQES(cc) \
322 	((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
323 #define	NVME_CC_GET_IOCQES(cc) \
324 	((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
325 
326 #define	NVME_CC_WRITE_MASK \
327 	((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
328 	 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
329 	 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
330 
331 #define	NVME_CC_NEN_WRITE_MASK \
332 	((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
333 	 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
334 	 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
335 
336 /* Controller Status utils */
337 #define	NVME_CSTS_GET_RDY(sts) \
338 	((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
339 
340 #define	NVME_CSTS_RDY	(1 << NVME_CSTS_REG_RDY_SHIFT)
341 
342 /* Completion Queue status word utils */
343 #define	NVME_STATUS_P	(1 << NVME_STATUS_P_SHIFT)
344 #define	NVME_STATUS_MASK \
345 	((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
346 	 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
347 
348 #define NVME_ONCS_DSM	(NVME_CTRLR_DATA_ONCS_DSM_MASK << \
349 	NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
350 
351 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
352     struct nvme_feature_obj *,
353     struct nvme_command *,
354     struct nvme_completion *);
355 static void nvme_feature_num_queues(struct pci_nvme_softc *,
356     struct nvme_feature_obj *,
357     struct nvme_command *,
358     struct nvme_completion *);
359 static void nvme_feature_iv_config(struct pci_nvme_softc *,
360     struct nvme_feature_obj *,
361     struct nvme_command *,
362     struct nvme_completion *);
363 
364 static __inline void
365 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
366 {
367 	size_t len;
368 
369 	len = strnlen(src, dst_size);
370 	memset(dst, pad, dst_size);
371 	memcpy(dst, src, len);
372 }
373 
374 static __inline void
375 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
376 {
377 
378 	*status &= ~NVME_STATUS_MASK;
379 	*status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
380 		(code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
381 }
382 
383 static __inline void
384 pci_nvme_status_genc(uint16_t *status, uint16_t code)
385 {
386 
387 	pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
388 }
389 
390 /*
391  * Initialize the requested number or IO Submission and Completion Queues.
392  * Admin queues are allocated implicitly.
393  */
394 static void
395 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
396 {
397 	uint32_t i;
398 
399 	/*
400 	 * Allocate and initialize the Submission Queues
401 	 */
402 	if (nsq > NVME_QUEUES) {
403 		WPRINTF("%s: clamping number of SQ from %u to %u",
404 					__func__, nsq, NVME_QUEUES);
405 		nsq = NVME_QUEUES;
406 	}
407 
408 	sc->num_squeues = nsq;
409 
410 	sc->submit_queues = calloc(sc->num_squeues + 1,
411 				sizeof(struct nvme_submission_queue));
412 	if (sc->submit_queues == NULL) {
413 		WPRINTF("%s: SQ allocation failed", __func__);
414 		sc->num_squeues = 0;
415 	} else {
416 		struct nvme_submission_queue *sq = sc->submit_queues;
417 
418 		for (i = 0; i < sc->num_squeues; i++)
419 			pthread_mutex_init(&sq[i].mtx, NULL);
420 	}
421 
422 	/*
423 	 * Allocate and initialize the Completion Queues
424 	 */
425 	if (ncq > NVME_QUEUES) {
426 		WPRINTF("%s: clamping number of CQ from %u to %u",
427 					__func__, ncq, NVME_QUEUES);
428 		ncq = NVME_QUEUES;
429 	}
430 
431 	sc->num_cqueues = ncq;
432 
433 	sc->compl_queues = calloc(sc->num_cqueues + 1,
434 				sizeof(struct nvme_completion_queue));
435 	if (sc->compl_queues == NULL) {
436 		WPRINTF("%s: CQ allocation failed", __func__);
437 		sc->num_cqueues = 0;
438 	} else {
439 		struct nvme_completion_queue *cq = sc->compl_queues;
440 
441 		for (i = 0; i < sc->num_cqueues; i++)
442 			pthread_mutex_init(&cq[i].mtx, NULL);
443 	}
444 }
445 
446 static void
447 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
448 {
449 	struct nvme_controller_data *cd = &sc->ctrldata;
450 
451 	cd->vid = 0xFB5D;
452 	cd->ssvid = 0x0000;
453 
454 	cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
455 	cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
456 
457 	/* Num of submission commands that we can handle at a time (2^rab) */
458 	cd->rab   = 4;
459 
460 	/* FreeBSD OUI */
461 	cd->ieee[0] = 0x58;
462 	cd->ieee[1] = 0x9c;
463 	cd->ieee[2] = 0xfc;
464 
465 	cd->mic = 0;
466 
467 	cd->mdts = NVME_MDTS;	/* max data transfer size (2^mdts * CAP.MPSMIN) */
468 
469 	cd->ver = 0x00010300;
470 
471 	cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
472 	cd->acl = 2;
473 	cd->aerl = 4;
474 
475 	/* Advertise 1, Read-only firmware slot */
476 	cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK |
477 	    (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
478 	cd->lpa = 0;	/* TODO: support some simple things like SMART */
479 	cd->elpe = 0;	/* max error log page entries */
480 	cd->npss = 1;	/* number of power states support */
481 
482 	/* Warning Composite Temperature Threshold */
483 	cd->wctemp = 0x0157;
484 
485 	cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
486 	    (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
487 	cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
488 	    (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
489 	cd->nn = 1;	/* number of namespaces */
490 
491 	cd->oncs = 0;
492 	switch (sc->dataset_management) {
493 	case NVME_DATASET_MANAGEMENT_AUTO:
494 		if (sc->nvstore.deallocate)
495 			cd->oncs |= NVME_ONCS_DSM;
496 		break;
497 	case NVME_DATASET_MANAGEMENT_ENABLE:
498 		cd->oncs |= NVME_ONCS_DSM;
499 		break;
500 	default:
501 		break;
502 	}
503 
504 	cd->fna = 0x03;
505 
506 	cd->power_state[0].mp = 10;
507 }
508 
509 /*
510  * Calculate the CRC-16 of the given buffer
511  * See copyright attribution at top of file
512  */
513 static uint16_t
514 crc16(uint16_t crc, const void *buffer, unsigned int len)
515 {
516 	const unsigned char *cp = buffer;
517 	/* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
518 	static uint16_t const crc16_table[256] = {
519 		0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
520 		0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
521 		0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
522 		0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
523 		0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
524 		0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
525 		0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
526 		0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
527 		0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
528 		0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
529 		0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
530 		0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
531 		0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
532 		0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
533 		0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
534 		0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
535 		0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
536 		0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
537 		0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
538 		0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
539 		0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
540 		0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
541 		0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
542 		0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
543 		0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
544 		0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
545 		0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
546 		0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
547 		0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
548 		0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
549 		0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
550 		0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
551 	};
552 
553 	while (len--)
554 		crc = (((crc >> 8) & 0xffU) ^
555 		    crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
556 	return crc;
557 }
558 
559 static void
560 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
561     struct nvme_namespace_data *nd, uint32_t nsid,
562     struct pci_nvme_blockstore *nvstore)
563 {
564 
565 	/* Get capacity and block size information from backing store */
566 	nd->nsze = nvstore->size / nvstore->sectsz;
567 	nd->ncap = nd->nsze;
568 	nd->nuse = nd->nsze;
569 
570 	if (nvstore->type == NVME_STOR_BLOCKIF)
571 		nvstore->deallocate = blockif_candelete(nvstore->ctx);
572 
573 	nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
574 	nd->flbas = 0;
575 
576 	/* Create an EUI-64 if user did not provide one */
577 	if (nvstore->eui64 == 0) {
578 		char *data = NULL;
579 		uint64_t eui64 = nvstore->eui64;
580 
581 		asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
582 		    sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
583 
584 		if (data != NULL) {
585 			eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
586 			free(data);
587 		}
588 		nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
589 	}
590 	be64enc(nd->eui64, nvstore->eui64);
591 
592 	/* LBA data-sz = 2^lbads */
593 	nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
594 }
595 
596 static void
597 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
598 {
599 
600 	memset(&sc->err_log, 0, sizeof(sc->err_log));
601 	memset(&sc->health_log, 0, sizeof(sc->health_log));
602 	memset(&sc->fw_log, 0, sizeof(sc->fw_log));
603 
604 	/* Set read/write remainder to round up according to spec */
605 	sc->read_dunits_remainder = 999;
606 	sc->write_dunits_remainder = 999;
607 }
608 
609 static void
610 pci_nvme_init_features(struct pci_nvme_softc *sc)
611 {
612 
613 	sc->feat[0].set = nvme_feature_invalid_cb;
614 	sc->feat[0].get = nvme_feature_invalid_cb;
615 
616 	sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true;
617 	sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true;
618 	sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues;
619 	sc->feat[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION].set =
620 	    nvme_feature_iv_config;
621 	sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG].get =
622 	    nvme_feature_invalid_cb;
623 	sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW].get =
624 	    nvme_feature_invalid_cb;
625 }
626 
627 static void
628 pci_nvme_aer_init(struct pci_nvme_softc *sc)
629 {
630 
631 	STAILQ_INIT(&sc->aer_list);
632 	sc->aer_count = 0;
633 }
634 
635 static void
636 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
637 {
638 	struct pci_nvme_aer *aer = NULL;
639 
640 	while (!STAILQ_EMPTY(&sc->aer_list)) {
641 		aer = STAILQ_FIRST(&sc->aer_list);
642 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
643 		free(aer);
644 	}
645 
646 	pci_nvme_aer_init(sc);
647 }
648 
649 static bool
650 pci_nvme_aer_available(struct pci_nvme_softc *sc)
651 {
652 
653 	return (!STAILQ_EMPTY(&sc->aer_list));
654 }
655 
656 static bool
657 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
658 {
659 	struct nvme_controller_data *cd = &sc->ctrldata;
660 
661 	/* AERL is a zero based value while aer_count is one's based */
662 	return (sc->aer_count == (cd->aerl + 1));
663 }
664 
665 /*
666  * Add an Async Event Request
667  *
668  * Stores an AER to be returned later if the Controller needs to notify the
669  * host of an event.
670  * Note that while the NVMe spec doesn't require Controllers to return AER's
671  * in order, this implementation does preserve the order.
672  */
673 static int
674 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
675 {
676 	struct pci_nvme_aer *aer = NULL;
677 
678 	if (pci_nvme_aer_limit_reached(sc))
679 		return (-1);
680 
681 	aer = calloc(1, sizeof(struct pci_nvme_aer));
682 	if (aer == NULL)
683 		return (-1);
684 
685 	sc->aer_count++;
686 
687 	/* Save the Command ID for use in the completion message */
688 	aer->cid = cid;
689 	STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
690 
691 	return (0);
692 }
693 
694 /*
695  * Get an Async Event Request structure
696  *
697  * Returns a pointer to an AER previously submitted by the host or NULL if
698  * no AER's exist. Caller is responsible for freeing the returned struct.
699  */
700 static struct pci_nvme_aer *
701 pci_nvme_aer_get(struct pci_nvme_softc *sc)
702 {
703 	struct pci_nvme_aer *aer = NULL;
704 
705 	aer = STAILQ_FIRST(&sc->aer_list);
706 	if (aer != NULL) {
707 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
708 		sc->aer_count--;
709 	}
710 
711 	return (aer);
712 }
713 
714 static void
715 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
716 {
717 	uint32_t i;
718 
719 	DPRINTF("%s", __func__);
720 
721 	sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
722 	    (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
723 	    (60 << NVME_CAP_LO_REG_TO_SHIFT);
724 
725 	sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
726 
727 	sc->regs.vs = 0x00010300;	/* NVMe v1.3 */
728 
729 	sc->regs.cc = 0;
730 	sc->regs.csts = 0;
731 
732 	assert(sc->submit_queues != NULL);
733 
734 	for (i = 0; i < sc->num_squeues + 1; i++) {
735 		sc->submit_queues[i].qbase = NULL;
736 		sc->submit_queues[i].size = 0;
737 		sc->submit_queues[i].cqid = 0;
738 		sc->submit_queues[i].tail = 0;
739 		sc->submit_queues[i].head = 0;
740 	}
741 
742 	assert(sc->compl_queues != NULL);
743 
744 	for (i = 0; i < sc->num_cqueues + 1; i++) {
745 		sc->compl_queues[i].qbase = NULL;
746 		sc->compl_queues[i].size = 0;
747 		sc->compl_queues[i].tail = 0;
748 		sc->compl_queues[i].head = 0;
749 	}
750 
751 	sc->num_q_is_set = false;
752 
753 	pci_nvme_aer_destroy(sc);
754 }
755 
756 static void
757 pci_nvme_reset(struct pci_nvme_softc *sc)
758 {
759 	pthread_mutex_lock(&sc->mtx);
760 	pci_nvme_reset_locked(sc);
761 	pthread_mutex_unlock(&sc->mtx);
762 }
763 
764 static void
765 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
766 {
767 	uint16_t acqs, asqs;
768 
769 	DPRINTF("%s", __func__);
770 
771 	asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
772 	sc->submit_queues[0].size = asqs;
773 	sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
774 	            sizeof(struct nvme_command) * asqs);
775 
776 	DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
777 	        __func__, sc->regs.asq, sc->submit_queues[0].qbase);
778 
779 	acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
780 	    NVME_AQA_REG_ACQS_MASK) + 1;
781 	sc->compl_queues[0].size = acqs;
782 	sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
783 	         sizeof(struct nvme_completion) * acqs);
784 	sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
785 
786 	DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
787 	        __func__, sc->regs.acq, sc->compl_queues[0].qbase);
788 }
789 
790 static int
791 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
792 	size_t len, enum nvme_copy_dir dir)
793 {
794 	uint8_t *p;
795 	size_t bytes;
796 
797 	if (len > (8 * 1024)) {
798 		return (-1);
799 	}
800 
801 	/* Copy from the start of prp1 to the end of the physical page */
802 	bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
803 	bytes = MIN(bytes, len);
804 
805 	p = vm_map_gpa(ctx, prp1, bytes);
806 	if (p == NULL) {
807 		return (-1);
808 	}
809 
810 	if (dir == NVME_COPY_TO_PRP)
811 		memcpy(p, b, bytes);
812 	else
813 		memcpy(b, p, bytes);
814 
815 	b += bytes;
816 
817 	len -= bytes;
818 	if (len == 0) {
819 		return (0);
820 	}
821 
822 	len = MIN(len, PAGE_SIZE);
823 
824 	p = vm_map_gpa(ctx, prp2, len);
825 	if (p == NULL) {
826 		return (-1);
827 	}
828 
829 	if (dir == NVME_COPY_TO_PRP)
830 		memcpy(p, b, len);
831 	else
832 		memcpy(b, p, len);
833 
834 	return (0);
835 }
836 
837 /*
838  * Write a Completion Queue Entry update
839  *
840  * Write the completion and update the doorbell value
841  */
842 static void
843 pci_nvme_cq_update(struct pci_nvme_softc *sc,
844 		struct nvme_completion_queue *cq,
845 		uint32_t cdw0,
846 		uint16_t cid,
847 		uint16_t sqid,
848 		uint16_t status)
849 {
850 	struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
851 	struct nvme_completion *cqe;
852 
853 	assert(cq->qbase != NULL);
854 
855 	pthread_mutex_lock(&cq->mtx);
856 
857 	cqe = &cq->qbase[cq->tail];
858 
859 	/* Flip the phase bit */
860 	status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
861 
862 	cqe->cdw0 = cdw0;
863 	cqe->sqhd = sq->head;
864 	cqe->sqid = sqid;
865 	cqe->cid = cid;
866 	cqe->status = status;
867 
868 	cq->tail++;
869 	if (cq->tail >= cq->size) {
870 		cq->tail = 0;
871 	}
872 
873 	pthread_mutex_unlock(&cq->mtx);
874 }
875 
876 static int
877 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
878 	struct nvme_completion* compl)
879 {
880 	uint16_t qid = command->cdw10 & 0xffff;
881 
882 	DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
883 	if (qid == 0 || qid > sc->num_squeues ||
884 	    (sc->submit_queues[qid].qbase == NULL)) {
885 		WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
886 		        __func__, qid, sc->num_squeues);
887 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
888 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
889 		return (1);
890 	}
891 
892 	sc->submit_queues[qid].qbase = NULL;
893 	sc->submit_queues[qid].cqid = 0;
894 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
895 	return (1);
896 }
897 
898 static int
899 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
900 	struct nvme_completion* compl)
901 {
902 	if (command->cdw11 & NVME_CMD_CDW11_PC) {
903 		uint16_t qid = command->cdw10 & 0xffff;
904 		struct nvme_submission_queue *nsq;
905 
906 		if ((qid == 0) || (qid > sc->num_squeues) ||
907 		    (sc->submit_queues[qid].qbase != NULL)) {
908 			WPRINTF("%s queue index %u > num_squeues %u",
909 			        __func__, qid, sc->num_squeues);
910 			pci_nvme_status_tc(&compl->status,
911 			    NVME_SCT_COMMAND_SPECIFIC,
912 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
913 			return (1);
914 		}
915 
916 		nsq = &sc->submit_queues[qid];
917 		nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
918 		DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
919 		if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
920 			/*
921 			 * Queues must specify at least two entries
922 			 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
923 			 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
924 			 */
925 			pci_nvme_status_tc(&compl->status,
926 			    NVME_SCT_COMMAND_SPECIFIC,
927 			    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
928 			return (1);
929 		}
930 
931 		nsq->cqid = (command->cdw11 >> 16) & 0xffff;
932 		if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
933 			pci_nvme_status_tc(&compl->status,
934 			    NVME_SCT_COMMAND_SPECIFIC,
935 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
936 			return (1);
937 		}
938 
939 		if (sc->compl_queues[nsq->cqid].qbase == NULL) {
940 			pci_nvme_status_tc(&compl->status,
941 			    NVME_SCT_COMMAND_SPECIFIC,
942 			    NVME_SC_COMPLETION_QUEUE_INVALID);
943 			return (1);
944 		}
945 
946 		nsq->qpriority = (command->cdw11 >> 1) & 0x03;
947 
948 		nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
949 		              sizeof(struct nvme_command) * (size_t)nsq->size);
950 
951 		DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
952 		        qid, nsq->size, nsq->qbase, nsq->cqid);
953 
954 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
955 
956 		DPRINTF("%s completed creating IOSQ qid %u",
957 		         __func__, qid);
958 	} else {
959 		/*
960 		 * Guest sent non-cont submission queue request.
961 		 * This setting is unsupported by this emulation.
962 		 */
963 		WPRINTF("%s unsupported non-contig (list-based) "
964 		         "create i/o submission queue", __func__);
965 
966 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
967 	}
968 	return (1);
969 }
970 
971 static int
972 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
973 	struct nvme_completion* compl)
974 {
975 	uint16_t qid = command->cdw10 & 0xffff;
976 	uint16_t sqid;
977 
978 	DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
979 	if (qid == 0 || qid > sc->num_cqueues ||
980 	    (sc->compl_queues[qid].qbase == NULL)) {
981 		WPRINTF("%s queue index %u / num_cqueues %u",
982 		        __func__, qid, sc->num_cqueues);
983 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
984 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
985 		return (1);
986 	}
987 
988 	/* Deleting an Active CQ is an error */
989 	for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
990 		if (sc->submit_queues[sqid].cqid == qid) {
991 			pci_nvme_status_tc(&compl->status,
992 			    NVME_SCT_COMMAND_SPECIFIC,
993 			    NVME_SC_INVALID_QUEUE_DELETION);
994 			return (1);
995 		}
996 
997 	sc->compl_queues[qid].qbase = NULL;
998 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
999 	return (1);
1000 }
1001 
1002 static int
1003 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1004 	struct nvme_completion* compl)
1005 {
1006 	struct nvme_completion_queue *ncq;
1007 	uint16_t qid = command->cdw10 & 0xffff;
1008 
1009 	/* Only support Physically Contiguous queues */
1010 	if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1011 		WPRINTF("%s unsupported non-contig (list-based) "
1012 		         "create i/o completion queue",
1013 		         __func__);
1014 
1015 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1016 		return (1);
1017 	}
1018 
1019 	if ((qid == 0) || (qid > sc->num_cqueues) ||
1020 	    (sc->compl_queues[qid].qbase != NULL)) {
1021 		WPRINTF("%s queue index %u > num_cqueues %u",
1022 			__func__, qid, sc->num_cqueues);
1023 		pci_nvme_status_tc(&compl->status,
1024 		    NVME_SCT_COMMAND_SPECIFIC,
1025 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1026 		return (1);
1027  	}
1028 
1029 	ncq = &sc->compl_queues[qid];
1030 	ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1031 	ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1032 	if (ncq->intr_vec > (sc->max_queues + 1)) {
1033 		pci_nvme_status_tc(&compl->status,
1034 		    NVME_SCT_COMMAND_SPECIFIC,
1035 		    NVME_SC_INVALID_INTERRUPT_VECTOR);
1036 		return (1);
1037 	}
1038 
1039 	ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1040 	if ((ncq->size < 2) || (ncq->size > sc->max_qentries))  {
1041 		/*
1042 		 * Queues must specify at least two entries
1043 		 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1044 		 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1045 		 */
1046 		pci_nvme_status_tc(&compl->status,
1047 		    NVME_SCT_COMMAND_SPECIFIC,
1048 		    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1049 		return (1);
1050 	}
1051 	ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1052 		     command->prp1,
1053 		     sizeof(struct nvme_command) * (size_t)ncq->size);
1054 
1055 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1056 
1057 
1058 	return (1);
1059 }
1060 
1061 static int
1062 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1063 	struct nvme_completion* compl)
1064 {
1065 	uint32_t logsize;
1066 	uint8_t logpage = command->cdw10 & 0xFF;
1067 
1068 	DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1069 
1070 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1071 
1072 	/*
1073 	 * Command specifies the number of dwords to return in fields NUMDU
1074 	 * and NUMDL. This is a zero-based value.
1075 	 */
1076 	logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1077 	logsize *= sizeof(uint32_t);
1078 
1079 	switch (logpage) {
1080 	case NVME_LOG_ERROR:
1081 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1082 		    command->prp2, (uint8_t *)&sc->err_log,
1083 		    MIN(logsize, sizeof(sc->err_log)),
1084 		    NVME_COPY_TO_PRP);
1085 		break;
1086 	case NVME_LOG_HEALTH_INFORMATION:
1087 		pthread_mutex_lock(&sc->mtx);
1088 		memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1089 		    sizeof(sc->health_log.data_units_read));
1090 		memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1091 		    sizeof(sc->health_log.data_units_written));
1092 		memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1093 		    sizeof(sc->health_log.host_read_commands));
1094 		memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1095 		    sizeof(sc->health_log.host_write_commands));
1096 		pthread_mutex_unlock(&sc->mtx);
1097 
1098 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1099 		    command->prp2, (uint8_t *)&sc->health_log,
1100 		    MIN(logsize, sizeof(sc->health_log)),
1101 		    NVME_COPY_TO_PRP);
1102 		break;
1103 	case NVME_LOG_FIRMWARE_SLOT:
1104 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1105 		    command->prp2, (uint8_t *)&sc->fw_log,
1106 		    MIN(logsize, sizeof(sc->fw_log)),
1107 		    NVME_COPY_TO_PRP);
1108 		break;
1109 	default:
1110 		DPRINTF("%s get log page %x command not supported",
1111 		        __func__, logpage);
1112 
1113 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1114 		    NVME_SC_INVALID_LOG_PAGE);
1115 	}
1116 
1117 	return (1);
1118 }
1119 
1120 static int
1121 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1122 	struct nvme_completion* compl)
1123 {
1124 	void *dest;
1125 	uint16_t status;
1126 
1127 	DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1128 	        command->cdw10 & 0xFF, command->nsid);
1129 
1130 	pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1131 
1132 	switch (command->cdw10 & 0xFF) {
1133 	case 0x00: /* return Identify Namespace data structure */
1134 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1135 		    command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1136 		    NVME_COPY_TO_PRP);
1137 		break;
1138 	case 0x01: /* return Identify Controller data structure */
1139 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1140 		    command->prp2, (uint8_t *)&sc->ctrldata,
1141 		    sizeof(sc->ctrldata),
1142 		    NVME_COPY_TO_PRP);
1143 		break;
1144 	case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1145 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1146 		                  sizeof(uint32_t) * 1024);
1147 		/* All unused entries shall be zero */
1148 		bzero(dest, sizeof(uint32_t) * 1024);
1149 		((uint32_t *)dest)[0] = 1;
1150 		break;
1151 	case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1152 		if (command->nsid != 1) {
1153 			pci_nvme_status_genc(&status,
1154 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1155 			break;
1156 		}
1157 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1158 		                  sizeof(uint32_t) * 1024);
1159 		/* All bytes after the descriptor shall be zero */
1160 		bzero(dest, sizeof(uint32_t) * 1024);
1161 
1162 		/* Return NIDT=1 (i.e. EUI64) descriptor */
1163 		((uint8_t *)dest)[0] = 1;
1164 		((uint8_t *)dest)[1] = sizeof(uint64_t);
1165 		bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1166 		break;
1167 	default:
1168 		DPRINTF("%s unsupported identify command requested 0x%x",
1169 		         __func__, command->cdw10 & 0xFF);
1170 		pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1171 		break;
1172 	}
1173 
1174 	compl->status = status;
1175 	return (1);
1176 }
1177 
1178 static const char *
1179 nvme_fid_to_name(uint8_t fid)
1180 {
1181 	const char *name;
1182 
1183 	switch (fid) {
1184 	case NVME_FEAT_ARBITRATION:
1185 		name = "Arbitration";
1186 		break;
1187 	case NVME_FEAT_POWER_MANAGEMENT:
1188 		name = "Power Management";
1189 		break;
1190 	case NVME_FEAT_LBA_RANGE_TYPE:
1191 		name = "LBA Range Type";
1192 		break;
1193 	case NVME_FEAT_TEMPERATURE_THRESHOLD:
1194 		name = "Temperature Threshold";
1195 		break;
1196 	case NVME_FEAT_ERROR_RECOVERY:
1197 		name = "Error Recovery";
1198 		break;
1199 	case NVME_FEAT_VOLATILE_WRITE_CACHE:
1200 		name = "Volatile Write Cache";
1201 		break;
1202 	case NVME_FEAT_NUMBER_OF_QUEUES:
1203 		name = "Number of Queues";
1204 		break;
1205 	case NVME_FEAT_INTERRUPT_COALESCING:
1206 		name = "Interrupt Coalescing";
1207 		break;
1208 	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1209 		name = "Interrupt Vector Configuration";
1210 		break;
1211 	case NVME_FEAT_WRITE_ATOMICITY:
1212 		name = "Write Atomicity Normal";
1213 		break;
1214 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1215 		name = "Asynchronous Event Configuration";
1216 		break;
1217 	case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1218 		name = "Autonomous Power State Transition";
1219 		break;
1220 	case NVME_FEAT_HOST_MEMORY_BUFFER:
1221 		name = "Host Memory Buffer";
1222 		break;
1223 	case NVME_FEAT_TIMESTAMP:
1224 		name = "Timestamp";
1225 		break;
1226 	case NVME_FEAT_KEEP_ALIVE_TIMER:
1227 		name = "Keep Alive Timer";
1228 		break;
1229 	case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1230 		name = "Host Controlled Thermal Management";
1231 		break;
1232 	case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1233 		name = "Non-Operation Power State Config";
1234 		break;
1235 	case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1236 		name = "Read Recovery Level Config";
1237 		break;
1238 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1239 		name = "Predictable Latency Mode Config";
1240 		break;
1241 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1242 		name = "Predictable Latency Mode Window";
1243 		break;
1244 	case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1245 		name = "LBA Status Information Report Interval";
1246 		break;
1247 	case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1248 		name = "Host Behavior Support";
1249 		break;
1250 	case NVME_FEAT_SANITIZE_CONFIG:
1251 		name = "Sanitize Config";
1252 		break;
1253 	case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1254 		name = "Endurance Group Event Configuration";
1255 		break;
1256 	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1257 		name = "Software Progress Marker";
1258 		break;
1259 	case NVME_FEAT_HOST_IDENTIFIER:
1260 		name = "Host Identifier";
1261 		break;
1262 	case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1263 		name = "Reservation Notification Mask";
1264 		break;
1265 	case NVME_FEAT_RESERVATION_PERSISTENCE:
1266 		name = "Reservation Persistence";
1267 		break;
1268 	case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1269 		name = "Namespace Write Protection Config";
1270 		break;
1271 	default:
1272 		name = "Unknown";
1273 		break;
1274 	}
1275 
1276 	return (name);
1277 }
1278 
1279 static void
1280 nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
1281     struct nvme_feature_obj *feat,
1282     struct nvme_command *command,
1283     struct nvme_completion *compl)
1284 {
1285 
1286 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1287 }
1288 
1289 static void
1290 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1291     struct nvme_feature_obj *feat,
1292     struct nvme_command *command,
1293     struct nvme_completion *compl)
1294 {
1295 	uint32_t i;
1296 	uint32_t cdw11 = command->cdw11;
1297 	uint16_t iv;
1298 	bool cd;
1299 
1300 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1301 
1302 	iv = cdw11 & 0xffff;
1303 	cd = cdw11 & (1 << 16);
1304 
1305 	if (iv > (sc->max_queues + 1)) {
1306 		return;
1307 	}
1308 
1309 	/* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1310 	if ((iv == 0) && !cd)
1311 		return;
1312 
1313 	/* Requested Interrupt Vector must be used by a CQ */
1314 	for (i = 0; i < sc->num_cqueues + 1; i++) {
1315 		if (sc->compl_queues[i].intr_vec == iv) {
1316 			pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1317 		}
1318 	}
1319 
1320 }
1321 
1322 static void
1323 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1324     struct nvme_feature_obj *feat,
1325     struct nvme_command *command,
1326     struct nvme_completion *compl)
1327 {
1328 	uint16_t nqr;	/* Number of Queues Requested */
1329 
1330 	if (sc->num_q_is_set) {
1331 		WPRINTF("%s: Number of Queues already set", __func__);
1332 		pci_nvme_status_genc(&compl->status,
1333 		    NVME_SC_COMMAND_SEQUENCE_ERROR);
1334 		return;
1335 	}
1336 
1337 	nqr = command->cdw11 & 0xFFFF;
1338 	if (nqr == 0xffff) {
1339 		WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1340 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1341 		return;
1342 	}
1343 
1344 	sc->num_squeues = ONE_BASED(nqr);
1345 	if (sc->num_squeues > sc->max_queues) {
1346 		DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1347 					sc->max_queues);
1348 		sc->num_squeues = sc->max_queues;
1349 	}
1350 
1351 	nqr = (command->cdw11 >> 16) & 0xFFFF;
1352 	if (nqr == 0xffff) {
1353 		WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1354 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1355 		return;
1356 	}
1357 
1358 	sc->num_cqueues = ONE_BASED(nqr);
1359 	if (sc->num_cqueues > sc->max_queues) {
1360 		DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1361 					sc->max_queues);
1362 		sc->num_cqueues = sc->max_queues;
1363 	}
1364 
1365 	/* Patch the command value which will be saved on callback's return */
1366 	command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1367 	compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1368 
1369 	sc->num_q_is_set = true;
1370 }
1371 
1372 static int
1373 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1374 	struct nvme_completion *compl)
1375 {
1376 	struct nvme_feature_obj *feat;
1377 	uint32_t nsid = command->nsid;
1378 	uint8_t fid = command->cdw10 & 0xFF;
1379 
1380 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1381 
1382 	if (fid >= NVME_FID_MAX) {
1383 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1384 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1385 		return (1);
1386 	}
1387 	feat = &sc->feat[fid];
1388 
1389 	if (!feat->namespace_specific &&
1390 	    !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1391 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1392 		    NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1393 		return (1);
1394 	}
1395 
1396 	compl->cdw0 = 0;
1397 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1398 
1399 	if (feat->set)
1400 		feat->set(sc, feat, command, compl);
1401 
1402 	if (compl->status == NVME_SC_SUCCESS)
1403 		feat->cdw11 = command->cdw11;
1404 
1405 	return (0);
1406 }
1407 
1408 static int
1409 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1410 	struct nvme_completion* compl)
1411 {
1412 	struct nvme_feature_obj *feat;
1413 	uint8_t fid = command->cdw10 & 0xFF;
1414 
1415 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1416 
1417 	if (fid >= NVME_FID_MAX) {
1418 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1419 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1420 		return (1);
1421 	}
1422 
1423 	compl->cdw0 = 0;
1424 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1425 
1426 	feat = &sc->feat[fid];
1427 	if (feat->get) {
1428 		feat->get(sc, feat, command, compl);
1429 	}
1430 
1431 	if (compl->status == NVME_SC_SUCCESS) {
1432 		compl->cdw0 = feat->cdw11;
1433 	}
1434 
1435 	return (0);
1436 }
1437 
1438 static int
1439 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1440 	struct nvme_completion* compl)
1441 {
1442 	uint8_t	ses, lbaf, pi;
1443 
1444 	/* Only supports Secure Erase Setting - User Data Erase */
1445 	ses = (command->cdw10 >> 9) & 0x7;
1446 	if (ses > 0x1) {
1447 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1448 		return (1);
1449 	}
1450 
1451 	/* Only supports a single LBA Format */
1452 	lbaf = command->cdw10 & 0xf;
1453 	if (lbaf != 0) {
1454 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1455 		    NVME_SC_INVALID_FORMAT);
1456 		return (1);
1457 	}
1458 
1459 	/* Doesn't support Protection Infomation */
1460 	pi = (command->cdw10 >> 5) & 0x7;
1461 	if (pi != 0) {
1462 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1463 		return (1);
1464 	}
1465 
1466 	if (sc->nvstore.type == NVME_STOR_RAM) {
1467 		if (sc->nvstore.ctx)
1468 			free(sc->nvstore.ctx);
1469 		sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1470 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1471 	} else {
1472 		struct pci_nvme_ioreq *req;
1473 		int err;
1474 
1475 		req = pci_nvme_get_ioreq(sc);
1476 		if (req == NULL) {
1477 			pci_nvme_status_genc(&compl->status,
1478 			    NVME_SC_INTERNAL_DEVICE_ERROR);
1479 			WPRINTF("%s: unable to allocate IO req", __func__);
1480 			return (1);
1481 		}
1482 		req->nvme_sq = &sc->submit_queues[0];
1483 		req->sqid = 0;
1484 		req->opc = command->opc;
1485 		req->cid = command->cid;
1486 		req->nsid = command->nsid;
1487 
1488 		req->io_req.br_offset = 0;
1489 		req->io_req.br_resid = sc->nvstore.size;
1490 		req->io_req.br_callback = pci_nvme_io_done;
1491 
1492 		err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1493 		if (err) {
1494 			pci_nvme_status_genc(&compl->status,
1495 			    NVME_SC_INTERNAL_DEVICE_ERROR);
1496 			pci_nvme_release_ioreq(sc, req);
1497 		}
1498 	}
1499 
1500 	return (1);
1501 }
1502 
1503 static int
1504 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1505 	struct nvme_completion* compl)
1506 {
1507 	DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1508 	        command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1509 
1510 	/* TODO: search for the command ID and abort it */
1511 
1512 	compl->cdw0 = 1;
1513 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1514 	return (1);
1515 }
1516 
1517 static int
1518 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1519 	struct nvme_command* command, struct nvme_completion* compl)
1520 {
1521 	DPRINTF("%s async event request 0x%x", __func__, command->cdw11);
1522 
1523 	/* Don't exceed the Async Event Request Limit (AERL). */
1524 	if (pci_nvme_aer_limit_reached(sc)) {
1525 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1526 				NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1527 		return (1);
1528 	}
1529 
1530 	if (pci_nvme_aer_add(sc, command->cid)) {
1531 		pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
1532 				NVME_SC_INTERNAL_DEVICE_ERROR);
1533 		return (1);
1534 	}
1535 
1536 	/*
1537 	 * Raise events when they happen based on the Set Features cmd.
1538 	 * These events happen async, so only set completion successful if
1539 	 * there is an event reflective of the request to get event.
1540 	 */
1541 	compl->status = NVME_NO_STATUS;
1542 
1543 	return (0);
1544 }
1545 
1546 static void
1547 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1548 {
1549 	struct nvme_completion compl;
1550 	struct nvme_command *cmd;
1551 	struct nvme_submission_queue *sq;
1552 	struct nvme_completion_queue *cq;
1553 	uint16_t sqhead;
1554 
1555 	DPRINTF("%s index %u", __func__, (uint32_t)value);
1556 
1557 	sq = &sc->submit_queues[0];
1558 	cq = &sc->compl_queues[0];
1559 
1560 	pthread_mutex_lock(&sq->mtx);
1561 
1562 	sqhead = sq->head;
1563 	DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1564 
1565 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
1566 		cmd = &(sq->qbase)[sqhead];
1567 		compl.cdw0 = 0;
1568 		compl.status = 0;
1569 
1570 		switch (cmd->opc) {
1571 		case NVME_OPC_DELETE_IO_SQ:
1572 			DPRINTF("%s command DELETE_IO_SQ", __func__);
1573 			nvme_opc_delete_io_sq(sc, cmd, &compl);
1574 			break;
1575 		case NVME_OPC_CREATE_IO_SQ:
1576 			DPRINTF("%s command CREATE_IO_SQ", __func__);
1577 			nvme_opc_create_io_sq(sc, cmd, &compl);
1578 			break;
1579 		case NVME_OPC_DELETE_IO_CQ:
1580 			DPRINTF("%s command DELETE_IO_CQ", __func__);
1581 			nvme_opc_delete_io_cq(sc, cmd, &compl);
1582 			break;
1583 		case NVME_OPC_CREATE_IO_CQ:
1584 			DPRINTF("%s command CREATE_IO_CQ", __func__);
1585 			nvme_opc_create_io_cq(sc, cmd, &compl);
1586 			break;
1587 		case NVME_OPC_GET_LOG_PAGE:
1588 			DPRINTF("%s command GET_LOG_PAGE", __func__);
1589 			nvme_opc_get_log_page(sc, cmd, &compl);
1590 			break;
1591 		case NVME_OPC_IDENTIFY:
1592 			DPRINTF("%s command IDENTIFY", __func__);
1593 			nvme_opc_identify(sc, cmd, &compl);
1594 			break;
1595 		case NVME_OPC_ABORT:
1596 			DPRINTF("%s command ABORT", __func__);
1597 			nvme_opc_abort(sc, cmd, &compl);
1598 			break;
1599 		case NVME_OPC_SET_FEATURES:
1600 			DPRINTF("%s command SET_FEATURES", __func__);
1601 			nvme_opc_set_features(sc, cmd, &compl);
1602 			break;
1603 		case NVME_OPC_GET_FEATURES:
1604 			DPRINTF("%s command GET_FEATURES", __func__);
1605 			nvme_opc_get_features(sc, cmd, &compl);
1606 			break;
1607 		case NVME_OPC_FIRMWARE_ACTIVATE:
1608 			DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
1609 			pci_nvme_status_tc(&compl.status,
1610 			    NVME_SCT_COMMAND_SPECIFIC,
1611 			    NVME_SC_INVALID_FIRMWARE_SLOT);
1612 			break;
1613 		case NVME_OPC_ASYNC_EVENT_REQUEST:
1614 			DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1615 			nvme_opc_async_event_req(sc, cmd, &compl);
1616 			break;
1617 		case NVME_OPC_FORMAT_NVM:
1618 			DPRINTF("%s command FORMAT_NVM", __func__);
1619 			if ((sc->ctrldata.oacs &
1620 			    (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
1621 				pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1622 			}
1623 			compl.status = NVME_NO_STATUS;
1624 			nvme_opc_format_nvm(sc, cmd, &compl);
1625 			break;
1626 		default:
1627 			DPRINTF("0x%x command is not implemented",
1628 			    cmd->opc);
1629 			pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1630 		}
1631 		sqhead = (sqhead + 1) % sq->size;
1632 
1633 		if (NVME_COMPLETION_VALID(compl)) {
1634 			pci_nvme_cq_update(sc, &sc->compl_queues[0],
1635 			    compl.cdw0,
1636 			    cmd->cid,
1637 			    0,		/* SQID */
1638 			    compl.status);
1639 		}
1640 	}
1641 
1642 	DPRINTF("setting sqhead %u", sqhead);
1643 	sq->head = sqhead;
1644 
1645 	if (cq->head != cq->tail)
1646 		pci_generate_msix(sc->nsc_pi, 0);
1647 
1648 	pthread_mutex_unlock(&sq->mtx);
1649 }
1650 
1651 /*
1652  * Update the Write and Read statistics reported in SMART data
1653  *
1654  * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
1655  * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
1656  * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
1657  */
1658 static void
1659 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
1660     size_t bytes, uint16_t status)
1661 {
1662 
1663 	pthread_mutex_lock(&sc->mtx);
1664 	switch (opc) {
1665 	case NVME_OPC_WRITE:
1666 		sc->write_commands++;
1667 		if (status != NVME_SC_SUCCESS)
1668 			break;
1669 		sc->write_dunits_remainder += (bytes / 512);
1670 		while (sc->write_dunits_remainder >= 1000) {
1671 			sc->write_data_units++;
1672 			sc->write_dunits_remainder -= 1000;
1673 		}
1674 		break;
1675 	case NVME_OPC_READ:
1676 		sc->read_commands++;
1677 		if (status != NVME_SC_SUCCESS)
1678 			break;
1679 		sc->read_dunits_remainder += (bytes / 512);
1680 		while (sc->read_dunits_remainder >= 1000) {
1681 			sc->read_data_units++;
1682 			sc->read_dunits_remainder -= 1000;
1683 		}
1684 		break;
1685 	default:
1686 		DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
1687 		break;
1688 	}
1689 	pthread_mutex_unlock(&sc->mtx);
1690 }
1691 
1692 /*
1693  * Check if the combination of Starting LBA (slba) and Number of Logical
1694  * Blocks (nlb) exceeds the range of the underlying storage.
1695  *
1696  * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
1697  * the capacity in bytes as a uint64_t, care must be taken to avoid integer
1698  * overflow.
1699  */
1700 static bool
1701 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
1702     uint32_t nlb)
1703 {
1704 	size_t	offset, bytes;
1705 
1706 	/* Overflow check of multiplying Starting LBA by the sector size */
1707 	if (slba >> (64 - nvstore->sectsz_bits))
1708 		return (true);
1709 
1710 	offset = slba << nvstore->sectsz_bits;
1711 	bytes = nlb << nvstore->sectsz_bits;
1712 
1713 	/* Overflow check of Number of Logical Blocks */
1714 	if ((nvstore->size - offset) < bytes)
1715 		return (true);
1716 
1717 	return (false);
1718 }
1719 
1720 static int
1721 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1722 	uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1723 {
1724 	int iovidx;
1725 
1726 	if (req == NULL)
1727 		return (-1);
1728 
1729 	if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
1730 		return (-1);
1731 	}
1732 
1733 	/* concatenate contig block-iovs to minimize number of iovs */
1734 	if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1735 		iovidx = req->io_req.br_iovcnt - 1;
1736 
1737 		req->io_req.br_iov[iovidx].iov_base =
1738 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1739 				     req->prev_gpaddr, size);
1740 
1741 		req->prev_size += size;
1742 		req->io_req.br_resid += size;
1743 
1744 		req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1745 	} else {
1746 		iovidx = req->io_req.br_iovcnt;
1747 		if (iovidx == 0) {
1748 			req->io_req.br_offset = lba;
1749 			req->io_req.br_resid = 0;
1750 			req->io_req.br_param = req;
1751 		}
1752 
1753 		req->io_req.br_iov[iovidx].iov_base =
1754 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1755 				     gpaddr, size);
1756 
1757 		req->io_req.br_iov[iovidx].iov_len = size;
1758 
1759 		req->prev_gpaddr = gpaddr;
1760 		req->prev_size = size;
1761 		req->io_req.br_resid += size;
1762 
1763 		req->io_req.br_iovcnt++;
1764 	}
1765 
1766 	return (0);
1767 }
1768 
1769 static void
1770 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1771 	struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1772 	uint32_t cdw0, uint16_t status)
1773 {
1774 	struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1775 
1776 	DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1777 		 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1778 		 NVME_STATUS_GET_SC(status));
1779 
1780 	pci_nvme_cq_update(sc, cq,
1781 	    0,		/* CDW0 */
1782 	    cid,
1783 	    sqid,
1784 	    status);
1785 
1786 	if (cq->head != cq->tail) {
1787 		if (cq->intr_en & NVME_CQ_INTEN) {
1788 			pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1789 		} else {
1790 			DPRINTF("%s: CQ%u interrupt disabled",
1791 						__func__, sq->cqid);
1792 		}
1793 	}
1794 }
1795 
1796 static void
1797 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1798 {
1799 	req->sc = NULL;
1800 	req->nvme_sq = NULL;
1801 	req->sqid = 0;
1802 
1803 	pthread_mutex_lock(&sc->mtx);
1804 
1805 	STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
1806 	sc->pending_ios--;
1807 
1808 	/* when no more IO pending, can set to ready if device reset/enabled */
1809 	if (sc->pending_ios == 0 &&
1810 	    NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1811 		sc->regs.csts |= NVME_CSTS_RDY;
1812 
1813 	pthread_mutex_unlock(&sc->mtx);
1814 
1815 	sem_post(&sc->iosemlock);
1816 }
1817 
1818 static struct pci_nvme_ioreq *
1819 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1820 {
1821 	struct pci_nvme_ioreq *req = NULL;;
1822 
1823 	sem_wait(&sc->iosemlock);
1824 	pthread_mutex_lock(&sc->mtx);
1825 
1826 	req = STAILQ_FIRST(&sc->ioreqs_free);
1827 	assert(req != NULL);
1828 	STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
1829 
1830 	req->sc = sc;
1831 
1832 	sc->pending_ios++;
1833 
1834 	pthread_mutex_unlock(&sc->mtx);
1835 
1836 	req->io_req.br_iovcnt = 0;
1837 	req->io_req.br_offset = 0;
1838 	req->io_req.br_resid = 0;
1839 	req->io_req.br_param = req;
1840 	req->prev_gpaddr = 0;
1841 	req->prev_size = 0;
1842 
1843 	return req;
1844 }
1845 
1846 static void
1847 pci_nvme_io_done(struct blockif_req *br, int err)
1848 {
1849 	struct pci_nvme_ioreq *req = br->br_param;
1850 	struct nvme_submission_queue *sq = req->nvme_sq;
1851 	uint16_t code, status;
1852 
1853 	DPRINTF("%s error %d %s", __func__, err, strerror(err));
1854 
1855 	/* TODO return correct error */
1856 	code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1857 	pci_nvme_status_genc(&status, code);
1858 
1859 	pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
1860 	pci_nvme_stats_write_read_update(req->sc, req->opc,
1861 	    req->bytes, status);
1862 	pci_nvme_release_ioreq(req->sc, req);
1863 }
1864 
1865 /*
1866  * Implements the Flush command. The specification states:
1867  *    If a volatile write cache is not present, Flush commands complete
1868  *    successfully and have no effect
1869  * in the description of the Volatile Write Cache (VWC) field of the Identify
1870  * Controller data. Therefore, set status to Success if the command is
1871  * not supported (i.e. RAM or as indicated by the blockif).
1872  */
1873 static bool
1874 nvme_opc_flush(struct pci_nvme_softc *sc,
1875     struct nvme_command *cmd,
1876     struct pci_nvme_blockstore *nvstore,
1877     struct pci_nvme_ioreq *req,
1878     uint16_t *status)
1879 {
1880 	bool pending = false;
1881 
1882 	if (nvstore->type == NVME_STOR_RAM) {
1883 		pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1884 	} else {
1885 		int err;
1886 
1887 		req->io_req.br_callback = pci_nvme_io_done;
1888 
1889 		err = blockif_flush(nvstore->ctx, &req->io_req);
1890 		switch (err) {
1891 		case 0:
1892 			pending = true;
1893 			break;
1894 		case EOPNOTSUPP:
1895 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1896 			break;
1897 		default:
1898 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1899 		}
1900 	}
1901 
1902 	return (pending);
1903 }
1904 
1905 static uint16_t
1906 nvme_write_read_ram(struct pci_nvme_softc *sc,
1907     struct pci_nvme_blockstore *nvstore,
1908     uint64_t prp1, uint64_t prp2,
1909     size_t offset, uint64_t bytes,
1910     bool is_write)
1911 {
1912 	uint8_t *buf = nvstore->ctx;
1913 	enum nvme_copy_dir dir;
1914 	uint16_t status;
1915 
1916 	if (is_write)
1917 		dir = NVME_COPY_TO_PRP;
1918 	else
1919 		dir = NVME_COPY_FROM_PRP;
1920 
1921 	if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
1922 	    buf + offset, bytes, dir))
1923 		pci_nvme_status_genc(&status,
1924 		    NVME_SC_DATA_TRANSFER_ERROR);
1925 	else
1926 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1927 
1928 	return (status);
1929 }
1930 
1931 static uint16_t
1932 nvme_write_read_blockif(struct pci_nvme_softc *sc,
1933     struct pci_nvme_blockstore *nvstore,
1934     struct pci_nvme_ioreq *req,
1935     uint64_t prp1, uint64_t prp2,
1936     size_t offset, uint64_t bytes,
1937     bool is_write)
1938 {
1939 	uint64_t size;
1940 	int err;
1941 	uint16_t status = NVME_NO_STATUS;
1942 
1943 	size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
1944 	if (pci_nvme_append_iov_req(sc, req, prp1,
1945 	    size, is_write, offset)) {
1946 		pci_nvme_status_genc(&status,
1947 		    NVME_SC_DATA_TRANSFER_ERROR);
1948 		goto out;
1949 	}
1950 
1951 	offset += size;
1952 	bytes  -= size;
1953 
1954 	if (bytes == 0) {
1955 		;
1956 	} else if (bytes <= PAGE_SIZE) {
1957 		size = bytes;
1958 		if (pci_nvme_append_iov_req(sc, req, prp2,
1959 		    size, is_write, offset)) {
1960 			pci_nvme_status_genc(&status,
1961 			    NVME_SC_DATA_TRANSFER_ERROR);
1962 			goto out;
1963 		}
1964 	} else {
1965 		void *vmctx = sc->nsc_pi->pi_vmctx;
1966 		uint64_t *prp_list = &prp2;
1967 		uint64_t *last = prp_list;
1968 
1969 		/* PRP2 is pointer to a physical region page list */
1970 		while (bytes) {
1971 			/* Last entry in list points to the next list */
1972 			if (prp_list == last) {
1973 				uint64_t prp = *prp_list;
1974 
1975 				prp_list = paddr_guest2host(vmctx, prp,
1976 				    PAGE_SIZE - (prp % PAGE_SIZE));
1977 				last = prp_list + (NVME_PRP2_ITEMS - 1);
1978 			}
1979 
1980 			size = MIN(bytes, PAGE_SIZE);
1981 
1982 			if (pci_nvme_append_iov_req(sc, req, *prp_list,
1983 			    size, is_write, offset)) {
1984 				pci_nvme_status_genc(&status,
1985 				    NVME_SC_DATA_TRANSFER_ERROR);
1986 				goto out;
1987 			}
1988 
1989 			offset += size;
1990 			bytes  -= size;
1991 
1992 			prp_list++;
1993 		}
1994 	}
1995 	req->io_req.br_callback = pci_nvme_io_done;
1996 	if (is_write)
1997 		err = blockif_write(nvstore->ctx, &req->io_req);
1998 	else
1999 		err = blockif_read(nvstore->ctx, &req->io_req);
2000 
2001 	if (err)
2002 		pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2003 out:
2004 	return (status);
2005 }
2006 
2007 static bool
2008 nvme_opc_write_read(struct pci_nvme_softc *sc,
2009     struct nvme_command *cmd,
2010     struct pci_nvme_blockstore *nvstore,
2011     struct pci_nvme_ioreq *req,
2012     uint16_t *status)
2013 {
2014 	uint64_t lba, nblocks, bytes;
2015 	size_t offset;
2016 	bool is_write = cmd->opc == NVME_OPC_WRITE;
2017 	bool pending = false;
2018 
2019 	lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2020 	nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2021 	if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2022 		WPRINTF("%s command would exceed LBA range", __func__);
2023 		pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2024 		goto out;
2025 	}
2026 
2027 	bytes  = nblocks << nvstore->sectsz_bits;
2028 	if (bytes > NVME_MAX_DATA_SIZE) {
2029 		WPRINTF("%s command would exceed MDTS", __func__);
2030 		pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2031 		goto out;
2032 	}
2033 
2034 	offset = lba << nvstore->sectsz_bits;
2035 
2036 	req->bytes = bytes;
2037 	req->io_req.br_offset = lba;
2038 
2039 	/* PRP bits 1:0 must be zero */
2040 	cmd->prp1 &= ~0x3UL;
2041 	cmd->prp2 &= ~0x3UL;
2042 
2043 	if (nvstore->type == NVME_STOR_RAM) {
2044 		*status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2045 		    cmd->prp2, offset, bytes, is_write);
2046 	} else {
2047 		*status = nvme_write_read_blockif(sc, nvstore, req,
2048 		    cmd->prp1, cmd->prp2, offset, bytes, is_write);
2049 
2050 		if (*status == NVME_NO_STATUS)
2051 			pending = true;
2052 	}
2053 out:
2054 	if (!pending)
2055 		pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2056 
2057 	return (pending);
2058 }
2059 
2060 static void
2061 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2062 {
2063 	struct pci_nvme_ioreq *req = br->br_param;
2064 	struct pci_nvme_softc *sc = req->sc;
2065 	bool done = true;
2066 	uint16_t status;
2067 
2068 	if (err) {
2069 		pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2070 	} else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2071 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2072 	} else {
2073 		struct iovec *iov = req->io_req.br_iov;
2074 
2075 		req->prev_gpaddr++;
2076 		iov += req->prev_gpaddr;
2077 
2078 		/* The iov_* values already include the sector size */
2079 		req->io_req.br_offset = (off_t)iov->iov_base;
2080 		req->io_req.br_resid = iov->iov_len;
2081 		if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2082 			pci_nvme_status_genc(&status,
2083 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2084 		} else
2085 			done = false;
2086 	}
2087 
2088 	if (done) {
2089 		pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
2090 		    req->cid, 0, status);
2091 		pci_nvme_release_ioreq(sc, req);
2092 	}
2093 }
2094 
2095 static bool
2096 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2097     struct nvme_command *cmd,
2098     struct pci_nvme_blockstore *nvstore,
2099     struct pci_nvme_ioreq *req,
2100     uint16_t *status)
2101 {
2102 	struct nvme_dsm_range *range;
2103 	uint32_t nr, r, non_zero, dr;
2104 	int err;
2105 	bool pending = false;
2106 
2107 	if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2108 		pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2109 		goto out;
2110 	}
2111 
2112 	nr = cmd->cdw10 & 0xff;
2113 
2114 	/* copy locally because a range entry could straddle PRPs */
2115 	range = calloc(1, NVME_MAX_DSM_TRIM);
2116 	if (range == NULL) {
2117 		pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2118 		goto out;
2119 	}
2120 	nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2121 	    (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2122 
2123 	/* Check for invalid ranges and the number of non-zero lengths */
2124 	non_zero = 0;
2125 	for (r = 0; r <= nr; r++) {
2126 		if (pci_nvme_out_of_range(nvstore,
2127 		    range[r].starting_lba, range[r].length)) {
2128 			pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2129 			goto out;
2130 		}
2131 		if (range[r].length != 0)
2132 			non_zero++;
2133 	}
2134 
2135 	if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2136 		size_t offset, bytes;
2137 		int sectsz_bits = sc->nvstore.sectsz_bits;
2138 
2139 		/*
2140 		 * DSM calls are advisory only, and compliant controllers
2141 		 * may choose to take no actions (i.e. return Success).
2142 		 */
2143 		if (!nvstore->deallocate) {
2144 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2145 			goto out;
2146 		}
2147 
2148 		/* If all ranges have a zero length, return Success */
2149 		if (non_zero == 0) {
2150 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2151 			goto out;
2152 		}
2153 
2154 		if (req == NULL) {
2155 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2156 			goto out;
2157 		}
2158 
2159 		offset = range[0].starting_lba << sectsz_bits;
2160 		bytes = range[0].length << sectsz_bits;
2161 
2162 		/*
2163 		 * If the request is for more than a single range, store
2164 		 * the ranges in the br_iov. Optimize for the common case
2165 		 * of a single range.
2166 		 *
2167 		 * Note that NVMe Number of Ranges is a zero based value
2168 		 */
2169 		req->io_req.br_iovcnt = 0;
2170 		req->io_req.br_offset = offset;
2171 		req->io_req.br_resid = bytes;
2172 
2173 		if (nr == 0) {
2174 			req->io_req.br_callback = pci_nvme_io_done;
2175 		} else {
2176 			struct iovec *iov = req->io_req.br_iov;
2177 
2178 			for (r = 0, dr = 0; r <= nr; r++) {
2179 				offset = range[r].starting_lba << sectsz_bits;
2180 				bytes = range[r].length << sectsz_bits;
2181 				if (bytes == 0)
2182 					continue;
2183 
2184 				if ((nvstore->size - offset) < bytes) {
2185 					pci_nvme_status_genc(status,
2186 					    NVME_SC_LBA_OUT_OF_RANGE);
2187 					goto out;
2188 				}
2189 				iov[dr].iov_base = (void *)offset;
2190 				iov[dr].iov_len = bytes;
2191 				dr++;
2192 			}
2193 			req->io_req.br_callback = pci_nvme_dealloc_sm;
2194 
2195 			/*
2196 			 * Use prev_gpaddr to track the current entry and
2197 			 * prev_size to track the number of entries
2198 			 */
2199 			req->prev_gpaddr = 0;
2200 			req->prev_size = dr;
2201 		}
2202 
2203 		err = blockif_delete(nvstore->ctx, &req->io_req);
2204 		if (err)
2205 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2206 		else
2207 			pending = true;
2208 	}
2209 out:
2210 	free(range);
2211 	return (pending);
2212 }
2213 
2214 static void
2215 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2216 {
2217 	struct nvme_submission_queue *sq;
2218 	uint16_t status;
2219 	uint16_t sqhead;
2220 
2221 	/* handle all submissions up to sq->tail index */
2222 	sq = &sc->submit_queues[idx];
2223 
2224 	pthread_mutex_lock(&sq->mtx);
2225 
2226 	sqhead = sq->head;
2227 	DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2228 	         idx, sqhead, sq->tail, sq->qbase);
2229 
2230 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
2231 		struct nvme_command *cmd;
2232 		struct pci_nvme_ioreq *req;
2233 		uint32_t nsid;
2234 		bool pending;
2235 
2236 		pending = false;
2237 		req = NULL;
2238 		status = 0;
2239 
2240 		cmd = &sq->qbase[sqhead];
2241 		sqhead = (sqhead + 1) % sq->size;
2242 
2243 		nsid = le32toh(cmd->nsid);
2244 		if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2245 			pci_nvme_status_genc(&status,
2246 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2247 			status |=
2248 			    NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2249 			goto complete;
2250  		}
2251 
2252 		req = pci_nvme_get_ioreq(sc);
2253 		if (req == NULL) {
2254 			pci_nvme_status_genc(&status,
2255 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2256 			WPRINTF("%s: unable to allocate IO req", __func__);
2257 			goto complete;
2258 		}
2259 		req->nvme_sq = sq;
2260 		req->sqid = idx;
2261 		req->opc = cmd->opc;
2262 		req->cid = cmd->cid;
2263 		req->nsid = cmd->nsid;
2264 
2265 		switch (cmd->opc) {
2266 		case NVME_OPC_FLUSH:
2267 			pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2268 			    req, &status);
2269  			break;
2270 		case NVME_OPC_WRITE:
2271 		case NVME_OPC_READ:
2272 			pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2273 			    req, &status);
2274 			break;
2275 		case NVME_OPC_WRITE_ZEROES:
2276 			/* TODO: write zeroes
2277 			WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2278 			        __func__, lba, cmd->cdw12 & 0xFFFF); */
2279 			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2280 			break;
2281 		case NVME_OPC_DATASET_MANAGEMENT:
2282  			pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2283 			    req, &status);
2284 			break;
2285  		default:
2286  			WPRINTF("%s unhandled io command 0x%x",
2287 			    __func__, cmd->opc);
2288 			pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2289 		}
2290 complete:
2291 		if (!pending) {
2292 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
2293 			    status);
2294 			if (req != NULL)
2295 				pci_nvme_release_ioreq(sc, req);
2296 		}
2297 	}
2298 
2299 	sq->head = sqhead;
2300 
2301 	pthread_mutex_unlock(&sq->mtx);
2302 }
2303 
2304 static void
2305 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
2306 	uint64_t idx, int is_sq, uint64_t value)
2307 {
2308 	DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2309 	        idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2310 
2311 	if (is_sq) {
2312 		if (idx > sc->num_squeues) {
2313 			WPRINTF("%s queue index %lu overflow from "
2314 			         "guest (max %u)",
2315 			         __func__, idx, sc->num_squeues);
2316 			return;
2317 		}
2318 
2319 		atomic_store_short(&sc->submit_queues[idx].tail,
2320 		                   (uint16_t)value);
2321 
2322 		if (idx == 0) {
2323 			pci_nvme_handle_admin_cmd(sc, value);
2324 		} else {
2325 			/* submission queue; handle new entries in SQ */
2326 			if (idx > sc->num_squeues) {
2327 				WPRINTF("%s SQ index %lu overflow from "
2328 				         "guest (max %u)",
2329 				         __func__, idx, sc->num_squeues);
2330 				return;
2331 			}
2332 			pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2333 		}
2334 	} else {
2335 		if (idx > sc->num_cqueues) {
2336 			WPRINTF("%s queue index %lu overflow from "
2337 			         "guest (max %u)",
2338 			         __func__, idx, sc->num_cqueues);
2339 			return;
2340 		}
2341 
2342 		atomic_store_short(&sc->compl_queues[idx].head,
2343 				(uint16_t)value);
2344 	}
2345 }
2346 
2347 static void
2348 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2349 {
2350 	const char *s = iswrite ? "WRITE" : "READ";
2351 
2352 	switch (offset) {
2353 	case NVME_CR_CAP_LOW:
2354 		DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2355 		break;
2356 	case NVME_CR_CAP_HI:
2357 		DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2358 		break;
2359 	case NVME_CR_VS:
2360 		DPRINTF("%s %s NVME_CR_VS", func, s);
2361 		break;
2362 	case NVME_CR_INTMS:
2363 		DPRINTF("%s %s NVME_CR_INTMS", func, s);
2364 		break;
2365 	case NVME_CR_INTMC:
2366 		DPRINTF("%s %s NVME_CR_INTMC", func, s);
2367 		break;
2368 	case NVME_CR_CC:
2369 		DPRINTF("%s %s NVME_CR_CC", func, s);
2370 		break;
2371 	case NVME_CR_CSTS:
2372 		DPRINTF("%s %s NVME_CR_CSTS", func, s);
2373 		break;
2374 	case NVME_CR_NSSR:
2375 		DPRINTF("%s %s NVME_CR_NSSR", func, s);
2376 		break;
2377 	case NVME_CR_AQA:
2378 		DPRINTF("%s %s NVME_CR_AQA", func, s);
2379 		break;
2380 	case NVME_CR_ASQ_LOW:
2381 		DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2382 		break;
2383 	case NVME_CR_ASQ_HI:
2384 		DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2385 		break;
2386 	case NVME_CR_ACQ_LOW:
2387 		DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2388 		break;
2389 	case NVME_CR_ACQ_HI:
2390 		DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2391 		break;
2392 	default:
2393 		DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2394 	}
2395 
2396 }
2397 
2398 static void
2399 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2400 	uint64_t offset, int size, uint64_t value)
2401 {
2402 	uint32_t ccreg;
2403 
2404 	if (offset >= NVME_DOORBELL_OFFSET) {
2405 		uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2406 		uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2407 		int is_sq = (belloffset % 8) < 4;
2408 
2409 		if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2410 			WPRINTF("guest attempted an overflow write offset "
2411 			         "0x%lx, val 0x%lx in %s",
2412 			         offset, value, __func__);
2413 			return;
2414 		}
2415 
2416 		pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2417 		return;
2418 	}
2419 
2420 	DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2421 	        offset, size, value);
2422 
2423 	if (size != 4) {
2424 		WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2425 		         "val 0x%lx) to bar0 in %s",
2426 		         size, offset, value, __func__);
2427 		/* TODO: shutdown device */
2428 		return;
2429 	}
2430 
2431 	pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2432 
2433 	pthread_mutex_lock(&sc->mtx);
2434 
2435 	switch (offset) {
2436 	case NVME_CR_CAP_LOW:
2437 	case NVME_CR_CAP_HI:
2438 		/* readonly */
2439 		break;
2440 	case NVME_CR_VS:
2441 		/* readonly */
2442 		break;
2443 	case NVME_CR_INTMS:
2444 		/* MSI-X, so ignore */
2445 		break;
2446 	case NVME_CR_INTMC:
2447 		/* MSI-X, so ignore */
2448 		break;
2449 	case NVME_CR_CC:
2450 		ccreg = (uint32_t)value;
2451 
2452 		DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2453 		         "iocqes %u",
2454 		        __func__,
2455 			 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2456 			 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2457 			 NVME_CC_GET_IOCQES(ccreg));
2458 
2459 		if (NVME_CC_GET_SHN(ccreg)) {
2460 			/* perform shutdown - flush out data to backend */
2461 			sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2462 			    NVME_CSTS_REG_SHST_SHIFT);
2463 			sc->regs.csts |= NVME_SHST_COMPLETE <<
2464 			    NVME_CSTS_REG_SHST_SHIFT;
2465 		}
2466 		if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2467 			if (NVME_CC_GET_EN(ccreg) == 0)
2468 				/* transition 1-> causes controller reset */
2469 				pci_nvme_reset_locked(sc);
2470 			else
2471 				pci_nvme_init_controller(ctx, sc);
2472 		}
2473 
2474 		/* Insert the iocqes, iosqes and en bits from the write */
2475 		sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2476 		sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2477 		if (NVME_CC_GET_EN(ccreg) == 0) {
2478 			/* Insert the ams, mps and css bit fields */
2479 			sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2480 			sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2481 			sc->regs.csts &= ~NVME_CSTS_RDY;
2482 		} else if (sc->pending_ios == 0) {
2483 			sc->regs.csts |= NVME_CSTS_RDY;
2484 		}
2485 		break;
2486 	case NVME_CR_CSTS:
2487 		break;
2488 	case NVME_CR_NSSR:
2489 		/* ignore writes; don't support subsystem reset */
2490 		break;
2491 	case NVME_CR_AQA:
2492 		sc->regs.aqa = (uint32_t)value;
2493 		break;
2494 	case NVME_CR_ASQ_LOW:
2495 		sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2496 		               (0xFFFFF000 & value);
2497 		break;
2498 	case NVME_CR_ASQ_HI:
2499 		sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2500 		               (value << 32);
2501 		break;
2502 	case NVME_CR_ACQ_LOW:
2503 		sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2504 		               (0xFFFFF000 & value);
2505 		break;
2506 	case NVME_CR_ACQ_HI:
2507 		sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2508 		               (value << 32);
2509 		break;
2510 	default:
2511 		DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2512 		         __func__, offset, value, size);
2513 	}
2514 	pthread_mutex_unlock(&sc->mtx);
2515 }
2516 
2517 static void
2518 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2519                 int baridx, uint64_t offset, int size, uint64_t value)
2520 {
2521 	struct pci_nvme_softc* sc = pi->pi_arg;
2522 
2523 	if (baridx == pci_msix_table_bar(pi) ||
2524 	    baridx == pci_msix_pba_bar(pi)) {
2525 		DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2526 		         " value 0x%lx", baridx, offset, size, value);
2527 
2528 		pci_emul_msix_twrite(pi, offset, size, value);
2529 		return;
2530 	}
2531 
2532 	switch (baridx) {
2533 	case 0:
2534 		pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2535 		break;
2536 
2537 	default:
2538 		DPRINTF("%s unknown baridx %d, val 0x%lx",
2539 		         __func__, baridx, value);
2540 	}
2541 }
2542 
2543 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2544 	uint64_t offset, int size)
2545 {
2546 	uint64_t value;
2547 
2548 	pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2549 
2550 	if (offset < NVME_DOORBELL_OFFSET) {
2551 		void *p = &(sc->regs);
2552 		pthread_mutex_lock(&sc->mtx);
2553 		memcpy(&value, (void *)((uintptr_t)p + offset), size);
2554 		pthread_mutex_unlock(&sc->mtx);
2555 	} else {
2556 		value = 0;
2557                 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2558 	}
2559 
2560 	switch (size) {
2561 	case 1:
2562 		value &= 0xFF;
2563 		break;
2564 	case 2:
2565 		value &= 0xFFFF;
2566 		break;
2567 	case 4:
2568 		value &= 0xFFFFFFFF;
2569 		break;
2570 	}
2571 
2572 	DPRINTF("   nvme-read offset 0x%lx, size %d -> value 0x%x",
2573 	         offset, size, (uint32_t)value);
2574 
2575 	return (value);
2576 }
2577 
2578 
2579 
2580 static uint64_t
2581 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2582     uint64_t offset, int size)
2583 {
2584 	struct pci_nvme_softc* sc = pi->pi_arg;
2585 
2586 	if (baridx == pci_msix_table_bar(pi) ||
2587 	    baridx == pci_msix_pba_bar(pi)) {
2588 		DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2589 		        baridx, offset, size);
2590 
2591 		return pci_emul_msix_tread(pi, offset, size);
2592 	}
2593 
2594 	switch (baridx) {
2595 	case 0:
2596        		return pci_nvme_read_bar_0(sc, offset, size);
2597 
2598 	default:
2599 		DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2600 	}
2601 
2602 	return (0);
2603 }
2604 
2605 
2606 static int
2607 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
2608 {
2609 	char bident[sizeof("XX:X:X")];
2610 	char	*uopt, *xopts, *config;
2611 	uint32_t sectsz;
2612 	int optidx;
2613 
2614 	sc->max_queues = NVME_QUEUES;
2615 	sc->max_qentries = NVME_MAX_QENTRIES;
2616 	sc->ioslots = NVME_IOSLOTS;
2617 	sc->num_squeues = sc->max_queues;
2618 	sc->num_cqueues = sc->max_queues;
2619 	sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2620 	sectsz = 0;
2621 
2622 	uopt = strdup(opts);
2623 	optidx = 0;
2624 	snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2625 	         "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2626 	for (xopts = strtok(uopt, ",");
2627 	     xopts != NULL;
2628 	     xopts = strtok(NULL, ",")) {
2629 
2630 		if ((config = strchr(xopts, '=')) != NULL)
2631 			*config++ = '\0';
2632 
2633 		if (!strcmp("maxq", xopts)) {
2634 			sc->max_queues = atoi(config);
2635 		} else if (!strcmp("qsz", xopts)) {
2636 			sc->max_qentries = atoi(config);
2637 		} else if (!strcmp("ioslots", xopts)) {
2638 			sc->ioslots = atoi(config);
2639 		} else if (!strcmp("sectsz", xopts)) {
2640 			sectsz = atoi(config);
2641 		} else if (!strcmp("ser", xopts)) {
2642 			/*
2643 			 * This field indicates the Product Serial Number in
2644 			 * 7-bit ASCII, unused bytes should be space characters.
2645 			 * Ref: NVMe v1.3c.
2646 			 */
2647 			cpywithpad((char *)sc->ctrldata.sn,
2648 			           sizeof(sc->ctrldata.sn), config, ' ');
2649 		} else if (!strcmp("ram", xopts)) {
2650 			uint64_t sz = strtoull(&xopts[4], NULL, 10);
2651 
2652 			sc->nvstore.type = NVME_STOR_RAM;
2653 			sc->nvstore.size = sz * 1024 * 1024;
2654 			sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2655 			sc->nvstore.sectsz = 4096;
2656 			sc->nvstore.sectsz_bits = 12;
2657 			if (sc->nvstore.ctx == NULL) {
2658 				perror("Unable to allocate RAM");
2659 				free(uopt);
2660 				return (-1);
2661 			}
2662 		} else if (!strcmp("eui64", xopts)) {
2663 			sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
2664 		} else if (!strcmp("dsm", xopts)) {
2665 			if (!strcmp("auto", config))
2666 				sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2667 			else if (!strcmp("enable", config))
2668 				sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2669 			else if (!strcmp("disable", config))
2670 				sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2671 		} else if (optidx == 0) {
2672 			snprintf(bident, sizeof(bident), "%d:%d",
2673 			         sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2674 			sc->nvstore.ctx = blockif_open(xopts, bident);
2675 			if (sc->nvstore.ctx == NULL) {
2676 				perror("Could not open backing file");
2677 				free(uopt);
2678 				return (-1);
2679 			}
2680 			sc->nvstore.type = NVME_STOR_BLOCKIF;
2681 			sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2682 		} else {
2683 			EPRINTLN("Invalid option %s", xopts);
2684 			free(uopt);
2685 			return (-1);
2686 		}
2687 
2688 		optidx++;
2689 	}
2690 	free(uopt);
2691 
2692 	if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
2693 		EPRINTLN("backing store not specified");
2694 		return (-1);
2695 	}
2696 	if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2697 		sc->nvstore.sectsz = sectsz;
2698 	else if (sc->nvstore.type != NVME_STOR_RAM)
2699 		sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2700 	for (sc->nvstore.sectsz_bits = 9;
2701 	     (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2702 	     sc->nvstore.sectsz_bits++);
2703 
2704 	if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2705 		sc->max_queues = NVME_QUEUES;
2706 
2707 	if (sc->max_qentries <= 0) {
2708 		EPRINTLN("Invalid qsz option");
2709 		return (-1);
2710 	}
2711 	if (sc->ioslots <= 0) {
2712 		EPRINTLN("Invalid ioslots option");
2713 		return (-1);
2714 	}
2715 
2716 	return (0);
2717 }
2718 
2719 static int
2720 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
2721 {
2722 	struct pci_nvme_softc *sc;
2723 	uint32_t pci_membar_sz;
2724 	int	error;
2725 
2726 	error = 0;
2727 
2728 	sc = calloc(1, sizeof(struct pci_nvme_softc));
2729 	pi->pi_arg = sc;
2730 	sc->nsc_pi = pi;
2731 
2732 	error = pci_nvme_parse_opts(sc, opts);
2733 	if (error < 0)
2734 		goto done;
2735 	else
2736 		error = 0;
2737 
2738 	STAILQ_INIT(&sc->ioreqs_free);
2739 	sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
2740 	for (int i = 0; i < sc->ioslots; i++) {
2741 		STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
2742 	}
2743 
2744 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
2745 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
2746 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
2747 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
2748 	pci_set_cfgdata8(pi, PCIR_PROGIF,
2749 	                 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
2750 
2751 	/*
2752 	 * Allocate size of NVMe registers + doorbell space for all queues.
2753 	 *
2754 	 * The specification requires a minimum memory I/O window size of 16K.
2755 	 * The Windows driver will refuse to start a device with a smaller
2756 	 * window.
2757 	 */
2758 	pci_membar_sz = sizeof(struct nvme_registers) +
2759 	    2 * sizeof(uint32_t) * (sc->max_queues + 1);
2760 	pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
2761 
2762 	DPRINTF("nvme membar size: %u", pci_membar_sz);
2763 
2764 	error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
2765 	if (error) {
2766 		WPRINTF("%s pci alloc mem bar failed", __func__);
2767 		goto done;
2768 	}
2769 
2770 	error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2771 	if (error) {
2772 		WPRINTF("%s pci add msixcap failed", __func__);
2773 		goto done;
2774 	}
2775 
2776 	error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2777 	if (error) {
2778 		WPRINTF("%s pci add Express capability failed", __func__);
2779 		goto done;
2780 	}
2781 
2782 	pthread_mutex_init(&sc->mtx, NULL);
2783 	sem_init(&sc->iosemlock, 0, sc->ioslots);
2784 
2785 	pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
2786 	/*
2787 	 * Controller data depends on Namespace data so initialize Namespace
2788 	 * data first.
2789 	 */
2790 	pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
2791 	pci_nvme_init_ctrldata(sc);
2792 	pci_nvme_init_logpages(sc);
2793 	pci_nvme_init_features(sc);
2794 
2795 	pci_nvme_aer_init(sc);
2796 
2797 	pci_nvme_reset(sc);
2798 
2799 	pci_lintr_request(pi);
2800 
2801 done:
2802 	return (error);
2803 }
2804 
2805 
2806 struct pci_devemu pci_de_nvme = {
2807 	.pe_emu =	"nvme",
2808 	.pe_init =	pci_nvme_init,
2809 	.pe_barwrite =	pci_nvme_write,
2810 	.pe_barread =	pci_nvme_read
2811 };
2812 PCI_EMUL_SET(pci_de_nvme);
2813