xref: /freebsd/usr.sbin/bhyve/pci_nvme.c (revision dce5f3abed7181cc533ca5ed3de44517775e78dd)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  * Copyright (c) 2020 Chuck Tuffli
7  *
8  * Function crc16 Copyright (c) 2017, Fedor Uporov
9  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 /*
34  * bhyve PCIe-NVMe device emulation.
35  *
36  * options:
37  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
38  *
39  *  accepted devpath:
40  *    /dev/blockdev
41  *    /path/to/image
42  *    ram=size_in_MiB
43  *
44  *  maxq    = max number of queues
45  *  qsz     = max elements in each queue
46  *  ioslots = max number of concurrent io requests
47  *  sectsz  = sector size (defaults to blockif sector size)
48  *  ser     = serial number (20-chars max)
49  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
50  *  dsm     = DataSet Management support. Option is one of auto, enable,disable
51  *
52  */
53 
54 /* TODO:
55     - create async event for smart and log
56     - intr coalesce
57  */
58 
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61 
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
65 
66 #include <assert.h>
67 #include <pthread.h>
68 #include <semaphore.h>
69 #include <stdbool.h>
70 #include <stddef.h>
71 #include <stdint.h>
72 #include <stdio.h>
73 #include <stdlib.h>
74 #include <string.h>
75 
76 #include <machine/atomic.h>
77 #include <machine/vmm.h>
78 #include <vmmapi.h>
79 
80 #include <dev/nvme/nvme.h>
81 
82 #include "bhyverun.h"
83 #include "block_if.h"
84 #include "config.h"
85 #include "debug.h"
86 #include "pci_emul.h"
87 
88 
89 static int nvme_debug = 0;
90 #define	DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
91 #define	WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
92 
93 /* defaults; can be overridden */
94 #define	NVME_MSIX_BAR		4
95 
96 #define	NVME_IOSLOTS		8
97 
98 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
99 #define NVME_MMIO_SPACE_MIN	(1 << 14)
100 
101 #define	NVME_QUEUES		16
102 #define	NVME_MAX_QENTRIES	2048
103 /* Memory Page size Minimum reported in CAP register */
104 #define	NVME_MPSMIN		0
105 /* MPSMIN converted to bytes */
106 #define	NVME_MPSMIN_BYTES	(1 << (12 + NVME_MPSMIN))
107 
108 #define	NVME_PRP2_ITEMS		(PAGE_SIZE/sizeof(uint64_t))
109 #define	NVME_MDTS		9
110 /* Note the + 1 allows for the initial descriptor to not be page aligned */
111 #define	NVME_MAX_IOVEC		((1 << NVME_MDTS) + 1)
112 #define	NVME_MAX_DATA_SIZE	((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
113 
114 /* This is a synthetic status code to indicate there is no status */
115 #define NVME_NO_STATUS		0xffff
116 #define NVME_COMPLETION_VALID(c)	((c).status != NVME_NO_STATUS)
117 
118 /* helpers */
119 
120 /* Convert a zero-based value into a one-based value */
121 #define ONE_BASED(zero)		((zero) + 1)
122 /* Convert a one-based value into a zero-based value */
123 #define ZERO_BASED(one)		((one)  - 1)
124 
125 /* Encode number of SQ's and CQ's for Set/Get Features */
126 #define NVME_FEATURE_NUM_QUEUES(sc) \
127 	(ZERO_BASED((sc)->num_squeues) & 0xffff) | \
128 	(ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
129 
130 #define	NVME_DOORBELL_OFFSET	offsetof(struct nvme_registers, doorbell)
131 
132 enum nvme_controller_register_offsets {
133 	NVME_CR_CAP_LOW = 0x00,
134 	NVME_CR_CAP_HI  = 0x04,
135 	NVME_CR_VS      = 0x08,
136 	NVME_CR_INTMS   = 0x0c,
137 	NVME_CR_INTMC   = 0x10,
138 	NVME_CR_CC      = 0x14,
139 	NVME_CR_CSTS    = 0x1c,
140 	NVME_CR_NSSR    = 0x20,
141 	NVME_CR_AQA     = 0x24,
142 	NVME_CR_ASQ_LOW = 0x28,
143 	NVME_CR_ASQ_HI  = 0x2c,
144 	NVME_CR_ACQ_LOW = 0x30,
145 	NVME_CR_ACQ_HI  = 0x34,
146 };
147 
148 enum nvme_cmd_cdw11 {
149 	NVME_CMD_CDW11_PC  = 0x0001,
150 	NVME_CMD_CDW11_IEN = 0x0002,
151 	NVME_CMD_CDW11_IV  = 0xFFFF0000,
152 };
153 
154 enum nvme_copy_dir {
155 	NVME_COPY_TO_PRP,
156 	NVME_COPY_FROM_PRP,
157 };
158 
159 #define	NVME_CQ_INTEN	0x01
160 #define	NVME_CQ_INTCOAL	0x02
161 
162 struct nvme_completion_queue {
163 	struct nvme_completion *qbase;
164 	pthread_mutex_t	mtx;
165 	uint32_t	size;
166 	uint16_t	tail; /* nvme progress */
167 	uint16_t	head; /* guest progress */
168 	uint16_t	intr_vec;
169 	uint32_t	intr_en;
170 };
171 
172 struct nvme_submission_queue {
173 	struct nvme_command *qbase;
174 	pthread_mutex_t	mtx;
175 	uint32_t	size;
176 	uint16_t	head; /* nvme progress */
177 	uint16_t	tail; /* guest progress */
178 	uint16_t	cqid; /* completion queue id */
179 	int		qpriority;
180 };
181 
182 enum nvme_storage_type {
183 	NVME_STOR_BLOCKIF = 0,
184 	NVME_STOR_RAM = 1,
185 };
186 
187 struct pci_nvme_blockstore {
188 	enum nvme_storage_type type;
189 	void		*ctx;
190 	uint64_t	size;
191 	uint32_t	sectsz;
192 	uint32_t	sectsz_bits;
193 	uint64_t	eui64;
194 	uint32_t	deallocate:1;
195 };
196 
197 /*
198  * Calculate the number of additional page descriptors for guest IO requests
199  * based on the advertised Max Data Transfer (MDTS) and given the number of
200  * default iovec's in a struct blockif_req.
201  */
202 #define MDTS_PAD_SIZE \
203 	( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
204 	  NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
205 	  0 )
206 
207 struct pci_nvme_ioreq {
208 	struct pci_nvme_softc *sc;
209 	STAILQ_ENTRY(pci_nvme_ioreq) link;
210 	struct nvme_submission_queue *nvme_sq;
211 	uint16_t	sqid;
212 
213 	/* command information */
214 	uint16_t	opc;
215 	uint16_t	cid;
216 	uint32_t	nsid;
217 
218 	uint64_t	prev_gpaddr;
219 	size_t		prev_size;
220 	size_t		bytes;
221 
222 	struct blockif_req io_req;
223 
224 	struct iovec	iovpadding[MDTS_PAD_SIZE];
225 };
226 
227 enum nvme_dsm_type {
228 	/* Dataset Management bit in ONCS reflects backing storage capability */
229 	NVME_DATASET_MANAGEMENT_AUTO,
230 	/* Unconditionally set Dataset Management bit in ONCS */
231 	NVME_DATASET_MANAGEMENT_ENABLE,
232 	/* Unconditionally clear Dataset Management bit in ONCS */
233 	NVME_DATASET_MANAGEMENT_DISABLE,
234 };
235 
236 struct pci_nvme_softc;
237 struct nvme_feature_obj;
238 
239 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
240     struct nvme_feature_obj *,
241     struct nvme_command *,
242     struct nvme_completion *);
243 
244 struct nvme_feature_obj {
245 	uint32_t	cdw11;
246 	nvme_feature_cb	set;
247 	nvme_feature_cb	get;
248 	bool namespace_specific;
249 };
250 
251 #define NVME_FID_MAX		(NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
252 
253 struct pci_nvme_aer {
254 	STAILQ_ENTRY(pci_nvme_aer) link;
255 	uint16_t	cid;	/* Command ID of the submitted AER */
256 };
257 
258 struct pci_nvme_softc {
259 	struct pci_devinst *nsc_pi;
260 
261 	pthread_mutex_t	mtx;
262 
263 	struct nvme_registers regs;
264 
265 	struct nvme_namespace_data  nsdata;
266 	struct nvme_controller_data ctrldata;
267 	struct nvme_error_information_entry err_log;
268 	struct nvme_health_information_page health_log;
269 	struct nvme_firmware_page fw_log;
270 
271 	struct pci_nvme_blockstore nvstore;
272 
273 	uint16_t	max_qentries;	/* max entries per queue */
274 	uint32_t	max_queues;	/* max number of IO SQ's or CQ's */
275 	uint32_t	num_cqueues;
276 	uint32_t	num_squeues;
277 	bool		num_q_is_set; /* Has host set Number of Queues */
278 
279 	struct pci_nvme_ioreq *ioreqs;
280 	STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
281 	uint32_t	pending_ios;
282 	uint32_t	ioslots;
283 	sem_t		iosemlock;
284 
285 	/*
286 	 * Memory mapped Submission and Completion queues
287 	 * Each array includes both Admin and IO queues
288 	 */
289 	struct nvme_completion_queue *compl_queues;
290 	struct nvme_submission_queue *submit_queues;
291 
292 	struct nvme_feature_obj feat[NVME_FID_MAX];
293 
294 	enum nvme_dsm_type dataset_management;
295 
296 	/* Accounting for SMART data */
297 	__uint128_t	read_data_units;
298 	__uint128_t	write_data_units;
299 	__uint128_t	read_commands;
300 	__uint128_t	write_commands;
301 	uint32_t	read_dunits_remainder;
302 	uint32_t	write_dunits_remainder;
303 
304 	STAILQ_HEAD(, pci_nvme_aer) aer_list;
305 	uint32_t	aer_count;
306 };
307 
308 
309 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
310 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
311 static void pci_nvme_io_done(struct blockif_req *, int);
312 
313 /* Controller Configuration utils */
314 #define	NVME_CC_GET_EN(cc) \
315 	((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
316 #define	NVME_CC_GET_CSS(cc) \
317 	((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
318 #define	NVME_CC_GET_SHN(cc) \
319 	((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
320 #define	NVME_CC_GET_IOSQES(cc) \
321 	((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
322 #define	NVME_CC_GET_IOCQES(cc) \
323 	((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
324 
325 #define	NVME_CC_WRITE_MASK \
326 	((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
327 	 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
328 	 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
329 
330 #define	NVME_CC_NEN_WRITE_MASK \
331 	((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
332 	 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
333 	 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
334 
335 /* Controller Status utils */
336 #define	NVME_CSTS_GET_RDY(sts) \
337 	((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
338 
339 #define	NVME_CSTS_RDY	(1 << NVME_CSTS_REG_RDY_SHIFT)
340 
341 /* Completion Queue status word utils */
342 #define	NVME_STATUS_P	(1 << NVME_STATUS_P_SHIFT)
343 #define	NVME_STATUS_MASK \
344 	((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
345 	 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
346 
347 #define NVME_ONCS_DSM	(NVME_CTRLR_DATA_ONCS_DSM_MASK << \
348 	NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
349 
350 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
351     struct nvme_feature_obj *,
352     struct nvme_command *,
353     struct nvme_completion *);
354 static void nvme_feature_num_queues(struct pci_nvme_softc *,
355     struct nvme_feature_obj *,
356     struct nvme_command *,
357     struct nvme_completion *);
358 static void nvme_feature_iv_config(struct pci_nvme_softc *,
359     struct nvme_feature_obj *,
360     struct nvme_command *,
361     struct nvme_completion *);
362 
363 static __inline void
364 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
365 {
366 	size_t len;
367 
368 	len = strnlen(src, dst_size);
369 	memset(dst, pad, dst_size);
370 	memcpy(dst, src, len);
371 }
372 
373 static __inline void
374 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
375 {
376 
377 	*status &= ~NVME_STATUS_MASK;
378 	*status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
379 		(code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
380 }
381 
382 static __inline void
383 pci_nvme_status_genc(uint16_t *status, uint16_t code)
384 {
385 
386 	pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
387 }
388 
389 /*
390  * Initialize the requested number or IO Submission and Completion Queues.
391  * Admin queues are allocated implicitly.
392  */
393 static void
394 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
395 {
396 	uint32_t i;
397 
398 	/*
399 	 * Allocate and initialize the Submission Queues
400 	 */
401 	if (nsq > NVME_QUEUES) {
402 		WPRINTF("%s: clamping number of SQ from %u to %u",
403 					__func__, nsq, NVME_QUEUES);
404 		nsq = NVME_QUEUES;
405 	}
406 
407 	sc->num_squeues = nsq;
408 
409 	sc->submit_queues = calloc(sc->num_squeues + 1,
410 				sizeof(struct nvme_submission_queue));
411 	if (sc->submit_queues == NULL) {
412 		WPRINTF("%s: SQ allocation failed", __func__);
413 		sc->num_squeues = 0;
414 	} else {
415 		struct nvme_submission_queue *sq = sc->submit_queues;
416 
417 		for (i = 0; i < sc->num_squeues; i++)
418 			pthread_mutex_init(&sq[i].mtx, NULL);
419 	}
420 
421 	/*
422 	 * Allocate and initialize the Completion Queues
423 	 */
424 	if (ncq > NVME_QUEUES) {
425 		WPRINTF("%s: clamping number of CQ from %u to %u",
426 					__func__, ncq, NVME_QUEUES);
427 		ncq = NVME_QUEUES;
428 	}
429 
430 	sc->num_cqueues = ncq;
431 
432 	sc->compl_queues = calloc(sc->num_cqueues + 1,
433 				sizeof(struct nvme_completion_queue));
434 	if (sc->compl_queues == NULL) {
435 		WPRINTF("%s: CQ allocation failed", __func__);
436 		sc->num_cqueues = 0;
437 	} else {
438 		struct nvme_completion_queue *cq = sc->compl_queues;
439 
440 		for (i = 0; i < sc->num_cqueues; i++)
441 			pthread_mutex_init(&cq[i].mtx, NULL);
442 	}
443 }
444 
445 static void
446 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
447 {
448 	struct nvme_controller_data *cd = &sc->ctrldata;
449 
450 	cd->vid = 0xFB5D;
451 	cd->ssvid = 0x0000;
452 
453 	cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
454 	cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
455 
456 	/* Num of submission commands that we can handle at a time (2^rab) */
457 	cd->rab   = 4;
458 
459 	/* FreeBSD OUI */
460 	cd->ieee[0] = 0x58;
461 	cd->ieee[1] = 0x9c;
462 	cd->ieee[2] = 0xfc;
463 
464 	cd->mic = 0;
465 
466 	cd->mdts = NVME_MDTS;	/* max data transfer size (2^mdts * CAP.MPSMIN) */
467 
468 	cd->ver = 0x00010300;
469 
470 	cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
471 	cd->acl = 2;
472 	cd->aerl = 4;
473 
474 	/* Advertise 1, Read-only firmware slot */
475 	cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK |
476 	    (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
477 	cd->lpa = 0;	/* TODO: support some simple things like SMART */
478 	cd->elpe = 0;	/* max error log page entries */
479 	cd->npss = 1;	/* number of power states support */
480 
481 	/* Warning Composite Temperature Threshold */
482 	cd->wctemp = 0x0157;
483 
484 	cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
485 	    (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
486 	cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
487 	    (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
488 	cd->nn = 1;	/* number of namespaces */
489 
490 	cd->oncs = 0;
491 	switch (sc->dataset_management) {
492 	case NVME_DATASET_MANAGEMENT_AUTO:
493 		if (sc->nvstore.deallocate)
494 			cd->oncs |= NVME_ONCS_DSM;
495 		break;
496 	case NVME_DATASET_MANAGEMENT_ENABLE:
497 		cd->oncs |= NVME_ONCS_DSM;
498 		break;
499 	default:
500 		break;
501 	}
502 
503 	cd->fna = 0x03;
504 
505 	cd->power_state[0].mp = 10;
506 }
507 
508 /*
509  * Calculate the CRC-16 of the given buffer
510  * See copyright attribution at top of file
511  */
512 static uint16_t
513 crc16(uint16_t crc, const void *buffer, unsigned int len)
514 {
515 	const unsigned char *cp = buffer;
516 	/* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
517 	static uint16_t const crc16_table[256] = {
518 		0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
519 		0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
520 		0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
521 		0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
522 		0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
523 		0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
524 		0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
525 		0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
526 		0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
527 		0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
528 		0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
529 		0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
530 		0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
531 		0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
532 		0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
533 		0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
534 		0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
535 		0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
536 		0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
537 		0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
538 		0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
539 		0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
540 		0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
541 		0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
542 		0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
543 		0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
544 		0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
545 		0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
546 		0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
547 		0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
548 		0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
549 		0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
550 	};
551 
552 	while (len--)
553 		crc = (((crc >> 8) & 0xffU) ^
554 		    crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
555 	return crc;
556 }
557 
558 static void
559 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
560     struct nvme_namespace_data *nd, uint32_t nsid,
561     struct pci_nvme_blockstore *nvstore)
562 {
563 
564 	/* Get capacity and block size information from backing store */
565 	nd->nsze = nvstore->size / nvstore->sectsz;
566 	nd->ncap = nd->nsze;
567 	nd->nuse = nd->nsze;
568 
569 	if (nvstore->type == NVME_STOR_BLOCKIF)
570 		nvstore->deallocate = blockif_candelete(nvstore->ctx);
571 
572 	nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
573 	nd->flbas = 0;
574 
575 	/* Create an EUI-64 if user did not provide one */
576 	if (nvstore->eui64 == 0) {
577 		char *data = NULL;
578 		uint64_t eui64 = nvstore->eui64;
579 
580 		asprintf(&data, "%s%u%u%u", get_config_value("name"),
581 		    sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot,
582 		    sc->nsc_pi->pi_func);
583 
584 		if (data != NULL) {
585 			eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
586 			free(data);
587 		}
588 		nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
589 	}
590 	be64enc(nd->eui64, nvstore->eui64);
591 
592 	/* LBA data-sz = 2^lbads */
593 	nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
594 }
595 
596 static void
597 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
598 {
599 
600 	memset(&sc->err_log, 0, sizeof(sc->err_log));
601 	memset(&sc->health_log, 0, sizeof(sc->health_log));
602 	memset(&sc->fw_log, 0, sizeof(sc->fw_log));
603 
604 	/* Set read/write remainder to round up according to spec */
605 	sc->read_dunits_remainder = 999;
606 	sc->write_dunits_remainder = 999;
607 
608 	/* Set nominal Health values checked by implementations */
609 	sc->health_log.temperature = 310;
610 	sc->health_log.available_spare = 100;
611 	sc->health_log.available_spare_threshold = 10;
612 }
613 
614 static void
615 pci_nvme_init_features(struct pci_nvme_softc *sc)
616 {
617 
618 	sc->feat[0].set = nvme_feature_invalid_cb;
619 	sc->feat[0].get = nvme_feature_invalid_cb;
620 
621 	sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true;
622 	sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true;
623 	sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues;
624 	sc->feat[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION].set =
625 	    nvme_feature_iv_config;
626 	sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG].get =
627 	    nvme_feature_invalid_cb;
628 	sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW].get =
629 	    nvme_feature_invalid_cb;
630 }
631 
632 static void
633 pci_nvme_aer_init(struct pci_nvme_softc *sc)
634 {
635 
636 	STAILQ_INIT(&sc->aer_list);
637 	sc->aer_count = 0;
638 }
639 
640 static void
641 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
642 {
643 	struct pci_nvme_aer *aer = NULL;
644 
645 	while (!STAILQ_EMPTY(&sc->aer_list)) {
646 		aer = STAILQ_FIRST(&sc->aer_list);
647 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
648 		free(aer);
649 	}
650 
651 	pci_nvme_aer_init(sc);
652 }
653 
654 static bool
655 pci_nvme_aer_available(struct pci_nvme_softc *sc)
656 {
657 
658 	return (!STAILQ_EMPTY(&sc->aer_list));
659 }
660 
661 static bool
662 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
663 {
664 	struct nvme_controller_data *cd = &sc->ctrldata;
665 
666 	/* AERL is a zero based value while aer_count is one's based */
667 	return (sc->aer_count == (cd->aerl + 1));
668 }
669 
670 /*
671  * Add an Async Event Request
672  *
673  * Stores an AER to be returned later if the Controller needs to notify the
674  * host of an event.
675  * Note that while the NVMe spec doesn't require Controllers to return AER's
676  * in order, this implementation does preserve the order.
677  */
678 static int
679 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
680 {
681 	struct pci_nvme_aer *aer = NULL;
682 
683 	if (pci_nvme_aer_limit_reached(sc))
684 		return (-1);
685 
686 	aer = calloc(1, sizeof(struct pci_nvme_aer));
687 	if (aer == NULL)
688 		return (-1);
689 
690 	sc->aer_count++;
691 
692 	/* Save the Command ID for use in the completion message */
693 	aer->cid = cid;
694 	STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
695 
696 	return (0);
697 }
698 
699 /*
700  * Get an Async Event Request structure
701  *
702  * Returns a pointer to an AER previously submitted by the host or NULL if
703  * no AER's exist. Caller is responsible for freeing the returned struct.
704  */
705 static struct pci_nvme_aer *
706 pci_nvme_aer_get(struct pci_nvme_softc *sc)
707 {
708 	struct pci_nvme_aer *aer = NULL;
709 
710 	aer = STAILQ_FIRST(&sc->aer_list);
711 	if (aer != NULL) {
712 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
713 		sc->aer_count--;
714 	}
715 
716 	return (aer);
717 }
718 
719 static void
720 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
721 {
722 	uint32_t i;
723 
724 	DPRINTF("%s", __func__);
725 
726 	sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
727 	    (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
728 	    (60 << NVME_CAP_LO_REG_TO_SHIFT);
729 
730 	sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
731 
732 	sc->regs.vs = 0x00010300;	/* NVMe v1.3 */
733 
734 	sc->regs.cc = 0;
735 	sc->regs.csts = 0;
736 
737 	assert(sc->submit_queues != NULL);
738 
739 	for (i = 0; i < sc->num_squeues + 1; i++) {
740 		sc->submit_queues[i].qbase = NULL;
741 		sc->submit_queues[i].size = 0;
742 		sc->submit_queues[i].cqid = 0;
743 		sc->submit_queues[i].tail = 0;
744 		sc->submit_queues[i].head = 0;
745 	}
746 
747 	assert(sc->compl_queues != NULL);
748 
749 	for (i = 0; i < sc->num_cqueues + 1; i++) {
750 		sc->compl_queues[i].qbase = NULL;
751 		sc->compl_queues[i].size = 0;
752 		sc->compl_queues[i].tail = 0;
753 		sc->compl_queues[i].head = 0;
754 	}
755 
756 	sc->num_q_is_set = false;
757 
758 	pci_nvme_aer_destroy(sc);
759 }
760 
761 static void
762 pci_nvme_reset(struct pci_nvme_softc *sc)
763 {
764 	pthread_mutex_lock(&sc->mtx);
765 	pci_nvme_reset_locked(sc);
766 	pthread_mutex_unlock(&sc->mtx);
767 }
768 
769 static void
770 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
771 {
772 	uint16_t acqs, asqs;
773 
774 	DPRINTF("%s", __func__);
775 
776 	asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
777 	sc->submit_queues[0].size = asqs;
778 	sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
779 	            sizeof(struct nvme_command) * asqs);
780 
781 	DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
782 	        __func__, sc->regs.asq, sc->submit_queues[0].qbase);
783 
784 	acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
785 	    NVME_AQA_REG_ACQS_MASK) + 1;
786 	sc->compl_queues[0].size = acqs;
787 	sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
788 	         sizeof(struct nvme_completion) * acqs);
789 	sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
790 
791 	DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
792 	        __func__, sc->regs.acq, sc->compl_queues[0].qbase);
793 }
794 
795 static int
796 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
797 	size_t len, enum nvme_copy_dir dir)
798 {
799 	uint8_t *p;
800 	size_t bytes;
801 
802 	if (len > (8 * 1024)) {
803 		return (-1);
804 	}
805 
806 	/* Copy from the start of prp1 to the end of the physical page */
807 	bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
808 	bytes = MIN(bytes, len);
809 
810 	p = vm_map_gpa(ctx, prp1, bytes);
811 	if (p == NULL) {
812 		return (-1);
813 	}
814 
815 	if (dir == NVME_COPY_TO_PRP)
816 		memcpy(p, b, bytes);
817 	else
818 		memcpy(b, p, bytes);
819 
820 	b += bytes;
821 
822 	len -= bytes;
823 	if (len == 0) {
824 		return (0);
825 	}
826 
827 	len = MIN(len, PAGE_SIZE);
828 
829 	p = vm_map_gpa(ctx, prp2, len);
830 	if (p == NULL) {
831 		return (-1);
832 	}
833 
834 	if (dir == NVME_COPY_TO_PRP)
835 		memcpy(p, b, len);
836 	else
837 		memcpy(b, p, len);
838 
839 	return (0);
840 }
841 
842 /*
843  * Write a Completion Queue Entry update
844  *
845  * Write the completion and update the doorbell value
846  */
847 static void
848 pci_nvme_cq_update(struct pci_nvme_softc *sc,
849 		struct nvme_completion_queue *cq,
850 		uint32_t cdw0,
851 		uint16_t cid,
852 		uint16_t sqid,
853 		uint16_t status)
854 {
855 	struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
856 	struct nvme_completion *cqe;
857 
858 	assert(cq->qbase != NULL);
859 
860 	pthread_mutex_lock(&cq->mtx);
861 
862 	cqe = &cq->qbase[cq->tail];
863 
864 	/* Flip the phase bit */
865 	status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
866 
867 	cqe->cdw0 = cdw0;
868 	cqe->sqhd = sq->head;
869 	cqe->sqid = sqid;
870 	cqe->cid = cid;
871 	cqe->status = status;
872 
873 	cq->tail++;
874 	if (cq->tail >= cq->size) {
875 		cq->tail = 0;
876 	}
877 
878 	pthread_mutex_unlock(&cq->mtx);
879 }
880 
881 static int
882 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
883 	struct nvme_completion* compl)
884 {
885 	uint16_t qid = command->cdw10 & 0xffff;
886 
887 	DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
888 	if (qid == 0 || qid > sc->num_squeues ||
889 	    (sc->submit_queues[qid].qbase == NULL)) {
890 		WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
891 		        __func__, qid, sc->num_squeues);
892 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
893 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
894 		return (1);
895 	}
896 
897 	sc->submit_queues[qid].qbase = NULL;
898 	sc->submit_queues[qid].cqid = 0;
899 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
900 	return (1);
901 }
902 
903 static int
904 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
905 	struct nvme_completion* compl)
906 {
907 	if (command->cdw11 & NVME_CMD_CDW11_PC) {
908 		uint16_t qid = command->cdw10 & 0xffff;
909 		struct nvme_submission_queue *nsq;
910 
911 		if ((qid == 0) || (qid > sc->num_squeues) ||
912 		    (sc->submit_queues[qid].qbase != NULL)) {
913 			WPRINTF("%s queue index %u > num_squeues %u",
914 			        __func__, qid, sc->num_squeues);
915 			pci_nvme_status_tc(&compl->status,
916 			    NVME_SCT_COMMAND_SPECIFIC,
917 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
918 			return (1);
919 		}
920 
921 		nsq = &sc->submit_queues[qid];
922 		nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
923 		DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
924 		if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
925 			/*
926 			 * Queues must specify at least two entries
927 			 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
928 			 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
929 			 */
930 			pci_nvme_status_tc(&compl->status,
931 			    NVME_SCT_COMMAND_SPECIFIC,
932 			    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
933 			return (1);
934 		}
935 		nsq->head = nsq->tail = 0;
936 
937 		nsq->cqid = (command->cdw11 >> 16) & 0xffff;
938 		if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
939 			pci_nvme_status_tc(&compl->status,
940 			    NVME_SCT_COMMAND_SPECIFIC,
941 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
942 			return (1);
943 		}
944 
945 		if (sc->compl_queues[nsq->cqid].qbase == NULL) {
946 			pci_nvme_status_tc(&compl->status,
947 			    NVME_SCT_COMMAND_SPECIFIC,
948 			    NVME_SC_COMPLETION_QUEUE_INVALID);
949 			return (1);
950 		}
951 
952 		nsq->qpriority = (command->cdw11 >> 1) & 0x03;
953 
954 		nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
955 		              sizeof(struct nvme_command) * (size_t)nsq->size);
956 
957 		DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
958 		        qid, nsq->size, nsq->qbase, nsq->cqid);
959 
960 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
961 
962 		DPRINTF("%s completed creating IOSQ qid %u",
963 		         __func__, qid);
964 	} else {
965 		/*
966 		 * Guest sent non-cont submission queue request.
967 		 * This setting is unsupported by this emulation.
968 		 */
969 		WPRINTF("%s unsupported non-contig (list-based) "
970 		         "create i/o submission queue", __func__);
971 
972 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
973 	}
974 	return (1);
975 }
976 
977 static int
978 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
979 	struct nvme_completion* compl)
980 {
981 	uint16_t qid = command->cdw10 & 0xffff;
982 	uint16_t sqid;
983 
984 	DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
985 	if (qid == 0 || qid > sc->num_cqueues ||
986 	    (sc->compl_queues[qid].qbase == NULL)) {
987 		WPRINTF("%s queue index %u / num_cqueues %u",
988 		        __func__, qid, sc->num_cqueues);
989 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
990 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
991 		return (1);
992 	}
993 
994 	/* Deleting an Active CQ is an error */
995 	for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
996 		if (sc->submit_queues[sqid].cqid == qid) {
997 			pci_nvme_status_tc(&compl->status,
998 			    NVME_SCT_COMMAND_SPECIFIC,
999 			    NVME_SC_INVALID_QUEUE_DELETION);
1000 			return (1);
1001 		}
1002 
1003 	sc->compl_queues[qid].qbase = NULL;
1004 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1005 	return (1);
1006 }
1007 
1008 static int
1009 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1010 	struct nvme_completion* compl)
1011 {
1012 	struct nvme_completion_queue *ncq;
1013 	uint16_t qid = command->cdw10 & 0xffff;
1014 
1015 	/* Only support Physically Contiguous queues */
1016 	if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1017 		WPRINTF("%s unsupported non-contig (list-based) "
1018 		         "create i/o completion queue",
1019 		         __func__);
1020 
1021 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1022 		return (1);
1023 	}
1024 
1025 	if ((qid == 0) || (qid > sc->num_cqueues) ||
1026 	    (sc->compl_queues[qid].qbase != NULL)) {
1027 		WPRINTF("%s queue index %u > num_cqueues %u",
1028 			__func__, qid, sc->num_cqueues);
1029 		pci_nvme_status_tc(&compl->status,
1030 		    NVME_SCT_COMMAND_SPECIFIC,
1031 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1032 		return (1);
1033  	}
1034 
1035 	ncq = &sc->compl_queues[qid];
1036 	ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1037 	ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1038 	if (ncq->intr_vec > (sc->max_queues + 1)) {
1039 		pci_nvme_status_tc(&compl->status,
1040 		    NVME_SCT_COMMAND_SPECIFIC,
1041 		    NVME_SC_INVALID_INTERRUPT_VECTOR);
1042 		return (1);
1043 	}
1044 
1045 	ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1046 	if ((ncq->size < 2) || (ncq->size > sc->max_qentries))  {
1047 		/*
1048 		 * Queues must specify at least two entries
1049 		 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1050 		 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1051 		 */
1052 		pci_nvme_status_tc(&compl->status,
1053 		    NVME_SCT_COMMAND_SPECIFIC,
1054 		    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1055 		return (1);
1056 	}
1057 	ncq->head = ncq->tail = 0;
1058 	ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1059 		     command->prp1,
1060 		     sizeof(struct nvme_command) * (size_t)ncq->size);
1061 
1062 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1063 
1064 
1065 	return (1);
1066 }
1067 
1068 static int
1069 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1070 	struct nvme_completion* compl)
1071 {
1072 	uint32_t logsize;
1073 	uint8_t logpage = command->cdw10 & 0xFF;
1074 
1075 	DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1076 
1077 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1078 
1079 	/*
1080 	 * Command specifies the number of dwords to return in fields NUMDU
1081 	 * and NUMDL. This is a zero-based value.
1082 	 */
1083 	logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1084 	logsize *= sizeof(uint32_t);
1085 
1086 	switch (logpage) {
1087 	case NVME_LOG_ERROR:
1088 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1089 		    command->prp2, (uint8_t *)&sc->err_log,
1090 		    MIN(logsize, sizeof(sc->err_log)),
1091 		    NVME_COPY_TO_PRP);
1092 		break;
1093 	case NVME_LOG_HEALTH_INFORMATION:
1094 		pthread_mutex_lock(&sc->mtx);
1095 		memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1096 		    sizeof(sc->health_log.data_units_read));
1097 		memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1098 		    sizeof(sc->health_log.data_units_written));
1099 		memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1100 		    sizeof(sc->health_log.host_read_commands));
1101 		memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1102 		    sizeof(sc->health_log.host_write_commands));
1103 		pthread_mutex_unlock(&sc->mtx);
1104 
1105 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1106 		    command->prp2, (uint8_t *)&sc->health_log,
1107 		    MIN(logsize, sizeof(sc->health_log)),
1108 		    NVME_COPY_TO_PRP);
1109 		break;
1110 	case NVME_LOG_FIRMWARE_SLOT:
1111 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1112 		    command->prp2, (uint8_t *)&sc->fw_log,
1113 		    MIN(logsize, sizeof(sc->fw_log)),
1114 		    NVME_COPY_TO_PRP);
1115 		break;
1116 	default:
1117 		DPRINTF("%s get log page %x command not supported",
1118 		        __func__, logpage);
1119 
1120 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1121 		    NVME_SC_INVALID_LOG_PAGE);
1122 	}
1123 
1124 	return (1);
1125 }
1126 
1127 static int
1128 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1129 	struct nvme_completion* compl)
1130 {
1131 	void *dest;
1132 	uint16_t status;
1133 
1134 	DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1135 	        command->cdw10 & 0xFF, command->nsid);
1136 
1137 	pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1138 
1139 	switch (command->cdw10 & 0xFF) {
1140 	case 0x00: /* return Identify Namespace data structure */
1141 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1142 		    command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1143 		    NVME_COPY_TO_PRP);
1144 		break;
1145 	case 0x01: /* return Identify Controller data structure */
1146 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1147 		    command->prp2, (uint8_t *)&sc->ctrldata,
1148 		    sizeof(sc->ctrldata),
1149 		    NVME_COPY_TO_PRP);
1150 		break;
1151 	case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1152 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1153 		                  sizeof(uint32_t) * 1024);
1154 		/* All unused entries shall be zero */
1155 		bzero(dest, sizeof(uint32_t) * 1024);
1156 		((uint32_t *)dest)[0] = 1;
1157 		break;
1158 	case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1159 		if (command->nsid != 1) {
1160 			pci_nvme_status_genc(&status,
1161 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1162 			break;
1163 		}
1164 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1165 		                  sizeof(uint32_t) * 1024);
1166 		/* All bytes after the descriptor shall be zero */
1167 		bzero(dest, sizeof(uint32_t) * 1024);
1168 
1169 		/* Return NIDT=1 (i.e. EUI64) descriptor */
1170 		((uint8_t *)dest)[0] = 1;
1171 		((uint8_t *)dest)[1] = sizeof(uint64_t);
1172 		bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1173 		break;
1174 	default:
1175 		DPRINTF("%s unsupported identify command requested 0x%x",
1176 		         __func__, command->cdw10 & 0xFF);
1177 		pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1178 		break;
1179 	}
1180 
1181 	compl->status = status;
1182 	return (1);
1183 }
1184 
1185 static const char *
1186 nvme_fid_to_name(uint8_t fid)
1187 {
1188 	const char *name;
1189 
1190 	switch (fid) {
1191 	case NVME_FEAT_ARBITRATION:
1192 		name = "Arbitration";
1193 		break;
1194 	case NVME_FEAT_POWER_MANAGEMENT:
1195 		name = "Power Management";
1196 		break;
1197 	case NVME_FEAT_LBA_RANGE_TYPE:
1198 		name = "LBA Range Type";
1199 		break;
1200 	case NVME_FEAT_TEMPERATURE_THRESHOLD:
1201 		name = "Temperature Threshold";
1202 		break;
1203 	case NVME_FEAT_ERROR_RECOVERY:
1204 		name = "Error Recovery";
1205 		break;
1206 	case NVME_FEAT_VOLATILE_WRITE_CACHE:
1207 		name = "Volatile Write Cache";
1208 		break;
1209 	case NVME_FEAT_NUMBER_OF_QUEUES:
1210 		name = "Number of Queues";
1211 		break;
1212 	case NVME_FEAT_INTERRUPT_COALESCING:
1213 		name = "Interrupt Coalescing";
1214 		break;
1215 	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1216 		name = "Interrupt Vector Configuration";
1217 		break;
1218 	case NVME_FEAT_WRITE_ATOMICITY:
1219 		name = "Write Atomicity Normal";
1220 		break;
1221 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1222 		name = "Asynchronous Event Configuration";
1223 		break;
1224 	case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1225 		name = "Autonomous Power State Transition";
1226 		break;
1227 	case NVME_FEAT_HOST_MEMORY_BUFFER:
1228 		name = "Host Memory Buffer";
1229 		break;
1230 	case NVME_FEAT_TIMESTAMP:
1231 		name = "Timestamp";
1232 		break;
1233 	case NVME_FEAT_KEEP_ALIVE_TIMER:
1234 		name = "Keep Alive Timer";
1235 		break;
1236 	case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1237 		name = "Host Controlled Thermal Management";
1238 		break;
1239 	case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1240 		name = "Non-Operation Power State Config";
1241 		break;
1242 	case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1243 		name = "Read Recovery Level Config";
1244 		break;
1245 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1246 		name = "Predictable Latency Mode Config";
1247 		break;
1248 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1249 		name = "Predictable Latency Mode Window";
1250 		break;
1251 	case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1252 		name = "LBA Status Information Report Interval";
1253 		break;
1254 	case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1255 		name = "Host Behavior Support";
1256 		break;
1257 	case NVME_FEAT_SANITIZE_CONFIG:
1258 		name = "Sanitize Config";
1259 		break;
1260 	case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1261 		name = "Endurance Group Event Configuration";
1262 		break;
1263 	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1264 		name = "Software Progress Marker";
1265 		break;
1266 	case NVME_FEAT_HOST_IDENTIFIER:
1267 		name = "Host Identifier";
1268 		break;
1269 	case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1270 		name = "Reservation Notification Mask";
1271 		break;
1272 	case NVME_FEAT_RESERVATION_PERSISTENCE:
1273 		name = "Reservation Persistence";
1274 		break;
1275 	case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1276 		name = "Namespace Write Protection Config";
1277 		break;
1278 	default:
1279 		name = "Unknown";
1280 		break;
1281 	}
1282 
1283 	return (name);
1284 }
1285 
1286 static void
1287 nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
1288     struct nvme_feature_obj *feat,
1289     struct nvme_command *command,
1290     struct nvme_completion *compl)
1291 {
1292 
1293 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1294 }
1295 
1296 static void
1297 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1298     struct nvme_feature_obj *feat,
1299     struct nvme_command *command,
1300     struct nvme_completion *compl)
1301 {
1302 	uint32_t i;
1303 	uint32_t cdw11 = command->cdw11;
1304 	uint16_t iv;
1305 	bool cd;
1306 
1307 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1308 
1309 	iv = cdw11 & 0xffff;
1310 	cd = cdw11 & (1 << 16);
1311 
1312 	if (iv > (sc->max_queues + 1)) {
1313 		return;
1314 	}
1315 
1316 	/* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1317 	if ((iv == 0) && !cd)
1318 		return;
1319 
1320 	/* Requested Interrupt Vector must be used by a CQ */
1321 	for (i = 0; i < sc->num_cqueues + 1; i++) {
1322 		if (sc->compl_queues[i].intr_vec == iv) {
1323 			pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1324 		}
1325 	}
1326 
1327 }
1328 
1329 static void
1330 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1331     struct nvme_feature_obj *feat,
1332     struct nvme_command *command,
1333     struct nvme_completion *compl)
1334 {
1335 	uint16_t nqr;	/* Number of Queues Requested */
1336 
1337 	if (sc->num_q_is_set) {
1338 		WPRINTF("%s: Number of Queues already set", __func__);
1339 		pci_nvme_status_genc(&compl->status,
1340 		    NVME_SC_COMMAND_SEQUENCE_ERROR);
1341 		return;
1342 	}
1343 
1344 	nqr = command->cdw11 & 0xFFFF;
1345 	if (nqr == 0xffff) {
1346 		WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1347 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1348 		return;
1349 	}
1350 
1351 	sc->num_squeues = ONE_BASED(nqr);
1352 	if (sc->num_squeues > sc->max_queues) {
1353 		DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1354 					sc->max_queues);
1355 		sc->num_squeues = sc->max_queues;
1356 	}
1357 
1358 	nqr = (command->cdw11 >> 16) & 0xFFFF;
1359 	if (nqr == 0xffff) {
1360 		WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1361 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1362 		return;
1363 	}
1364 
1365 	sc->num_cqueues = ONE_BASED(nqr);
1366 	if (sc->num_cqueues > sc->max_queues) {
1367 		DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1368 					sc->max_queues);
1369 		sc->num_cqueues = sc->max_queues;
1370 	}
1371 
1372 	/* Patch the command value which will be saved on callback's return */
1373 	command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1374 	compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1375 
1376 	sc->num_q_is_set = true;
1377 }
1378 
1379 static int
1380 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1381 	struct nvme_completion *compl)
1382 {
1383 	struct nvme_feature_obj *feat;
1384 	uint32_t nsid = command->nsid;
1385 	uint8_t fid = command->cdw10 & 0xFF;
1386 
1387 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1388 
1389 	if (fid >= NVME_FID_MAX) {
1390 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1391 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1392 		return (1);
1393 	}
1394 	feat = &sc->feat[fid];
1395 
1396 	if (!feat->namespace_specific &&
1397 	    !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1398 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1399 		    NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1400 		return (1);
1401 	}
1402 
1403 	compl->cdw0 = 0;
1404 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1405 
1406 	if (feat->set)
1407 		feat->set(sc, feat, command, compl);
1408 
1409 	if (compl->status == NVME_SC_SUCCESS)
1410 		feat->cdw11 = command->cdw11;
1411 
1412 	return (0);
1413 }
1414 
1415 static int
1416 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1417 	struct nvme_completion* compl)
1418 {
1419 	struct nvme_feature_obj *feat;
1420 	uint8_t fid = command->cdw10 & 0xFF;
1421 
1422 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1423 
1424 	if (fid >= NVME_FID_MAX) {
1425 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1426 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1427 		return (1);
1428 	}
1429 
1430 	compl->cdw0 = 0;
1431 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1432 
1433 	feat = &sc->feat[fid];
1434 	if (feat->get) {
1435 		feat->get(sc, feat, command, compl);
1436 	}
1437 
1438 	if (compl->status == NVME_SC_SUCCESS) {
1439 		compl->cdw0 = feat->cdw11;
1440 	}
1441 
1442 	return (0);
1443 }
1444 
1445 static int
1446 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1447 	struct nvme_completion* compl)
1448 {
1449 	uint8_t	ses, lbaf, pi;
1450 
1451 	/* Only supports Secure Erase Setting - User Data Erase */
1452 	ses = (command->cdw10 >> 9) & 0x7;
1453 	if (ses > 0x1) {
1454 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1455 		return (1);
1456 	}
1457 
1458 	/* Only supports a single LBA Format */
1459 	lbaf = command->cdw10 & 0xf;
1460 	if (lbaf != 0) {
1461 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1462 		    NVME_SC_INVALID_FORMAT);
1463 		return (1);
1464 	}
1465 
1466 	/* Doesn't support Protection Infomation */
1467 	pi = (command->cdw10 >> 5) & 0x7;
1468 	if (pi != 0) {
1469 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1470 		return (1);
1471 	}
1472 
1473 	if (sc->nvstore.type == NVME_STOR_RAM) {
1474 		if (sc->nvstore.ctx)
1475 			free(sc->nvstore.ctx);
1476 		sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1477 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1478 	} else {
1479 		struct pci_nvme_ioreq *req;
1480 		int err;
1481 
1482 		req = pci_nvme_get_ioreq(sc);
1483 		if (req == NULL) {
1484 			pci_nvme_status_genc(&compl->status,
1485 			    NVME_SC_INTERNAL_DEVICE_ERROR);
1486 			WPRINTF("%s: unable to allocate IO req", __func__);
1487 			return (1);
1488 		}
1489 		req->nvme_sq = &sc->submit_queues[0];
1490 		req->sqid = 0;
1491 		req->opc = command->opc;
1492 		req->cid = command->cid;
1493 		req->nsid = command->nsid;
1494 
1495 		req->io_req.br_offset = 0;
1496 		req->io_req.br_resid = sc->nvstore.size;
1497 		req->io_req.br_callback = pci_nvme_io_done;
1498 
1499 		err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1500 		if (err) {
1501 			pci_nvme_status_genc(&compl->status,
1502 			    NVME_SC_INTERNAL_DEVICE_ERROR);
1503 			pci_nvme_release_ioreq(sc, req);
1504 		}
1505 	}
1506 
1507 	return (1);
1508 }
1509 
1510 static int
1511 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1512 	struct nvme_completion* compl)
1513 {
1514 	DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1515 	        command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1516 
1517 	/* TODO: search for the command ID and abort it */
1518 
1519 	compl->cdw0 = 1;
1520 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1521 	return (1);
1522 }
1523 
1524 static int
1525 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1526 	struct nvme_command* command, struct nvme_completion* compl)
1527 {
1528 	DPRINTF("%s async event request 0x%x", __func__, command->cdw11);
1529 
1530 	/* Don't exceed the Async Event Request Limit (AERL). */
1531 	if (pci_nvme_aer_limit_reached(sc)) {
1532 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1533 				NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1534 		return (1);
1535 	}
1536 
1537 	if (pci_nvme_aer_add(sc, command->cid)) {
1538 		pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
1539 				NVME_SC_INTERNAL_DEVICE_ERROR);
1540 		return (1);
1541 	}
1542 
1543 	/*
1544 	 * Raise events when they happen based on the Set Features cmd.
1545 	 * These events happen async, so only set completion successful if
1546 	 * there is an event reflective of the request to get event.
1547 	 */
1548 	compl->status = NVME_NO_STATUS;
1549 
1550 	return (0);
1551 }
1552 
1553 static void
1554 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1555 {
1556 	struct nvme_completion compl;
1557 	struct nvme_command *cmd;
1558 	struct nvme_submission_queue *sq;
1559 	struct nvme_completion_queue *cq;
1560 	uint16_t sqhead;
1561 
1562 	DPRINTF("%s index %u", __func__, (uint32_t)value);
1563 
1564 	sq = &sc->submit_queues[0];
1565 	cq = &sc->compl_queues[0];
1566 
1567 	pthread_mutex_lock(&sq->mtx);
1568 
1569 	sqhead = sq->head;
1570 	DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1571 
1572 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
1573 		cmd = &(sq->qbase)[sqhead];
1574 		compl.cdw0 = 0;
1575 		compl.status = 0;
1576 
1577 		switch (cmd->opc) {
1578 		case NVME_OPC_DELETE_IO_SQ:
1579 			DPRINTF("%s command DELETE_IO_SQ", __func__);
1580 			nvme_opc_delete_io_sq(sc, cmd, &compl);
1581 			break;
1582 		case NVME_OPC_CREATE_IO_SQ:
1583 			DPRINTF("%s command CREATE_IO_SQ", __func__);
1584 			nvme_opc_create_io_sq(sc, cmd, &compl);
1585 			break;
1586 		case NVME_OPC_DELETE_IO_CQ:
1587 			DPRINTF("%s command DELETE_IO_CQ", __func__);
1588 			nvme_opc_delete_io_cq(sc, cmd, &compl);
1589 			break;
1590 		case NVME_OPC_CREATE_IO_CQ:
1591 			DPRINTF("%s command CREATE_IO_CQ", __func__);
1592 			nvme_opc_create_io_cq(sc, cmd, &compl);
1593 			break;
1594 		case NVME_OPC_GET_LOG_PAGE:
1595 			DPRINTF("%s command GET_LOG_PAGE", __func__);
1596 			nvme_opc_get_log_page(sc, cmd, &compl);
1597 			break;
1598 		case NVME_OPC_IDENTIFY:
1599 			DPRINTF("%s command IDENTIFY", __func__);
1600 			nvme_opc_identify(sc, cmd, &compl);
1601 			break;
1602 		case NVME_OPC_ABORT:
1603 			DPRINTF("%s command ABORT", __func__);
1604 			nvme_opc_abort(sc, cmd, &compl);
1605 			break;
1606 		case NVME_OPC_SET_FEATURES:
1607 			DPRINTF("%s command SET_FEATURES", __func__);
1608 			nvme_opc_set_features(sc, cmd, &compl);
1609 			break;
1610 		case NVME_OPC_GET_FEATURES:
1611 			DPRINTF("%s command GET_FEATURES", __func__);
1612 			nvme_opc_get_features(sc, cmd, &compl);
1613 			break;
1614 		case NVME_OPC_FIRMWARE_ACTIVATE:
1615 			DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
1616 			pci_nvme_status_tc(&compl.status,
1617 			    NVME_SCT_COMMAND_SPECIFIC,
1618 			    NVME_SC_INVALID_FIRMWARE_SLOT);
1619 			break;
1620 		case NVME_OPC_ASYNC_EVENT_REQUEST:
1621 			DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1622 			nvme_opc_async_event_req(sc, cmd, &compl);
1623 			break;
1624 		case NVME_OPC_FORMAT_NVM:
1625 			DPRINTF("%s command FORMAT_NVM", __func__);
1626 			if ((sc->ctrldata.oacs &
1627 			    (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
1628 				pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1629 			}
1630 			compl.status = NVME_NO_STATUS;
1631 			nvme_opc_format_nvm(sc, cmd, &compl);
1632 			break;
1633 		default:
1634 			DPRINTF("0x%x command is not implemented",
1635 			    cmd->opc);
1636 			pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1637 		}
1638 		sqhead = (sqhead + 1) % sq->size;
1639 
1640 		if (NVME_COMPLETION_VALID(compl)) {
1641 			pci_nvme_cq_update(sc, &sc->compl_queues[0],
1642 			    compl.cdw0,
1643 			    cmd->cid,
1644 			    0,		/* SQID */
1645 			    compl.status);
1646 		}
1647 	}
1648 
1649 	DPRINTF("setting sqhead %u", sqhead);
1650 	sq->head = sqhead;
1651 
1652 	if (cq->head != cq->tail)
1653 		pci_generate_msix(sc->nsc_pi, 0);
1654 
1655 	pthread_mutex_unlock(&sq->mtx);
1656 }
1657 
1658 /*
1659  * Update the Write and Read statistics reported in SMART data
1660  *
1661  * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
1662  * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
1663  * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
1664  */
1665 static void
1666 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
1667     size_t bytes, uint16_t status)
1668 {
1669 
1670 	pthread_mutex_lock(&sc->mtx);
1671 	switch (opc) {
1672 	case NVME_OPC_WRITE:
1673 		sc->write_commands++;
1674 		if (status != NVME_SC_SUCCESS)
1675 			break;
1676 		sc->write_dunits_remainder += (bytes / 512);
1677 		while (sc->write_dunits_remainder >= 1000) {
1678 			sc->write_data_units++;
1679 			sc->write_dunits_remainder -= 1000;
1680 		}
1681 		break;
1682 	case NVME_OPC_READ:
1683 		sc->read_commands++;
1684 		if (status != NVME_SC_SUCCESS)
1685 			break;
1686 		sc->read_dunits_remainder += (bytes / 512);
1687 		while (sc->read_dunits_remainder >= 1000) {
1688 			sc->read_data_units++;
1689 			sc->read_dunits_remainder -= 1000;
1690 		}
1691 		break;
1692 	default:
1693 		DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
1694 		break;
1695 	}
1696 	pthread_mutex_unlock(&sc->mtx);
1697 }
1698 
1699 /*
1700  * Check if the combination of Starting LBA (slba) and Number of Logical
1701  * Blocks (nlb) exceeds the range of the underlying storage.
1702  *
1703  * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
1704  * the capacity in bytes as a uint64_t, care must be taken to avoid integer
1705  * overflow.
1706  */
1707 static bool
1708 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
1709     uint32_t nlb)
1710 {
1711 	size_t	offset, bytes;
1712 
1713 	/* Overflow check of multiplying Starting LBA by the sector size */
1714 	if (slba >> (64 - nvstore->sectsz_bits))
1715 		return (true);
1716 
1717 	offset = slba << nvstore->sectsz_bits;
1718 	bytes = nlb << nvstore->sectsz_bits;
1719 
1720 	/* Overflow check of Number of Logical Blocks */
1721 	if ((nvstore->size - offset) < bytes)
1722 		return (true);
1723 
1724 	return (false);
1725 }
1726 
1727 static int
1728 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1729 	uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1730 {
1731 	int iovidx;
1732 
1733 	if (req == NULL)
1734 		return (-1);
1735 
1736 	if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
1737 		return (-1);
1738 	}
1739 
1740 	/* concatenate contig block-iovs to minimize number of iovs */
1741 	if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1742 		iovidx = req->io_req.br_iovcnt - 1;
1743 
1744 		req->io_req.br_iov[iovidx].iov_base =
1745 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1746 				     req->prev_gpaddr, size);
1747 
1748 		req->prev_size += size;
1749 		req->io_req.br_resid += size;
1750 
1751 		req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1752 	} else {
1753 		iovidx = req->io_req.br_iovcnt;
1754 		if (iovidx == 0) {
1755 			req->io_req.br_offset = lba;
1756 			req->io_req.br_resid = 0;
1757 			req->io_req.br_param = req;
1758 		}
1759 
1760 		req->io_req.br_iov[iovidx].iov_base =
1761 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1762 				     gpaddr, size);
1763 
1764 		req->io_req.br_iov[iovidx].iov_len = size;
1765 
1766 		req->prev_gpaddr = gpaddr;
1767 		req->prev_size = size;
1768 		req->io_req.br_resid += size;
1769 
1770 		req->io_req.br_iovcnt++;
1771 	}
1772 
1773 	return (0);
1774 }
1775 
1776 static void
1777 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1778 	struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1779 	uint32_t cdw0, uint16_t status)
1780 {
1781 	struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1782 
1783 	DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1784 		 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1785 		 NVME_STATUS_GET_SC(status));
1786 
1787 	pci_nvme_cq_update(sc, cq,
1788 	    0,		/* CDW0 */
1789 	    cid,
1790 	    sqid,
1791 	    status);
1792 
1793 	if (cq->head != cq->tail) {
1794 		if (cq->intr_en & NVME_CQ_INTEN) {
1795 			pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1796 		} else {
1797 			DPRINTF("%s: CQ%u interrupt disabled",
1798 						__func__, sq->cqid);
1799 		}
1800 	}
1801 }
1802 
1803 static void
1804 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1805 {
1806 	req->sc = NULL;
1807 	req->nvme_sq = NULL;
1808 	req->sqid = 0;
1809 
1810 	pthread_mutex_lock(&sc->mtx);
1811 
1812 	STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
1813 	sc->pending_ios--;
1814 
1815 	/* when no more IO pending, can set to ready if device reset/enabled */
1816 	if (sc->pending_ios == 0 &&
1817 	    NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1818 		sc->regs.csts |= NVME_CSTS_RDY;
1819 
1820 	pthread_mutex_unlock(&sc->mtx);
1821 
1822 	sem_post(&sc->iosemlock);
1823 }
1824 
1825 static struct pci_nvme_ioreq *
1826 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1827 {
1828 	struct pci_nvme_ioreq *req = NULL;
1829 
1830 	sem_wait(&sc->iosemlock);
1831 	pthread_mutex_lock(&sc->mtx);
1832 
1833 	req = STAILQ_FIRST(&sc->ioreqs_free);
1834 	assert(req != NULL);
1835 	STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
1836 
1837 	req->sc = sc;
1838 
1839 	sc->pending_ios++;
1840 
1841 	pthread_mutex_unlock(&sc->mtx);
1842 
1843 	req->io_req.br_iovcnt = 0;
1844 	req->io_req.br_offset = 0;
1845 	req->io_req.br_resid = 0;
1846 	req->io_req.br_param = req;
1847 	req->prev_gpaddr = 0;
1848 	req->prev_size = 0;
1849 
1850 	return req;
1851 }
1852 
1853 static void
1854 pci_nvme_io_done(struct blockif_req *br, int err)
1855 {
1856 	struct pci_nvme_ioreq *req = br->br_param;
1857 	struct nvme_submission_queue *sq = req->nvme_sq;
1858 	uint16_t code, status;
1859 
1860 	DPRINTF("%s error %d %s", __func__, err, strerror(err));
1861 
1862 	/* TODO return correct error */
1863 	code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1864 	pci_nvme_status_genc(&status, code);
1865 
1866 	pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
1867 	pci_nvme_stats_write_read_update(req->sc, req->opc,
1868 	    req->bytes, status);
1869 	pci_nvme_release_ioreq(req->sc, req);
1870 }
1871 
1872 /*
1873  * Implements the Flush command. The specification states:
1874  *    If a volatile write cache is not present, Flush commands complete
1875  *    successfully and have no effect
1876  * in the description of the Volatile Write Cache (VWC) field of the Identify
1877  * Controller data. Therefore, set status to Success if the command is
1878  * not supported (i.e. RAM or as indicated by the blockif).
1879  */
1880 static bool
1881 nvme_opc_flush(struct pci_nvme_softc *sc,
1882     struct nvme_command *cmd,
1883     struct pci_nvme_blockstore *nvstore,
1884     struct pci_nvme_ioreq *req,
1885     uint16_t *status)
1886 {
1887 	bool pending = false;
1888 
1889 	if (nvstore->type == NVME_STOR_RAM) {
1890 		pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1891 	} else {
1892 		int err;
1893 
1894 		req->io_req.br_callback = pci_nvme_io_done;
1895 
1896 		err = blockif_flush(nvstore->ctx, &req->io_req);
1897 		switch (err) {
1898 		case 0:
1899 			pending = true;
1900 			break;
1901 		case EOPNOTSUPP:
1902 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1903 			break;
1904 		default:
1905 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1906 		}
1907 	}
1908 
1909 	return (pending);
1910 }
1911 
1912 static uint16_t
1913 nvme_write_read_ram(struct pci_nvme_softc *sc,
1914     struct pci_nvme_blockstore *nvstore,
1915     uint64_t prp1, uint64_t prp2,
1916     size_t offset, uint64_t bytes,
1917     bool is_write)
1918 {
1919 	uint8_t *buf = nvstore->ctx;
1920 	enum nvme_copy_dir dir;
1921 	uint16_t status;
1922 
1923 	if (is_write)
1924 		dir = NVME_COPY_TO_PRP;
1925 	else
1926 		dir = NVME_COPY_FROM_PRP;
1927 
1928 	if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
1929 	    buf + offset, bytes, dir))
1930 		pci_nvme_status_genc(&status,
1931 		    NVME_SC_DATA_TRANSFER_ERROR);
1932 	else
1933 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1934 
1935 	return (status);
1936 }
1937 
1938 static uint16_t
1939 nvme_write_read_blockif(struct pci_nvme_softc *sc,
1940     struct pci_nvme_blockstore *nvstore,
1941     struct pci_nvme_ioreq *req,
1942     uint64_t prp1, uint64_t prp2,
1943     size_t offset, uint64_t bytes,
1944     bool is_write)
1945 {
1946 	uint64_t size;
1947 	int err;
1948 	uint16_t status = NVME_NO_STATUS;
1949 
1950 	size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
1951 	if (pci_nvme_append_iov_req(sc, req, prp1,
1952 	    size, is_write, offset)) {
1953 		pci_nvme_status_genc(&status,
1954 		    NVME_SC_DATA_TRANSFER_ERROR);
1955 		goto out;
1956 	}
1957 
1958 	offset += size;
1959 	bytes  -= size;
1960 
1961 	if (bytes == 0) {
1962 		;
1963 	} else if (bytes <= PAGE_SIZE) {
1964 		size = bytes;
1965 		if (pci_nvme_append_iov_req(sc, req, prp2,
1966 		    size, is_write, offset)) {
1967 			pci_nvme_status_genc(&status,
1968 			    NVME_SC_DATA_TRANSFER_ERROR);
1969 			goto out;
1970 		}
1971 	} else {
1972 		void *vmctx = sc->nsc_pi->pi_vmctx;
1973 		uint64_t *prp_list = &prp2;
1974 		uint64_t *last = prp_list;
1975 
1976 		/* PRP2 is pointer to a physical region page list */
1977 		while (bytes) {
1978 			/* Last entry in list points to the next list */
1979 			if ((prp_list == last) && (bytes > PAGE_SIZE)) {
1980 				uint64_t prp = *prp_list;
1981 
1982 				prp_list = paddr_guest2host(vmctx, prp,
1983 				    PAGE_SIZE - (prp % PAGE_SIZE));
1984 				last = prp_list + (NVME_PRP2_ITEMS - 1);
1985 			}
1986 
1987 			size = MIN(bytes, PAGE_SIZE);
1988 
1989 			if (pci_nvme_append_iov_req(sc, req, *prp_list,
1990 			    size, is_write, offset)) {
1991 				pci_nvme_status_genc(&status,
1992 				    NVME_SC_DATA_TRANSFER_ERROR);
1993 				goto out;
1994 			}
1995 
1996 			offset += size;
1997 			bytes  -= size;
1998 
1999 			prp_list++;
2000 		}
2001 	}
2002 	req->io_req.br_callback = pci_nvme_io_done;
2003 	if (is_write)
2004 		err = blockif_write(nvstore->ctx, &req->io_req);
2005 	else
2006 		err = blockif_read(nvstore->ctx, &req->io_req);
2007 
2008 	if (err)
2009 		pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2010 out:
2011 	return (status);
2012 }
2013 
2014 static bool
2015 nvme_opc_write_read(struct pci_nvme_softc *sc,
2016     struct nvme_command *cmd,
2017     struct pci_nvme_blockstore *nvstore,
2018     struct pci_nvme_ioreq *req,
2019     uint16_t *status)
2020 {
2021 	uint64_t lba, nblocks, bytes;
2022 	size_t offset;
2023 	bool is_write = cmd->opc == NVME_OPC_WRITE;
2024 	bool pending = false;
2025 
2026 	lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2027 	nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2028 	if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2029 		WPRINTF("%s command would exceed LBA range", __func__);
2030 		pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2031 		goto out;
2032 	}
2033 
2034 	bytes  = nblocks << nvstore->sectsz_bits;
2035 	if (bytes > NVME_MAX_DATA_SIZE) {
2036 		WPRINTF("%s command would exceed MDTS", __func__);
2037 		pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2038 		goto out;
2039 	}
2040 
2041 	offset = lba << nvstore->sectsz_bits;
2042 
2043 	req->bytes = bytes;
2044 	req->io_req.br_offset = lba;
2045 
2046 	/* PRP bits 1:0 must be zero */
2047 	cmd->prp1 &= ~0x3UL;
2048 	cmd->prp2 &= ~0x3UL;
2049 
2050 	if (nvstore->type == NVME_STOR_RAM) {
2051 		*status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2052 		    cmd->prp2, offset, bytes, is_write);
2053 	} else {
2054 		*status = nvme_write_read_blockif(sc, nvstore, req,
2055 		    cmd->prp1, cmd->prp2, offset, bytes, is_write);
2056 
2057 		if (*status == NVME_NO_STATUS)
2058 			pending = true;
2059 	}
2060 out:
2061 	if (!pending)
2062 		pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2063 
2064 	return (pending);
2065 }
2066 
2067 static void
2068 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2069 {
2070 	struct pci_nvme_ioreq *req = br->br_param;
2071 	struct pci_nvme_softc *sc = req->sc;
2072 	bool done = true;
2073 	uint16_t status;
2074 
2075 	if (err) {
2076 		pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2077 	} else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2078 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2079 	} else {
2080 		struct iovec *iov = req->io_req.br_iov;
2081 
2082 		req->prev_gpaddr++;
2083 		iov += req->prev_gpaddr;
2084 
2085 		/* The iov_* values already include the sector size */
2086 		req->io_req.br_offset = (off_t)iov->iov_base;
2087 		req->io_req.br_resid = iov->iov_len;
2088 		if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2089 			pci_nvme_status_genc(&status,
2090 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2091 		} else
2092 			done = false;
2093 	}
2094 
2095 	if (done) {
2096 		pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
2097 		    req->cid, 0, status);
2098 		pci_nvme_release_ioreq(sc, req);
2099 	}
2100 }
2101 
2102 static bool
2103 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2104     struct nvme_command *cmd,
2105     struct pci_nvme_blockstore *nvstore,
2106     struct pci_nvme_ioreq *req,
2107     uint16_t *status)
2108 {
2109 	struct nvme_dsm_range *range;
2110 	uint32_t nr, r, non_zero, dr;
2111 	int err;
2112 	bool pending = false;
2113 
2114 	if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2115 		pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2116 		goto out;
2117 	}
2118 
2119 	nr = cmd->cdw10 & 0xff;
2120 
2121 	/* copy locally because a range entry could straddle PRPs */
2122 	range = calloc(1, NVME_MAX_DSM_TRIM);
2123 	if (range == NULL) {
2124 		pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2125 		goto out;
2126 	}
2127 	nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2128 	    (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2129 
2130 	/* Check for invalid ranges and the number of non-zero lengths */
2131 	non_zero = 0;
2132 	for (r = 0; r <= nr; r++) {
2133 		if (pci_nvme_out_of_range(nvstore,
2134 		    range[r].starting_lba, range[r].length)) {
2135 			pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2136 			goto out;
2137 		}
2138 		if (range[r].length != 0)
2139 			non_zero++;
2140 	}
2141 
2142 	if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2143 		size_t offset, bytes;
2144 		int sectsz_bits = sc->nvstore.sectsz_bits;
2145 
2146 		/*
2147 		 * DSM calls are advisory only, and compliant controllers
2148 		 * may choose to take no actions (i.e. return Success).
2149 		 */
2150 		if (!nvstore->deallocate) {
2151 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2152 			goto out;
2153 		}
2154 
2155 		/* If all ranges have a zero length, return Success */
2156 		if (non_zero == 0) {
2157 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2158 			goto out;
2159 		}
2160 
2161 		if (req == NULL) {
2162 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2163 			goto out;
2164 		}
2165 
2166 		offset = range[0].starting_lba << sectsz_bits;
2167 		bytes = range[0].length << sectsz_bits;
2168 
2169 		/*
2170 		 * If the request is for more than a single range, store
2171 		 * the ranges in the br_iov. Optimize for the common case
2172 		 * of a single range.
2173 		 *
2174 		 * Note that NVMe Number of Ranges is a zero based value
2175 		 */
2176 		req->io_req.br_iovcnt = 0;
2177 		req->io_req.br_offset = offset;
2178 		req->io_req.br_resid = bytes;
2179 
2180 		if (nr == 0) {
2181 			req->io_req.br_callback = pci_nvme_io_done;
2182 		} else {
2183 			struct iovec *iov = req->io_req.br_iov;
2184 
2185 			for (r = 0, dr = 0; r <= nr; r++) {
2186 				offset = range[r].starting_lba << sectsz_bits;
2187 				bytes = range[r].length << sectsz_bits;
2188 				if (bytes == 0)
2189 					continue;
2190 
2191 				if ((nvstore->size - offset) < bytes) {
2192 					pci_nvme_status_genc(status,
2193 					    NVME_SC_LBA_OUT_OF_RANGE);
2194 					goto out;
2195 				}
2196 				iov[dr].iov_base = (void *)offset;
2197 				iov[dr].iov_len = bytes;
2198 				dr++;
2199 			}
2200 			req->io_req.br_callback = pci_nvme_dealloc_sm;
2201 
2202 			/*
2203 			 * Use prev_gpaddr to track the current entry and
2204 			 * prev_size to track the number of entries
2205 			 */
2206 			req->prev_gpaddr = 0;
2207 			req->prev_size = dr;
2208 		}
2209 
2210 		err = blockif_delete(nvstore->ctx, &req->io_req);
2211 		if (err)
2212 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2213 		else
2214 			pending = true;
2215 	}
2216 out:
2217 	free(range);
2218 	return (pending);
2219 }
2220 
2221 static void
2222 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2223 {
2224 	struct nvme_submission_queue *sq;
2225 	uint16_t status;
2226 	uint16_t sqhead;
2227 
2228 	/* handle all submissions up to sq->tail index */
2229 	sq = &sc->submit_queues[idx];
2230 
2231 	pthread_mutex_lock(&sq->mtx);
2232 
2233 	sqhead = sq->head;
2234 	DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2235 	         idx, sqhead, sq->tail, sq->qbase);
2236 
2237 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
2238 		struct nvme_command *cmd;
2239 		struct pci_nvme_ioreq *req;
2240 		uint32_t nsid;
2241 		bool pending;
2242 
2243 		pending = false;
2244 		req = NULL;
2245 		status = 0;
2246 
2247 		cmd = &sq->qbase[sqhead];
2248 		sqhead = (sqhead + 1) % sq->size;
2249 
2250 		nsid = le32toh(cmd->nsid);
2251 		if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2252 			pci_nvme_status_genc(&status,
2253 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2254 			status |=
2255 			    NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2256 			goto complete;
2257  		}
2258 
2259 		req = pci_nvme_get_ioreq(sc);
2260 		if (req == NULL) {
2261 			pci_nvme_status_genc(&status,
2262 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2263 			WPRINTF("%s: unable to allocate IO req", __func__);
2264 			goto complete;
2265 		}
2266 		req->nvme_sq = sq;
2267 		req->sqid = idx;
2268 		req->opc = cmd->opc;
2269 		req->cid = cmd->cid;
2270 		req->nsid = cmd->nsid;
2271 
2272 		switch (cmd->opc) {
2273 		case NVME_OPC_FLUSH:
2274 			pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2275 			    req, &status);
2276  			break;
2277 		case NVME_OPC_WRITE:
2278 		case NVME_OPC_READ:
2279 			pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2280 			    req, &status);
2281 			break;
2282 		case NVME_OPC_WRITE_ZEROES:
2283 			/* TODO: write zeroes
2284 			WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2285 			        __func__, lba, cmd->cdw12 & 0xFFFF); */
2286 			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2287 			break;
2288 		case NVME_OPC_DATASET_MANAGEMENT:
2289  			pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2290 			    req, &status);
2291 			break;
2292  		default:
2293  			WPRINTF("%s unhandled io command 0x%x",
2294 			    __func__, cmd->opc);
2295 			pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2296 		}
2297 complete:
2298 		if (!pending) {
2299 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
2300 			    status);
2301 			if (req != NULL)
2302 				pci_nvme_release_ioreq(sc, req);
2303 		}
2304 	}
2305 
2306 	sq->head = sqhead;
2307 
2308 	pthread_mutex_unlock(&sq->mtx);
2309 }
2310 
2311 static void
2312 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
2313 	uint64_t idx, int is_sq, uint64_t value)
2314 {
2315 	DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2316 	        idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2317 
2318 	if (is_sq) {
2319 		if (idx > sc->num_squeues) {
2320 			WPRINTF("%s queue index %lu overflow from "
2321 			         "guest (max %u)",
2322 			         __func__, idx, sc->num_squeues);
2323 			return;
2324 		}
2325 
2326 		atomic_store_short(&sc->submit_queues[idx].tail,
2327 		                   (uint16_t)value);
2328 
2329 		if (idx == 0) {
2330 			pci_nvme_handle_admin_cmd(sc, value);
2331 		} else {
2332 			/* submission queue; handle new entries in SQ */
2333 			if (idx > sc->num_squeues) {
2334 				WPRINTF("%s SQ index %lu overflow from "
2335 				         "guest (max %u)",
2336 				         __func__, idx, sc->num_squeues);
2337 				return;
2338 			}
2339 			pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2340 		}
2341 	} else {
2342 		if (idx > sc->num_cqueues) {
2343 			WPRINTF("%s queue index %lu overflow from "
2344 			         "guest (max %u)",
2345 			         __func__, idx, sc->num_cqueues);
2346 			return;
2347 		}
2348 
2349 		atomic_store_short(&sc->compl_queues[idx].head,
2350 				(uint16_t)value);
2351 	}
2352 }
2353 
2354 static void
2355 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2356 {
2357 	const char *s = iswrite ? "WRITE" : "READ";
2358 
2359 	switch (offset) {
2360 	case NVME_CR_CAP_LOW:
2361 		DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2362 		break;
2363 	case NVME_CR_CAP_HI:
2364 		DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2365 		break;
2366 	case NVME_CR_VS:
2367 		DPRINTF("%s %s NVME_CR_VS", func, s);
2368 		break;
2369 	case NVME_CR_INTMS:
2370 		DPRINTF("%s %s NVME_CR_INTMS", func, s);
2371 		break;
2372 	case NVME_CR_INTMC:
2373 		DPRINTF("%s %s NVME_CR_INTMC", func, s);
2374 		break;
2375 	case NVME_CR_CC:
2376 		DPRINTF("%s %s NVME_CR_CC", func, s);
2377 		break;
2378 	case NVME_CR_CSTS:
2379 		DPRINTF("%s %s NVME_CR_CSTS", func, s);
2380 		break;
2381 	case NVME_CR_NSSR:
2382 		DPRINTF("%s %s NVME_CR_NSSR", func, s);
2383 		break;
2384 	case NVME_CR_AQA:
2385 		DPRINTF("%s %s NVME_CR_AQA", func, s);
2386 		break;
2387 	case NVME_CR_ASQ_LOW:
2388 		DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2389 		break;
2390 	case NVME_CR_ASQ_HI:
2391 		DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2392 		break;
2393 	case NVME_CR_ACQ_LOW:
2394 		DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2395 		break;
2396 	case NVME_CR_ACQ_HI:
2397 		DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2398 		break;
2399 	default:
2400 		DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2401 	}
2402 
2403 }
2404 
2405 static void
2406 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2407 	uint64_t offset, int size, uint64_t value)
2408 {
2409 	uint32_t ccreg;
2410 
2411 	if (offset >= NVME_DOORBELL_OFFSET) {
2412 		uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2413 		uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2414 		int is_sq = (belloffset % 8) < 4;
2415 
2416 		if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2417 			WPRINTF("guest attempted an overflow write offset "
2418 			         "0x%lx, val 0x%lx in %s",
2419 			         offset, value, __func__);
2420 			return;
2421 		}
2422 
2423 		pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2424 		return;
2425 	}
2426 
2427 	DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2428 	        offset, size, value);
2429 
2430 	if (size != 4) {
2431 		WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2432 		         "val 0x%lx) to bar0 in %s",
2433 		         size, offset, value, __func__);
2434 		/* TODO: shutdown device */
2435 		return;
2436 	}
2437 
2438 	pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2439 
2440 	pthread_mutex_lock(&sc->mtx);
2441 
2442 	switch (offset) {
2443 	case NVME_CR_CAP_LOW:
2444 	case NVME_CR_CAP_HI:
2445 		/* readonly */
2446 		break;
2447 	case NVME_CR_VS:
2448 		/* readonly */
2449 		break;
2450 	case NVME_CR_INTMS:
2451 		/* MSI-X, so ignore */
2452 		break;
2453 	case NVME_CR_INTMC:
2454 		/* MSI-X, so ignore */
2455 		break;
2456 	case NVME_CR_CC:
2457 		ccreg = (uint32_t)value;
2458 
2459 		DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2460 		         "iocqes %u",
2461 		        __func__,
2462 			 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2463 			 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2464 			 NVME_CC_GET_IOCQES(ccreg));
2465 
2466 		if (NVME_CC_GET_SHN(ccreg)) {
2467 			/* perform shutdown - flush out data to backend */
2468 			sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2469 			    NVME_CSTS_REG_SHST_SHIFT);
2470 			sc->regs.csts |= NVME_SHST_COMPLETE <<
2471 			    NVME_CSTS_REG_SHST_SHIFT;
2472 		}
2473 		if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2474 			if (NVME_CC_GET_EN(ccreg) == 0)
2475 				/* transition 1-> causes controller reset */
2476 				pci_nvme_reset_locked(sc);
2477 			else
2478 				pci_nvme_init_controller(ctx, sc);
2479 		}
2480 
2481 		/* Insert the iocqes, iosqes and en bits from the write */
2482 		sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2483 		sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2484 		if (NVME_CC_GET_EN(ccreg) == 0) {
2485 			/* Insert the ams, mps and css bit fields */
2486 			sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2487 			sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2488 			sc->regs.csts &= ~NVME_CSTS_RDY;
2489 		} else if (sc->pending_ios == 0) {
2490 			sc->regs.csts |= NVME_CSTS_RDY;
2491 		}
2492 		break;
2493 	case NVME_CR_CSTS:
2494 		break;
2495 	case NVME_CR_NSSR:
2496 		/* ignore writes; don't support subsystem reset */
2497 		break;
2498 	case NVME_CR_AQA:
2499 		sc->regs.aqa = (uint32_t)value;
2500 		break;
2501 	case NVME_CR_ASQ_LOW:
2502 		sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2503 		               (0xFFFFF000 & value);
2504 		break;
2505 	case NVME_CR_ASQ_HI:
2506 		sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2507 		               (value << 32);
2508 		break;
2509 	case NVME_CR_ACQ_LOW:
2510 		sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2511 		               (0xFFFFF000 & value);
2512 		break;
2513 	case NVME_CR_ACQ_HI:
2514 		sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2515 		               (value << 32);
2516 		break;
2517 	default:
2518 		DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2519 		         __func__, offset, value, size);
2520 	}
2521 	pthread_mutex_unlock(&sc->mtx);
2522 }
2523 
2524 static void
2525 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2526                 int baridx, uint64_t offset, int size, uint64_t value)
2527 {
2528 	struct pci_nvme_softc* sc = pi->pi_arg;
2529 
2530 	if (baridx == pci_msix_table_bar(pi) ||
2531 	    baridx == pci_msix_pba_bar(pi)) {
2532 		DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2533 		         " value 0x%lx", baridx, offset, size, value);
2534 
2535 		pci_emul_msix_twrite(pi, offset, size, value);
2536 		return;
2537 	}
2538 
2539 	switch (baridx) {
2540 	case 0:
2541 		pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2542 		break;
2543 
2544 	default:
2545 		DPRINTF("%s unknown baridx %d, val 0x%lx",
2546 		         __func__, baridx, value);
2547 	}
2548 }
2549 
2550 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2551 	uint64_t offset, int size)
2552 {
2553 	uint64_t value;
2554 
2555 	pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2556 
2557 	if (offset < NVME_DOORBELL_OFFSET) {
2558 		void *p = &(sc->regs);
2559 		pthread_mutex_lock(&sc->mtx);
2560 		memcpy(&value, (void *)((uintptr_t)p + offset), size);
2561 		pthread_mutex_unlock(&sc->mtx);
2562 	} else {
2563 		value = 0;
2564                 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2565 	}
2566 
2567 	switch (size) {
2568 	case 1:
2569 		value &= 0xFF;
2570 		break;
2571 	case 2:
2572 		value &= 0xFFFF;
2573 		break;
2574 	case 4:
2575 		value &= 0xFFFFFFFF;
2576 		break;
2577 	}
2578 
2579 	DPRINTF("   nvme-read offset 0x%lx, size %d -> value 0x%x",
2580 	         offset, size, (uint32_t)value);
2581 
2582 	return (value);
2583 }
2584 
2585 
2586 
2587 static uint64_t
2588 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2589     uint64_t offset, int size)
2590 {
2591 	struct pci_nvme_softc* sc = pi->pi_arg;
2592 
2593 	if (baridx == pci_msix_table_bar(pi) ||
2594 	    baridx == pci_msix_pba_bar(pi)) {
2595 		DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2596 		        baridx, offset, size);
2597 
2598 		return pci_emul_msix_tread(pi, offset, size);
2599 	}
2600 
2601 	switch (baridx) {
2602 	case 0:
2603        		return pci_nvme_read_bar_0(sc, offset, size);
2604 
2605 	default:
2606 		DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2607 	}
2608 
2609 	return (0);
2610 }
2611 
2612 static int
2613 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl)
2614 {
2615 	char bident[sizeof("XX:X:X")];
2616 	const char *value;
2617 	uint32_t sectsz;
2618 
2619 	sc->max_queues = NVME_QUEUES;
2620 	sc->max_qentries = NVME_MAX_QENTRIES;
2621 	sc->ioslots = NVME_IOSLOTS;
2622 	sc->num_squeues = sc->max_queues;
2623 	sc->num_cqueues = sc->max_queues;
2624 	sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2625 	sectsz = 0;
2626 	snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2627 	         "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2628 
2629 	value = get_config_value_node(nvl, "maxq");
2630 	if (value != NULL)
2631 		sc->max_queues = atoi(value);
2632 	value = get_config_value_node(nvl, "qsz");
2633 	if (value != NULL) {
2634 		sc->max_qentries = atoi(value);
2635 		if (sc->max_qentries <= 0) {
2636 			EPRINTLN("nvme: Invalid qsz option %d",
2637 			    sc->max_qentries);
2638 			return (-1);
2639 		}
2640 	}
2641 	value = get_config_value_node(nvl, "ioslots");
2642 	if (value != NULL) {
2643 		sc->ioslots = atoi(value);
2644 		if (sc->ioslots <= 0) {
2645 			EPRINTLN("Invalid ioslots option %d", sc->ioslots);
2646 			return (-1);
2647 		}
2648 	}
2649 	value = get_config_value_node(nvl, "sectsz");
2650 	if (value != NULL)
2651 		sectsz = atoi(value);
2652 	value = get_config_value_node(nvl, "ser");
2653 	if (value != NULL) {
2654 		/*
2655 		 * This field indicates the Product Serial Number in
2656 		 * 7-bit ASCII, unused bytes should be space characters.
2657 		 * Ref: NVMe v1.3c.
2658 		 */
2659 		cpywithpad((char *)sc->ctrldata.sn,
2660 		    sizeof(sc->ctrldata.sn), value, ' ');
2661 	}
2662 	value = get_config_value_node(nvl, "eui64");
2663 	if (value != NULL)
2664 		sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0));
2665 	value = get_config_value_node(nvl, "dsm");
2666 	if (value != NULL) {
2667 		if (strcmp(value, "auto") == 0)
2668 			sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2669 		else if (strcmp(value, "enable") == 0)
2670 			sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2671 		else if (strcmp(value, "disable") == 0)
2672 			sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2673 	}
2674 
2675 	value = get_config_value_node(nvl, "ram");
2676 	if (value != NULL) {
2677 		uint64_t sz = strtoull(value, NULL, 10);
2678 
2679 		sc->nvstore.type = NVME_STOR_RAM;
2680 		sc->nvstore.size = sz * 1024 * 1024;
2681 		sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2682 		sc->nvstore.sectsz = 4096;
2683 		sc->nvstore.sectsz_bits = 12;
2684 		if (sc->nvstore.ctx == NULL) {
2685 			EPRINTLN("nvme: Unable to allocate RAM");
2686 			return (-1);
2687 		}
2688 	} else {
2689 		snprintf(bident, sizeof(bident), "%d:%d",
2690 		    sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2691 		sc->nvstore.ctx = blockif_open(nvl, bident);
2692 		if (sc->nvstore.ctx == NULL) {
2693 			EPRINTLN("nvme: Could not open backing file: %s",
2694 			    strerror(errno));
2695 			return (-1);
2696 		}
2697 		sc->nvstore.type = NVME_STOR_BLOCKIF;
2698 		sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2699 	}
2700 
2701 	if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2702 		sc->nvstore.sectsz = sectsz;
2703 	else if (sc->nvstore.type != NVME_STOR_RAM)
2704 		sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2705 	for (sc->nvstore.sectsz_bits = 9;
2706 	     (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2707 	     sc->nvstore.sectsz_bits++);
2708 
2709 	if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2710 		sc->max_queues = NVME_QUEUES;
2711 
2712 	return (0);
2713 }
2714 
2715 static int
2716 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl)
2717 {
2718 	struct pci_nvme_softc *sc;
2719 	uint32_t pci_membar_sz;
2720 	int	error;
2721 
2722 	error = 0;
2723 
2724 	sc = calloc(1, sizeof(struct pci_nvme_softc));
2725 	pi->pi_arg = sc;
2726 	sc->nsc_pi = pi;
2727 
2728 	error = pci_nvme_parse_config(sc, nvl);
2729 	if (error < 0)
2730 		goto done;
2731 	else
2732 		error = 0;
2733 
2734 	STAILQ_INIT(&sc->ioreqs_free);
2735 	sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
2736 	for (int i = 0; i < sc->ioslots; i++) {
2737 		STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
2738 	}
2739 
2740 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
2741 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
2742 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
2743 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
2744 	pci_set_cfgdata8(pi, PCIR_PROGIF,
2745 	                 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
2746 
2747 	/*
2748 	 * Allocate size of NVMe registers + doorbell space for all queues.
2749 	 *
2750 	 * The specification requires a minimum memory I/O window size of 16K.
2751 	 * The Windows driver will refuse to start a device with a smaller
2752 	 * window.
2753 	 */
2754 	pci_membar_sz = sizeof(struct nvme_registers) +
2755 	    2 * sizeof(uint32_t) * (sc->max_queues + 1);
2756 	pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
2757 
2758 	DPRINTF("nvme membar size: %u", pci_membar_sz);
2759 
2760 	error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
2761 	if (error) {
2762 		WPRINTF("%s pci alloc mem bar failed", __func__);
2763 		goto done;
2764 	}
2765 
2766 	error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2767 	if (error) {
2768 		WPRINTF("%s pci add msixcap failed", __func__);
2769 		goto done;
2770 	}
2771 
2772 	error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2773 	if (error) {
2774 		WPRINTF("%s pci add Express capability failed", __func__);
2775 		goto done;
2776 	}
2777 
2778 	pthread_mutex_init(&sc->mtx, NULL);
2779 	sem_init(&sc->iosemlock, 0, sc->ioslots);
2780 
2781 	pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
2782 	/*
2783 	 * Controller data depends on Namespace data so initialize Namespace
2784 	 * data first.
2785 	 */
2786 	pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
2787 	pci_nvme_init_ctrldata(sc);
2788 	pci_nvme_init_logpages(sc);
2789 	pci_nvme_init_features(sc);
2790 
2791 	pci_nvme_aer_init(sc);
2792 
2793 	pci_nvme_reset(sc);
2794 
2795 	pci_lintr_request(pi);
2796 
2797 done:
2798 	return (error);
2799 }
2800 
2801 static int
2802 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts)
2803 {
2804 	char *cp, *ram;
2805 
2806 	if (opts == NULL)
2807 		return (0);
2808 
2809 	if (strncmp(opts, "ram=", 4) == 0) {
2810 		cp = strchr(opts, ',');
2811 		if (cp == NULL) {
2812 			set_config_value_node(nvl, "ram", opts + 4);
2813 			return (0);
2814 		}
2815 		ram = strndup(opts + 4, cp - opts - 4);
2816 		set_config_value_node(nvl, "ram", ram);
2817 		free(ram);
2818 		return (pci_parse_legacy_config(nvl, cp + 1));
2819 	} else
2820 		return (blockif_legacy_config(nvl, opts));
2821 }
2822 
2823 struct pci_devemu pci_de_nvme = {
2824 	.pe_emu =	"nvme",
2825 	.pe_init =	pci_nvme_init,
2826 	.pe_legacy_config = pci_nvme_legacy_config,
2827 	.pe_barwrite =	pci_nvme_write,
2828 	.pe_barread =	pci_nvme_read
2829 };
2830 PCI_EMUL_SET(pci_de_nvme);
2831