xref: /illumos-gate/usr/src/cmd/bhyve/pci_nvme.c (revision 25a9a7aaf35c7e4a2b5a57d3875af906147710d5)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  * Copyright (c) 2020 Chuck Tuffli
7  *
8  * Function crc16 Copyright (c) 2017, Fedor Uporov
9  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 /*
34  * bhyve PCIe-NVMe device emulation.
35  *
36  * options:
37  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
38  *
39  *  accepted devpath:
40  *    /dev/blockdev
41  *    /path/to/image
42  *    ram=size_in_MiB
43  *
44  *  maxq    = max number of queues
45  *  qsz     = max elements in each queue
46  *  ioslots = max number of concurrent io requests
47  *  sectsz  = sector size (defaults to blockif sector size)
48  *  ser     = serial number (20-chars max)
49  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
50  *  dsm     = DataSet Management support. Option is one of auto, enable,disable
51  *
52  */
53 
54 /* TODO:
55     - create async event for smart and log
56     - intr coalesce
57  */
58 
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61 
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
65 #ifndef __FreeBSD__
66 #include <endian.h>
67 #endif
68 
69 #include <assert.h>
70 #include <pthread.h>
71 #include <semaphore.h>
72 #include <stdbool.h>
73 #include <stddef.h>
74 #include <stdint.h>
75 #include <stdio.h>
76 #include <stdlib.h>
77 #include <string.h>
78 
79 #include <machine/atomic.h>
80 #include <machine/vmm.h>
81 #include <vmmapi.h>
82 
83 #include <dev/nvme/nvme.h>
84 
85 #include "bhyverun.h"
86 #include "block_if.h"
87 #include "debug.h"
88 #include "pci_emul.h"
89 
90 
91 static int nvme_debug = 0;
92 #define	DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
93 #define	WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
94 
95 /* defaults; can be overridden */
96 #define	NVME_MSIX_BAR		4
97 
98 #define	NVME_IOSLOTS		8
99 
100 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
101 #define NVME_MMIO_SPACE_MIN	(1 << 14)
102 
103 #define	NVME_QUEUES		16
104 #define	NVME_MAX_QENTRIES	2048
105 /* Memory Page size Minimum reported in CAP register */
106 #define	NVME_MPSMIN		0
107 /* MPSMIN converted to bytes */
108 #define	NVME_MPSMIN_BYTES	(1 << (12 + NVME_MPSMIN))
109 
110 #define	NVME_PRP2_ITEMS		(PAGE_SIZE/sizeof(uint64_t))
111 #define	NVME_MDTS		9
112 /* Note the + 1 allows for the initial descriptor to not be page aligned */
113 #define	NVME_MAX_IOVEC		((1 << NVME_MDTS) + 1)
114 #define	NVME_MAX_DATA_SIZE	((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
115 
116 /* This is a synthetic status code to indicate there is no status */
117 #define NVME_NO_STATUS		0xffff
118 #define NVME_COMPLETION_VALID(c)	((c).status != NVME_NO_STATUS)
119 
120 /* helpers */
121 
122 /* Convert a zero-based value into a one-based value */
123 #define ONE_BASED(zero)		((zero) + 1)
124 /* Convert a one-based value into a zero-based value */
125 #define ZERO_BASED(one)		((one)  - 1)
126 
127 /* Encode number of SQ's and CQ's for Set/Get Features */
128 #define NVME_FEATURE_NUM_QUEUES(sc) \
129 	(ZERO_BASED((sc)->num_squeues) & 0xffff) | \
130 	(ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
131 
132 #define	NVME_DOORBELL_OFFSET	offsetof(struct nvme_registers, doorbell)
133 
134 enum nvme_controller_register_offsets {
135 	NVME_CR_CAP_LOW = 0x00,
136 	NVME_CR_CAP_HI  = 0x04,
137 	NVME_CR_VS      = 0x08,
138 	NVME_CR_INTMS   = 0x0c,
139 	NVME_CR_INTMC   = 0x10,
140 	NVME_CR_CC      = 0x14,
141 	NVME_CR_CSTS    = 0x1c,
142 	NVME_CR_NSSR    = 0x20,
143 	NVME_CR_AQA     = 0x24,
144 	NVME_CR_ASQ_LOW = 0x28,
145 	NVME_CR_ASQ_HI  = 0x2c,
146 	NVME_CR_ACQ_LOW = 0x30,
147 	NVME_CR_ACQ_HI  = 0x34,
148 };
149 
150 enum nvme_cmd_cdw11 {
151 	NVME_CMD_CDW11_PC  = 0x0001,
152 	NVME_CMD_CDW11_IEN = 0x0002,
153 	NVME_CMD_CDW11_IV  = 0xFFFF0000,
154 };
155 
156 enum nvme_copy_dir {
157 	NVME_COPY_TO_PRP,
158 	NVME_COPY_FROM_PRP,
159 };
160 
161 #define	NVME_CQ_INTEN	0x01
162 #define	NVME_CQ_INTCOAL	0x02
163 
164 struct nvme_completion_queue {
165 	struct nvme_completion *qbase;
166 	pthread_mutex_t	mtx;
167 	uint32_t	size;
168 	uint16_t	tail; /* nvme progress */
169 	uint16_t	head; /* guest progress */
170 	uint16_t	intr_vec;
171 	uint32_t	intr_en;
172 };
173 
174 struct nvme_submission_queue {
175 	struct nvme_command *qbase;
176 	pthread_mutex_t	mtx;
177 	uint32_t	size;
178 	uint16_t	head; /* nvme progress */
179 	uint16_t	tail; /* guest progress */
180 	uint16_t	cqid; /* completion queue id */
181 	int		qpriority;
182 };
183 
184 enum nvme_storage_type {
185 	NVME_STOR_BLOCKIF = 0,
186 	NVME_STOR_RAM = 1,
187 };
188 
189 struct pci_nvme_blockstore {
190 	enum nvme_storage_type type;
191 	void		*ctx;
192 	uint64_t	size;
193 	uint32_t	sectsz;
194 	uint32_t	sectsz_bits;
195 	uint64_t	eui64;
196 	uint32_t	deallocate:1;
197 };
198 
199 /*
200  * Calculate the number of additional page descriptors for guest IO requests
201  * based on the advertised Max Data Transfer (MDTS) and given the number of
202  * default iovec's in a struct blockif_req.
203  *
204  * Note the + 1 allows for the initial descriptor to not be page aligned.
205  */
206 #define MDTS_PAD_SIZE \
207 	NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
208 	NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
209 	0
210 
211 struct pci_nvme_ioreq {
212 	struct pci_nvme_softc *sc;
213 	STAILQ_ENTRY(pci_nvme_ioreq) link;
214 	struct nvme_submission_queue *nvme_sq;
215 	uint16_t	sqid;
216 
217 	/* command information */
218 	uint16_t	opc;
219 	uint16_t	cid;
220 	uint32_t	nsid;
221 
222 	uint64_t	prev_gpaddr;
223 	size_t		prev_size;
224 	size_t		bytes;
225 
226 	struct blockif_req io_req;
227 
228 	struct iovec	iovpadding[MDTS_PAD_SIZE];
229 };
230 
231 enum nvme_dsm_type {
232 	/* Dataset Management bit in ONCS reflects backing storage capability */
233 	NVME_DATASET_MANAGEMENT_AUTO,
234 	/* Unconditionally set Dataset Management bit in ONCS */
235 	NVME_DATASET_MANAGEMENT_ENABLE,
236 	/* Unconditionally clear Dataset Management bit in ONCS */
237 	NVME_DATASET_MANAGEMENT_DISABLE,
238 };
239 
240 struct pci_nvme_softc;
241 struct nvme_feature_obj;
242 
243 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
244     struct nvme_feature_obj *,
245     struct nvme_command *,
246     struct nvme_completion *);
247 
248 struct nvme_feature_obj {
249 	uint32_t	cdw11;
250 	nvme_feature_cb	set;
251 	nvme_feature_cb	get;
252 	bool namespace_specific;
253 };
254 
255 #define NVME_FID_MAX		(NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
256 
257 struct pci_nvme_aer {
258 	STAILQ_ENTRY(pci_nvme_aer) link;
259 	uint16_t	cid;	/* Command ID of the submitted AER */
260 };
261 
262 struct pci_nvme_softc {
263 	struct pci_devinst *nsc_pi;
264 
265 	pthread_mutex_t	mtx;
266 
267 	struct nvme_registers regs;
268 
269 	struct nvme_namespace_data  nsdata;
270 	struct nvme_controller_data ctrldata;
271 	struct nvme_error_information_entry err_log;
272 	struct nvme_health_information_page health_log;
273 	struct nvme_firmware_page fw_log;
274 
275 	struct pci_nvme_blockstore nvstore;
276 
277 	uint16_t	max_qentries;	/* max entries per queue */
278 	uint32_t	max_queues;	/* max number of IO SQ's or CQ's */
279 	uint32_t	num_cqueues;
280 	uint32_t	num_squeues;
281 	bool		num_q_is_set; /* Has host set Number of Queues */
282 
283 	struct pci_nvme_ioreq *ioreqs;
284 	STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
285 	uint32_t	pending_ios;
286 	uint32_t	ioslots;
287 	sem_t		iosemlock;
288 
289 	/*
290 	 * Memory mapped Submission and Completion queues
291 	 * Each array includes both Admin and IO queues
292 	 */
293 	struct nvme_completion_queue *compl_queues;
294 	struct nvme_submission_queue *submit_queues;
295 
296 	struct nvme_feature_obj feat[NVME_FID_MAX];
297 
298 	enum nvme_dsm_type dataset_management;
299 
300 	/* Accounting for SMART data */
301 	__uint128_t	read_data_units;
302 	__uint128_t	write_data_units;
303 	__uint128_t	read_commands;
304 	__uint128_t	write_commands;
305 	uint32_t	read_dunits_remainder;
306 	uint32_t	write_dunits_remainder;
307 
308 	STAILQ_HEAD(, pci_nvme_aer) aer_list;
309 	uint32_t	aer_count;
310 };
311 
312 
313 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
314 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
315 static void pci_nvme_io_done(struct blockif_req *, int);
316 
317 /* Controller Configuration utils */
318 #define	NVME_CC_GET_EN(cc) \
319 	((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
320 #define	NVME_CC_GET_CSS(cc) \
321 	((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
322 #define	NVME_CC_GET_SHN(cc) \
323 	((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
324 #define	NVME_CC_GET_IOSQES(cc) \
325 	((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
326 #define	NVME_CC_GET_IOCQES(cc) \
327 	((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
328 
329 #define	NVME_CC_WRITE_MASK \
330 	((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
331 	 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
332 	 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
333 
334 #define	NVME_CC_NEN_WRITE_MASK \
335 	((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
336 	 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
337 	 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
338 
339 /* Controller Status utils */
340 #define	NVME_CSTS_GET_RDY(sts) \
341 	((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
342 
343 #define	NVME_CSTS_RDY	(1 << NVME_CSTS_REG_RDY_SHIFT)
344 
345 /* Completion Queue status word utils */
346 #define	NVME_STATUS_P	(1 << NVME_STATUS_P_SHIFT)
347 #define	NVME_STATUS_MASK \
348 	((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
349 	 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
350 
351 #define NVME_ONCS_DSM	(NVME_CTRLR_DATA_ONCS_DSM_MASK << \
352 	NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
353 
354 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
355     struct nvme_feature_obj *,
356     struct nvme_command *,
357     struct nvme_completion *);
358 static void nvme_feature_num_queues(struct pci_nvme_softc *,
359     struct nvme_feature_obj *,
360     struct nvme_command *,
361     struct nvme_completion *);
362 static void nvme_feature_iv_config(struct pci_nvme_softc *,
363     struct nvme_feature_obj *,
364     struct nvme_command *,
365     struct nvme_completion *);
366 
367 static __inline void
368 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
369 {
370 	size_t len;
371 
372 	len = strnlen(src, dst_size);
373 	memset(dst, pad, dst_size);
374 	memcpy(dst, src, len);
375 }
376 
377 static __inline void
378 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
379 {
380 
381 	*status &= ~NVME_STATUS_MASK;
382 	*status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
383 		(code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
384 }
385 
386 static __inline void
387 pci_nvme_status_genc(uint16_t *status, uint16_t code)
388 {
389 
390 	pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
391 }
392 
393 /*
394  * Initialize the requested number or IO Submission and Completion Queues.
395  * Admin queues are allocated implicitly.
396  */
397 static void
398 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
399 {
400 	uint32_t i;
401 
402 	/*
403 	 * Allocate and initialize the Submission Queues
404 	 */
405 	if (nsq > NVME_QUEUES) {
406 		WPRINTF("%s: clamping number of SQ from %u to %u",
407 					__func__, nsq, NVME_QUEUES);
408 		nsq = NVME_QUEUES;
409 	}
410 
411 	sc->num_squeues = nsq;
412 
413 	sc->submit_queues = calloc(sc->num_squeues + 1,
414 				sizeof(struct nvme_submission_queue));
415 	if (sc->submit_queues == NULL) {
416 		WPRINTF("%s: SQ allocation failed", __func__);
417 		sc->num_squeues = 0;
418 	} else {
419 		struct nvme_submission_queue *sq = sc->submit_queues;
420 
421 		for (i = 0; i < sc->num_squeues; i++)
422 			pthread_mutex_init(&sq[i].mtx, NULL);
423 	}
424 
425 	/*
426 	 * Allocate and initialize the Completion Queues
427 	 */
428 	if (ncq > NVME_QUEUES) {
429 		WPRINTF("%s: clamping number of CQ from %u to %u",
430 					__func__, ncq, NVME_QUEUES);
431 		ncq = NVME_QUEUES;
432 	}
433 
434 	sc->num_cqueues = ncq;
435 
436 	sc->compl_queues = calloc(sc->num_cqueues + 1,
437 				sizeof(struct nvme_completion_queue));
438 	if (sc->compl_queues == NULL) {
439 		WPRINTF("%s: CQ allocation failed", __func__);
440 		sc->num_cqueues = 0;
441 	} else {
442 		struct nvme_completion_queue *cq = sc->compl_queues;
443 
444 		for (i = 0; i < sc->num_cqueues; i++)
445 			pthread_mutex_init(&cq[i].mtx, NULL);
446 	}
447 }
448 
449 static void
450 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
451 {
452 	struct nvme_controller_data *cd = &sc->ctrldata;
453 
454 	cd->vid = 0xFB5D;
455 	cd->ssvid = 0x0000;
456 
457 	cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
458 	cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
459 
460 	/* Num of submission commands that we can handle at a time (2^rab) */
461 	cd->rab   = 4;
462 
463 	/* FreeBSD OUI */
464 	cd->ieee[0] = 0x58;
465 	cd->ieee[1] = 0x9c;
466 	cd->ieee[2] = 0xfc;
467 
468 	cd->mic = 0;
469 
470 	cd->mdts = NVME_MDTS;	/* max data transfer size (2^mdts * CAP.MPSMIN) */
471 
472 	cd->ver = 0x00010300;
473 
474 	cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
475 	cd->acl = 2;
476 	cd->aerl = 4;
477 
478 	/* Advertise 1, Read-only firmware slot */
479 	cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK |
480 	    (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
481 	cd->lpa = 0;	/* TODO: support some simple things like SMART */
482 	cd->elpe = 0;	/* max error log page entries */
483 	cd->npss = 1;	/* number of power states support */
484 
485 	/* Warning Composite Temperature Threshold */
486 	cd->wctemp = 0x0157;
487 
488 	cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
489 	    (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
490 	cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
491 	    (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
492 	cd->nn = 1;	/* number of namespaces */
493 
494 	cd->oncs = 0;
495 	switch (sc->dataset_management) {
496 	case NVME_DATASET_MANAGEMENT_AUTO:
497 		if (sc->nvstore.deallocate)
498 			cd->oncs |= NVME_ONCS_DSM;
499 		break;
500 	case NVME_DATASET_MANAGEMENT_ENABLE:
501 		cd->oncs |= NVME_ONCS_DSM;
502 		break;
503 	default:
504 		break;
505 	}
506 
507 	cd->fna = 0x03;
508 
509 	cd->power_state[0].mp = 10;
510 }
511 
512 /*
513  * Calculate the CRC-16 of the given buffer
514  * See copyright attribution at top of file
515  */
516 static uint16_t
517 crc16(uint16_t crc, const void *buffer, unsigned int len)
518 {
519 	const unsigned char *cp = buffer;
520 	/* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
521 	static uint16_t const crc16_table[256] = {
522 		0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
523 		0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
524 		0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
525 		0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
526 		0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
527 		0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
528 		0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
529 		0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
530 		0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
531 		0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
532 		0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
533 		0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
534 		0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
535 		0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
536 		0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
537 		0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
538 		0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
539 		0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
540 		0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
541 		0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
542 		0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
543 		0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
544 		0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
545 		0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
546 		0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
547 		0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
548 		0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
549 		0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
550 		0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
551 		0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
552 		0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
553 		0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
554 	};
555 
556 	while (len--)
557 		crc = (((crc >> 8) & 0xffU) ^
558 		    crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
559 	return crc;
560 }
561 
562 static void
563 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
564     struct nvme_namespace_data *nd, uint32_t nsid,
565     struct pci_nvme_blockstore *nvstore)
566 {
567 
568 	/* Get capacity and block size information from backing store */
569 	nd->nsze = nvstore->size / nvstore->sectsz;
570 	nd->ncap = nd->nsze;
571 	nd->nuse = nd->nsze;
572 
573 	if (nvstore->type == NVME_STOR_BLOCKIF)
574 		nvstore->deallocate = blockif_candelete(nvstore->ctx);
575 
576 	nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
577 	nd->flbas = 0;
578 
579 	/* Create an EUI-64 if user did not provide one */
580 	if (nvstore->eui64 == 0) {
581 		char *data = NULL;
582 		uint64_t eui64 = nvstore->eui64;
583 
584 		asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
585 		    sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
586 
587 		if (data != NULL) {
588 			eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
589 			free(data);
590 		}
591 		nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
592 	}
593 	be64enc(nd->eui64, nvstore->eui64);
594 
595 	/* LBA data-sz = 2^lbads */
596 	nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
597 }
598 
599 static void
600 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
601 {
602 
603 	memset(&sc->err_log, 0, sizeof(sc->err_log));
604 	memset(&sc->health_log, 0, sizeof(sc->health_log));
605 	memset(&sc->fw_log, 0, sizeof(sc->fw_log));
606 
607 	/* Set read/write remainder to round up according to spec */
608 	sc->read_dunits_remainder = 999;
609 	sc->write_dunits_remainder = 999;
610 
611 	/* Set nominal Health values checked by implementations */
612 	sc->health_log.temperature = 310;
613 	sc->health_log.available_spare = 100;
614 	sc->health_log.available_spare_threshold = 10;
615 }
616 
617 static void
618 pci_nvme_init_features(struct pci_nvme_softc *sc)
619 {
620 
621 	sc->feat[0].set = nvme_feature_invalid_cb;
622 	sc->feat[0].get = nvme_feature_invalid_cb;
623 
624 	sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true;
625 	sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true;
626 	sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues;
627 	sc->feat[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION].set =
628 	    nvme_feature_iv_config;
629 	sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG].get =
630 	    nvme_feature_invalid_cb;
631 	sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW].get =
632 	    nvme_feature_invalid_cb;
633 }
634 
635 static void
636 pci_nvme_aer_init(struct pci_nvme_softc *sc)
637 {
638 
639 	STAILQ_INIT(&sc->aer_list);
640 	sc->aer_count = 0;
641 }
642 
643 static void
644 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
645 {
646 	struct pci_nvme_aer *aer = NULL;
647 
648 	while (!STAILQ_EMPTY(&sc->aer_list)) {
649 		aer = STAILQ_FIRST(&sc->aer_list);
650 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
651 		free(aer);
652 	}
653 
654 	pci_nvme_aer_init(sc);
655 }
656 
657 #ifdef __FreeBSD__
658 static bool
659 pci_nvme_aer_available(struct pci_nvme_softc *sc)
660 {
661 
662 	return (!STAILQ_EMPTY(&sc->aer_list));
663 }
664 #else
665 /* This is kept behind an ifdef while it's unused to appease the compiler. */
666 #endif
667 
668 static bool
669 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
670 {
671 	struct nvme_controller_data *cd = &sc->ctrldata;
672 
673 	/* AERL is a zero based value while aer_count is one's based */
674 	return (sc->aer_count == (cd->aerl + 1));
675 }
676 
677 /*
678  * Add an Async Event Request
679  *
680  * Stores an AER to be returned later if the Controller needs to notify the
681  * host of an event.
682  * Note that while the NVMe spec doesn't require Controllers to return AER's
683  * in order, this implementation does preserve the order.
684  */
685 static int
686 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
687 {
688 	struct pci_nvme_aer *aer = NULL;
689 
690 	if (pci_nvme_aer_limit_reached(sc))
691 		return (-1);
692 
693 	aer = calloc(1, sizeof(struct pci_nvme_aer));
694 	if (aer == NULL)
695 		return (-1);
696 
697 	sc->aer_count++;
698 
699 	/* Save the Command ID for use in the completion message */
700 	aer->cid = cid;
701 	STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
702 
703 	return (0);
704 }
705 
706 /*
707  * Get an Async Event Request structure
708  *
709  * Returns a pointer to an AER previously submitted by the host or NULL if
710  * no AER's exist. Caller is responsible for freeing the returned struct.
711  */
712 #ifdef __FreeBSD__
713 static struct pci_nvme_aer *
714 pci_nvme_aer_get(struct pci_nvme_softc *sc)
715 {
716 	struct pci_nvme_aer *aer = NULL;
717 
718 	aer = STAILQ_FIRST(&sc->aer_list);
719 	if (aer != NULL) {
720 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
721 		sc->aer_count--;
722 	}
723 
724 	return (aer);
725 }
726 #else
727 /* This is kept behind an ifdef while it's unused to appease the compiler. */
728 #endif
729 
730 static void
731 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
732 {
733 	uint32_t i;
734 
735 	DPRINTF("%s", __func__);
736 
737 	sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
738 	    (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
739 	    (60 << NVME_CAP_LO_REG_TO_SHIFT);
740 
741 	sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
742 
743 	sc->regs.vs = 0x00010300;	/* NVMe v1.3 */
744 
745 	sc->regs.cc = 0;
746 	sc->regs.csts = 0;
747 
748 	assert(sc->submit_queues != NULL);
749 
750 	for (i = 0; i < sc->num_squeues + 1; i++) {
751 		sc->submit_queues[i].qbase = NULL;
752 		sc->submit_queues[i].size = 0;
753 		sc->submit_queues[i].cqid = 0;
754 		sc->submit_queues[i].tail = 0;
755 		sc->submit_queues[i].head = 0;
756 	}
757 
758 	assert(sc->compl_queues != NULL);
759 
760 	for (i = 0; i < sc->num_cqueues + 1; i++) {
761 		sc->compl_queues[i].qbase = NULL;
762 		sc->compl_queues[i].size = 0;
763 		sc->compl_queues[i].tail = 0;
764 		sc->compl_queues[i].head = 0;
765 	}
766 
767 	sc->num_q_is_set = false;
768 
769 	pci_nvme_aer_destroy(sc);
770 }
771 
772 static void
773 pci_nvme_reset(struct pci_nvme_softc *sc)
774 {
775 	pthread_mutex_lock(&sc->mtx);
776 	pci_nvme_reset_locked(sc);
777 	pthread_mutex_unlock(&sc->mtx);
778 }
779 
780 static void
781 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
782 {
783 	uint16_t acqs, asqs;
784 
785 	DPRINTF("%s", __func__);
786 
787 	asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
788 	sc->submit_queues[0].size = asqs;
789 	sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
790 	            sizeof(struct nvme_command) * asqs);
791 
792 	DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
793 	        __func__, sc->regs.asq, sc->submit_queues[0].qbase);
794 
795 	acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
796 	    NVME_AQA_REG_ACQS_MASK) + 1;
797 	sc->compl_queues[0].size = acqs;
798 	sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
799 	         sizeof(struct nvme_completion) * acqs);
800 	sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
801 
802 	DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
803 	        __func__, sc->regs.acq, sc->compl_queues[0].qbase);
804 }
805 
806 static int
807 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
808 	size_t len, enum nvme_copy_dir dir)
809 {
810 	uint8_t *p;
811 	size_t bytes;
812 
813 	if (len > (8 * 1024)) {
814 		return (-1);
815 	}
816 
817 	/* Copy from the start of prp1 to the end of the physical page */
818 	bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
819 	bytes = MIN(bytes, len);
820 
821 	p = vm_map_gpa(ctx, prp1, bytes);
822 	if (p == NULL) {
823 		return (-1);
824 	}
825 
826 	if (dir == NVME_COPY_TO_PRP)
827 		memcpy(p, b, bytes);
828 	else
829 		memcpy(b, p, bytes);
830 
831 	b += bytes;
832 
833 	len -= bytes;
834 	if (len == 0) {
835 		return (0);
836 	}
837 
838 	len = MIN(len, PAGE_SIZE);
839 
840 	p = vm_map_gpa(ctx, prp2, len);
841 	if (p == NULL) {
842 		return (-1);
843 	}
844 
845 	if (dir == NVME_COPY_TO_PRP)
846 		memcpy(p, b, len);
847 	else
848 		memcpy(b, p, len);
849 
850 	return (0);
851 }
852 
853 /*
854  * Write a Completion Queue Entry update
855  *
856  * Write the completion and update the doorbell value
857  */
858 static void
859 pci_nvme_cq_update(struct pci_nvme_softc *sc,
860 		struct nvme_completion_queue *cq,
861 		uint32_t cdw0,
862 		uint16_t cid,
863 		uint16_t sqid,
864 		uint16_t status)
865 {
866 	struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
867 	struct nvme_completion *cqe;
868 
869 	assert(cq->qbase != NULL);
870 
871 	pthread_mutex_lock(&cq->mtx);
872 
873 	cqe = &cq->qbase[cq->tail];
874 
875 	/* Flip the phase bit */
876 	status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
877 
878 	cqe->cdw0 = cdw0;
879 	cqe->sqhd = sq->head;
880 	cqe->sqid = sqid;
881 	cqe->cid = cid;
882 	cqe->status = status;
883 
884 	cq->tail++;
885 	if (cq->tail >= cq->size) {
886 		cq->tail = 0;
887 	}
888 
889 	pthread_mutex_unlock(&cq->mtx);
890 }
891 
892 static int
893 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
894 	struct nvme_completion* compl)
895 {
896 	uint16_t qid = command->cdw10 & 0xffff;
897 
898 	DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
899 	if (qid == 0 || qid > sc->num_squeues ||
900 	    (sc->submit_queues[qid].qbase == NULL)) {
901 		WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
902 		        __func__, qid, sc->num_squeues);
903 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
904 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
905 		return (1);
906 	}
907 
908 	sc->submit_queues[qid].qbase = NULL;
909 	sc->submit_queues[qid].cqid = 0;
910 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
911 	return (1);
912 }
913 
914 static int
915 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
916 	struct nvme_completion* compl)
917 {
918 	if (command->cdw11 & NVME_CMD_CDW11_PC) {
919 		uint16_t qid = command->cdw10 & 0xffff;
920 		struct nvme_submission_queue *nsq;
921 
922 		if ((qid == 0) || (qid > sc->num_squeues) ||
923 		    (sc->submit_queues[qid].qbase != NULL)) {
924 			WPRINTF("%s queue index %u > num_squeues %u",
925 			        __func__, qid, sc->num_squeues);
926 			pci_nvme_status_tc(&compl->status,
927 			    NVME_SCT_COMMAND_SPECIFIC,
928 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
929 			return (1);
930 		}
931 
932 		nsq = &sc->submit_queues[qid];
933 		nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
934 		DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
935 		if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
936 			/*
937 			 * Queues must specify at least two entries
938 			 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
939 			 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
940 			 */
941 			pci_nvme_status_tc(&compl->status,
942 			    NVME_SCT_COMMAND_SPECIFIC,
943 			    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
944 			return (1);
945 		}
946 		nsq->head = nsq->tail = 0;
947 
948 		nsq->cqid = (command->cdw11 >> 16) & 0xffff;
949 		if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
950 			pci_nvme_status_tc(&compl->status,
951 			    NVME_SCT_COMMAND_SPECIFIC,
952 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
953 			return (1);
954 		}
955 
956 		if (sc->compl_queues[nsq->cqid].qbase == NULL) {
957 			pci_nvme_status_tc(&compl->status,
958 			    NVME_SCT_COMMAND_SPECIFIC,
959 			    NVME_SC_COMPLETION_QUEUE_INVALID);
960 			return (1);
961 		}
962 
963 		nsq->qpriority = (command->cdw11 >> 1) & 0x03;
964 
965 		nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
966 		              sizeof(struct nvme_command) * (size_t)nsq->size);
967 
968 		DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
969 		        qid, nsq->size, nsq->qbase, nsq->cqid);
970 
971 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
972 
973 		DPRINTF("%s completed creating IOSQ qid %u",
974 		         __func__, qid);
975 	} else {
976 		/*
977 		 * Guest sent non-cont submission queue request.
978 		 * This setting is unsupported by this emulation.
979 		 */
980 		WPRINTF("%s unsupported non-contig (list-based) "
981 		         "create i/o submission queue", __func__);
982 
983 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
984 	}
985 	return (1);
986 }
987 
988 static int
989 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
990 	struct nvme_completion* compl)
991 {
992 	uint16_t qid = command->cdw10 & 0xffff;
993 	uint16_t sqid;
994 
995 	DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
996 	if (qid == 0 || qid > sc->num_cqueues ||
997 	    (sc->compl_queues[qid].qbase == NULL)) {
998 		WPRINTF("%s queue index %u / num_cqueues %u",
999 		        __func__, qid, sc->num_cqueues);
1000 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1001 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1002 		return (1);
1003 	}
1004 
1005 	/* Deleting an Active CQ is an error */
1006 	for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
1007 		if (sc->submit_queues[sqid].cqid == qid) {
1008 			pci_nvme_status_tc(&compl->status,
1009 			    NVME_SCT_COMMAND_SPECIFIC,
1010 			    NVME_SC_INVALID_QUEUE_DELETION);
1011 			return (1);
1012 		}
1013 
1014 	sc->compl_queues[qid].qbase = NULL;
1015 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1016 	return (1);
1017 }
1018 
1019 static int
1020 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1021 	struct nvme_completion* compl)
1022 {
1023 	struct nvme_completion_queue *ncq;
1024 	uint16_t qid = command->cdw10 & 0xffff;
1025 
1026 	/* Only support Physically Contiguous queues */
1027 	if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1028 		WPRINTF("%s unsupported non-contig (list-based) "
1029 		         "create i/o completion queue",
1030 		         __func__);
1031 
1032 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1033 		return (1);
1034 	}
1035 
1036 	if ((qid == 0) || (qid > sc->num_cqueues) ||
1037 	    (sc->compl_queues[qid].qbase != NULL)) {
1038 		WPRINTF("%s queue index %u > num_cqueues %u",
1039 			__func__, qid, sc->num_cqueues);
1040 		pci_nvme_status_tc(&compl->status,
1041 		    NVME_SCT_COMMAND_SPECIFIC,
1042 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1043 		return (1);
1044  	}
1045 
1046 	ncq = &sc->compl_queues[qid];
1047 	ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1048 	ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1049 	if (ncq->intr_vec > (sc->max_queues + 1)) {
1050 		pci_nvme_status_tc(&compl->status,
1051 		    NVME_SCT_COMMAND_SPECIFIC,
1052 		    NVME_SC_INVALID_INTERRUPT_VECTOR);
1053 		return (1);
1054 	}
1055 
1056 	ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1057 	if ((ncq->size < 2) || (ncq->size > sc->max_qentries))  {
1058 		/*
1059 		 * Queues must specify at least two entries
1060 		 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1061 		 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1062 		 */
1063 		pci_nvme_status_tc(&compl->status,
1064 		    NVME_SCT_COMMAND_SPECIFIC,
1065 		    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1066 		return (1);
1067 	}
1068 	ncq->head = ncq->tail = 0;
1069 	ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1070 		     command->prp1,
1071 		     sizeof(struct nvme_command) * (size_t)ncq->size);
1072 
1073 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1074 
1075 
1076 	return (1);
1077 }
1078 
1079 static int
1080 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1081 	struct nvme_completion* compl)
1082 {
1083 	uint32_t logsize = 0;
1084 	uint8_t logpage = command->cdw10 & 0xFF;
1085 
1086 	DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1087 
1088 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1089 
1090 	/*
1091 	 * Command specifies the number of dwords to return in fields NUMDU
1092 	 * and NUMDL. This is a zero-based value.
1093 	 */
1094 	logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1095 	logsize *= sizeof(uint32_t);
1096 
1097 	switch (logpage) {
1098 	case NVME_LOG_ERROR:
1099 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1100 		    command->prp2, (uint8_t *)&sc->err_log,
1101 		    MIN(logsize, sizeof(sc->err_log)),
1102 		    NVME_COPY_TO_PRP);
1103 		break;
1104 	case NVME_LOG_HEALTH_INFORMATION:
1105 		pthread_mutex_lock(&sc->mtx);
1106 		memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1107 		    sizeof(sc->health_log.data_units_read));
1108 		memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1109 		    sizeof(sc->health_log.data_units_written));
1110 		memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1111 		    sizeof(sc->health_log.host_read_commands));
1112 		memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1113 		    sizeof(sc->health_log.host_write_commands));
1114 		pthread_mutex_unlock(&sc->mtx);
1115 
1116 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1117 		    command->prp2, (uint8_t *)&sc->health_log,
1118 		    MIN(logsize, sizeof(sc->health_log)),
1119 		    NVME_COPY_TO_PRP);
1120 		break;
1121 	case NVME_LOG_FIRMWARE_SLOT:
1122 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1123 		    command->prp2, (uint8_t *)&sc->fw_log,
1124 		    MIN(logsize, sizeof(sc->fw_log)),
1125 		    NVME_COPY_TO_PRP);
1126 		break;
1127 	default:
1128 		DPRINTF("%s get log page %x command not supported",
1129 		        __func__, logpage);
1130 
1131 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1132 		    NVME_SC_INVALID_LOG_PAGE);
1133 	}
1134 
1135 	return (1);
1136 }
1137 
1138 static int
1139 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1140 	struct nvme_completion* compl)
1141 {
1142 	void *dest;
1143 	uint16_t status = 0;
1144 
1145 	DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1146 	        command->cdw10 & 0xFF, command->nsid);
1147 
1148 	pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1149 
1150 	switch (command->cdw10 & 0xFF) {
1151 	case 0x00: /* return Identify Namespace data structure */
1152 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1153 		    command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1154 		    NVME_COPY_TO_PRP);
1155 		break;
1156 	case 0x01: /* return Identify Controller data structure */
1157 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1158 		    command->prp2, (uint8_t *)&sc->ctrldata,
1159 		    sizeof(sc->ctrldata),
1160 		    NVME_COPY_TO_PRP);
1161 		break;
1162 	case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1163 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1164 		                  sizeof(uint32_t) * 1024);
1165 		/* All unused entries shall be zero */
1166 		bzero(dest, sizeof(uint32_t) * 1024);
1167 		((uint32_t *)dest)[0] = 1;
1168 		break;
1169 	case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1170 		if (command->nsid != 1) {
1171 			pci_nvme_status_genc(&status,
1172 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1173 			break;
1174 		}
1175 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1176 		                  sizeof(uint32_t) * 1024);
1177 		/* All bytes after the descriptor shall be zero */
1178 		bzero(dest, sizeof(uint32_t) * 1024);
1179 
1180 		/* Return NIDT=1 (i.e. EUI64) descriptor */
1181 		((uint8_t *)dest)[0] = 1;
1182 		((uint8_t *)dest)[1] = sizeof(uint64_t);
1183 		bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1184 		break;
1185 	default:
1186 		DPRINTF("%s unsupported identify command requested 0x%x",
1187 		         __func__, command->cdw10 & 0xFF);
1188 		pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1189 		break;
1190 	}
1191 
1192 	compl->status = status;
1193 	return (1);
1194 }
1195 
1196 static const char *
1197 nvme_fid_to_name(uint8_t fid)
1198 {
1199 	const char *name;
1200 
1201 	switch (fid) {
1202 	case NVME_FEAT_ARBITRATION:
1203 		name = "Arbitration";
1204 		break;
1205 	case NVME_FEAT_POWER_MANAGEMENT:
1206 		name = "Power Management";
1207 		break;
1208 	case NVME_FEAT_LBA_RANGE_TYPE:
1209 		name = "LBA Range Type";
1210 		break;
1211 	case NVME_FEAT_TEMPERATURE_THRESHOLD:
1212 		name = "Temperature Threshold";
1213 		break;
1214 	case NVME_FEAT_ERROR_RECOVERY:
1215 		name = "Error Recovery";
1216 		break;
1217 	case NVME_FEAT_VOLATILE_WRITE_CACHE:
1218 		name = "Volatile Write Cache";
1219 		break;
1220 	case NVME_FEAT_NUMBER_OF_QUEUES:
1221 		name = "Number of Queues";
1222 		break;
1223 	case NVME_FEAT_INTERRUPT_COALESCING:
1224 		name = "Interrupt Coalescing";
1225 		break;
1226 	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1227 		name = "Interrupt Vector Configuration";
1228 		break;
1229 	case NVME_FEAT_WRITE_ATOMICITY:
1230 		name = "Write Atomicity Normal";
1231 		break;
1232 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1233 		name = "Asynchronous Event Configuration";
1234 		break;
1235 	case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1236 		name = "Autonomous Power State Transition";
1237 		break;
1238 	case NVME_FEAT_HOST_MEMORY_BUFFER:
1239 		name = "Host Memory Buffer";
1240 		break;
1241 	case NVME_FEAT_TIMESTAMP:
1242 		name = "Timestamp";
1243 		break;
1244 	case NVME_FEAT_KEEP_ALIVE_TIMER:
1245 		name = "Keep Alive Timer";
1246 		break;
1247 	case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1248 		name = "Host Controlled Thermal Management";
1249 		break;
1250 	case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1251 		name = "Non-Operation Power State Config";
1252 		break;
1253 	case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1254 		name = "Read Recovery Level Config";
1255 		break;
1256 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1257 		name = "Predictable Latency Mode Config";
1258 		break;
1259 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1260 		name = "Predictable Latency Mode Window";
1261 		break;
1262 	case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1263 		name = "LBA Status Information Report Interval";
1264 		break;
1265 	case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1266 		name = "Host Behavior Support";
1267 		break;
1268 	case NVME_FEAT_SANITIZE_CONFIG:
1269 		name = "Sanitize Config";
1270 		break;
1271 	case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1272 		name = "Endurance Group Event Configuration";
1273 		break;
1274 	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1275 		name = "Software Progress Marker";
1276 		break;
1277 	case NVME_FEAT_HOST_IDENTIFIER:
1278 		name = "Host Identifier";
1279 		break;
1280 	case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1281 		name = "Reservation Notification Mask";
1282 		break;
1283 	case NVME_FEAT_RESERVATION_PERSISTENCE:
1284 		name = "Reservation Persistence";
1285 		break;
1286 	case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1287 		name = "Namespace Write Protection Config";
1288 		break;
1289 	default:
1290 		name = "Unknown";
1291 		break;
1292 	}
1293 
1294 	return (name);
1295 }
1296 
1297 static void
1298 nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
1299     struct nvme_feature_obj *feat,
1300     struct nvme_command *command,
1301     struct nvme_completion *compl)
1302 {
1303 
1304 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1305 }
1306 
1307 static void
1308 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1309     struct nvme_feature_obj *feat,
1310     struct nvme_command *command,
1311     struct nvme_completion *compl)
1312 {
1313 	uint32_t i;
1314 	uint32_t cdw11 = command->cdw11;
1315 	uint16_t iv;
1316 	bool cd;
1317 
1318 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1319 
1320 	iv = cdw11 & 0xffff;
1321 	cd = cdw11 & (1 << 16);
1322 
1323 	if (iv > (sc->max_queues + 1)) {
1324 		return;
1325 	}
1326 
1327 	/* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1328 	if ((iv == 0) && !cd)
1329 		return;
1330 
1331 	/* Requested Interrupt Vector must be used by a CQ */
1332 	for (i = 0; i < sc->num_cqueues + 1; i++) {
1333 		if (sc->compl_queues[i].intr_vec == iv) {
1334 			pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1335 		}
1336 	}
1337 
1338 }
1339 
1340 static void
1341 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1342     struct nvme_feature_obj *feat,
1343     struct nvme_command *command,
1344     struct nvme_completion *compl)
1345 {
1346 	uint16_t nqr;	/* Number of Queues Requested */
1347 
1348 	if (sc->num_q_is_set) {
1349 		WPRINTF("%s: Number of Queues already set", __func__);
1350 		pci_nvme_status_genc(&compl->status,
1351 		    NVME_SC_COMMAND_SEQUENCE_ERROR);
1352 		return;
1353 	}
1354 
1355 	nqr = command->cdw11 & 0xFFFF;
1356 	if (nqr == 0xffff) {
1357 		WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1358 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1359 		return;
1360 	}
1361 
1362 	sc->num_squeues = ONE_BASED(nqr);
1363 	if (sc->num_squeues > sc->max_queues) {
1364 		DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1365 					sc->max_queues);
1366 		sc->num_squeues = sc->max_queues;
1367 	}
1368 
1369 	nqr = (command->cdw11 >> 16) & 0xFFFF;
1370 	if (nqr == 0xffff) {
1371 		WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1372 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1373 		return;
1374 	}
1375 
1376 	sc->num_cqueues = ONE_BASED(nqr);
1377 	if (sc->num_cqueues > sc->max_queues) {
1378 		DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1379 					sc->max_queues);
1380 		sc->num_cqueues = sc->max_queues;
1381 	}
1382 
1383 	/* Patch the command value which will be saved on callback's return */
1384 	command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1385 	compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1386 
1387 	sc->num_q_is_set = true;
1388 }
1389 
1390 static int
1391 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1392 	struct nvme_completion *compl)
1393 {
1394 	struct nvme_feature_obj *feat;
1395 	uint32_t nsid = command->nsid;
1396 	uint8_t fid = command->cdw10 & 0xFF;
1397 
1398 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1399 
1400 	if (fid >= NVME_FID_MAX) {
1401 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1402 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1403 		return (1);
1404 	}
1405 	feat = &sc->feat[fid];
1406 
1407 	if (!feat->namespace_specific &&
1408 	    !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1409 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1410 		    NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1411 		return (1);
1412 	}
1413 
1414 	compl->cdw0 = 0;
1415 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1416 
1417 	if (feat->set)
1418 		feat->set(sc, feat, command, compl);
1419 
1420 	if (compl->status == NVME_SC_SUCCESS)
1421 		feat->cdw11 = command->cdw11;
1422 
1423 	return (0);
1424 }
1425 
1426 static int
1427 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1428 	struct nvme_completion* compl)
1429 {
1430 	struct nvme_feature_obj *feat;
1431 	uint8_t fid = command->cdw10 & 0xFF;
1432 
1433 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1434 
1435 	if (fid >= NVME_FID_MAX) {
1436 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1437 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1438 		return (1);
1439 	}
1440 
1441 	compl->cdw0 = 0;
1442 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1443 
1444 	feat = &sc->feat[fid];
1445 	if (feat->get) {
1446 		feat->get(sc, feat, command, compl);
1447 	}
1448 
1449 	if (compl->status == NVME_SC_SUCCESS) {
1450 		compl->cdw0 = feat->cdw11;
1451 	}
1452 
1453 	return (0);
1454 }
1455 
1456 static int
1457 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1458 	struct nvme_completion* compl)
1459 {
1460 	uint8_t	ses, lbaf, pi;
1461 
1462 	/* Only supports Secure Erase Setting - User Data Erase */
1463 	ses = (command->cdw10 >> 9) & 0x7;
1464 	if (ses > 0x1) {
1465 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1466 		return (1);
1467 	}
1468 
1469 	/* Only supports a single LBA Format */
1470 	lbaf = command->cdw10 & 0xf;
1471 	if (lbaf != 0) {
1472 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1473 		    NVME_SC_INVALID_FORMAT);
1474 		return (1);
1475 	}
1476 
1477 	/* Doesn't support Protection Infomation */
1478 	pi = (command->cdw10 >> 5) & 0x7;
1479 	if (pi != 0) {
1480 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1481 		return (1);
1482 	}
1483 
1484 	if (sc->nvstore.type == NVME_STOR_RAM) {
1485 		if (sc->nvstore.ctx)
1486 			free(sc->nvstore.ctx);
1487 		sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1488 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1489 	} else {
1490 		struct pci_nvme_ioreq *req;
1491 		int err;
1492 
1493 		req = pci_nvme_get_ioreq(sc);
1494 		if (req == NULL) {
1495 			pci_nvme_status_genc(&compl->status,
1496 			    NVME_SC_INTERNAL_DEVICE_ERROR);
1497 			WPRINTF("%s: unable to allocate IO req", __func__);
1498 			return (1);
1499 		}
1500 		req->nvme_sq = &sc->submit_queues[0];
1501 		req->sqid = 0;
1502 		req->opc = command->opc;
1503 		req->cid = command->cid;
1504 		req->nsid = command->nsid;
1505 
1506 		req->io_req.br_offset = 0;
1507 		req->io_req.br_resid = sc->nvstore.size;
1508 		req->io_req.br_callback = pci_nvme_io_done;
1509 
1510 		err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1511 		if (err) {
1512 			pci_nvme_status_genc(&compl->status,
1513 			    NVME_SC_INTERNAL_DEVICE_ERROR);
1514 			pci_nvme_release_ioreq(sc, req);
1515 		}
1516 	}
1517 
1518 	return (1);
1519 }
1520 
1521 static int
1522 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1523 	struct nvme_completion* compl)
1524 {
1525 	DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1526 	        command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1527 
1528 	/* TODO: search for the command ID and abort it */
1529 
1530 	compl->cdw0 = 1;
1531 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1532 	return (1);
1533 }
1534 
1535 static int
1536 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1537 	struct nvme_command* command, struct nvme_completion* compl)
1538 {
1539 	DPRINTF("%s async event request 0x%x", __func__, command->cdw11);
1540 
1541 	/* Don't exceed the Async Event Request Limit (AERL). */
1542 	if (pci_nvme_aer_limit_reached(sc)) {
1543 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1544 				NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1545 		return (1);
1546 	}
1547 
1548 	if (pci_nvme_aer_add(sc, command->cid)) {
1549 		pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
1550 				NVME_SC_INTERNAL_DEVICE_ERROR);
1551 		return (1);
1552 	}
1553 
1554 	/*
1555 	 * Raise events when they happen based on the Set Features cmd.
1556 	 * These events happen async, so only set completion successful if
1557 	 * there is an event reflective of the request to get event.
1558 	 */
1559 	compl->status = NVME_NO_STATUS;
1560 
1561 	return (0);
1562 }
1563 
1564 static void
1565 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1566 {
1567 	struct nvme_completion compl;
1568 	struct nvme_command *cmd;
1569 	struct nvme_submission_queue *sq;
1570 	struct nvme_completion_queue *cq;
1571 	uint16_t sqhead;
1572 
1573 	DPRINTF("%s index %u", __func__, (uint32_t)value);
1574 
1575 	sq = &sc->submit_queues[0];
1576 	cq = &sc->compl_queues[0];
1577 
1578 	pthread_mutex_lock(&sq->mtx);
1579 
1580 	sqhead = sq->head;
1581 	DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1582 
1583 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
1584 		cmd = &(sq->qbase)[sqhead];
1585 		compl.cdw0 = 0;
1586 		compl.status = 0;
1587 
1588 		switch (cmd->opc) {
1589 		case NVME_OPC_DELETE_IO_SQ:
1590 			DPRINTF("%s command DELETE_IO_SQ", __func__);
1591 			nvme_opc_delete_io_sq(sc, cmd, &compl);
1592 			break;
1593 		case NVME_OPC_CREATE_IO_SQ:
1594 			DPRINTF("%s command CREATE_IO_SQ", __func__);
1595 			nvme_opc_create_io_sq(sc, cmd, &compl);
1596 			break;
1597 		case NVME_OPC_DELETE_IO_CQ:
1598 			DPRINTF("%s command DELETE_IO_CQ", __func__);
1599 			nvme_opc_delete_io_cq(sc, cmd, &compl);
1600 			break;
1601 		case NVME_OPC_CREATE_IO_CQ:
1602 			DPRINTF("%s command CREATE_IO_CQ", __func__);
1603 			nvme_opc_create_io_cq(sc, cmd, &compl);
1604 			break;
1605 		case NVME_OPC_GET_LOG_PAGE:
1606 			DPRINTF("%s command GET_LOG_PAGE", __func__);
1607 			nvme_opc_get_log_page(sc, cmd, &compl);
1608 			break;
1609 		case NVME_OPC_IDENTIFY:
1610 			DPRINTF("%s command IDENTIFY", __func__);
1611 			nvme_opc_identify(sc, cmd, &compl);
1612 			break;
1613 		case NVME_OPC_ABORT:
1614 			DPRINTF("%s command ABORT", __func__);
1615 			nvme_opc_abort(sc, cmd, &compl);
1616 			break;
1617 		case NVME_OPC_SET_FEATURES:
1618 			DPRINTF("%s command SET_FEATURES", __func__);
1619 			nvme_opc_set_features(sc, cmd, &compl);
1620 			break;
1621 		case NVME_OPC_GET_FEATURES:
1622 			DPRINTF("%s command GET_FEATURES", __func__);
1623 			nvme_opc_get_features(sc, cmd, &compl);
1624 			break;
1625 		case NVME_OPC_FIRMWARE_ACTIVATE:
1626 			DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
1627 			pci_nvme_status_tc(&compl.status,
1628 			    NVME_SCT_COMMAND_SPECIFIC,
1629 			    NVME_SC_INVALID_FIRMWARE_SLOT);
1630 			break;
1631 		case NVME_OPC_ASYNC_EVENT_REQUEST:
1632 			DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1633 			nvme_opc_async_event_req(sc, cmd, &compl);
1634 			break;
1635 		case NVME_OPC_FORMAT_NVM:
1636 			DPRINTF("%s command FORMAT_NVM", __func__);
1637 			if ((sc->ctrldata.oacs &
1638 			    (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
1639 				pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1640 			}
1641 			compl.status = NVME_NO_STATUS;
1642 			nvme_opc_format_nvm(sc, cmd, &compl);
1643 			break;
1644 		default:
1645 			DPRINTF("0x%x command is not implemented",
1646 			    cmd->opc);
1647 			pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1648 		}
1649 		sqhead = (sqhead + 1) % sq->size;
1650 
1651 		if (NVME_COMPLETION_VALID(compl)) {
1652 			pci_nvme_cq_update(sc, &sc->compl_queues[0],
1653 			    compl.cdw0,
1654 			    cmd->cid,
1655 			    0,		/* SQID */
1656 			    compl.status);
1657 		}
1658 	}
1659 
1660 	DPRINTF("setting sqhead %u", sqhead);
1661 	sq->head = sqhead;
1662 
1663 	if (cq->head != cq->tail)
1664 		pci_generate_msix(sc->nsc_pi, 0);
1665 
1666 	pthread_mutex_unlock(&sq->mtx);
1667 }
1668 
1669 /*
1670  * Update the Write and Read statistics reported in SMART data
1671  *
1672  * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
1673  * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
1674  * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
1675  */
1676 static void
1677 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
1678     size_t bytes, uint16_t status)
1679 {
1680 
1681 	pthread_mutex_lock(&sc->mtx);
1682 	switch (opc) {
1683 	case NVME_OPC_WRITE:
1684 		sc->write_commands++;
1685 		if (status != NVME_SC_SUCCESS)
1686 			break;
1687 		sc->write_dunits_remainder += (bytes / 512);
1688 		while (sc->write_dunits_remainder >= 1000) {
1689 			sc->write_data_units++;
1690 			sc->write_dunits_remainder -= 1000;
1691 		}
1692 		break;
1693 	case NVME_OPC_READ:
1694 		sc->read_commands++;
1695 		if (status != NVME_SC_SUCCESS)
1696 			break;
1697 		sc->read_dunits_remainder += (bytes / 512);
1698 		while (sc->read_dunits_remainder >= 1000) {
1699 			sc->read_data_units++;
1700 			sc->read_dunits_remainder -= 1000;
1701 		}
1702 		break;
1703 	default:
1704 		DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
1705 		break;
1706 	}
1707 	pthread_mutex_unlock(&sc->mtx);
1708 }
1709 
1710 /*
1711  * Check if the combination of Starting LBA (slba) and Number of Logical
1712  * Blocks (nlb) exceeds the range of the underlying storage.
1713  *
1714  * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
1715  * the capacity in bytes as a uint64_t, care must be taken to avoid integer
1716  * overflow.
1717  */
1718 static bool
1719 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
1720     uint32_t nlb)
1721 {
1722 	size_t	offset, bytes;
1723 
1724 	/* Overflow check of multiplying Starting LBA by the sector size */
1725 	if (slba >> (64 - nvstore->sectsz_bits))
1726 		return (true);
1727 
1728 	offset = slba << nvstore->sectsz_bits;
1729 	bytes = nlb << nvstore->sectsz_bits;
1730 
1731 	/* Overflow check of Number of Logical Blocks */
1732 	if ((nvstore->size - offset) < bytes)
1733 		return (true);
1734 
1735 	return (false);
1736 }
1737 
1738 static int
1739 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1740 	uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1741 {
1742 	int iovidx;
1743 
1744 	if (req == NULL)
1745 		return (-1);
1746 
1747 	if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
1748 		return (-1);
1749 	}
1750 
1751 	/* concatenate contig block-iovs to minimize number of iovs */
1752 	if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1753 		iovidx = req->io_req.br_iovcnt - 1;
1754 
1755 		req->io_req.br_iov[iovidx].iov_base =
1756 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1757 				     req->prev_gpaddr, size);
1758 
1759 		req->prev_size += size;
1760 		req->io_req.br_resid += size;
1761 
1762 		req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1763 	} else {
1764 		iovidx = req->io_req.br_iovcnt;
1765 		if (iovidx == 0) {
1766 			req->io_req.br_offset = lba;
1767 			req->io_req.br_resid = 0;
1768 			req->io_req.br_param = req;
1769 		}
1770 
1771 		req->io_req.br_iov[iovidx].iov_base =
1772 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1773 				     gpaddr, size);
1774 
1775 		req->io_req.br_iov[iovidx].iov_len = size;
1776 
1777 		req->prev_gpaddr = gpaddr;
1778 		req->prev_size = size;
1779 		req->io_req.br_resid += size;
1780 
1781 		req->io_req.br_iovcnt++;
1782 	}
1783 
1784 	return (0);
1785 }
1786 
1787 static void
1788 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1789 	struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1790 	uint32_t cdw0, uint16_t status)
1791 {
1792 	struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1793 
1794 	DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1795 		 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1796 		 NVME_STATUS_GET_SC(status));
1797 
1798 	pci_nvme_cq_update(sc, cq,
1799 	    0,		/* CDW0 */
1800 	    cid,
1801 	    sqid,
1802 	    status);
1803 
1804 	if (cq->head != cq->tail) {
1805 		if (cq->intr_en & NVME_CQ_INTEN) {
1806 			pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1807 		} else {
1808 			DPRINTF("%s: CQ%u interrupt disabled",
1809 						__func__, sq->cqid);
1810 		}
1811 	}
1812 }
1813 
1814 static void
1815 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1816 {
1817 	req->sc = NULL;
1818 	req->nvme_sq = NULL;
1819 	req->sqid = 0;
1820 
1821 	pthread_mutex_lock(&sc->mtx);
1822 
1823 	STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
1824 	sc->pending_ios--;
1825 
1826 	/* when no more IO pending, can set to ready if device reset/enabled */
1827 	if (sc->pending_ios == 0 &&
1828 	    NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1829 		sc->regs.csts |= NVME_CSTS_RDY;
1830 
1831 	pthread_mutex_unlock(&sc->mtx);
1832 
1833 	sem_post(&sc->iosemlock);
1834 }
1835 
1836 static struct pci_nvme_ioreq *
1837 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1838 {
1839 	struct pci_nvme_ioreq *req = NULL;;
1840 
1841 	sem_wait(&sc->iosemlock);
1842 	pthread_mutex_lock(&sc->mtx);
1843 
1844 	req = STAILQ_FIRST(&sc->ioreqs_free);
1845 	assert(req != NULL);
1846 	STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
1847 
1848 	req->sc = sc;
1849 
1850 	sc->pending_ios++;
1851 
1852 	pthread_mutex_unlock(&sc->mtx);
1853 
1854 	req->io_req.br_iovcnt = 0;
1855 	req->io_req.br_offset = 0;
1856 	req->io_req.br_resid = 0;
1857 	req->io_req.br_param = req;
1858 	req->prev_gpaddr = 0;
1859 	req->prev_size = 0;
1860 
1861 	return req;
1862 }
1863 
1864 static void
1865 pci_nvme_io_done(struct blockif_req *br, int err)
1866 {
1867 	struct pci_nvme_ioreq *req = br->br_param;
1868 	struct nvme_submission_queue *sq = req->nvme_sq;
1869 	uint16_t code, status = 0;
1870 
1871 	DPRINTF("%s error %d %s", __func__, err, strerror(err));
1872 
1873 	/* TODO return correct error */
1874 	code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1875 	pci_nvme_status_genc(&status, code);
1876 
1877 	pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
1878 	pci_nvme_stats_write_read_update(req->sc, req->opc,
1879 	    req->bytes, status);
1880 	pci_nvme_release_ioreq(req->sc, req);
1881 }
1882 
1883 /*
1884  * Implements the Flush command. The specification states:
1885  *    If a volatile write cache is not present, Flush commands complete
1886  *    successfully and have no effect
1887  * in the description of the Volatile Write Cache (VWC) field of the Identify
1888  * Controller data. Therefore, set status to Success if the command is
1889  * not supported (i.e. RAM or as indicated by the blockif).
1890  */
1891 static bool
1892 nvme_opc_flush(struct pci_nvme_softc *sc,
1893     struct nvme_command *cmd,
1894     struct pci_nvme_blockstore *nvstore,
1895     struct pci_nvme_ioreq *req,
1896     uint16_t *status)
1897 {
1898 	bool pending = false;
1899 
1900 	if (nvstore->type == NVME_STOR_RAM) {
1901 		pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1902 	} else {
1903 		int err;
1904 
1905 		req->io_req.br_callback = pci_nvme_io_done;
1906 
1907 		err = blockif_flush(nvstore->ctx, &req->io_req);
1908 		switch (err) {
1909 		case 0:
1910 			pending = true;
1911 			break;
1912 		case EOPNOTSUPP:
1913 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1914 			break;
1915 		default:
1916 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1917 		}
1918 	}
1919 
1920 	return (pending);
1921 }
1922 
1923 static uint16_t
1924 nvme_write_read_ram(struct pci_nvme_softc *sc,
1925     struct pci_nvme_blockstore *nvstore,
1926     uint64_t prp1, uint64_t prp2,
1927     size_t offset, uint64_t bytes,
1928     bool is_write)
1929 {
1930 	uint8_t *buf = nvstore->ctx;
1931 	enum nvme_copy_dir dir;
1932 	uint16_t status = 0;
1933 
1934 	if (is_write)
1935 		dir = NVME_COPY_TO_PRP;
1936 	else
1937 		dir = NVME_COPY_FROM_PRP;
1938 
1939 	if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
1940 	    buf + offset, bytes, dir))
1941 		pci_nvme_status_genc(&status,
1942 		    NVME_SC_DATA_TRANSFER_ERROR);
1943 	else
1944 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1945 
1946 	return (status);
1947 }
1948 
1949 static uint16_t
1950 nvme_write_read_blockif(struct pci_nvme_softc *sc,
1951     struct pci_nvme_blockstore *nvstore,
1952     struct pci_nvme_ioreq *req,
1953     uint64_t prp1, uint64_t prp2,
1954     size_t offset, uint64_t bytes,
1955     bool is_write)
1956 {
1957 	uint64_t size;
1958 	int err;
1959 	uint16_t status = NVME_NO_STATUS;
1960 
1961 	size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
1962 	if (pci_nvme_append_iov_req(sc, req, prp1,
1963 	    size, is_write, offset)) {
1964 		pci_nvme_status_genc(&status,
1965 		    NVME_SC_DATA_TRANSFER_ERROR);
1966 		goto out;
1967 	}
1968 
1969 	offset += size;
1970 	bytes  -= size;
1971 
1972 	if (bytes == 0) {
1973 		;
1974 	} else if (bytes <= PAGE_SIZE) {
1975 		size = bytes;
1976 		if (pci_nvme_append_iov_req(sc, req, prp2,
1977 		    size, is_write, offset)) {
1978 			pci_nvme_status_genc(&status,
1979 			    NVME_SC_DATA_TRANSFER_ERROR);
1980 			goto out;
1981 		}
1982 	} else {
1983 		void *vmctx = sc->nsc_pi->pi_vmctx;
1984 		uint64_t *prp_list = &prp2;
1985 		uint64_t *last = prp_list;
1986 
1987 		/* PRP2 is pointer to a physical region page list */
1988 		while (bytes) {
1989 			/* Last entry in list points to the next list */
1990 			if (prp_list == last) {
1991 				uint64_t prp = *prp_list;
1992 
1993 				prp_list = paddr_guest2host(vmctx, prp,
1994 				    PAGE_SIZE - (prp % PAGE_SIZE));
1995 				last = prp_list + (NVME_PRP2_ITEMS - 1);
1996 			}
1997 
1998 			size = MIN(bytes, PAGE_SIZE);
1999 
2000 			if (pci_nvme_append_iov_req(sc, req, *prp_list,
2001 			    size, is_write, offset)) {
2002 				pci_nvme_status_genc(&status,
2003 				    NVME_SC_DATA_TRANSFER_ERROR);
2004 				goto out;
2005 			}
2006 
2007 			offset += size;
2008 			bytes  -= size;
2009 
2010 			prp_list++;
2011 		}
2012 	}
2013 	req->io_req.br_callback = pci_nvme_io_done;
2014 	if (is_write)
2015 		err = blockif_write(nvstore->ctx, &req->io_req);
2016 	else
2017 		err = blockif_read(nvstore->ctx, &req->io_req);
2018 
2019 	if (err)
2020 		pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2021 out:
2022 	return (status);
2023 }
2024 
2025 static bool
2026 nvme_opc_write_read(struct pci_nvme_softc *sc,
2027     struct nvme_command *cmd,
2028     struct pci_nvme_blockstore *nvstore,
2029     struct pci_nvme_ioreq *req,
2030     uint16_t *status)
2031 {
2032 	uint64_t lba, nblocks, bytes = 0;
2033 	size_t offset;
2034 	bool is_write = cmd->opc == NVME_OPC_WRITE;
2035 	bool pending = false;
2036 
2037 	lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2038 	nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2039 	if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2040 		WPRINTF("%s command would exceed LBA range", __func__);
2041 		pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2042 		goto out;
2043 	}
2044 
2045 	bytes  = nblocks << nvstore->sectsz_bits;
2046 	if (bytes > NVME_MAX_DATA_SIZE) {
2047 		WPRINTF("%s command would exceed MDTS", __func__);
2048 		pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2049 		goto out;
2050 	}
2051 
2052 	offset = lba << nvstore->sectsz_bits;
2053 
2054 	req->bytes = bytes;
2055 	req->io_req.br_offset = lba;
2056 
2057 	/* PRP bits 1:0 must be zero */
2058 	cmd->prp1 &= ~0x3UL;
2059 	cmd->prp2 &= ~0x3UL;
2060 
2061 	if (nvstore->type == NVME_STOR_RAM) {
2062 		*status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2063 		    cmd->prp2, offset, bytes, is_write);
2064 	} else {
2065 		*status = nvme_write_read_blockif(sc, nvstore, req,
2066 		    cmd->prp1, cmd->prp2, offset, bytes, is_write);
2067 
2068 		if (*status == NVME_NO_STATUS)
2069 			pending = true;
2070 	}
2071 out:
2072 	if (!pending)
2073 		pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2074 
2075 	return (pending);
2076 }
2077 
2078 static void
2079 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2080 {
2081 	struct pci_nvme_ioreq *req = br->br_param;
2082 	struct pci_nvme_softc *sc = req->sc;
2083 	bool done = true;
2084 #ifdef __FreeBSD__
2085 	uint16_t status;
2086 #else
2087 	uint16_t status = 0;
2088 #endif
2089 
2090 	if (err) {
2091 		pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2092 	} else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2093 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2094 	} else {
2095 		struct iovec *iov = req->io_req.br_iov;
2096 
2097 		req->prev_gpaddr++;
2098 		iov += req->prev_gpaddr;
2099 
2100 		/* The iov_* values already include the sector size */
2101 		req->io_req.br_offset = (off_t)iov->iov_base;
2102 		req->io_req.br_resid = iov->iov_len;
2103 		if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2104 			pci_nvme_status_genc(&status,
2105 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2106 		} else
2107 			done = false;
2108 	}
2109 
2110 	if (done) {
2111 		pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
2112 		    req->cid, 0, status);
2113 		pci_nvme_release_ioreq(sc, req);
2114 	}
2115 }
2116 
2117 static bool
2118 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2119     struct nvme_command *cmd,
2120     struct pci_nvme_blockstore *nvstore,
2121     struct pci_nvme_ioreq *req,
2122     uint16_t *status)
2123 {
2124 	struct nvme_dsm_range *range = NULL;
2125 	uint32_t nr, r, non_zero, dr;
2126 	int err;
2127 	bool pending = false;
2128 
2129 	if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2130 		pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2131 		goto out;
2132 	}
2133 
2134 	nr = cmd->cdw10 & 0xff;
2135 
2136 	/* copy locally because a range entry could straddle PRPs */
2137 	range = calloc(1, NVME_MAX_DSM_TRIM);
2138 	if (range == NULL) {
2139 		pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2140 		goto out;
2141 	}
2142 	nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2143 	    (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2144 
2145 	/* Check for invalid ranges and the number of non-zero lengths */
2146 	non_zero = 0;
2147 	for (r = 0; r <= nr; r++) {
2148 		if (pci_nvme_out_of_range(nvstore,
2149 		    range[r].starting_lba, range[r].length)) {
2150 			pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2151 			goto out;
2152 		}
2153 		if (range[r].length != 0)
2154 			non_zero++;
2155 	}
2156 
2157 	if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2158 		size_t offset, bytes;
2159 		int sectsz_bits = sc->nvstore.sectsz_bits;
2160 
2161 		/*
2162 		 * DSM calls are advisory only, and compliant controllers
2163 		 * may choose to take no actions (i.e. return Success).
2164 		 */
2165 		if (!nvstore->deallocate) {
2166 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2167 			goto out;
2168 		}
2169 
2170 		/* If all ranges have a zero length, return Success */
2171 		if (non_zero == 0) {
2172 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2173 			goto out;
2174 		}
2175 
2176 		if (req == NULL) {
2177 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2178 			goto out;
2179 		}
2180 
2181 		offset = range[0].starting_lba << sectsz_bits;
2182 		bytes = range[0].length << sectsz_bits;
2183 
2184 		/*
2185 		 * If the request is for more than a single range, store
2186 		 * the ranges in the br_iov. Optimize for the common case
2187 		 * of a single range.
2188 		 *
2189 		 * Note that NVMe Number of Ranges is a zero based value
2190 		 */
2191 		req->io_req.br_iovcnt = 0;
2192 		req->io_req.br_offset = offset;
2193 		req->io_req.br_resid = bytes;
2194 
2195 		if (nr == 0) {
2196 			req->io_req.br_callback = pci_nvme_io_done;
2197 		} else {
2198 			struct iovec *iov = req->io_req.br_iov;
2199 
2200 			for (r = 0, dr = 0; r <= nr; r++) {
2201 				offset = range[r].starting_lba << sectsz_bits;
2202 				bytes = range[r].length << sectsz_bits;
2203 				if (bytes == 0)
2204 					continue;
2205 
2206 				if ((nvstore->size - offset) < bytes) {
2207 					pci_nvme_status_genc(status,
2208 					    NVME_SC_LBA_OUT_OF_RANGE);
2209 					goto out;
2210 				}
2211 				iov[dr].iov_base = (void *)offset;
2212 				iov[dr].iov_len = bytes;
2213 				dr++;
2214 			}
2215 			req->io_req.br_callback = pci_nvme_dealloc_sm;
2216 
2217 			/*
2218 			 * Use prev_gpaddr to track the current entry and
2219 			 * prev_size to track the number of entries
2220 			 */
2221 			req->prev_gpaddr = 0;
2222 			req->prev_size = dr;
2223 		}
2224 
2225 		err = blockif_delete(nvstore->ctx, &req->io_req);
2226 		if (err)
2227 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2228 		else
2229 			pending = true;
2230 	}
2231 out:
2232 	free(range);
2233 	return (pending);
2234 }
2235 
2236 static void
2237 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2238 {
2239 	struct nvme_submission_queue *sq;
2240 	uint16_t status = 0;
2241 	uint16_t sqhead;
2242 
2243 	/* handle all submissions up to sq->tail index */
2244 	sq = &sc->submit_queues[idx];
2245 
2246 	pthread_mutex_lock(&sq->mtx);
2247 
2248 	sqhead = sq->head;
2249 	DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2250 	         idx, sqhead, sq->tail, sq->qbase);
2251 
2252 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
2253 		struct nvme_command *cmd;
2254 		struct pci_nvme_ioreq *req;
2255 		uint32_t nsid;
2256 		bool pending;
2257 
2258 		pending = false;
2259 		req = NULL;
2260 		status = 0;
2261 
2262 		cmd = &sq->qbase[sqhead];
2263 		sqhead = (sqhead + 1) % sq->size;
2264 
2265 		nsid = le32toh(cmd->nsid);
2266 		if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2267 			pci_nvme_status_genc(&status,
2268 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2269 			status |=
2270 			    NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2271 			goto complete;
2272  		}
2273 
2274 		req = pci_nvme_get_ioreq(sc);
2275 		if (req == NULL) {
2276 			pci_nvme_status_genc(&status,
2277 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2278 			WPRINTF("%s: unable to allocate IO req", __func__);
2279 			goto complete;
2280 		}
2281 		req->nvme_sq = sq;
2282 		req->sqid = idx;
2283 		req->opc = cmd->opc;
2284 		req->cid = cmd->cid;
2285 		req->nsid = cmd->nsid;
2286 
2287 		switch (cmd->opc) {
2288 		case NVME_OPC_FLUSH:
2289 			pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2290 			    req, &status);
2291  			break;
2292 		case NVME_OPC_WRITE:
2293 		case NVME_OPC_READ:
2294 			pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2295 			    req, &status);
2296 			break;
2297 		case NVME_OPC_WRITE_ZEROES:
2298 			/* TODO: write zeroes
2299 			WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2300 			        __func__, lba, cmd->cdw12 & 0xFFFF); */
2301 			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2302 			break;
2303 		case NVME_OPC_DATASET_MANAGEMENT:
2304  			pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2305 			    req, &status);
2306 			break;
2307  		default:
2308  			WPRINTF("%s unhandled io command 0x%x",
2309 			    __func__, cmd->opc);
2310 			pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2311 		}
2312 complete:
2313 		if (!pending) {
2314 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
2315 			    status);
2316 			if (req != NULL)
2317 				pci_nvme_release_ioreq(sc, req);
2318 		}
2319 	}
2320 
2321 	sq->head = sqhead;
2322 
2323 	pthread_mutex_unlock(&sq->mtx);
2324 }
2325 
2326 static void
2327 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
2328 	uint64_t idx, int is_sq, uint64_t value)
2329 {
2330 	DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2331 	        idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2332 
2333 	if (is_sq) {
2334 		if (idx > sc->num_squeues) {
2335 			WPRINTF("%s queue index %lu overflow from "
2336 			         "guest (max %u)",
2337 			         __func__, idx, sc->num_squeues);
2338 			return;
2339 		}
2340 
2341 		atomic_store_short(&sc->submit_queues[idx].tail,
2342 		                   (uint16_t)value);
2343 
2344 		if (idx == 0) {
2345 			pci_nvme_handle_admin_cmd(sc, value);
2346 		} else {
2347 			/* submission queue; handle new entries in SQ */
2348 			if (idx > sc->num_squeues) {
2349 				WPRINTF("%s SQ index %lu overflow from "
2350 				         "guest (max %u)",
2351 				         __func__, idx, sc->num_squeues);
2352 				return;
2353 			}
2354 			pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2355 		}
2356 	} else {
2357 		if (idx > sc->num_cqueues) {
2358 			WPRINTF("%s queue index %lu overflow from "
2359 			         "guest (max %u)",
2360 			         __func__, idx, sc->num_cqueues);
2361 			return;
2362 		}
2363 
2364 		atomic_store_short(&sc->compl_queues[idx].head,
2365 				(uint16_t)value);
2366 	}
2367 }
2368 
2369 static void
2370 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2371 {
2372 	const char *s = iswrite ? "WRITE" : "READ";
2373 
2374 	switch (offset) {
2375 	case NVME_CR_CAP_LOW:
2376 		DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2377 		break;
2378 	case NVME_CR_CAP_HI:
2379 		DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2380 		break;
2381 	case NVME_CR_VS:
2382 		DPRINTF("%s %s NVME_CR_VS", func, s);
2383 		break;
2384 	case NVME_CR_INTMS:
2385 		DPRINTF("%s %s NVME_CR_INTMS", func, s);
2386 		break;
2387 	case NVME_CR_INTMC:
2388 		DPRINTF("%s %s NVME_CR_INTMC", func, s);
2389 		break;
2390 	case NVME_CR_CC:
2391 		DPRINTF("%s %s NVME_CR_CC", func, s);
2392 		break;
2393 	case NVME_CR_CSTS:
2394 		DPRINTF("%s %s NVME_CR_CSTS", func, s);
2395 		break;
2396 	case NVME_CR_NSSR:
2397 		DPRINTF("%s %s NVME_CR_NSSR", func, s);
2398 		break;
2399 	case NVME_CR_AQA:
2400 		DPRINTF("%s %s NVME_CR_AQA", func, s);
2401 		break;
2402 	case NVME_CR_ASQ_LOW:
2403 		DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2404 		break;
2405 	case NVME_CR_ASQ_HI:
2406 		DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2407 		break;
2408 	case NVME_CR_ACQ_LOW:
2409 		DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2410 		break;
2411 	case NVME_CR_ACQ_HI:
2412 		DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2413 		break;
2414 	default:
2415 		DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2416 	}
2417 
2418 }
2419 
2420 static void
2421 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2422 	uint64_t offset, int size, uint64_t value)
2423 {
2424 	uint32_t ccreg;
2425 
2426 	if (offset >= NVME_DOORBELL_OFFSET) {
2427 		uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2428 		uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2429 		int is_sq = (belloffset % 8) < 4;
2430 
2431 		if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2432 			WPRINTF("guest attempted an overflow write offset "
2433 			         "0x%lx, val 0x%lx in %s",
2434 			         offset, value, __func__);
2435 			return;
2436 		}
2437 
2438 		pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2439 		return;
2440 	}
2441 
2442 	DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2443 	        offset, size, value);
2444 
2445 	if (size != 4) {
2446 		WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2447 		         "val 0x%lx) to bar0 in %s",
2448 		         size, offset, value, __func__);
2449 		/* TODO: shutdown device */
2450 		return;
2451 	}
2452 
2453 	pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2454 
2455 	pthread_mutex_lock(&sc->mtx);
2456 
2457 	switch (offset) {
2458 	case NVME_CR_CAP_LOW:
2459 	case NVME_CR_CAP_HI:
2460 		/* readonly */
2461 		break;
2462 	case NVME_CR_VS:
2463 		/* readonly */
2464 		break;
2465 	case NVME_CR_INTMS:
2466 		/* MSI-X, so ignore */
2467 		break;
2468 	case NVME_CR_INTMC:
2469 		/* MSI-X, so ignore */
2470 		break;
2471 	case NVME_CR_CC:
2472 		ccreg = (uint32_t)value;
2473 
2474 		DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2475 		         "iocqes %u",
2476 		        __func__,
2477 			 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2478 			 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2479 			 NVME_CC_GET_IOCQES(ccreg));
2480 
2481 		if (NVME_CC_GET_SHN(ccreg)) {
2482 			/* perform shutdown - flush out data to backend */
2483 			sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2484 			    NVME_CSTS_REG_SHST_SHIFT);
2485 			sc->regs.csts |= NVME_SHST_COMPLETE <<
2486 			    NVME_CSTS_REG_SHST_SHIFT;
2487 		}
2488 		if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2489 			if (NVME_CC_GET_EN(ccreg) == 0)
2490 				/* transition 1-> causes controller reset */
2491 				pci_nvme_reset_locked(sc);
2492 			else
2493 				pci_nvme_init_controller(ctx, sc);
2494 		}
2495 
2496 		/* Insert the iocqes, iosqes and en bits from the write */
2497 		sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2498 		sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2499 		if (NVME_CC_GET_EN(ccreg) == 0) {
2500 			/* Insert the ams, mps and css bit fields */
2501 			sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2502 			sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2503 			sc->regs.csts &= ~NVME_CSTS_RDY;
2504 		} else if (sc->pending_ios == 0) {
2505 			sc->regs.csts |= NVME_CSTS_RDY;
2506 		}
2507 		break;
2508 	case NVME_CR_CSTS:
2509 		break;
2510 	case NVME_CR_NSSR:
2511 		/* ignore writes; don't support subsystem reset */
2512 		break;
2513 	case NVME_CR_AQA:
2514 		sc->regs.aqa = (uint32_t)value;
2515 		break;
2516 	case NVME_CR_ASQ_LOW:
2517 		sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2518 		               (0xFFFFF000 & value);
2519 		break;
2520 	case NVME_CR_ASQ_HI:
2521 		sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2522 		               (value << 32);
2523 		break;
2524 	case NVME_CR_ACQ_LOW:
2525 		sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2526 		               (0xFFFFF000 & value);
2527 		break;
2528 	case NVME_CR_ACQ_HI:
2529 		sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2530 		               (value << 32);
2531 		break;
2532 	default:
2533 		DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2534 		         __func__, offset, value, size);
2535 	}
2536 	pthread_mutex_unlock(&sc->mtx);
2537 }
2538 
2539 static void
2540 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2541                 int baridx, uint64_t offset, int size, uint64_t value)
2542 {
2543 	struct pci_nvme_softc* sc = pi->pi_arg;
2544 
2545 	if (baridx == pci_msix_table_bar(pi) ||
2546 	    baridx == pci_msix_pba_bar(pi)) {
2547 		DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2548 		         " value 0x%lx", baridx, offset, size, value);
2549 
2550 		pci_emul_msix_twrite(pi, offset, size, value);
2551 		return;
2552 	}
2553 
2554 	switch (baridx) {
2555 	case 0:
2556 		pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2557 		break;
2558 
2559 	default:
2560 		DPRINTF("%s unknown baridx %d, val 0x%lx",
2561 		         __func__, baridx, value);
2562 	}
2563 }
2564 
2565 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2566 	uint64_t offset, int size)
2567 {
2568 	uint64_t value;
2569 
2570 	pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2571 
2572 	if (offset < NVME_DOORBELL_OFFSET) {
2573 		void *p = &(sc->regs);
2574 		pthread_mutex_lock(&sc->mtx);
2575 		memcpy(&value, (void *)((uintptr_t)p + offset), size);
2576 		pthread_mutex_unlock(&sc->mtx);
2577 	} else {
2578 		value = 0;
2579                 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2580 	}
2581 
2582 	switch (size) {
2583 	case 1:
2584 		value &= 0xFF;
2585 		break;
2586 	case 2:
2587 		value &= 0xFFFF;
2588 		break;
2589 	case 4:
2590 		value &= 0xFFFFFFFF;
2591 		break;
2592 	}
2593 
2594 	DPRINTF("   nvme-read offset 0x%lx, size %d -> value 0x%x",
2595 	         offset, size, (uint32_t)value);
2596 
2597 	return (value);
2598 }
2599 
2600 
2601 
2602 static uint64_t
2603 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2604     uint64_t offset, int size)
2605 {
2606 	struct pci_nvme_softc* sc = pi->pi_arg;
2607 
2608 	if (baridx == pci_msix_table_bar(pi) ||
2609 	    baridx == pci_msix_pba_bar(pi)) {
2610 		DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2611 		        baridx, offset, size);
2612 
2613 		return pci_emul_msix_tread(pi, offset, size);
2614 	}
2615 
2616 	switch (baridx) {
2617 	case 0:
2618        		return pci_nvme_read_bar_0(sc, offset, size);
2619 
2620 	default:
2621 		DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2622 	}
2623 
2624 	return (0);
2625 }
2626 
2627 
2628 static int
2629 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
2630 {
2631 	char bident[sizeof("XX:X:X")];
2632 	char	*uopt, *xopts, *config;
2633 	uint32_t sectsz;
2634 	int optidx;
2635 
2636 	sc->max_queues = NVME_QUEUES;
2637 	sc->max_qentries = NVME_MAX_QENTRIES;
2638 	sc->ioslots = NVME_IOSLOTS;
2639 	sc->num_squeues = sc->max_queues;
2640 	sc->num_cqueues = sc->max_queues;
2641 	sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2642 	sectsz = 0;
2643 
2644 	uopt = strdup(opts);
2645 	optidx = 0;
2646 	snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2647 	         "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2648 	for (xopts = strtok(uopt, ",");
2649 	     xopts != NULL;
2650 	     xopts = strtok(NULL, ",")) {
2651 
2652 		if ((config = strchr(xopts, '=')) != NULL)
2653 			*config++ = '\0';
2654 
2655 		if (!strcmp("maxq", xopts)) {
2656 			sc->max_queues = atoi(config);
2657 		} else if (!strcmp("qsz", xopts)) {
2658 			sc->max_qentries = atoi(config);
2659 		} else if (!strcmp("ioslots", xopts)) {
2660 			sc->ioslots = atoi(config);
2661 		} else if (!strcmp("sectsz", xopts)) {
2662 			sectsz = atoi(config);
2663 		} else if (!strcmp("ser", xopts)) {
2664 			/*
2665 			 * This field indicates the Product Serial Number in
2666 			 * 7-bit ASCII, unused bytes should be space characters.
2667 			 * Ref: NVMe v1.3c.
2668 			 */
2669 			cpywithpad((char *)sc->ctrldata.sn,
2670 			           sizeof(sc->ctrldata.sn), config, ' ');
2671 		} else if (!strcmp("ram", xopts)) {
2672 			uint64_t sz = strtoull(&xopts[4], NULL, 10);
2673 
2674 			sc->nvstore.type = NVME_STOR_RAM;
2675 			sc->nvstore.size = sz * 1024 * 1024;
2676 			sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2677 			sc->nvstore.sectsz = 4096;
2678 			sc->nvstore.sectsz_bits = 12;
2679 			if (sc->nvstore.ctx == NULL) {
2680 				perror("Unable to allocate RAM");
2681 				free(uopt);
2682 				return (-1);
2683 			}
2684 		} else if (!strcmp("eui64", xopts)) {
2685 			sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
2686 		} else if (!strcmp("dsm", xopts)) {
2687 			if (!strcmp("auto", config))
2688 				sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2689 			else if (!strcmp("enable", config))
2690 				sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2691 			else if (!strcmp("disable", config))
2692 				sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2693 		} else if (optidx == 0) {
2694 			snprintf(bident, sizeof(bident), "%d:%d",
2695 			         sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2696 			sc->nvstore.ctx = blockif_open(xopts, bident);
2697 			if (sc->nvstore.ctx == NULL) {
2698 				perror("Could not open backing file");
2699 				free(uopt);
2700 				return (-1);
2701 			}
2702 			sc->nvstore.type = NVME_STOR_BLOCKIF;
2703 			sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2704 		} else {
2705 			EPRINTLN("Invalid option %s", xopts);
2706 			free(uopt);
2707 			return (-1);
2708 		}
2709 
2710 		optidx++;
2711 	}
2712 	free(uopt);
2713 
2714 	if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
2715 		EPRINTLN("backing store not specified");
2716 		return (-1);
2717 	}
2718 	if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2719 		sc->nvstore.sectsz = sectsz;
2720 	else if (sc->nvstore.type != NVME_STOR_RAM)
2721 		sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2722 	for (sc->nvstore.sectsz_bits = 9;
2723 	     (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2724 	     sc->nvstore.sectsz_bits++);
2725 
2726 	if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2727 		sc->max_queues = NVME_QUEUES;
2728 
2729 	if (sc->max_qentries <= 0) {
2730 		EPRINTLN("Invalid qsz option");
2731 		return (-1);
2732 	}
2733 	if (sc->ioslots <= 0) {
2734 		EPRINTLN("Invalid ioslots option");
2735 		return (-1);
2736 	}
2737 
2738 	return (0);
2739 }
2740 
2741 static int
2742 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
2743 {
2744 	struct pci_nvme_softc *sc;
2745 	uint32_t pci_membar_sz;
2746 	int	error;
2747 
2748 	error = 0;
2749 
2750 	sc = calloc(1, sizeof(struct pci_nvme_softc));
2751 	pi->pi_arg = sc;
2752 	sc->nsc_pi = pi;
2753 
2754 	error = pci_nvme_parse_opts(sc, opts);
2755 	if (error < 0)
2756 		goto done;
2757 	else
2758 		error = 0;
2759 
2760 	STAILQ_INIT(&sc->ioreqs_free);
2761 	sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
2762 	for (int i = 0; i < sc->ioslots; i++) {
2763 		STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
2764 	}
2765 
2766 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
2767 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
2768 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
2769 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
2770 	pci_set_cfgdata8(pi, PCIR_PROGIF,
2771 	                 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
2772 
2773 	/*
2774 	 * Allocate size of NVMe registers + doorbell space for all queues.
2775 	 *
2776 	 * The specification requires a minimum memory I/O window size of 16K.
2777 	 * The Windows driver will refuse to start a device with a smaller
2778 	 * window.
2779 	 */
2780 	pci_membar_sz = sizeof(struct nvme_registers) +
2781 	    2 * sizeof(uint32_t) * (sc->max_queues + 1);
2782 	pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
2783 
2784 	DPRINTF("nvme membar size: %u", pci_membar_sz);
2785 
2786 	error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
2787 	if (error) {
2788 		WPRINTF("%s pci alloc mem bar failed", __func__);
2789 		goto done;
2790 	}
2791 
2792 	error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2793 	if (error) {
2794 		WPRINTF("%s pci add msixcap failed", __func__);
2795 		goto done;
2796 	}
2797 
2798 	error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2799 	if (error) {
2800 		WPRINTF("%s pci add Express capability failed", __func__);
2801 		goto done;
2802 	}
2803 
2804 	pthread_mutex_init(&sc->mtx, NULL);
2805 	sem_init(&sc->iosemlock, 0, sc->ioslots);
2806 
2807 	pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
2808 	/*
2809 	 * Controller data depends on Namespace data so initialize Namespace
2810 	 * data first.
2811 	 */
2812 	pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
2813 	pci_nvme_init_ctrldata(sc);
2814 	pci_nvme_init_logpages(sc);
2815 	pci_nvme_init_features(sc);
2816 
2817 	pci_nvme_aer_init(sc);
2818 
2819 	pci_nvme_reset(sc);
2820 
2821 	pci_lintr_request(pi);
2822 
2823 done:
2824 	return (error);
2825 }
2826 
2827 
2828 struct pci_devemu pci_de_nvme = {
2829 	.pe_emu =	"nvme",
2830 	.pe_init =	pci_nvme_init,
2831 	.pe_barwrite =	pci_nvme_write,
2832 	.pe_barread =	pci_nvme_read
2833 };
2834 PCI_EMUL_SET(pci_de_nvme);
2835