xref: /freebsd/usr.sbin/bhyve/pci_nvme.c (revision 963f5dc7a30624e95d72fb7f87b8892651164e46)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  * Copyright (c) 2020 Chuck Tuffli
7  *
8  * Function crc16 Copyright (c) 2017, Fedor Uporov
9  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 /*
34  * bhyve PCIe-NVMe device emulation.
35  *
36  * options:
37  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
38  *
39  *  accepted devpath:
40  *    /dev/blockdev
41  *    /path/to/image
42  *    ram=size_in_MiB
43  *
44  *  maxq    = max number of queues
45  *  qsz     = max elements in each queue
46  *  ioslots = max number of concurrent io requests
47  *  sectsz  = sector size (defaults to blockif sector size)
48  *  ser     = serial number (20-chars max)
49  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
50  *  dsm     = DataSet Management support. Option is one of auto, enable,disable
51  *
52  */
53 
54 /* TODO:
55     - create async event for smart and log
56     - intr coalesce
57  */
58 
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61 
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
65 
66 #include <assert.h>
67 #include <pthread.h>
68 #include <pthread_np.h>
69 #include <semaphore.h>
70 #include <stdbool.h>
71 #include <stddef.h>
72 #include <stdint.h>
73 #include <stdio.h>
74 #include <stdlib.h>
75 #include <string.h>
76 
77 #include <machine/atomic.h>
78 #include <machine/vmm.h>
79 #include <vmmapi.h>
80 
81 #include <dev/nvme/nvme.h>
82 
83 #include "bhyverun.h"
84 #include "block_if.h"
85 #include "config.h"
86 #include "debug.h"
87 #include "pci_emul.h"
88 
89 
90 static int nvme_debug = 0;
91 #define	DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
92 #define	WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
93 
94 /* defaults; can be overridden */
95 #define	NVME_MSIX_BAR		4
96 
97 #define	NVME_IOSLOTS		8
98 
99 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
100 #define NVME_MMIO_SPACE_MIN	(1 << 14)
101 
102 #define	NVME_QUEUES		16
103 #define	NVME_MAX_QENTRIES	2048
104 /* Memory Page size Minimum reported in CAP register */
105 #define	NVME_MPSMIN		0
106 /* MPSMIN converted to bytes */
107 #define	NVME_MPSMIN_BYTES	(1 << (12 + NVME_MPSMIN))
108 
109 #define	NVME_PRP2_ITEMS		(PAGE_SIZE/sizeof(uint64_t))
110 #define	NVME_MDTS		9
111 /* Note the + 1 allows for the initial descriptor to not be page aligned */
112 #define	NVME_MAX_IOVEC		((1 << NVME_MDTS) + 1)
113 #define	NVME_MAX_DATA_SIZE	((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
114 
115 /* This is a synthetic status code to indicate there is no status */
116 #define NVME_NO_STATUS		0xffff
117 #define NVME_COMPLETION_VALID(c)	((c).status != NVME_NO_STATUS)
118 
119 /* helpers */
120 
121 /* Convert a zero-based value into a one-based value */
122 #define ONE_BASED(zero)		((zero) + 1)
123 /* Convert a one-based value into a zero-based value */
124 #define ZERO_BASED(one)		((one)  - 1)
125 
126 /* Encode number of SQ's and CQ's for Set/Get Features */
127 #define NVME_FEATURE_NUM_QUEUES(sc) \
128 	(ZERO_BASED((sc)->num_squeues) & 0xffff) | \
129 	(ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
130 
131 #define	NVME_DOORBELL_OFFSET	offsetof(struct nvme_registers, doorbell)
132 
133 enum nvme_controller_register_offsets {
134 	NVME_CR_CAP_LOW = 0x00,
135 	NVME_CR_CAP_HI  = 0x04,
136 	NVME_CR_VS      = 0x08,
137 	NVME_CR_INTMS   = 0x0c,
138 	NVME_CR_INTMC   = 0x10,
139 	NVME_CR_CC      = 0x14,
140 	NVME_CR_CSTS    = 0x1c,
141 	NVME_CR_NSSR    = 0x20,
142 	NVME_CR_AQA     = 0x24,
143 	NVME_CR_ASQ_LOW = 0x28,
144 	NVME_CR_ASQ_HI  = 0x2c,
145 	NVME_CR_ACQ_LOW = 0x30,
146 	NVME_CR_ACQ_HI  = 0x34,
147 };
148 
149 enum nvme_cmd_cdw11 {
150 	NVME_CMD_CDW11_PC  = 0x0001,
151 	NVME_CMD_CDW11_IEN = 0x0002,
152 	NVME_CMD_CDW11_IV  = 0xFFFF0000,
153 };
154 
155 enum nvme_copy_dir {
156 	NVME_COPY_TO_PRP,
157 	NVME_COPY_FROM_PRP,
158 };
159 
160 #define	NVME_CQ_INTEN	0x01
161 #define	NVME_CQ_INTCOAL	0x02
162 
163 struct nvme_completion_queue {
164 	struct nvme_completion *qbase;
165 	pthread_mutex_t	mtx;
166 	uint32_t	size;
167 	uint16_t	tail; /* nvme progress */
168 	uint16_t	head; /* guest progress */
169 	uint16_t	intr_vec;
170 	uint32_t	intr_en;
171 };
172 
173 struct nvme_submission_queue {
174 	struct nvme_command *qbase;
175 	pthread_mutex_t	mtx;
176 	uint32_t	size;
177 	uint16_t	head; /* nvme progress */
178 	uint16_t	tail; /* guest progress */
179 	uint16_t	cqid; /* completion queue id */
180 	int		qpriority;
181 };
182 
183 enum nvme_storage_type {
184 	NVME_STOR_BLOCKIF = 0,
185 	NVME_STOR_RAM = 1,
186 };
187 
188 struct pci_nvme_blockstore {
189 	enum nvme_storage_type type;
190 	void		*ctx;
191 	uint64_t	size;
192 	uint32_t	sectsz;
193 	uint32_t	sectsz_bits;
194 	uint64_t	eui64;
195 	uint32_t	deallocate:1;
196 };
197 
198 /*
199  * Calculate the number of additional page descriptors for guest IO requests
200  * based on the advertised Max Data Transfer (MDTS) and given the number of
201  * default iovec's in a struct blockif_req.
202  */
203 #define MDTS_PAD_SIZE \
204 	( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
205 	  NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
206 	  0 )
207 
208 struct pci_nvme_ioreq {
209 	struct pci_nvme_softc *sc;
210 	STAILQ_ENTRY(pci_nvme_ioreq) link;
211 	struct nvme_submission_queue *nvme_sq;
212 	uint16_t	sqid;
213 
214 	/* command information */
215 	uint16_t	opc;
216 	uint16_t	cid;
217 	uint32_t	nsid;
218 
219 	uint64_t	prev_gpaddr;
220 	size_t		prev_size;
221 	size_t		bytes;
222 
223 	struct blockif_req io_req;
224 
225 	struct iovec	iovpadding[MDTS_PAD_SIZE];
226 };
227 
228 enum nvme_dsm_type {
229 	/* Dataset Management bit in ONCS reflects backing storage capability */
230 	NVME_DATASET_MANAGEMENT_AUTO,
231 	/* Unconditionally set Dataset Management bit in ONCS */
232 	NVME_DATASET_MANAGEMENT_ENABLE,
233 	/* Unconditionally clear Dataset Management bit in ONCS */
234 	NVME_DATASET_MANAGEMENT_DISABLE,
235 };
236 
237 struct pci_nvme_softc;
238 struct nvme_feature_obj;
239 
240 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
241     struct nvme_feature_obj *,
242     struct nvme_command *,
243     struct nvme_completion *);
244 
245 struct nvme_feature_obj {
246 	uint32_t	cdw11;
247 	nvme_feature_cb	set;
248 	nvme_feature_cb	get;
249 	bool namespace_specific;
250 };
251 
252 #define NVME_FID_MAX		(NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
253 
254 typedef enum {
255 	PCI_NVME_AE_TYPE_ERROR = 0,
256 	PCI_NVME_AE_TYPE_SMART,
257 	PCI_NVME_AE_TYPE_NOTICE,
258 	PCI_NVME_AE_TYPE_IO_CMD = 6,
259 	PCI_NVME_AE_TYPE_VENDOR = 7,
260 	PCI_NVME_AE_TYPE_MAX		/* Must be last */
261 } pci_nvme_async_type;
262 
263 /* Asynchronous Event Requests */
264 struct pci_nvme_aer {
265 	STAILQ_ENTRY(pci_nvme_aer) link;
266 	uint16_t	cid;	/* Command ID of the submitted AER */
267 };
268 
269 typedef enum {
270 	PCI_NVME_AE_INFO_NS_ATTR_CHANGED = 0,
271 	PCI_NVME_AE_INFO_FW_ACTIVATION,
272 	PCI_NVME_AE_INFO_TELEMETRY_CHANGE,
273 	PCI_NVME_AE_INFO_ANA_CHANGE,
274 	PCI_NVME_AE_INFO_PREDICT_LATENCY_CHANGE,
275 	PCI_NVME_AE_INFO_LBA_STATUS_ALERT,
276 	PCI_NVME_AE_INFO_ENDURANCE_GROUP_CHANGE,
277 	PCI_NVME_AE_INFO_MAX,
278 } pci_nvme_async_info;
279 
280 /* Asynchronous Event Notifications */
281 struct pci_nvme_aen {
282 	pci_nvme_async_type atype;
283 	uint32_t	event_data;
284 	bool		posted;
285 };
286 
287 struct pci_nvme_softc {
288 	struct pci_devinst *nsc_pi;
289 
290 	pthread_mutex_t	mtx;
291 
292 	struct nvme_registers regs;
293 
294 	struct nvme_namespace_data  nsdata;
295 	struct nvme_controller_data ctrldata;
296 	struct nvme_error_information_entry err_log;
297 	struct nvme_health_information_page health_log;
298 	struct nvme_firmware_page fw_log;
299 	struct nvme_ns_list ns_log;
300 
301 	struct pci_nvme_blockstore nvstore;
302 
303 	uint16_t	max_qentries;	/* max entries per queue */
304 	uint32_t	max_queues;	/* max number of IO SQ's or CQ's */
305 	uint32_t	num_cqueues;
306 	uint32_t	num_squeues;
307 	bool		num_q_is_set; /* Has host set Number of Queues */
308 
309 	struct pci_nvme_ioreq *ioreqs;
310 	STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
311 	uint32_t	pending_ios;
312 	uint32_t	ioslots;
313 	sem_t		iosemlock;
314 
315 	/*
316 	 * Memory mapped Submission and Completion queues
317 	 * Each array includes both Admin and IO queues
318 	 */
319 	struct nvme_completion_queue *compl_queues;
320 	struct nvme_submission_queue *submit_queues;
321 
322 	struct nvme_feature_obj feat[NVME_FID_MAX];
323 
324 	enum nvme_dsm_type dataset_management;
325 
326 	/* Accounting for SMART data */
327 	__uint128_t	read_data_units;
328 	__uint128_t	write_data_units;
329 	__uint128_t	read_commands;
330 	__uint128_t	write_commands;
331 	uint32_t	read_dunits_remainder;
332 	uint32_t	write_dunits_remainder;
333 
334 	STAILQ_HEAD(, pci_nvme_aer) aer_list;
335 	pthread_mutex_t	aer_mtx;
336 	uint32_t	aer_count;
337 	struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX];
338 	pthread_t	aen_tid;
339 	pthread_mutex_t	aen_mtx;
340 	pthread_cond_t	aen_cond;
341 };
342 
343 
344 static void pci_nvme_cq_update(struct pci_nvme_softc *sc,
345     struct nvme_completion_queue *cq,
346     uint32_t cdw0,
347     uint16_t cid,
348     uint16_t sqid,
349     uint16_t status);
350 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
351 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
352 static void pci_nvme_io_done(struct blockif_req *, int);
353 
354 /* Controller Configuration utils */
355 #define	NVME_CC_GET_EN(cc) \
356 	((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
357 #define	NVME_CC_GET_CSS(cc) \
358 	((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
359 #define	NVME_CC_GET_SHN(cc) \
360 	((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
361 #define	NVME_CC_GET_IOSQES(cc) \
362 	((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
363 #define	NVME_CC_GET_IOCQES(cc) \
364 	((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
365 
366 #define	NVME_CC_WRITE_MASK \
367 	((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
368 	 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
369 	 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
370 
371 #define	NVME_CC_NEN_WRITE_MASK \
372 	((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
373 	 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
374 	 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
375 
376 /* Controller Status utils */
377 #define	NVME_CSTS_GET_RDY(sts) \
378 	((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
379 
380 #define	NVME_CSTS_RDY	(1 << NVME_CSTS_REG_RDY_SHIFT)
381 
382 /* Completion Queue status word utils */
383 #define	NVME_STATUS_P	(1 << NVME_STATUS_P_SHIFT)
384 #define	NVME_STATUS_MASK \
385 	((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
386 	 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
387 
388 #define NVME_ONCS_DSM	(NVME_CTRLR_DATA_ONCS_DSM_MASK << \
389 	NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
390 
391 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
392     struct nvme_feature_obj *,
393     struct nvme_command *,
394     struct nvme_completion *);
395 static void nvme_feature_num_queues(struct pci_nvme_softc *,
396     struct nvme_feature_obj *,
397     struct nvme_command *,
398     struct nvme_completion *);
399 static void nvme_feature_iv_config(struct pci_nvme_softc *,
400     struct nvme_feature_obj *,
401     struct nvme_command *,
402     struct nvme_completion *);
403 
404 static void *aen_thr(void *arg);
405 
406 static __inline void
407 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
408 {
409 	size_t len;
410 
411 	len = strnlen(src, dst_size);
412 	memset(dst, pad, dst_size);
413 	memcpy(dst, src, len);
414 }
415 
416 static __inline void
417 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
418 {
419 
420 	*status &= ~NVME_STATUS_MASK;
421 	*status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
422 		(code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
423 }
424 
425 static __inline void
426 pci_nvme_status_genc(uint16_t *status, uint16_t code)
427 {
428 
429 	pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
430 }
431 
432 /*
433  * Initialize the requested number or IO Submission and Completion Queues.
434  * Admin queues are allocated implicitly.
435  */
436 static void
437 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
438 {
439 	uint32_t i;
440 
441 	/*
442 	 * Allocate and initialize the Submission Queues
443 	 */
444 	if (nsq > NVME_QUEUES) {
445 		WPRINTF("%s: clamping number of SQ from %u to %u",
446 					__func__, nsq, NVME_QUEUES);
447 		nsq = NVME_QUEUES;
448 	}
449 
450 	sc->num_squeues = nsq;
451 
452 	sc->submit_queues = calloc(sc->num_squeues + 1,
453 				sizeof(struct nvme_submission_queue));
454 	if (sc->submit_queues == NULL) {
455 		WPRINTF("%s: SQ allocation failed", __func__);
456 		sc->num_squeues = 0;
457 	} else {
458 		struct nvme_submission_queue *sq = sc->submit_queues;
459 
460 		for (i = 0; i < sc->num_squeues; i++)
461 			pthread_mutex_init(&sq[i].mtx, NULL);
462 	}
463 
464 	/*
465 	 * Allocate and initialize the Completion Queues
466 	 */
467 	if (ncq > NVME_QUEUES) {
468 		WPRINTF("%s: clamping number of CQ from %u to %u",
469 					__func__, ncq, NVME_QUEUES);
470 		ncq = NVME_QUEUES;
471 	}
472 
473 	sc->num_cqueues = ncq;
474 
475 	sc->compl_queues = calloc(sc->num_cqueues + 1,
476 				sizeof(struct nvme_completion_queue));
477 	if (sc->compl_queues == NULL) {
478 		WPRINTF("%s: CQ allocation failed", __func__);
479 		sc->num_cqueues = 0;
480 	} else {
481 		struct nvme_completion_queue *cq = sc->compl_queues;
482 
483 		for (i = 0; i < sc->num_cqueues; i++)
484 			pthread_mutex_init(&cq[i].mtx, NULL);
485 	}
486 }
487 
488 static void
489 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
490 {
491 	struct nvme_controller_data *cd = &sc->ctrldata;
492 
493 	cd->vid = 0xFB5D;
494 	cd->ssvid = 0x0000;
495 
496 	cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
497 	cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
498 
499 	/* Num of submission commands that we can handle at a time (2^rab) */
500 	cd->rab   = 4;
501 
502 	/* FreeBSD OUI */
503 	cd->ieee[0] = 0x58;
504 	cd->ieee[1] = 0x9c;
505 	cd->ieee[2] = 0xfc;
506 
507 	cd->mic = 0;
508 
509 	cd->mdts = NVME_MDTS;	/* max data transfer size (2^mdts * CAP.MPSMIN) */
510 
511 	cd->ver = 0x00010300;
512 
513 	cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
514 	cd->acl = 2;
515 	cd->aerl = 4;
516 
517 	/* Advertise 1, Read-only firmware slot */
518 	cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK |
519 	    (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
520 	cd->lpa = 0;	/* TODO: support some simple things like SMART */
521 	cd->elpe = 0;	/* max error log page entries */
522 	cd->npss = 1;	/* number of power states support */
523 
524 	/* Warning Composite Temperature Threshold */
525 	cd->wctemp = 0x0157;
526 
527 	cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
528 	    (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
529 	cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
530 	    (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
531 	cd->nn = 1;	/* number of namespaces */
532 
533 	cd->oncs = 0;
534 	switch (sc->dataset_management) {
535 	case NVME_DATASET_MANAGEMENT_AUTO:
536 		if (sc->nvstore.deallocate)
537 			cd->oncs |= NVME_ONCS_DSM;
538 		break;
539 	case NVME_DATASET_MANAGEMENT_ENABLE:
540 		cd->oncs |= NVME_ONCS_DSM;
541 		break;
542 	default:
543 		break;
544 	}
545 
546 	cd->fna = 0x03;
547 
548 	cd->power_state[0].mp = 10;
549 }
550 
551 /*
552  * Calculate the CRC-16 of the given buffer
553  * See copyright attribution at top of file
554  */
555 static uint16_t
556 crc16(uint16_t crc, const void *buffer, unsigned int len)
557 {
558 	const unsigned char *cp = buffer;
559 	/* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
560 	static uint16_t const crc16_table[256] = {
561 		0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
562 		0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
563 		0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
564 		0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
565 		0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
566 		0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
567 		0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
568 		0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
569 		0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
570 		0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
571 		0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
572 		0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
573 		0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
574 		0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
575 		0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
576 		0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
577 		0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
578 		0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
579 		0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
580 		0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
581 		0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
582 		0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
583 		0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
584 		0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
585 		0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
586 		0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
587 		0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
588 		0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
589 		0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
590 		0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
591 		0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
592 		0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
593 	};
594 
595 	while (len--)
596 		crc = (((crc >> 8) & 0xffU) ^
597 		    crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
598 	return crc;
599 }
600 
601 static void
602 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore,
603     struct nvme_namespace_data *nd)
604 {
605 
606 	/* Get capacity and block size information from backing store */
607 	nd->nsze = nvstore->size / nvstore->sectsz;
608 	nd->ncap = nd->nsze;
609 	nd->nuse = nd->nsze;
610 }
611 
612 static void
613 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
614     struct nvme_namespace_data *nd, uint32_t nsid,
615     struct pci_nvme_blockstore *nvstore)
616 {
617 
618 	pci_nvme_init_nsdata_size(nvstore, nd);
619 
620 	if (nvstore->type == NVME_STOR_BLOCKIF)
621 		nvstore->deallocate = blockif_candelete(nvstore->ctx);
622 
623 	nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
624 	nd->flbas = 0;
625 
626 	/* Create an EUI-64 if user did not provide one */
627 	if (nvstore->eui64 == 0) {
628 		char *data = NULL;
629 		uint64_t eui64 = nvstore->eui64;
630 
631 		asprintf(&data, "%s%u%u%u", get_config_value("name"),
632 		    sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot,
633 		    sc->nsc_pi->pi_func);
634 
635 		if (data != NULL) {
636 			eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
637 			free(data);
638 		}
639 		nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
640 	}
641 	be64enc(nd->eui64, nvstore->eui64);
642 
643 	/* LBA data-sz = 2^lbads */
644 	nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
645 }
646 
647 static void
648 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
649 {
650 
651 	memset(&sc->err_log, 0, sizeof(sc->err_log));
652 	memset(&sc->health_log, 0, sizeof(sc->health_log));
653 	memset(&sc->fw_log, 0, sizeof(sc->fw_log));
654 	memset(&sc->ns_log, 0, sizeof(sc->ns_log));
655 
656 	/* Set read/write remainder to round up according to spec */
657 	sc->read_dunits_remainder = 999;
658 	sc->write_dunits_remainder = 999;
659 
660 	/* Set nominal Health values checked by implementations */
661 	sc->health_log.temperature = 310;
662 	sc->health_log.available_spare = 100;
663 	sc->health_log.available_spare_threshold = 10;
664 }
665 
666 static void
667 pci_nvme_init_features(struct pci_nvme_softc *sc)
668 {
669 
670 	sc->feat[0].set = nvme_feature_invalid_cb;
671 	sc->feat[0].get = nvme_feature_invalid_cb;
672 
673 	sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true;
674 	sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true;
675 	sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues;
676 	sc->feat[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION].set =
677 	    nvme_feature_iv_config;
678 	/* Enable all AENs by default */
679 	sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11 = 0x31f;
680 	sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG].get =
681 	    nvme_feature_invalid_cb;
682 	sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW].get =
683 	    nvme_feature_invalid_cb;
684 }
685 
686 static void
687 pci_nvme_aer_reset(struct pci_nvme_softc *sc)
688 {
689 
690 	STAILQ_INIT(&sc->aer_list);
691 	sc->aer_count = 0;
692 }
693 
694 static void
695 pci_nvme_aer_init(struct pci_nvme_softc *sc)
696 {
697 
698 	pthread_mutex_init(&sc->aer_mtx, NULL);
699 	pci_nvme_aer_reset(sc);
700 }
701 
702 static void
703 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
704 {
705 	struct pci_nvme_aer *aer = NULL;
706 
707 	pthread_mutex_lock(&sc->aer_mtx);
708 	while (!STAILQ_EMPTY(&sc->aer_list)) {
709 		aer = STAILQ_FIRST(&sc->aer_list);
710 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
711 		free(aer);
712 	}
713 	pthread_mutex_unlock(&sc->aer_mtx);
714 
715 	pci_nvme_aer_reset(sc);
716 }
717 
718 static bool
719 pci_nvme_aer_available(struct pci_nvme_softc *sc)
720 {
721 
722 	return (sc->aer_count != 0);
723 }
724 
725 static bool
726 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
727 {
728 	struct nvme_controller_data *cd = &sc->ctrldata;
729 
730 	/* AERL is a zero based value while aer_count is one's based */
731 	return (sc->aer_count == (cd->aerl + 1));
732 }
733 
734 /*
735  * Add an Async Event Request
736  *
737  * Stores an AER to be returned later if the Controller needs to notify the
738  * host of an event.
739  * Note that while the NVMe spec doesn't require Controllers to return AER's
740  * in order, this implementation does preserve the order.
741  */
742 static int
743 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
744 {
745 	struct pci_nvme_aer *aer = NULL;
746 
747 	if (pci_nvme_aer_limit_reached(sc))
748 		return (-1);
749 
750 	aer = calloc(1, sizeof(struct pci_nvme_aer));
751 	if (aer == NULL)
752 		return (-1);
753 
754 	/* Save the Command ID for use in the completion message */
755 	aer->cid = cid;
756 
757 	pthread_mutex_lock(&sc->aer_mtx);
758 	sc->aer_count++;
759 	STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
760 	pthread_mutex_unlock(&sc->aer_mtx);
761 
762 	return (0);
763 }
764 
765 /*
766  * Get an Async Event Request structure
767  *
768  * Returns a pointer to an AER previously submitted by the host or NULL if
769  * no AER's exist. Caller is responsible for freeing the returned struct.
770  */
771 static struct pci_nvme_aer *
772 pci_nvme_aer_get(struct pci_nvme_softc *sc)
773 {
774 	struct pci_nvme_aer *aer = NULL;
775 
776 	pthread_mutex_lock(&sc->aer_mtx);
777 	aer = STAILQ_FIRST(&sc->aer_list);
778 	if (aer != NULL) {
779 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
780 		sc->aer_count--;
781 	}
782 	pthread_mutex_unlock(&sc->aer_mtx);
783 
784 	return (aer);
785 }
786 
787 static void
788 pci_nvme_aen_reset(struct pci_nvme_softc *sc)
789 {
790 	uint32_t	atype;
791 
792 	memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen));
793 
794 	for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
795 		sc->aen[atype].atype = atype;
796 	}
797 }
798 
799 static void
800 pci_nvme_aen_init(struct pci_nvme_softc *sc)
801 {
802 	char nstr[80];
803 
804 	pci_nvme_aen_reset(sc);
805 
806 	pthread_mutex_init(&sc->aen_mtx, NULL);
807 	pthread_create(&sc->aen_tid, NULL, aen_thr, sc);
808 	snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot,
809 	    sc->nsc_pi->pi_func);
810 	pthread_set_name_np(sc->aen_tid, nstr);
811 }
812 
813 static void
814 pci_nvme_aen_destroy(struct pci_nvme_softc *sc)
815 {
816 
817 	pci_nvme_aen_reset(sc);
818 }
819 
820 /* Notify the AEN thread of pending work */
821 static void
822 pci_nvme_aen_notify(struct pci_nvme_softc *sc)
823 {
824 
825 	pthread_cond_signal(&sc->aen_cond);
826 }
827 
828 /*
829  * Post an Asynchronous Event Notification
830  */
831 static int32_t
832 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype,
833 		uint32_t event_data)
834 {
835 	struct pci_nvme_aen *aen;
836 
837 	if (atype >= PCI_NVME_AE_TYPE_MAX) {
838 		return(EINVAL);
839 	}
840 
841 	pthread_mutex_lock(&sc->aen_mtx);
842 	aen = &sc->aen[atype];
843 
844 	/* Has the controller already posted an event of this type? */
845 	if (aen->posted) {
846 		pthread_mutex_unlock(&sc->aen_mtx);
847 		return(EALREADY);
848 	}
849 
850 	aen->event_data = event_data;
851 	aen->posted = true;
852 	pthread_mutex_unlock(&sc->aen_mtx);
853 
854 	pci_nvme_aen_notify(sc);
855 
856 	return(0);
857 }
858 
859 static void
860 pci_nvme_aen_process(struct pci_nvme_softc *sc)
861 {
862 	struct pci_nvme_aer *aer;
863 	struct pci_nvme_aen *aen;
864 	pci_nvme_async_type atype;
865 	uint32_t mask;
866 	uint16_t status;
867 	uint8_t lid;
868 
869 	assert(pthread_mutex_isowned_np(&sc->aen_mtx));
870 	for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
871 		aen = &sc->aen[atype];
872 		/* Previous iterations may have depleted the available AER's */
873 		if (!pci_nvme_aer_available(sc)) {
874 			DPRINTF("%s: no AER", __func__);
875 			break;
876 		}
877 
878 		if (!aen->posted) {
879 			DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype);
880 			continue;
881 		}
882 
883 		status = NVME_SC_SUCCESS;
884 
885 		/* Is the event masked? */
886 		mask =
887 		    sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11;
888 
889 		DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data);
890 		switch (atype) {
891 		case PCI_NVME_AE_TYPE_ERROR:
892 			lid = NVME_LOG_ERROR;
893 			break;
894 		case PCI_NVME_AE_TYPE_SMART:
895 			mask &= 0xff;
896 			if ((mask & aen->event_data) == 0)
897 				continue;
898 			lid = NVME_LOG_HEALTH_INFORMATION;
899 			break;
900 		case PCI_NVME_AE_TYPE_NOTICE:
901 			if (aen->event_data >= PCI_NVME_AE_INFO_MAX) {
902 				EPRINTLN("%s unknown AEN notice type %u",
903 				    __func__, aen->event_data);
904 				status = NVME_SC_INTERNAL_DEVICE_ERROR;
905 				break;
906 			}
907 			mask >>= 8;
908 			if (((1 << aen->event_data) & mask) == 0)
909 				continue;
910 			switch (aen->event_data) {
911 			case PCI_NVME_AE_INFO_NS_ATTR_CHANGED:
912 				lid = NVME_LOG_CHANGED_NAMESPACE;
913 				break;
914 			case PCI_NVME_AE_INFO_FW_ACTIVATION:
915 				lid = NVME_LOG_FIRMWARE_SLOT;
916 				break;
917 			case PCI_NVME_AE_INFO_TELEMETRY_CHANGE:
918 				lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED;
919 				break;
920 			case PCI_NVME_AE_INFO_ANA_CHANGE:
921 				lid = NVME_LOG_ASYMMETRIC_NAMESPAVE_ACCESS; //TODO spelling
922 				break;
923 			case PCI_NVME_AE_INFO_PREDICT_LATENCY_CHANGE:
924 				lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE;
925 				break;
926 			case PCI_NVME_AE_INFO_LBA_STATUS_ALERT:
927 				lid = NVME_LOG_LBA_STATUS_INFORMATION;
928 				break;
929 			case PCI_NVME_AE_INFO_ENDURANCE_GROUP_CHANGE:
930 				lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE;
931 				break;
932 			default:
933 				lid = 0;
934 			}
935 			break;
936 		default:
937 			/* bad type?!? */
938 			EPRINTLN("%s unknown AEN type %u", __func__, atype);
939 			status = NVME_SC_INTERNAL_DEVICE_ERROR;
940 			break;
941 		}
942 
943 		aer = pci_nvme_aer_get(sc);
944 		assert(aer != NULL);
945 
946 		DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype);
947 		pci_nvme_cq_update(sc, &sc->compl_queues[0],
948 		    (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */
949 		    aer->cid,
950 		    0,		/* SQID */
951 		    status);
952 
953 		aen->event_data = 0;
954 		aen->posted = false;
955 
956 		pci_generate_msix(sc->nsc_pi, 0);
957 	}
958 }
959 
960 static void *
961 aen_thr(void *arg)
962 {
963 	struct pci_nvme_softc *sc;
964 
965 	sc = arg;
966 
967 	pthread_mutex_lock(&sc->aen_mtx);
968 	for (;;) {
969 		pci_nvme_aen_process(sc);
970 		pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx);
971 	}
972 	pthread_mutex_unlock(&sc->aen_mtx);
973 
974 	pthread_exit(NULL);
975 	return (NULL);
976 }
977 
978 static void
979 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
980 {
981 	uint32_t i;
982 
983 	DPRINTF("%s", __func__);
984 
985 	sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
986 	    (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
987 	    (60 << NVME_CAP_LO_REG_TO_SHIFT);
988 
989 	sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
990 
991 	sc->regs.vs = 0x00010300;	/* NVMe v1.3 */
992 
993 	sc->regs.cc = 0;
994 	sc->regs.csts = 0;
995 
996 	assert(sc->submit_queues != NULL);
997 
998 	for (i = 0; i < sc->num_squeues + 1; i++) {
999 		sc->submit_queues[i].qbase = NULL;
1000 		sc->submit_queues[i].size = 0;
1001 		sc->submit_queues[i].cqid = 0;
1002 		sc->submit_queues[i].tail = 0;
1003 		sc->submit_queues[i].head = 0;
1004 	}
1005 
1006 	assert(sc->compl_queues != NULL);
1007 
1008 	for (i = 0; i < sc->num_cqueues + 1; i++) {
1009 		sc->compl_queues[i].qbase = NULL;
1010 		sc->compl_queues[i].size = 0;
1011 		sc->compl_queues[i].tail = 0;
1012 		sc->compl_queues[i].head = 0;
1013 	}
1014 
1015 	sc->num_q_is_set = false;
1016 
1017 	pci_nvme_aer_destroy(sc);
1018 	pci_nvme_aen_destroy(sc);
1019 }
1020 
1021 static void
1022 pci_nvme_reset(struct pci_nvme_softc *sc)
1023 {
1024 	pthread_mutex_lock(&sc->mtx);
1025 	pci_nvme_reset_locked(sc);
1026 	pthread_mutex_unlock(&sc->mtx);
1027 }
1028 
1029 static void
1030 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
1031 {
1032 	uint16_t acqs, asqs;
1033 
1034 	DPRINTF("%s", __func__);
1035 
1036 	asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
1037 	sc->submit_queues[0].size = asqs;
1038 	sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
1039 	            sizeof(struct nvme_command) * asqs);
1040 
1041 	DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
1042 	        __func__, sc->regs.asq, sc->submit_queues[0].qbase);
1043 
1044 	acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
1045 	    NVME_AQA_REG_ACQS_MASK) + 1;
1046 	sc->compl_queues[0].size = acqs;
1047 	sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
1048 	         sizeof(struct nvme_completion) * acqs);
1049 	sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
1050 
1051 	DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
1052 	        __func__, sc->regs.acq, sc->compl_queues[0].qbase);
1053 }
1054 
1055 static int
1056 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
1057 	size_t len, enum nvme_copy_dir dir)
1058 {
1059 	uint8_t *p;
1060 	size_t bytes;
1061 
1062 	if (len > (8 * 1024)) {
1063 		return (-1);
1064 	}
1065 
1066 	/* Copy from the start of prp1 to the end of the physical page */
1067 	bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
1068 	bytes = MIN(bytes, len);
1069 
1070 	p = vm_map_gpa(ctx, prp1, bytes);
1071 	if (p == NULL) {
1072 		return (-1);
1073 	}
1074 
1075 	if (dir == NVME_COPY_TO_PRP)
1076 		memcpy(p, b, bytes);
1077 	else
1078 		memcpy(b, p, bytes);
1079 
1080 	b += bytes;
1081 
1082 	len -= bytes;
1083 	if (len == 0) {
1084 		return (0);
1085 	}
1086 
1087 	len = MIN(len, PAGE_SIZE);
1088 
1089 	p = vm_map_gpa(ctx, prp2, len);
1090 	if (p == NULL) {
1091 		return (-1);
1092 	}
1093 
1094 	if (dir == NVME_COPY_TO_PRP)
1095 		memcpy(p, b, len);
1096 	else
1097 		memcpy(b, p, len);
1098 
1099 	return (0);
1100 }
1101 
1102 /*
1103  * Write a Completion Queue Entry update
1104  *
1105  * Write the completion and update the doorbell value
1106  */
1107 static void
1108 pci_nvme_cq_update(struct pci_nvme_softc *sc,
1109 		struct nvme_completion_queue *cq,
1110 		uint32_t cdw0,
1111 		uint16_t cid,
1112 		uint16_t sqid,
1113 		uint16_t status)
1114 {
1115 	struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
1116 	struct nvme_completion *cqe;
1117 
1118 	assert(cq->qbase != NULL);
1119 
1120 	pthread_mutex_lock(&cq->mtx);
1121 
1122 	cqe = &cq->qbase[cq->tail];
1123 
1124 	/* Flip the phase bit */
1125 	status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
1126 
1127 	cqe->cdw0 = cdw0;
1128 	cqe->sqhd = sq->head;
1129 	cqe->sqid = sqid;
1130 	cqe->cid = cid;
1131 	cqe->status = status;
1132 
1133 	cq->tail++;
1134 	if (cq->tail >= cq->size) {
1135 		cq->tail = 0;
1136 	}
1137 
1138 	pthread_mutex_unlock(&cq->mtx);
1139 }
1140 
1141 static int
1142 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1143 	struct nvme_completion* compl)
1144 {
1145 	uint16_t qid = command->cdw10 & 0xffff;
1146 
1147 	DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
1148 	if (qid == 0 || qid > sc->num_squeues ||
1149 	    (sc->submit_queues[qid].qbase == NULL)) {
1150 		WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
1151 		        __func__, qid, sc->num_squeues);
1152 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1153 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1154 		return (1);
1155 	}
1156 
1157 	sc->submit_queues[qid].qbase = NULL;
1158 	sc->submit_queues[qid].cqid = 0;
1159 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1160 	return (1);
1161 }
1162 
1163 static int
1164 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1165 	struct nvme_completion* compl)
1166 {
1167 	if (command->cdw11 & NVME_CMD_CDW11_PC) {
1168 		uint16_t qid = command->cdw10 & 0xffff;
1169 		struct nvme_submission_queue *nsq;
1170 
1171 		if ((qid == 0) || (qid > sc->num_squeues) ||
1172 		    (sc->submit_queues[qid].qbase != NULL)) {
1173 			WPRINTF("%s queue index %u > num_squeues %u",
1174 			        __func__, qid, sc->num_squeues);
1175 			pci_nvme_status_tc(&compl->status,
1176 			    NVME_SCT_COMMAND_SPECIFIC,
1177 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1178 			return (1);
1179 		}
1180 
1181 		nsq = &sc->submit_queues[qid];
1182 		nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1183 		DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
1184 		if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
1185 			/*
1186 			 * Queues must specify at least two entries
1187 			 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1188 			 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1189 			 */
1190 			pci_nvme_status_tc(&compl->status,
1191 			    NVME_SCT_COMMAND_SPECIFIC,
1192 			    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1193 			return (1);
1194 		}
1195 		nsq->head = nsq->tail = 0;
1196 
1197 		nsq->cqid = (command->cdw11 >> 16) & 0xffff;
1198 		if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
1199 			pci_nvme_status_tc(&compl->status,
1200 			    NVME_SCT_COMMAND_SPECIFIC,
1201 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1202 			return (1);
1203 		}
1204 
1205 		if (sc->compl_queues[nsq->cqid].qbase == NULL) {
1206 			pci_nvme_status_tc(&compl->status,
1207 			    NVME_SCT_COMMAND_SPECIFIC,
1208 			    NVME_SC_COMPLETION_QUEUE_INVALID);
1209 			return (1);
1210 		}
1211 
1212 		nsq->qpriority = (command->cdw11 >> 1) & 0x03;
1213 
1214 		nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1215 		              sizeof(struct nvme_command) * (size_t)nsq->size);
1216 
1217 		DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
1218 		        qid, nsq->size, nsq->qbase, nsq->cqid);
1219 
1220 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1221 
1222 		DPRINTF("%s completed creating IOSQ qid %u",
1223 		         __func__, qid);
1224 	} else {
1225 		/*
1226 		 * Guest sent non-cont submission queue request.
1227 		 * This setting is unsupported by this emulation.
1228 		 */
1229 		WPRINTF("%s unsupported non-contig (list-based) "
1230 		         "create i/o submission queue", __func__);
1231 
1232 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1233 	}
1234 	return (1);
1235 }
1236 
1237 static int
1238 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1239 	struct nvme_completion* compl)
1240 {
1241 	uint16_t qid = command->cdw10 & 0xffff;
1242 	uint16_t sqid;
1243 
1244 	DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
1245 	if (qid == 0 || qid > sc->num_cqueues ||
1246 	    (sc->compl_queues[qid].qbase == NULL)) {
1247 		WPRINTF("%s queue index %u / num_cqueues %u",
1248 		        __func__, qid, sc->num_cqueues);
1249 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1250 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1251 		return (1);
1252 	}
1253 
1254 	/* Deleting an Active CQ is an error */
1255 	for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
1256 		if (sc->submit_queues[sqid].cqid == qid) {
1257 			pci_nvme_status_tc(&compl->status,
1258 			    NVME_SCT_COMMAND_SPECIFIC,
1259 			    NVME_SC_INVALID_QUEUE_DELETION);
1260 			return (1);
1261 		}
1262 
1263 	sc->compl_queues[qid].qbase = NULL;
1264 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1265 	return (1);
1266 }
1267 
1268 static int
1269 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1270 	struct nvme_completion* compl)
1271 {
1272 	struct nvme_completion_queue *ncq;
1273 	uint16_t qid = command->cdw10 & 0xffff;
1274 
1275 	/* Only support Physically Contiguous queues */
1276 	if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1277 		WPRINTF("%s unsupported non-contig (list-based) "
1278 		         "create i/o completion queue",
1279 		         __func__);
1280 
1281 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1282 		return (1);
1283 	}
1284 
1285 	if ((qid == 0) || (qid > sc->num_cqueues) ||
1286 	    (sc->compl_queues[qid].qbase != NULL)) {
1287 		WPRINTF("%s queue index %u > num_cqueues %u",
1288 			__func__, qid, sc->num_cqueues);
1289 		pci_nvme_status_tc(&compl->status,
1290 		    NVME_SCT_COMMAND_SPECIFIC,
1291 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1292 		return (1);
1293  	}
1294 
1295 	ncq = &sc->compl_queues[qid];
1296 	ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1297 	ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1298 	if (ncq->intr_vec > (sc->max_queues + 1)) {
1299 		pci_nvme_status_tc(&compl->status,
1300 		    NVME_SCT_COMMAND_SPECIFIC,
1301 		    NVME_SC_INVALID_INTERRUPT_VECTOR);
1302 		return (1);
1303 	}
1304 
1305 	ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1306 	if ((ncq->size < 2) || (ncq->size > sc->max_qentries))  {
1307 		/*
1308 		 * Queues must specify at least two entries
1309 		 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1310 		 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1311 		 */
1312 		pci_nvme_status_tc(&compl->status,
1313 		    NVME_SCT_COMMAND_SPECIFIC,
1314 		    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1315 		return (1);
1316 	}
1317 	ncq->head = ncq->tail = 0;
1318 	ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1319 		     command->prp1,
1320 		     sizeof(struct nvme_command) * (size_t)ncq->size);
1321 
1322 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1323 
1324 
1325 	return (1);
1326 }
1327 
1328 static int
1329 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1330 	struct nvme_completion* compl)
1331 {
1332 	uint32_t logsize;
1333 	uint8_t logpage = command->cdw10 & 0xFF;
1334 
1335 	DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1336 
1337 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1338 
1339 	/*
1340 	 * Command specifies the number of dwords to return in fields NUMDU
1341 	 * and NUMDL. This is a zero-based value.
1342 	 */
1343 	logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1344 	logsize *= sizeof(uint32_t);
1345 
1346 	switch (logpage) {
1347 	case NVME_LOG_ERROR:
1348 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1349 		    command->prp2, (uint8_t *)&sc->err_log,
1350 		    MIN(logsize, sizeof(sc->err_log)),
1351 		    NVME_COPY_TO_PRP);
1352 		break;
1353 	case NVME_LOG_HEALTH_INFORMATION:
1354 		pthread_mutex_lock(&sc->mtx);
1355 		memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1356 		    sizeof(sc->health_log.data_units_read));
1357 		memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1358 		    sizeof(sc->health_log.data_units_written));
1359 		memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1360 		    sizeof(sc->health_log.host_read_commands));
1361 		memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1362 		    sizeof(sc->health_log.host_write_commands));
1363 		pthread_mutex_unlock(&sc->mtx);
1364 
1365 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1366 		    command->prp2, (uint8_t *)&sc->health_log,
1367 		    MIN(logsize, sizeof(sc->health_log)),
1368 		    NVME_COPY_TO_PRP);
1369 		break;
1370 	case NVME_LOG_FIRMWARE_SLOT:
1371 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1372 		    command->prp2, (uint8_t *)&sc->fw_log,
1373 		    MIN(logsize, sizeof(sc->fw_log)),
1374 		    NVME_COPY_TO_PRP);
1375 		break;
1376 	case NVME_LOG_CHANGED_NAMESPACE:
1377 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1378 		    command->prp2, (uint8_t *)&sc->ns_log,
1379 		    MIN(logsize, sizeof(sc->ns_log)),
1380 		    NVME_COPY_TO_PRP);
1381 		memset(&sc->ns_log, 0, sizeof(sc->ns_log));
1382 		break;
1383 	default:
1384 		DPRINTF("%s get log page %x command not supported",
1385 		        __func__, logpage);
1386 
1387 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1388 		    NVME_SC_INVALID_LOG_PAGE);
1389 	}
1390 
1391 	return (1);
1392 }
1393 
1394 static int
1395 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1396 	struct nvme_completion* compl)
1397 {
1398 	void *dest;
1399 	uint16_t status;
1400 
1401 	DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1402 	        command->cdw10 & 0xFF, command->nsid);
1403 
1404 	pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1405 
1406 	switch (command->cdw10 & 0xFF) {
1407 	case 0x00: /* return Identify Namespace data structure */
1408 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1409 		    command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1410 		    NVME_COPY_TO_PRP);
1411 		break;
1412 	case 0x01: /* return Identify Controller data structure */
1413 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1414 		    command->prp2, (uint8_t *)&sc->ctrldata,
1415 		    sizeof(sc->ctrldata),
1416 		    NVME_COPY_TO_PRP);
1417 		break;
1418 	case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1419 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1420 		                  sizeof(uint32_t) * 1024);
1421 		/* All unused entries shall be zero */
1422 		bzero(dest, sizeof(uint32_t) * 1024);
1423 		((uint32_t *)dest)[0] = 1;
1424 		break;
1425 	case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1426 		if (command->nsid != 1) {
1427 			pci_nvme_status_genc(&status,
1428 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1429 			break;
1430 		}
1431 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1432 		                  sizeof(uint32_t) * 1024);
1433 		/* All bytes after the descriptor shall be zero */
1434 		bzero(dest, sizeof(uint32_t) * 1024);
1435 
1436 		/* Return NIDT=1 (i.e. EUI64) descriptor */
1437 		((uint8_t *)dest)[0] = 1;
1438 		((uint8_t *)dest)[1] = sizeof(uint64_t);
1439 		bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1440 		break;
1441 	default:
1442 		DPRINTF("%s unsupported identify command requested 0x%x",
1443 		         __func__, command->cdw10 & 0xFF);
1444 		pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1445 		break;
1446 	}
1447 
1448 	compl->status = status;
1449 	return (1);
1450 }
1451 
1452 static const char *
1453 nvme_fid_to_name(uint8_t fid)
1454 {
1455 	const char *name;
1456 
1457 	switch (fid) {
1458 	case NVME_FEAT_ARBITRATION:
1459 		name = "Arbitration";
1460 		break;
1461 	case NVME_FEAT_POWER_MANAGEMENT:
1462 		name = "Power Management";
1463 		break;
1464 	case NVME_FEAT_LBA_RANGE_TYPE:
1465 		name = "LBA Range Type";
1466 		break;
1467 	case NVME_FEAT_TEMPERATURE_THRESHOLD:
1468 		name = "Temperature Threshold";
1469 		break;
1470 	case NVME_FEAT_ERROR_RECOVERY:
1471 		name = "Error Recovery";
1472 		break;
1473 	case NVME_FEAT_VOLATILE_WRITE_CACHE:
1474 		name = "Volatile Write Cache";
1475 		break;
1476 	case NVME_FEAT_NUMBER_OF_QUEUES:
1477 		name = "Number of Queues";
1478 		break;
1479 	case NVME_FEAT_INTERRUPT_COALESCING:
1480 		name = "Interrupt Coalescing";
1481 		break;
1482 	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1483 		name = "Interrupt Vector Configuration";
1484 		break;
1485 	case NVME_FEAT_WRITE_ATOMICITY:
1486 		name = "Write Atomicity Normal";
1487 		break;
1488 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1489 		name = "Asynchronous Event Configuration";
1490 		break;
1491 	case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1492 		name = "Autonomous Power State Transition";
1493 		break;
1494 	case NVME_FEAT_HOST_MEMORY_BUFFER:
1495 		name = "Host Memory Buffer";
1496 		break;
1497 	case NVME_FEAT_TIMESTAMP:
1498 		name = "Timestamp";
1499 		break;
1500 	case NVME_FEAT_KEEP_ALIVE_TIMER:
1501 		name = "Keep Alive Timer";
1502 		break;
1503 	case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1504 		name = "Host Controlled Thermal Management";
1505 		break;
1506 	case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1507 		name = "Non-Operation Power State Config";
1508 		break;
1509 	case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1510 		name = "Read Recovery Level Config";
1511 		break;
1512 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1513 		name = "Predictable Latency Mode Config";
1514 		break;
1515 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1516 		name = "Predictable Latency Mode Window";
1517 		break;
1518 	case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1519 		name = "LBA Status Information Report Interval";
1520 		break;
1521 	case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1522 		name = "Host Behavior Support";
1523 		break;
1524 	case NVME_FEAT_SANITIZE_CONFIG:
1525 		name = "Sanitize Config";
1526 		break;
1527 	case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1528 		name = "Endurance Group Event Configuration";
1529 		break;
1530 	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1531 		name = "Software Progress Marker";
1532 		break;
1533 	case NVME_FEAT_HOST_IDENTIFIER:
1534 		name = "Host Identifier";
1535 		break;
1536 	case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1537 		name = "Reservation Notification Mask";
1538 		break;
1539 	case NVME_FEAT_RESERVATION_PERSISTENCE:
1540 		name = "Reservation Persistence";
1541 		break;
1542 	case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1543 		name = "Namespace Write Protection Config";
1544 		break;
1545 	default:
1546 		name = "Unknown";
1547 		break;
1548 	}
1549 
1550 	return (name);
1551 }
1552 
1553 static void
1554 nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
1555     struct nvme_feature_obj *feat,
1556     struct nvme_command *command,
1557     struct nvme_completion *compl)
1558 {
1559 
1560 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1561 }
1562 
1563 static void
1564 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1565     struct nvme_feature_obj *feat,
1566     struct nvme_command *command,
1567     struct nvme_completion *compl)
1568 {
1569 	uint32_t i;
1570 	uint32_t cdw11 = command->cdw11;
1571 	uint16_t iv;
1572 	bool cd;
1573 
1574 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1575 
1576 	iv = cdw11 & 0xffff;
1577 	cd = cdw11 & (1 << 16);
1578 
1579 	if (iv > (sc->max_queues + 1)) {
1580 		return;
1581 	}
1582 
1583 	/* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1584 	if ((iv == 0) && !cd)
1585 		return;
1586 
1587 	/* Requested Interrupt Vector must be used by a CQ */
1588 	for (i = 0; i < sc->num_cqueues + 1; i++) {
1589 		if (sc->compl_queues[i].intr_vec == iv) {
1590 			pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1591 		}
1592 	}
1593 
1594 }
1595 
1596 static void
1597 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1598     struct nvme_feature_obj *feat,
1599     struct nvme_command *command,
1600     struct nvme_completion *compl)
1601 {
1602 	uint16_t nqr;	/* Number of Queues Requested */
1603 
1604 	if (sc->num_q_is_set) {
1605 		WPRINTF("%s: Number of Queues already set", __func__);
1606 		pci_nvme_status_genc(&compl->status,
1607 		    NVME_SC_COMMAND_SEQUENCE_ERROR);
1608 		return;
1609 	}
1610 
1611 	nqr = command->cdw11 & 0xFFFF;
1612 	if (nqr == 0xffff) {
1613 		WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1614 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1615 		return;
1616 	}
1617 
1618 	sc->num_squeues = ONE_BASED(nqr);
1619 	if (sc->num_squeues > sc->max_queues) {
1620 		DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1621 					sc->max_queues);
1622 		sc->num_squeues = sc->max_queues;
1623 	}
1624 
1625 	nqr = (command->cdw11 >> 16) & 0xFFFF;
1626 	if (nqr == 0xffff) {
1627 		WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1628 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1629 		return;
1630 	}
1631 
1632 	sc->num_cqueues = ONE_BASED(nqr);
1633 	if (sc->num_cqueues > sc->max_queues) {
1634 		DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1635 					sc->max_queues);
1636 		sc->num_cqueues = sc->max_queues;
1637 	}
1638 
1639 	/* Patch the command value which will be saved on callback's return */
1640 	command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1641 	compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1642 
1643 	sc->num_q_is_set = true;
1644 }
1645 
1646 static int
1647 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1648 	struct nvme_completion *compl)
1649 {
1650 	struct nvme_feature_obj *feat;
1651 	uint32_t nsid = command->nsid;
1652 	uint8_t fid = command->cdw10 & 0xFF;
1653 
1654 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1655 
1656 	if (fid >= NVME_FID_MAX) {
1657 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1658 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1659 		return (1);
1660 	}
1661 	feat = &sc->feat[fid];
1662 
1663 	if (!feat->namespace_specific &&
1664 	    !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1665 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1666 		    NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1667 		return (1);
1668 	}
1669 
1670 	compl->cdw0 = 0;
1671 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1672 
1673 	if (feat->set)
1674 		feat->set(sc, feat, command, compl);
1675 
1676 	DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11);
1677 	if (compl->status == NVME_SC_SUCCESS) {
1678 		feat->cdw11 = command->cdw11;
1679 		if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) &&
1680 		    (command->cdw11 != 0))
1681 			pci_nvme_aen_notify(sc);
1682 	}
1683 
1684 	return (0);
1685 }
1686 
1687 static int
1688 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1689 	struct nvme_completion* compl)
1690 {
1691 	struct nvme_feature_obj *feat;
1692 	uint8_t fid = command->cdw10 & 0xFF;
1693 
1694 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1695 
1696 	if (fid >= NVME_FID_MAX) {
1697 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1698 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1699 		return (1);
1700 	}
1701 
1702 	compl->cdw0 = 0;
1703 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1704 
1705 	feat = &sc->feat[fid];
1706 	if (feat->get) {
1707 		feat->get(sc, feat, command, compl);
1708 	}
1709 
1710 	if (compl->status == NVME_SC_SUCCESS) {
1711 		compl->cdw0 = feat->cdw11;
1712 	}
1713 
1714 	return (0);
1715 }
1716 
1717 static int
1718 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1719 	struct nvme_completion* compl)
1720 {
1721 	uint8_t	ses, lbaf, pi;
1722 
1723 	/* Only supports Secure Erase Setting - User Data Erase */
1724 	ses = (command->cdw10 >> 9) & 0x7;
1725 	if (ses > 0x1) {
1726 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1727 		return (1);
1728 	}
1729 
1730 	/* Only supports a single LBA Format */
1731 	lbaf = command->cdw10 & 0xf;
1732 	if (lbaf != 0) {
1733 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1734 		    NVME_SC_INVALID_FORMAT);
1735 		return (1);
1736 	}
1737 
1738 	/* Doesn't support Protection Infomation */
1739 	pi = (command->cdw10 >> 5) & 0x7;
1740 	if (pi != 0) {
1741 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1742 		return (1);
1743 	}
1744 
1745 	if (sc->nvstore.type == NVME_STOR_RAM) {
1746 		if (sc->nvstore.ctx)
1747 			free(sc->nvstore.ctx);
1748 		sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1749 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1750 	} else {
1751 		struct pci_nvme_ioreq *req;
1752 		int err;
1753 
1754 		req = pci_nvme_get_ioreq(sc);
1755 		if (req == NULL) {
1756 			pci_nvme_status_genc(&compl->status,
1757 			    NVME_SC_INTERNAL_DEVICE_ERROR);
1758 			WPRINTF("%s: unable to allocate IO req", __func__);
1759 			return (1);
1760 		}
1761 		req->nvme_sq = &sc->submit_queues[0];
1762 		req->sqid = 0;
1763 		req->opc = command->opc;
1764 		req->cid = command->cid;
1765 		req->nsid = command->nsid;
1766 
1767 		req->io_req.br_offset = 0;
1768 		req->io_req.br_resid = sc->nvstore.size;
1769 		req->io_req.br_callback = pci_nvme_io_done;
1770 
1771 		err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1772 		if (err) {
1773 			pci_nvme_status_genc(&compl->status,
1774 			    NVME_SC_INTERNAL_DEVICE_ERROR);
1775 			pci_nvme_release_ioreq(sc, req);
1776 		}
1777 	}
1778 
1779 	return (1);
1780 }
1781 
1782 static int
1783 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1784 	struct nvme_completion* compl)
1785 {
1786 	DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1787 	        command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1788 
1789 	/* TODO: search for the command ID and abort it */
1790 
1791 	compl->cdw0 = 1;
1792 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1793 	return (1);
1794 }
1795 
1796 static int
1797 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1798 	struct nvme_command* command, struct nvme_completion* compl)
1799 {
1800 	DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__,
1801 	    sc->aer_count, sc->ctrldata.aerl, command->cid);
1802 
1803 	/* Don't exceed the Async Event Request Limit (AERL). */
1804 	if (pci_nvme_aer_limit_reached(sc)) {
1805 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1806 				NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1807 		return (1);
1808 	}
1809 
1810 	if (pci_nvme_aer_add(sc, command->cid)) {
1811 		pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
1812 				NVME_SC_INTERNAL_DEVICE_ERROR);
1813 		return (1);
1814 	}
1815 
1816 	/*
1817 	 * Raise events when they happen based on the Set Features cmd.
1818 	 * These events happen async, so only set completion successful if
1819 	 * there is an event reflective of the request to get event.
1820 	 */
1821 	compl->status = NVME_NO_STATUS;
1822 	pci_nvme_aen_notify(sc);
1823 
1824 	return (0);
1825 }
1826 
1827 static void
1828 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1829 {
1830 	struct nvme_completion compl;
1831 	struct nvme_command *cmd;
1832 	struct nvme_submission_queue *sq;
1833 	struct nvme_completion_queue *cq;
1834 	uint16_t sqhead;
1835 
1836 	DPRINTF("%s index %u", __func__, (uint32_t)value);
1837 
1838 	sq = &sc->submit_queues[0];
1839 	cq = &sc->compl_queues[0];
1840 
1841 	pthread_mutex_lock(&sq->mtx);
1842 
1843 	sqhead = sq->head;
1844 	DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1845 
1846 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
1847 		cmd = &(sq->qbase)[sqhead];
1848 		compl.cdw0 = 0;
1849 		compl.status = 0;
1850 
1851 		switch (cmd->opc) {
1852 		case NVME_OPC_DELETE_IO_SQ:
1853 			DPRINTF("%s command DELETE_IO_SQ", __func__);
1854 			nvme_opc_delete_io_sq(sc, cmd, &compl);
1855 			break;
1856 		case NVME_OPC_CREATE_IO_SQ:
1857 			DPRINTF("%s command CREATE_IO_SQ", __func__);
1858 			nvme_opc_create_io_sq(sc, cmd, &compl);
1859 			break;
1860 		case NVME_OPC_DELETE_IO_CQ:
1861 			DPRINTF("%s command DELETE_IO_CQ", __func__);
1862 			nvme_opc_delete_io_cq(sc, cmd, &compl);
1863 			break;
1864 		case NVME_OPC_CREATE_IO_CQ:
1865 			DPRINTF("%s command CREATE_IO_CQ", __func__);
1866 			nvme_opc_create_io_cq(sc, cmd, &compl);
1867 			break;
1868 		case NVME_OPC_GET_LOG_PAGE:
1869 			DPRINTF("%s command GET_LOG_PAGE", __func__);
1870 			nvme_opc_get_log_page(sc, cmd, &compl);
1871 			break;
1872 		case NVME_OPC_IDENTIFY:
1873 			DPRINTF("%s command IDENTIFY", __func__);
1874 			nvme_opc_identify(sc, cmd, &compl);
1875 			break;
1876 		case NVME_OPC_ABORT:
1877 			DPRINTF("%s command ABORT", __func__);
1878 			nvme_opc_abort(sc, cmd, &compl);
1879 			break;
1880 		case NVME_OPC_SET_FEATURES:
1881 			DPRINTF("%s command SET_FEATURES", __func__);
1882 			nvme_opc_set_features(sc, cmd, &compl);
1883 			break;
1884 		case NVME_OPC_GET_FEATURES:
1885 			DPRINTF("%s command GET_FEATURES", __func__);
1886 			nvme_opc_get_features(sc, cmd, &compl);
1887 			break;
1888 		case NVME_OPC_FIRMWARE_ACTIVATE:
1889 			DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
1890 			pci_nvme_status_tc(&compl.status,
1891 			    NVME_SCT_COMMAND_SPECIFIC,
1892 			    NVME_SC_INVALID_FIRMWARE_SLOT);
1893 			break;
1894 		case NVME_OPC_ASYNC_EVENT_REQUEST:
1895 			DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1896 			nvme_opc_async_event_req(sc, cmd, &compl);
1897 			break;
1898 		case NVME_OPC_FORMAT_NVM:
1899 			DPRINTF("%s command FORMAT_NVM", __func__);
1900 			if ((sc->ctrldata.oacs &
1901 			    (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
1902 				pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1903 			}
1904 			compl.status = NVME_NO_STATUS;
1905 			nvme_opc_format_nvm(sc, cmd, &compl);
1906 			break;
1907 		default:
1908 			DPRINTF("0x%x command is not implemented",
1909 			    cmd->opc);
1910 			pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1911 		}
1912 		sqhead = (sqhead + 1) % sq->size;
1913 
1914 		if (NVME_COMPLETION_VALID(compl)) {
1915 			pci_nvme_cq_update(sc, &sc->compl_queues[0],
1916 			    compl.cdw0,
1917 			    cmd->cid,
1918 			    0,		/* SQID */
1919 			    compl.status);
1920 		}
1921 	}
1922 
1923 	DPRINTF("setting sqhead %u", sqhead);
1924 	sq->head = sqhead;
1925 
1926 	if (cq->head != cq->tail)
1927 		pci_generate_msix(sc->nsc_pi, 0);
1928 
1929 	pthread_mutex_unlock(&sq->mtx);
1930 }
1931 
1932 /*
1933  * Update the Write and Read statistics reported in SMART data
1934  *
1935  * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
1936  * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
1937  * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
1938  */
1939 static void
1940 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
1941     size_t bytes, uint16_t status)
1942 {
1943 
1944 	pthread_mutex_lock(&sc->mtx);
1945 	switch (opc) {
1946 	case NVME_OPC_WRITE:
1947 		sc->write_commands++;
1948 		if (status != NVME_SC_SUCCESS)
1949 			break;
1950 		sc->write_dunits_remainder += (bytes / 512);
1951 		while (sc->write_dunits_remainder >= 1000) {
1952 			sc->write_data_units++;
1953 			sc->write_dunits_remainder -= 1000;
1954 		}
1955 		break;
1956 	case NVME_OPC_READ:
1957 		sc->read_commands++;
1958 		if (status != NVME_SC_SUCCESS)
1959 			break;
1960 		sc->read_dunits_remainder += (bytes / 512);
1961 		while (sc->read_dunits_remainder >= 1000) {
1962 			sc->read_data_units++;
1963 			sc->read_dunits_remainder -= 1000;
1964 		}
1965 		break;
1966 	default:
1967 		DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
1968 		break;
1969 	}
1970 	pthread_mutex_unlock(&sc->mtx);
1971 }
1972 
1973 /*
1974  * Check if the combination of Starting LBA (slba) and Number of Logical
1975  * Blocks (nlb) exceeds the range of the underlying storage.
1976  *
1977  * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
1978  * the capacity in bytes as a uint64_t, care must be taken to avoid integer
1979  * overflow.
1980  */
1981 static bool
1982 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
1983     uint32_t nlb)
1984 {
1985 	size_t	offset, bytes;
1986 
1987 	/* Overflow check of multiplying Starting LBA by the sector size */
1988 	if (slba >> (64 - nvstore->sectsz_bits))
1989 		return (true);
1990 
1991 	offset = slba << nvstore->sectsz_bits;
1992 	bytes = nlb << nvstore->sectsz_bits;
1993 
1994 	/* Overflow check of Number of Logical Blocks */
1995 	if ((nvstore->size - offset) < bytes)
1996 		return (true);
1997 
1998 	return (false);
1999 }
2000 
2001 static int
2002 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
2003 	uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
2004 {
2005 	int iovidx;
2006 
2007 	if (req == NULL)
2008 		return (-1);
2009 
2010 	if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
2011 		return (-1);
2012 	}
2013 
2014 	/* concatenate contig block-iovs to minimize number of iovs */
2015 	if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
2016 		iovidx = req->io_req.br_iovcnt - 1;
2017 
2018 		req->io_req.br_iov[iovidx].iov_base =
2019 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2020 				     req->prev_gpaddr, size);
2021 
2022 		req->prev_size += size;
2023 		req->io_req.br_resid += size;
2024 
2025 		req->io_req.br_iov[iovidx].iov_len = req->prev_size;
2026 	} else {
2027 		iovidx = req->io_req.br_iovcnt;
2028 		if (iovidx == 0) {
2029 			req->io_req.br_offset = lba;
2030 			req->io_req.br_resid = 0;
2031 			req->io_req.br_param = req;
2032 		}
2033 
2034 		req->io_req.br_iov[iovidx].iov_base =
2035 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2036 				     gpaddr, size);
2037 
2038 		req->io_req.br_iov[iovidx].iov_len = size;
2039 
2040 		req->prev_gpaddr = gpaddr;
2041 		req->prev_size = size;
2042 		req->io_req.br_resid += size;
2043 
2044 		req->io_req.br_iovcnt++;
2045 	}
2046 
2047 	return (0);
2048 }
2049 
2050 static void
2051 pci_nvme_set_completion(struct pci_nvme_softc *sc,
2052 	struct nvme_submission_queue *sq, int sqid, uint16_t cid,
2053 	uint32_t cdw0, uint16_t status)
2054 {
2055 	struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
2056 
2057 	DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
2058 		 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
2059 		 NVME_STATUS_GET_SC(status));
2060 
2061 	pci_nvme_cq_update(sc, cq,
2062 	    0,		/* CDW0 */
2063 	    cid,
2064 	    sqid,
2065 	    status);
2066 
2067 	if (cq->head != cq->tail) {
2068 		if (cq->intr_en & NVME_CQ_INTEN) {
2069 			pci_generate_msix(sc->nsc_pi, cq->intr_vec);
2070 		} else {
2071 			DPRINTF("%s: CQ%u interrupt disabled",
2072 						__func__, sq->cqid);
2073 		}
2074 	}
2075 }
2076 
2077 static void
2078 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
2079 {
2080 	req->sc = NULL;
2081 	req->nvme_sq = NULL;
2082 	req->sqid = 0;
2083 
2084 	pthread_mutex_lock(&sc->mtx);
2085 
2086 	STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
2087 	sc->pending_ios--;
2088 
2089 	/* when no more IO pending, can set to ready if device reset/enabled */
2090 	if (sc->pending_ios == 0 &&
2091 	    NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
2092 		sc->regs.csts |= NVME_CSTS_RDY;
2093 
2094 	pthread_mutex_unlock(&sc->mtx);
2095 
2096 	sem_post(&sc->iosemlock);
2097 }
2098 
2099 static struct pci_nvme_ioreq *
2100 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
2101 {
2102 	struct pci_nvme_ioreq *req = NULL;
2103 
2104 	sem_wait(&sc->iosemlock);
2105 	pthread_mutex_lock(&sc->mtx);
2106 
2107 	req = STAILQ_FIRST(&sc->ioreqs_free);
2108 	assert(req != NULL);
2109 	STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
2110 
2111 	req->sc = sc;
2112 
2113 	sc->pending_ios++;
2114 
2115 	pthread_mutex_unlock(&sc->mtx);
2116 
2117 	req->io_req.br_iovcnt = 0;
2118 	req->io_req.br_offset = 0;
2119 	req->io_req.br_resid = 0;
2120 	req->io_req.br_param = req;
2121 	req->prev_gpaddr = 0;
2122 	req->prev_size = 0;
2123 
2124 	return req;
2125 }
2126 
2127 static void
2128 pci_nvme_io_done(struct blockif_req *br, int err)
2129 {
2130 	struct pci_nvme_ioreq *req = br->br_param;
2131 	struct nvme_submission_queue *sq = req->nvme_sq;
2132 	uint16_t code, status;
2133 
2134 	DPRINTF("%s error %d %s", __func__, err, strerror(err));
2135 
2136 	/* TODO return correct error */
2137 	code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
2138 	pci_nvme_status_genc(&status, code);
2139 
2140 	pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
2141 	pci_nvme_stats_write_read_update(req->sc, req->opc,
2142 	    req->bytes, status);
2143 	pci_nvme_release_ioreq(req->sc, req);
2144 }
2145 
2146 /*
2147  * Implements the Flush command. The specification states:
2148  *    If a volatile write cache is not present, Flush commands complete
2149  *    successfully and have no effect
2150  * in the description of the Volatile Write Cache (VWC) field of the Identify
2151  * Controller data. Therefore, set status to Success if the command is
2152  * not supported (i.e. RAM or as indicated by the blockif).
2153  */
2154 static bool
2155 nvme_opc_flush(struct pci_nvme_softc *sc,
2156     struct nvme_command *cmd,
2157     struct pci_nvme_blockstore *nvstore,
2158     struct pci_nvme_ioreq *req,
2159     uint16_t *status)
2160 {
2161 	bool pending = false;
2162 
2163 	if (nvstore->type == NVME_STOR_RAM) {
2164 		pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2165 	} else {
2166 		int err;
2167 
2168 		req->io_req.br_callback = pci_nvme_io_done;
2169 
2170 		err = blockif_flush(nvstore->ctx, &req->io_req);
2171 		switch (err) {
2172 		case 0:
2173 			pending = true;
2174 			break;
2175 		case EOPNOTSUPP:
2176 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2177 			break;
2178 		default:
2179 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2180 		}
2181 	}
2182 
2183 	return (pending);
2184 }
2185 
2186 static uint16_t
2187 nvme_write_read_ram(struct pci_nvme_softc *sc,
2188     struct pci_nvme_blockstore *nvstore,
2189     uint64_t prp1, uint64_t prp2,
2190     size_t offset, uint64_t bytes,
2191     bool is_write)
2192 {
2193 	uint8_t *buf = nvstore->ctx;
2194 	enum nvme_copy_dir dir;
2195 	uint16_t status;
2196 
2197 	if (is_write)
2198 		dir = NVME_COPY_TO_PRP;
2199 	else
2200 		dir = NVME_COPY_FROM_PRP;
2201 
2202 	if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
2203 	    buf + offset, bytes, dir))
2204 		pci_nvme_status_genc(&status,
2205 		    NVME_SC_DATA_TRANSFER_ERROR);
2206 	else
2207 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2208 
2209 	return (status);
2210 }
2211 
2212 static uint16_t
2213 nvme_write_read_blockif(struct pci_nvme_softc *sc,
2214     struct pci_nvme_blockstore *nvstore,
2215     struct pci_nvme_ioreq *req,
2216     uint64_t prp1, uint64_t prp2,
2217     size_t offset, uint64_t bytes,
2218     bool is_write)
2219 {
2220 	uint64_t size;
2221 	int err;
2222 	uint16_t status = NVME_NO_STATUS;
2223 
2224 	size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
2225 	if (pci_nvme_append_iov_req(sc, req, prp1,
2226 	    size, is_write, offset)) {
2227 		pci_nvme_status_genc(&status,
2228 		    NVME_SC_DATA_TRANSFER_ERROR);
2229 		goto out;
2230 	}
2231 
2232 	offset += size;
2233 	bytes  -= size;
2234 
2235 	if (bytes == 0) {
2236 		;
2237 	} else if (bytes <= PAGE_SIZE) {
2238 		size = bytes;
2239 		if (pci_nvme_append_iov_req(sc, req, prp2,
2240 		    size, is_write, offset)) {
2241 			pci_nvme_status_genc(&status,
2242 			    NVME_SC_DATA_TRANSFER_ERROR);
2243 			goto out;
2244 		}
2245 	} else {
2246 		void *vmctx = sc->nsc_pi->pi_vmctx;
2247 		uint64_t *prp_list = &prp2;
2248 		uint64_t *last = prp_list;
2249 
2250 		/* PRP2 is pointer to a physical region page list */
2251 		while (bytes) {
2252 			/* Last entry in list points to the next list */
2253 			if ((prp_list == last) && (bytes > PAGE_SIZE)) {
2254 				uint64_t prp = *prp_list;
2255 
2256 				prp_list = paddr_guest2host(vmctx, prp,
2257 				    PAGE_SIZE - (prp % PAGE_SIZE));
2258 				last = prp_list + (NVME_PRP2_ITEMS - 1);
2259 			}
2260 
2261 			size = MIN(bytes, PAGE_SIZE);
2262 
2263 			if (pci_nvme_append_iov_req(sc, req, *prp_list,
2264 			    size, is_write, offset)) {
2265 				pci_nvme_status_genc(&status,
2266 				    NVME_SC_DATA_TRANSFER_ERROR);
2267 				goto out;
2268 			}
2269 
2270 			offset += size;
2271 			bytes  -= size;
2272 
2273 			prp_list++;
2274 		}
2275 	}
2276 	req->io_req.br_callback = pci_nvme_io_done;
2277 	if (is_write)
2278 		err = blockif_write(nvstore->ctx, &req->io_req);
2279 	else
2280 		err = blockif_read(nvstore->ctx, &req->io_req);
2281 
2282 	if (err)
2283 		pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2284 out:
2285 	return (status);
2286 }
2287 
2288 static bool
2289 nvme_opc_write_read(struct pci_nvme_softc *sc,
2290     struct nvme_command *cmd,
2291     struct pci_nvme_blockstore *nvstore,
2292     struct pci_nvme_ioreq *req,
2293     uint16_t *status)
2294 {
2295 	uint64_t lba, nblocks, bytes;
2296 	size_t offset;
2297 	bool is_write = cmd->opc == NVME_OPC_WRITE;
2298 	bool pending = false;
2299 
2300 	lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2301 	nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2302 
2303 	if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2304 		WPRINTF("%s command would exceed LBA range", __func__);
2305 		pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2306 		goto out;
2307 	}
2308 
2309 	bytes  = nblocks << nvstore->sectsz_bits;
2310 	if (bytes > NVME_MAX_DATA_SIZE) {
2311 		WPRINTF("%s command would exceed MDTS", __func__);
2312 		pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2313 		goto out;
2314 	}
2315 
2316 	offset = lba << nvstore->sectsz_bits;
2317 
2318 	req->bytes = bytes;
2319 	req->io_req.br_offset = lba;
2320 
2321 	/* PRP bits 1:0 must be zero */
2322 	cmd->prp1 &= ~0x3UL;
2323 	cmd->prp2 &= ~0x3UL;
2324 
2325 	if (nvstore->type == NVME_STOR_RAM) {
2326 		*status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2327 		    cmd->prp2, offset, bytes, is_write);
2328 	} else {
2329 		*status = nvme_write_read_blockif(sc, nvstore, req,
2330 		    cmd->prp1, cmd->prp2, offset, bytes, is_write);
2331 
2332 		if (*status == NVME_NO_STATUS)
2333 			pending = true;
2334 	}
2335 out:
2336 	if (!pending)
2337 		pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2338 
2339 	return (pending);
2340 }
2341 
2342 static void
2343 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2344 {
2345 	struct pci_nvme_ioreq *req = br->br_param;
2346 	struct pci_nvme_softc *sc = req->sc;
2347 	bool done = true;
2348 	uint16_t status;
2349 
2350 	if (err) {
2351 		pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2352 	} else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2353 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2354 	} else {
2355 		struct iovec *iov = req->io_req.br_iov;
2356 
2357 		req->prev_gpaddr++;
2358 		iov += req->prev_gpaddr;
2359 
2360 		/* The iov_* values already include the sector size */
2361 		req->io_req.br_offset = (off_t)iov->iov_base;
2362 		req->io_req.br_resid = iov->iov_len;
2363 		if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2364 			pci_nvme_status_genc(&status,
2365 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2366 		} else
2367 			done = false;
2368 	}
2369 
2370 	if (done) {
2371 		pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
2372 		    req->cid, 0, status);
2373 		pci_nvme_release_ioreq(sc, req);
2374 	}
2375 }
2376 
2377 static bool
2378 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2379     struct nvme_command *cmd,
2380     struct pci_nvme_blockstore *nvstore,
2381     struct pci_nvme_ioreq *req,
2382     uint16_t *status)
2383 {
2384 	struct nvme_dsm_range *range;
2385 	uint32_t nr, r, non_zero, dr;
2386 	int err;
2387 	bool pending = false;
2388 
2389 	if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2390 		pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2391 		goto out;
2392 	}
2393 
2394 	nr = cmd->cdw10 & 0xff;
2395 
2396 	/* copy locally because a range entry could straddle PRPs */
2397 	range = calloc(1, NVME_MAX_DSM_TRIM);
2398 	if (range == NULL) {
2399 		pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2400 		goto out;
2401 	}
2402 	nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2403 	    (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2404 
2405 	/* Check for invalid ranges and the number of non-zero lengths */
2406 	non_zero = 0;
2407 	for (r = 0; r <= nr; r++) {
2408 		if (pci_nvme_out_of_range(nvstore,
2409 		    range[r].starting_lba, range[r].length)) {
2410 			pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2411 			goto out;
2412 		}
2413 		if (range[r].length != 0)
2414 			non_zero++;
2415 	}
2416 
2417 	if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2418 		size_t offset, bytes;
2419 		int sectsz_bits = sc->nvstore.sectsz_bits;
2420 
2421 		/*
2422 		 * DSM calls are advisory only, and compliant controllers
2423 		 * may choose to take no actions (i.e. return Success).
2424 		 */
2425 		if (!nvstore->deallocate) {
2426 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2427 			goto out;
2428 		}
2429 
2430 		/* If all ranges have a zero length, return Success */
2431 		if (non_zero == 0) {
2432 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2433 			goto out;
2434 		}
2435 
2436 		if (req == NULL) {
2437 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2438 			goto out;
2439 		}
2440 
2441 		offset = range[0].starting_lba << sectsz_bits;
2442 		bytes = range[0].length << sectsz_bits;
2443 
2444 		/*
2445 		 * If the request is for more than a single range, store
2446 		 * the ranges in the br_iov. Optimize for the common case
2447 		 * of a single range.
2448 		 *
2449 		 * Note that NVMe Number of Ranges is a zero based value
2450 		 */
2451 		req->io_req.br_iovcnt = 0;
2452 		req->io_req.br_offset = offset;
2453 		req->io_req.br_resid = bytes;
2454 
2455 		if (nr == 0) {
2456 			req->io_req.br_callback = pci_nvme_io_done;
2457 		} else {
2458 			struct iovec *iov = req->io_req.br_iov;
2459 
2460 			for (r = 0, dr = 0; r <= nr; r++) {
2461 				offset = range[r].starting_lba << sectsz_bits;
2462 				bytes = range[r].length << sectsz_bits;
2463 				if (bytes == 0)
2464 					continue;
2465 
2466 				if ((nvstore->size - offset) < bytes) {
2467 					pci_nvme_status_genc(status,
2468 					    NVME_SC_LBA_OUT_OF_RANGE);
2469 					goto out;
2470 				}
2471 				iov[dr].iov_base = (void *)offset;
2472 				iov[dr].iov_len = bytes;
2473 				dr++;
2474 			}
2475 			req->io_req.br_callback = pci_nvme_dealloc_sm;
2476 
2477 			/*
2478 			 * Use prev_gpaddr to track the current entry and
2479 			 * prev_size to track the number of entries
2480 			 */
2481 			req->prev_gpaddr = 0;
2482 			req->prev_size = dr;
2483 		}
2484 
2485 		err = blockif_delete(nvstore->ctx, &req->io_req);
2486 		if (err)
2487 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2488 		else
2489 			pending = true;
2490 	}
2491 out:
2492 	free(range);
2493 	return (pending);
2494 }
2495 
2496 static void
2497 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2498 {
2499 	struct nvme_submission_queue *sq;
2500 	uint16_t status;
2501 	uint16_t sqhead;
2502 
2503 	/* handle all submissions up to sq->tail index */
2504 	sq = &sc->submit_queues[idx];
2505 
2506 	pthread_mutex_lock(&sq->mtx);
2507 
2508 	sqhead = sq->head;
2509 	DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2510 	         idx, sqhead, sq->tail, sq->qbase);
2511 
2512 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
2513 		struct nvme_command *cmd;
2514 		struct pci_nvme_ioreq *req;
2515 		uint32_t nsid;
2516 		bool pending;
2517 
2518 		pending = false;
2519 		req = NULL;
2520 		status = 0;
2521 
2522 		cmd = &sq->qbase[sqhead];
2523 		sqhead = (sqhead + 1) % sq->size;
2524 
2525 		nsid = le32toh(cmd->nsid);
2526 		if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2527 			pci_nvme_status_genc(&status,
2528 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2529 			status |=
2530 			    NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2531 			goto complete;
2532  		}
2533 
2534 		req = pci_nvme_get_ioreq(sc);
2535 		if (req == NULL) {
2536 			pci_nvme_status_genc(&status,
2537 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2538 			WPRINTF("%s: unable to allocate IO req", __func__);
2539 			goto complete;
2540 		}
2541 		req->nvme_sq = sq;
2542 		req->sqid = idx;
2543 		req->opc = cmd->opc;
2544 		req->cid = cmd->cid;
2545 		req->nsid = cmd->nsid;
2546 
2547 		switch (cmd->opc) {
2548 		case NVME_OPC_FLUSH:
2549 			pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2550 			    req, &status);
2551  			break;
2552 		case NVME_OPC_WRITE:
2553 		case NVME_OPC_READ:
2554 			pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2555 			    req, &status);
2556 			break;
2557 		case NVME_OPC_WRITE_ZEROES:
2558 			/* TODO: write zeroes
2559 			WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2560 			        __func__, lba, cmd->cdw12 & 0xFFFF); */
2561 			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2562 			break;
2563 		case NVME_OPC_DATASET_MANAGEMENT:
2564  			pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2565 			    req, &status);
2566 			break;
2567  		default:
2568  			WPRINTF("%s unhandled io command 0x%x",
2569 			    __func__, cmd->opc);
2570 			pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2571 		}
2572 complete:
2573 		if (!pending) {
2574 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
2575 			    status);
2576 			if (req != NULL)
2577 				pci_nvme_release_ioreq(sc, req);
2578 		}
2579 	}
2580 
2581 	sq->head = sqhead;
2582 
2583 	pthread_mutex_unlock(&sq->mtx);
2584 }
2585 
2586 static void
2587 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
2588 	uint64_t idx, int is_sq, uint64_t value)
2589 {
2590 	DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2591 	        idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2592 
2593 	if (is_sq) {
2594 		if (idx > sc->num_squeues) {
2595 			WPRINTF("%s queue index %lu overflow from "
2596 			         "guest (max %u)",
2597 			         __func__, idx, sc->num_squeues);
2598 			return;
2599 		}
2600 
2601 		atomic_store_short(&sc->submit_queues[idx].tail,
2602 		                   (uint16_t)value);
2603 
2604 		if (idx == 0) {
2605 			pci_nvme_handle_admin_cmd(sc, value);
2606 		} else {
2607 			/* submission queue; handle new entries in SQ */
2608 			if (idx > sc->num_squeues) {
2609 				WPRINTF("%s SQ index %lu overflow from "
2610 				         "guest (max %u)",
2611 				         __func__, idx, sc->num_squeues);
2612 				return;
2613 			}
2614 			pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2615 		}
2616 	} else {
2617 		if (idx > sc->num_cqueues) {
2618 			WPRINTF("%s queue index %lu overflow from "
2619 			         "guest (max %u)",
2620 			         __func__, idx, sc->num_cqueues);
2621 			return;
2622 		}
2623 
2624 		atomic_store_short(&sc->compl_queues[idx].head,
2625 				(uint16_t)value);
2626 	}
2627 }
2628 
2629 static void
2630 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2631 {
2632 	const char *s = iswrite ? "WRITE" : "READ";
2633 
2634 	switch (offset) {
2635 	case NVME_CR_CAP_LOW:
2636 		DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2637 		break;
2638 	case NVME_CR_CAP_HI:
2639 		DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2640 		break;
2641 	case NVME_CR_VS:
2642 		DPRINTF("%s %s NVME_CR_VS", func, s);
2643 		break;
2644 	case NVME_CR_INTMS:
2645 		DPRINTF("%s %s NVME_CR_INTMS", func, s);
2646 		break;
2647 	case NVME_CR_INTMC:
2648 		DPRINTF("%s %s NVME_CR_INTMC", func, s);
2649 		break;
2650 	case NVME_CR_CC:
2651 		DPRINTF("%s %s NVME_CR_CC", func, s);
2652 		break;
2653 	case NVME_CR_CSTS:
2654 		DPRINTF("%s %s NVME_CR_CSTS", func, s);
2655 		break;
2656 	case NVME_CR_NSSR:
2657 		DPRINTF("%s %s NVME_CR_NSSR", func, s);
2658 		break;
2659 	case NVME_CR_AQA:
2660 		DPRINTF("%s %s NVME_CR_AQA", func, s);
2661 		break;
2662 	case NVME_CR_ASQ_LOW:
2663 		DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2664 		break;
2665 	case NVME_CR_ASQ_HI:
2666 		DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2667 		break;
2668 	case NVME_CR_ACQ_LOW:
2669 		DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2670 		break;
2671 	case NVME_CR_ACQ_HI:
2672 		DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2673 		break;
2674 	default:
2675 		DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2676 	}
2677 
2678 }
2679 
2680 static void
2681 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2682 	uint64_t offset, int size, uint64_t value)
2683 {
2684 	uint32_t ccreg;
2685 
2686 	if (offset >= NVME_DOORBELL_OFFSET) {
2687 		uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2688 		uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2689 		int is_sq = (belloffset % 8) < 4;
2690 
2691 		if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2692 			WPRINTF("guest attempted an overflow write offset "
2693 			         "0x%lx, val 0x%lx in %s",
2694 			         offset, value, __func__);
2695 			return;
2696 		}
2697 
2698 		pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2699 		return;
2700 	}
2701 
2702 	DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2703 	        offset, size, value);
2704 
2705 	if (size != 4) {
2706 		WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2707 		         "val 0x%lx) to bar0 in %s",
2708 		         size, offset, value, __func__);
2709 		/* TODO: shutdown device */
2710 		return;
2711 	}
2712 
2713 	pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2714 
2715 	pthread_mutex_lock(&sc->mtx);
2716 
2717 	switch (offset) {
2718 	case NVME_CR_CAP_LOW:
2719 	case NVME_CR_CAP_HI:
2720 		/* readonly */
2721 		break;
2722 	case NVME_CR_VS:
2723 		/* readonly */
2724 		break;
2725 	case NVME_CR_INTMS:
2726 		/* MSI-X, so ignore */
2727 		break;
2728 	case NVME_CR_INTMC:
2729 		/* MSI-X, so ignore */
2730 		break;
2731 	case NVME_CR_CC:
2732 		ccreg = (uint32_t)value;
2733 
2734 		DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2735 		         "iocqes %u",
2736 		        __func__,
2737 			 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2738 			 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2739 			 NVME_CC_GET_IOCQES(ccreg));
2740 
2741 		if (NVME_CC_GET_SHN(ccreg)) {
2742 			/* perform shutdown - flush out data to backend */
2743 			sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2744 			    NVME_CSTS_REG_SHST_SHIFT);
2745 			sc->regs.csts |= NVME_SHST_COMPLETE <<
2746 			    NVME_CSTS_REG_SHST_SHIFT;
2747 		}
2748 		if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2749 			if (NVME_CC_GET_EN(ccreg) == 0)
2750 				/* transition 1-> causes controller reset */
2751 				pci_nvme_reset_locked(sc);
2752 			else
2753 				pci_nvme_init_controller(ctx, sc);
2754 		}
2755 
2756 		/* Insert the iocqes, iosqes and en bits from the write */
2757 		sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2758 		sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2759 		if (NVME_CC_GET_EN(ccreg) == 0) {
2760 			/* Insert the ams, mps and css bit fields */
2761 			sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2762 			sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2763 			sc->regs.csts &= ~NVME_CSTS_RDY;
2764 		} else if (sc->pending_ios == 0) {
2765 			sc->regs.csts |= NVME_CSTS_RDY;
2766 		}
2767 		break;
2768 	case NVME_CR_CSTS:
2769 		break;
2770 	case NVME_CR_NSSR:
2771 		/* ignore writes; don't support subsystem reset */
2772 		break;
2773 	case NVME_CR_AQA:
2774 		sc->regs.aqa = (uint32_t)value;
2775 		break;
2776 	case NVME_CR_ASQ_LOW:
2777 		sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2778 		               (0xFFFFF000 & value);
2779 		break;
2780 	case NVME_CR_ASQ_HI:
2781 		sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2782 		               (value << 32);
2783 		break;
2784 	case NVME_CR_ACQ_LOW:
2785 		sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2786 		               (0xFFFFF000 & value);
2787 		break;
2788 	case NVME_CR_ACQ_HI:
2789 		sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2790 		               (value << 32);
2791 		break;
2792 	default:
2793 		DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2794 		         __func__, offset, value, size);
2795 	}
2796 	pthread_mutex_unlock(&sc->mtx);
2797 }
2798 
2799 static void
2800 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2801                 int baridx, uint64_t offset, int size, uint64_t value)
2802 {
2803 	struct pci_nvme_softc* sc = pi->pi_arg;
2804 
2805 	if (baridx == pci_msix_table_bar(pi) ||
2806 	    baridx == pci_msix_pba_bar(pi)) {
2807 		DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2808 		         " value 0x%lx", baridx, offset, size, value);
2809 
2810 		pci_emul_msix_twrite(pi, offset, size, value);
2811 		return;
2812 	}
2813 
2814 	switch (baridx) {
2815 	case 0:
2816 		pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2817 		break;
2818 
2819 	default:
2820 		DPRINTF("%s unknown baridx %d, val 0x%lx",
2821 		         __func__, baridx, value);
2822 	}
2823 }
2824 
2825 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2826 	uint64_t offset, int size)
2827 {
2828 	uint64_t value;
2829 
2830 	pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2831 
2832 	if (offset < NVME_DOORBELL_OFFSET) {
2833 		void *p = &(sc->regs);
2834 		pthread_mutex_lock(&sc->mtx);
2835 		memcpy(&value, (void *)((uintptr_t)p + offset), size);
2836 		pthread_mutex_unlock(&sc->mtx);
2837 	} else {
2838 		value = 0;
2839                 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2840 	}
2841 
2842 	switch (size) {
2843 	case 1:
2844 		value &= 0xFF;
2845 		break;
2846 	case 2:
2847 		value &= 0xFFFF;
2848 		break;
2849 	case 4:
2850 		value &= 0xFFFFFFFF;
2851 		break;
2852 	}
2853 
2854 	DPRINTF("   nvme-read offset 0x%lx, size %d -> value 0x%x",
2855 	         offset, size, (uint32_t)value);
2856 
2857 	return (value);
2858 }
2859 
2860 
2861 
2862 static uint64_t
2863 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2864     uint64_t offset, int size)
2865 {
2866 	struct pci_nvme_softc* sc = pi->pi_arg;
2867 
2868 	if (baridx == pci_msix_table_bar(pi) ||
2869 	    baridx == pci_msix_pba_bar(pi)) {
2870 		DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2871 		        baridx, offset, size);
2872 
2873 		return pci_emul_msix_tread(pi, offset, size);
2874 	}
2875 
2876 	switch (baridx) {
2877 	case 0:
2878        		return pci_nvme_read_bar_0(sc, offset, size);
2879 
2880 	default:
2881 		DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2882 	}
2883 
2884 	return (0);
2885 }
2886 
2887 static int
2888 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl)
2889 {
2890 	char bident[sizeof("XX:X:X")];
2891 	const char *value;
2892 	uint32_t sectsz;
2893 
2894 	sc->max_queues = NVME_QUEUES;
2895 	sc->max_qentries = NVME_MAX_QENTRIES;
2896 	sc->ioslots = NVME_IOSLOTS;
2897 	sc->num_squeues = sc->max_queues;
2898 	sc->num_cqueues = sc->max_queues;
2899 	sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2900 	sectsz = 0;
2901 	snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2902 	         "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2903 
2904 	value = get_config_value_node(nvl, "maxq");
2905 	if (value != NULL)
2906 		sc->max_queues = atoi(value);
2907 	value = get_config_value_node(nvl, "qsz");
2908 	if (value != NULL) {
2909 		sc->max_qentries = atoi(value);
2910 		if (sc->max_qentries <= 0) {
2911 			EPRINTLN("nvme: Invalid qsz option %d",
2912 			    sc->max_qentries);
2913 			return (-1);
2914 		}
2915 	}
2916 	value = get_config_value_node(nvl, "ioslots");
2917 	if (value != NULL) {
2918 		sc->ioslots = atoi(value);
2919 		if (sc->ioslots <= 0) {
2920 			EPRINTLN("Invalid ioslots option %d", sc->ioslots);
2921 			return (-1);
2922 		}
2923 	}
2924 	value = get_config_value_node(nvl, "sectsz");
2925 	if (value != NULL)
2926 		sectsz = atoi(value);
2927 	value = get_config_value_node(nvl, "ser");
2928 	if (value != NULL) {
2929 		/*
2930 		 * This field indicates the Product Serial Number in
2931 		 * 7-bit ASCII, unused bytes should be space characters.
2932 		 * Ref: NVMe v1.3c.
2933 		 */
2934 		cpywithpad((char *)sc->ctrldata.sn,
2935 		    sizeof(sc->ctrldata.sn), value, ' ');
2936 	}
2937 	value = get_config_value_node(nvl, "eui64");
2938 	if (value != NULL)
2939 		sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0));
2940 	value = get_config_value_node(nvl, "dsm");
2941 	if (value != NULL) {
2942 		if (strcmp(value, "auto") == 0)
2943 			sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2944 		else if (strcmp(value, "enable") == 0)
2945 			sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2946 		else if (strcmp(value, "disable") == 0)
2947 			sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2948 	}
2949 
2950 	value = get_config_value_node(nvl, "ram");
2951 	if (value != NULL) {
2952 		uint64_t sz = strtoull(value, NULL, 10);
2953 
2954 		sc->nvstore.type = NVME_STOR_RAM;
2955 		sc->nvstore.size = sz * 1024 * 1024;
2956 		sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2957 		sc->nvstore.sectsz = 4096;
2958 		sc->nvstore.sectsz_bits = 12;
2959 		if (sc->nvstore.ctx == NULL) {
2960 			EPRINTLN("nvme: Unable to allocate RAM");
2961 			return (-1);
2962 		}
2963 	} else {
2964 		snprintf(bident, sizeof(bident), "%d:%d",
2965 		    sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2966 		sc->nvstore.ctx = blockif_open(nvl, bident);
2967 		if (sc->nvstore.ctx == NULL) {
2968 			EPRINTLN("nvme: Could not open backing file: %s",
2969 			    strerror(errno));
2970 			return (-1);
2971 		}
2972 		sc->nvstore.type = NVME_STOR_BLOCKIF;
2973 		sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2974 	}
2975 
2976 	if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2977 		sc->nvstore.sectsz = sectsz;
2978 	else if (sc->nvstore.type != NVME_STOR_RAM)
2979 		sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2980 	for (sc->nvstore.sectsz_bits = 9;
2981 	     (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2982 	     sc->nvstore.sectsz_bits++);
2983 
2984 	if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2985 		sc->max_queues = NVME_QUEUES;
2986 
2987 	return (0);
2988 }
2989 
2990 static void
2991 pci_nvme_resized(struct blockif_ctxt *bctxt, void *arg, size_t new_size)
2992 {
2993 	struct pci_nvme_softc *sc;
2994 	struct pci_nvme_blockstore *nvstore;
2995 	struct nvme_namespace_data *nd;
2996 
2997 	sc = arg;
2998 	nvstore = &sc->nvstore;
2999 	nd = &sc->nsdata;
3000 
3001 	nvstore->size = new_size;
3002 	pci_nvme_init_nsdata_size(nvstore, nd);
3003 
3004 	/* Add changed NSID to list */
3005 	sc->ns_log.ns[0] = 1;
3006 	sc->ns_log.ns[1] = 0;
3007 
3008 	pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE,
3009 	    PCI_NVME_AE_INFO_NS_ATTR_CHANGED);
3010 }
3011 
3012 static int
3013 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl)
3014 {
3015 	struct pci_nvme_softc *sc;
3016 	uint32_t pci_membar_sz;
3017 	int	error;
3018 
3019 	error = 0;
3020 
3021 	sc = calloc(1, sizeof(struct pci_nvme_softc));
3022 	pi->pi_arg = sc;
3023 	sc->nsc_pi = pi;
3024 
3025 	error = pci_nvme_parse_config(sc, nvl);
3026 	if (error < 0)
3027 		goto done;
3028 	else
3029 		error = 0;
3030 
3031 	STAILQ_INIT(&sc->ioreqs_free);
3032 	sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
3033 	for (int i = 0; i < sc->ioslots; i++) {
3034 		STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
3035 	}
3036 
3037 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
3038 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
3039 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
3040 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
3041 	pci_set_cfgdata8(pi, PCIR_PROGIF,
3042 	                 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
3043 
3044 	/*
3045 	 * Allocate size of NVMe registers + doorbell space for all queues.
3046 	 *
3047 	 * The specification requires a minimum memory I/O window size of 16K.
3048 	 * The Windows driver will refuse to start a device with a smaller
3049 	 * window.
3050 	 */
3051 	pci_membar_sz = sizeof(struct nvme_registers) +
3052 	    2 * sizeof(uint32_t) * (sc->max_queues + 1);
3053 	pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
3054 
3055 	DPRINTF("nvme membar size: %u", pci_membar_sz);
3056 
3057 	error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
3058 	if (error) {
3059 		WPRINTF("%s pci alloc mem bar failed", __func__);
3060 		goto done;
3061 	}
3062 
3063 	error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
3064 	if (error) {
3065 		WPRINTF("%s pci add msixcap failed", __func__);
3066 		goto done;
3067 	}
3068 
3069 	error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
3070 	if (error) {
3071 		WPRINTF("%s pci add Express capability failed", __func__);
3072 		goto done;
3073 	}
3074 
3075 	pthread_mutex_init(&sc->mtx, NULL);
3076 	sem_init(&sc->iosemlock, 0, sc->ioslots);
3077 	blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc);
3078 
3079 	pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
3080 	/*
3081 	 * Controller data depends on Namespace data so initialize Namespace
3082 	 * data first.
3083 	 */
3084 	pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
3085 	pci_nvme_init_ctrldata(sc);
3086 	pci_nvme_init_logpages(sc);
3087 	pci_nvme_init_features(sc);
3088 
3089 	pci_nvme_aer_init(sc);
3090 	pci_nvme_aen_init(sc);
3091 
3092 	pci_nvme_reset(sc);
3093 
3094 	pci_lintr_request(pi);
3095 
3096 done:
3097 	return (error);
3098 }
3099 
3100 static int
3101 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts)
3102 {
3103 	char *cp, *ram;
3104 
3105 	if (opts == NULL)
3106 		return (0);
3107 
3108 	if (strncmp(opts, "ram=", 4) == 0) {
3109 		cp = strchr(opts, ',');
3110 		if (cp == NULL) {
3111 			set_config_value_node(nvl, "ram", opts + 4);
3112 			return (0);
3113 		}
3114 		ram = strndup(opts + 4, cp - opts - 4);
3115 		set_config_value_node(nvl, "ram", ram);
3116 		free(ram);
3117 		return (pci_parse_legacy_config(nvl, cp + 1));
3118 	} else
3119 		return (blockif_legacy_config(nvl, opts));
3120 }
3121 
3122 struct pci_devemu pci_de_nvme = {
3123 	.pe_emu =	"nvme",
3124 	.pe_init =	pci_nvme_init,
3125 	.pe_legacy_config = pci_nvme_legacy_config,
3126 	.pe_barwrite =	pci_nvme_write,
3127 	.pe_barread =	pci_nvme_read
3128 };
3129 PCI_EMUL_SET(pci_de_nvme);
3130