xref: /freebsd/usr.sbin/bhyve/pci_nvme.c (revision 7c20397b724a55001c2054fa133a768e9d06eb1c)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  * Copyright (c) 2020 Chuck Tuffli
7  *
8  * Function crc16 Copyright (c) 2017, Fedor Uporov
9  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 /*
34  * bhyve PCIe-NVMe device emulation.
35  *
36  * options:
37  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
38  *
39  *  accepted devpath:
40  *    /dev/blockdev
41  *    /path/to/image
42  *    ram=size_in_MiB
43  *
44  *  maxq    = max number of queues
45  *  qsz     = max elements in each queue
46  *  ioslots = max number of concurrent io requests
47  *  sectsz  = sector size (defaults to blockif sector size)
48  *  ser     = serial number (20-chars max)
49  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
50  *  dsm     = DataSet Management support. Option is one of auto, enable,disable
51  *
52  */
53 
54 /* TODO:
55     - create async event for smart and log
56     - intr coalesce
57  */
58 
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61 
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
65 
66 #include <assert.h>
67 #include <pthread.h>
68 #include <pthread_np.h>
69 #include <semaphore.h>
70 #include <stdbool.h>
71 #include <stddef.h>
72 #include <stdint.h>
73 #include <stdio.h>
74 #include <stdlib.h>
75 #include <string.h>
76 
77 #include <machine/atomic.h>
78 #include <machine/vmm.h>
79 #include <vmmapi.h>
80 
81 #include <dev/nvme/nvme.h>
82 
83 #include "bhyverun.h"
84 #include "block_if.h"
85 #include "config.h"
86 #include "debug.h"
87 #include "pci_emul.h"
88 
89 
90 static int nvme_debug = 0;
91 #define	DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
92 #define	WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
93 
94 /* defaults; can be overridden */
95 #define	NVME_MSIX_BAR		4
96 
97 #define	NVME_IOSLOTS		8
98 
99 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
100 #define NVME_MMIO_SPACE_MIN	(1 << 14)
101 
102 #define	NVME_QUEUES		16
103 #define	NVME_MAX_QENTRIES	2048
104 /* Memory Page size Minimum reported in CAP register */
105 #define	NVME_MPSMIN		0
106 /* MPSMIN converted to bytes */
107 #define	NVME_MPSMIN_BYTES	(1 << (12 + NVME_MPSMIN))
108 
109 #define	NVME_PRP2_ITEMS		(PAGE_SIZE/sizeof(uint64_t))
110 #define	NVME_MDTS		9
111 /* Note the + 1 allows for the initial descriptor to not be page aligned */
112 #define	NVME_MAX_IOVEC		((1 << NVME_MDTS) + 1)
113 #define	NVME_MAX_DATA_SIZE	((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
114 
115 /* This is a synthetic status code to indicate there is no status */
116 #define NVME_NO_STATUS		0xffff
117 #define NVME_COMPLETION_VALID(c)	((c).status != NVME_NO_STATUS)
118 
119 /* Reported temperature in Kelvin (i.e. room temperature) */
120 #define NVME_TEMPERATURE 296
121 
122 /* helpers */
123 
124 /* Convert a zero-based value into a one-based value */
125 #define ONE_BASED(zero)		((zero) + 1)
126 /* Convert a one-based value into a zero-based value */
127 #define ZERO_BASED(one)		((one)  - 1)
128 
129 /* Encode number of SQ's and CQ's for Set/Get Features */
130 #define NVME_FEATURE_NUM_QUEUES(sc) \
131 	(ZERO_BASED((sc)->num_squeues) & 0xffff) | \
132 	(ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
133 
134 #define	NVME_DOORBELL_OFFSET	offsetof(struct nvme_registers, doorbell)
135 
136 enum nvme_controller_register_offsets {
137 	NVME_CR_CAP_LOW = 0x00,
138 	NVME_CR_CAP_HI  = 0x04,
139 	NVME_CR_VS      = 0x08,
140 	NVME_CR_INTMS   = 0x0c,
141 	NVME_CR_INTMC   = 0x10,
142 	NVME_CR_CC      = 0x14,
143 	NVME_CR_CSTS    = 0x1c,
144 	NVME_CR_NSSR    = 0x20,
145 	NVME_CR_AQA     = 0x24,
146 	NVME_CR_ASQ_LOW = 0x28,
147 	NVME_CR_ASQ_HI  = 0x2c,
148 	NVME_CR_ACQ_LOW = 0x30,
149 	NVME_CR_ACQ_HI  = 0x34,
150 };
151 
152 enum nvme_cmd_cdw11 {
153 	NVME_CMD_CDW11_PC  = 0x0001,
154 	NVME_CMD_CDW11_IEN = 0x0002,
155 	NVME_CMD_CDW11_IV  = 0xFFFF0000,
156 };
157 
158 enum nvme_copy_dir {
159 	NVME_COPY_TO_PRP,
160 	NVME_COPY_FROM_PRP,
161 };
162 
163 #define	NVME_CQ_INTEN	0x01
164 #define	NVME_CQ_INTCOAL	0x02
165 
166 struct nvme_completion_queue {
167 	struct nvme_completion *qbase;
168 	pthread_mutex_t	mtx;
169 	uint32_t	size;
170 	uint16_t	tail; /* nvme progress */
171 	uint16_t	head; /* guest progress */
172 	uint16_t	intr_vec;
173 	uint32_t	intr_en;
174 };
175 
176 struct nvme_submission_queue {
177 	struct nvme_command *qbase;
178 	pthread_mutex_t	mtx;
179 	uint32_t	size;
180 	uint16_t	head; /* nvme progress */
181 	uint16_t	tail; /* guest progress */
182 	uint16_t	cqid; /* completion queue id */
183 	int		qpriority;
184 };
185 
186 enum nvme_storage_type {
187 	NVME_STOR_BLOCKIF = 0,
188 	NVME_STOR_RAM = 1,
189 };
190 
191 struct pci_nvme_blockstore {
192 	enum nvme_storage_type type;
193 	void		*ctx;
194 	uint64_t	size;
195 	uint32_t	sectsz;
196 	uint32_t	sectsz_bits;
197 	uint64_t	eui64;
198 	uint32_t	deallocate:1;
199 };
200 
201 /*
202  * Calculate the number of additional page descriptors for guest IO requests
203  * based on the advertised Max Data Transfer (MDTS) and given the number of
204  * default iovec's in a struct blockif_req.
205  */
206 #define MDTS_PAD_SIZE \
207 	( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
208 	  NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
209 	  0 )
210 
211 struct pci_nvme_ioreq {
212 	struct pci_nvme_softc *sc;
213 	STAILQ_ENTRY(pci_nvme_ioreq) link;
214 	struct nvme_submission_queue *nvme_sq;
215 	uint16_t	sqid;
216 
217 	/* command information */
218 	uint16_t	opc;
219 	uint16_t	cid;
220 	uint32_t	nsid;
221 
222 	uint64_t	prev_gpaddr;
223 	size_t		prev_size;
224 	size_t		bytes;
225 
226 	struct blockif_req io_req;
227 
228 	struct iovec	iovpadding[MDTS_PAD_SIZE];
229 };
230 
231 enum nvme_dsm_type {
232 	/* Dataset Management bit in ONCS reflects backing storage capability */
233 	NVME_DATASET_MANAGEMENT_AUTO,
234 	/* Unconditionally set Dataset Management bit in ONCS */
235 	NVME_DATASET_MANAGEMENT_ENABLE,
236 	/* Unconditionally clear Dataset Management bit in ONCS */
237 	NVME_DATASET_MANAGEMENT_DISABLE,
238 };
239 
240 struct pci_nvme_softc;
241 struct nvme_feature_obj;
242 
243 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
244     struct nvme_feature_obj *,
245     struct nvme_command *,
246     struct nvme_completion *);
247 
248 struct nvme_feature_obj {
249 	uint32_t	cdw11;
250 	nvme_feature_cb	set;
251 	nvme_feature_cb	get;
252 	bool namespace_specific;
253 };
254 
255 #define NVME_FID_MAX		(NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
256 
257 typedef enum {
258 	PCI_NVME_AE_TYPE_ERROR = 0,
259 	PCI_NVME_AE_TYPE_SMART,
260 	PCI_NVME_AE_TYPE_NOTICE,
261 	PCI_NVME_AE_TYPE_IO_CMD = 6,
262 	PCI_NVME_AE_TYPE_VENDOR = 7,
263 	PCI_NVME_AE_TYPE_MAX		/* Must be last */
264 } pci_nvme_async_type;
265 
266 /* Asynchronous Event Requests */
267 struct pci_nvme_aer {
268 	STAILQ_ENTRY(pci_nvme_aer) link;
269 	uint16_t	cid;	/* Command ID of the submitted AER */
270 };
271 
272 /** Asynchronous Event Information - Notice */
273 typedef enum {
274 	PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED = 0,
275 	PCI_NVME_AEI_NOTICE_FW_ACTIVATION,
276 	PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE,
277 	PCI_NVME_AEI_NOTICE_ANA_CHANGE,
278 	PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE,
279 	PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT,
280 	PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE,
281 	PCI_NVME_AEI_NOTICE_MAX,
282 } pci_nvme_async_event_info_notice;
283 
284 #define PCI_NVME_AEI_NOTICE_SHIFT		8
285 #define PCI_NVME_AEI_NOTICE_MASK(event)	(1 << (event + PCI_NVME_AEI_NOTICE_SHIFT))
286 
287 /* Asynchronous Event Notifications */
288 struct pci_nvme_aen {
289 	pci_nvme_async_type atype;
290 	uint32_t	event_data;
291 	bool		posted;
292 };
293 
294 /*
295  * By default, enable all Asynchrnous Event Notifications:
296  *     SMART / Health Critical Warnings
297  *     Namespace Attribute Notices
298  */
299 #define PCI_NVME_AEN_DEFAULT_MASK	0x11f
300 
301 typedef enum {
302 	NVME_CNTRLTYPE_IO = 1,
303 	NVME_CNTRLTYPE_DISCOVERY = 2,
304 	NVME_CNTRLTYPE_ADMIN = 3,
305 } pci_nvme_cntrl_type;
306 
307 struct pci_nvme_softc {
308 	struct pci_devinst *nsc_pi;
309 
310 	pthread_mutex_t	mtx;
311 
312 	struct nvme_registers regs;
313 
314 	struct nvme_namespace_data  nsdata;
315 	struct nvme_controller_data ctrldata;
316 	struct nvme_error_information_entry err_log;
317 	struct nvme_health_information_page health_log;
318 	struct nvme_firmware_page fw_log;
319 	struct nvme_ns_list ns_log;
320 
321 	struct pci_nvme_blockstore nvstore;
322 
323 	uint16_t	max_qentries;	/* max entries per queue */
324 	uint32_t	max_queues;	/* max number of IO SQ's or CQ's */
325 	uint32_t	num_cqueues;
326 	uint32_t	num_squeues;
327 	bool		num_q_is_set; /* Has host set Number of Queues */
328 
329 	struct pci_nvme_ioreq *ioreqs;
330 	STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
331 	uint32_t	pending_ios;
332 	uint32_t	ioslots;
333 	sem_t		iosemlock;
334 
335 	/*
336 	 * Memory mapped Submission and Completion queues
337 	 * Each array includes both Admin and IO queues
338 	 */
339 	struct nvme_completion_queue *compl_queues;
340 	struct nvme_submission_queue *submit_queues;
341 
342 	struct nvme_feature_obj feat[NVME_FID_MAX];
343 
344 	enum nvme_dsm_type dataset_management;
345 
346 	/* Accounting for SMART data */
347 	__uint128_t	read_data_units;
348 	__uint128_t	write_data_units;
349 	__uint128_t	read_commands;
350 	__uint128_t	write_commands;
351 	uint32_t	read_dunits_remainder;
352 	uint32_t	write_dunits_remainder;
353 
354 	STAILQ_HEAD(, pci_nvme_aer) aer_list;
355 	pthread_mutex_t	aer_mtx;
356 	uint32_t	aer_count;
357 	struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX];
358 	pthread_t	aen_tid;
359 	pthread_mutex_t	aen_mtx;
360 	pthread_cond_t	aen_cond;
361 };
362 
363 
364 static void pci_nvme_cq_update(struct pci_nvme_softc *sc,
365     struct nvme_completion_queue *cq,
366     uint32_t cdw0,
367     uint16_t cid,
368     uint16_t sqid,
369     uint16_t status);
370 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
371 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
372 static void pci_nvme_io_done(struct blockif_req *, int);
373 
374 /* Controller Configuration utils */
375 #define	NVME_CC_GET_EN(cc) \
376 	((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
377 #define	NVME_CC_GET_CSS(cc) \
378 	((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
379 #define	NVME_CC_GET_SHN(cc) \
380 	((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
381 #define	NVME_CC_GET_IOSQES(cc) \
382 	((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
383 #define	NVME_CC_GET_IOCQES(cc) \
384 	((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
385 
386 #define	NVME_CC_WRITE_MASK \
387 	((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
388 	 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
389 	 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
390 
391 #define	NVME_CC_NEN_WRITE_MASK \
392 	((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
393 	 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
394 	 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
395 
396 /* Controller Status utils */
397 #define	NVME_CSTS_GET_RDY(sts) \
398 	((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
399 
400 #define	NVME_CSTS_RDY	(1 << NVME_CSTS_REG_RDY_SHIFT)
401 
402 /* Completion Queue status word utils */
403 #define	NVME_STATUS_P	(1 << NVME_STATUS_P_SHIFT)
404 #define	NVME_STATUS_MASK \
405 	((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
406 	 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
407 
408 #define NVME_ONCS_DSM	(NVME_CTRLR_DATA_ONCS_DSM_MASK << \
409 	NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
410 
411 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
412     struct nvme_feature_obj *,
413     struct nvme_command *,
414     struct nvme_completion *);
415 static void nvme_feature_temperature(struct pci_nvme_softc *,
416     struct nvme_feature_obj *,
417     struct nvme_command *,
418     struct nvme_completion *);
419 static void nvme_feature_num_queues(struct pci_nvme_softc *,
420     struct nvme_feature_obj *,
421     struct nvme_command *,
422     struct nvme_completion *);
423 static void nvme_feature_iv_config(struct pci_nvme_softc *,
424     struct nvme_feature_obj *,
425     struct nvme_command *,
426     struct nvme_completion *);
427 static void nvme_feature_async_event(struct pci_nvme_softc *,
428     struct nvme_feature_obj *,
429     struct nvme_command *,
430     struct nvme_completion *);
431 
432 static void *aen_thr(void *arg);
433 
434 static __inline void
435 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
436 {
437 	size_t len;
438 
439 	len = strnlen(src, dst_size);
440 	memset(dst, pad, dst_size);
441 	memcpy(dst, src, len);
442 }
443 
444 static __inline void
445 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
446 {
447 
448 	*status &= ~NVME_STATUS_MASK;
449 	*status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
450 		(code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
451 }
452 
453 static __inline void
454 pci_nvme_status_genc(uint16_t *status, uint16_t code)
455 {
456 
457 	pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
458 }
459 
460 /*
461  * Initialize the requested number or IO Submission and Completion Queues.
462  * Admin queues are allocated implicitly.
463  */
464 static void
465 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
466 {
467 	uint32_t i;
468 
469 	/*
470 	 * Allocate and initialize the Submission Queues
471 	 */
472 	if (nsq > NVME_QUEUES) {
473 		WPRINTF("%s: clamping number of SQ from %u to %u",
474 					__func__, nsq, NVME_QUEUES);
475 		nsq = NVME_QUEUES;
476 	}
477 
478 	sc->num_squeues = nsq;
479 
480 	sc->submit_queues = calloc(sc->num_squeues + 1,
481 				sizeof(struct nvme_submission_queue));
482 	if (sc->submit_queues == NULL) {
483 		WPRINTF("%s: SQ allocation failed", __func__);
484 		sc->num_squeues = 0;
485 	} else {
486 		struct nvme_submission_queue *sq = sc->submit_queues;
487 
488 		for (i = 0; i < sc->num_squeues + 1; i++)
489 			pthread_mutex_init(&sq[i].mtx, NULL);
490 	}
491 
492 	/*
493 	 * Allocate and initialize the Completion Queues
494 	 */
495 	if (ncq > NVME_QUEUES) {
496 		WPRINTF("%s: clamping number of CQ from %u to %u",
497 					__func__, ncq, NVME_QUEUES);
498 		ncq = NVME_QUEUES;
499 	}
500 
501 	sc->num_cqueues = ncq;
502 
503 	sc->compl_queues = calloc(sc->num_cqueues + 1,
504 				sizeof(struct nvme_completion_queue));
505 	if (sc->compl_queues == NULL) {
506 		WPRINTF("%s: CQ allocation failed", __func__);
507 		sc->num_cqueues = 0;
508 	} else {
509 		struct nvme_completion_queue *cq = sc->compl_queues;
510 
511 		for (i = 0; i < sc->num_cqueues + 1; i++)
512 			pthread_mutex_init(&cq[i].mtx, NULL);
513 	}
514 }
515 
516 static void
517 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
518 {
519 	struct nvme_controller_data *cd = &sc->ctrldata;
520 
521 	cd->vid = 0xFB5D;
522 	cd->ssvid = 0x0000;
523 
524 	cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
525 	cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
526 
527 	/* Num of submission commands that we can handle at a time (2^rab) */
528 	cd->rab   = 4;
529 
530 	/* FreeBSD OUI */
531 	cd->ieee[0] = 0x58;
532 	cd->ieee[1] = 0x9c;
533 	cd->ieee[2] = 0xfc;
534 
535 	cd->mic = 0;
536 
537 	cd->mdts = NVME_MDTS;	/* max data transfer size (2^mdts * CAP.MPSMIN) */
538 
539 	cd->ver = NVME_REV(1,4);
540 
541 	cd->cntrltype = NVME_CNTRLTYPE_IO;
542 	cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
543 	cd->oaes = NVMEB(NVME_CTRLR_DATA_OAES_NS_ATTR);
544 	cd->acl = 2;
545 	cd->aerl = 4;
546 
547 	/* Advertise 1, Read-only firmware slot */
548 	cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK |
549 	    (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
550 	cd->lpa = 0;	/* TODO: support some simple things like SMART */
551 	cd->elpe = 0;	/* max error log page entries */
552 	cd->npss = 1;	/* number of power states support */
553 
554 	/* Warning Composite Temperature Threshold */
555 	cd->wctemp = 0x0157;
556 	cd->cctemp = 0x0157;
557 
558 	cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
559 	    (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
560 	cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
561 	    (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
562 	cd->nn = 1;	/* number of namespaces */
563 
564 	cd->oncs = 0;
565 	switch (sc->dataset_management) {
566 	case NVME_DATASET_MANAGEMENT_AUTO:
567 		if (sc->nvstore.deallocate)
568 			cd->oncs |= NVME_ONCS_DSM;
569 		break;
570 	case NVME_DATASET_MANAGEMENT_ENABLE:
571 		cd->oncs |= NVME_ONCS_DSM;
572 		break;
573 	default:
574 		break;
575 	}
576 
577 	cd->fna = NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK <<
578 	    NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT;
579 
580 	cd->vwc = NVME_CTRLR_DATA_VWC_ALL_NO << NVME_CTRLR_DATA_VWC_ALL_SHIFT;
581 
582 	cd->power_state[0].mp = 10;
583 }
584 
585 /*
586  * Calculate the CRC-16 of the given buffer
587  * See copyright attribution at top of file
588  */
589 static uint16_t
590 crc16(uint16_t crc, const void *buffer, unsigned int len)
591 {
592 	const unsigned char *cp = buffer;
593 	/* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
594 	static uint16_t const crc16_table[256] = {
595 		0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
596 		0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
597 		0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
598 		0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
599 		0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
600 		0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
601 		0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
602 		0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
603 		0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
604 		0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
605 		0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
606 		0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
607 		0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
608 		0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
609 		0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
610 		0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
611 		0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
612 		0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
613 		0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
614 		0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
615 		0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
616 		0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
617 		0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
618 		0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
619 		0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
620 		0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
621 		0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
622 		0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
623 		0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
624 		0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
625 		0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
626 		0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
627 	};
628 
629 	while (len--)
630 		crc = (((crc >> 8) & 0xffU) ^
631 		    crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
632 	return crc;
633 }
634 
635 static void
636 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore,
637     struct nvme_namespace_data *nd)
638 {
639 
640 	/* Get capacity and block size information from backing store */
641 	nd->nsze = nvstore->size / nvstore->sectsz;
642 	nd->ncap = nd->nsze;
643 	nd->nuse = nd->nsze;
644 }
645 
646 static void
647 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
648     struct nvme_namespace_data *nd, uint32_t nsid,
649     struct pci_nvme_blockstore *nvstore)
650 {
651 
652 	pci_nvme_init_nsdata_size(nvstore, nd);
653 
654 	if (nvstore->type == NVME_STOR_BLOCKIF)
655 		nvstore->deallocate = blockif_candelete(nvstore->ctx);
656 
657 	nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
658 	nd->flbas = 0;
659 
660 	/* Create an EUI-64 if user did not provide one */
661 	if (nvstore->eui64 == 0) {
662 		char *data = NULL;
663 		uint64_t eui64 = nvstore->eui64;
664 
665 		asprintf(&data, "%s%u%u%u", get_config_value("name"),
666 		    sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot,
667 		    sc->nsc_pi->pi_func);
668 
669 		if (data != NULL) {
670 			eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
671 			free(data);
672 		}
673 		nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
674 	}
675 	be64enc(nd->eui64, nvstore->eui64);
676 
677 	/* LBA data-sz = 2^lbads */
678 	nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
679 }
680 
681 static void
682 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
683 {
684 
685 	memset(&sc->err_log, 0, sizeof(sc->err_log));
686 	memset(&sc->health_log, 0, sizeof(sc->health_log));
687 	memset(&sc->fw_log, 0, sizeof(sc->fw_log));
688 	memset(&sc->ns_log, 0, sizeof(sc->ns_log));
689 
690 	/* Set read/write remainder to round up according to spec */
691 	sc->read_dunits_remainder = 999;
692 	sc->write_dunits_remainder = 999;
693 
694 	/* Set nominal Health values checked by implementations */
695 	sc->health_log.temperature = NVME_TEMPERATURE;
696 	sc->health_log.available_spare = 100;
697 	sc->health_log.available_spare_threshold = 10;
698 }
699 
700 static void
701 pci_nvme_init_features(struct pci_nvme_softc *sc)
702 {
703 	enum nvme_feature	fid;
704 
705 	for (fid = 0; fid < NVME_FID_MAX; fid++) {
706 		switch (fid) {
707 		case NVME_FEAT_ARBITRATION:
708 		case NVME_FEAT_POWER_MANAGEMENT:
709 		case NVME_FEAT_INTERRUPT_COALESCING: //XXX
710 		case NVME_FEAT_WRITE_ATOMICITY:
711 			/* Mandatory but no special handling required */
712 		//XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
713 		//XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
714 		//		  this returns a data buffer
715 			break;
716 		case NVME_FEAT_TEMPERATURE_THRESHOLD:
717 			sc->feat[fid].set = nvme_feature_temperature;
718 			break;
719 		case NVME_FEAT_ERROR_RECOVERY:
720 			sc->feat[fid].namespace_specific = true;
721 			break;
722 		case NVME_FEAT_NUMBER_OF_QUEUES:
723 			sc->feat[fid].set = nvme_feature_num_queues;
724 			break;
725 		case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
726 			sc->feat[fid].set = nvme_feature_iv_config;
727 			break;
728 		case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
729 			sc->feat[fid].set = nvme_feature_async_event;
730 			/* Enable all AENs by default */
731 			sc->feat[fid].cdw11 = PCI_NVME_AEN_DEFAULT_MASK;
732 			break;
733 		default:
734 			sc->feat[fid].set = nvme_feature_invalid_cb;
735 			sc->feat[fid].get = nvme_feature_invalid_cb;
736 		}
737 	}
738 }
739 
740 static void
741 pci_nvme_aer_reset(struct pci_nvme_softc *sc)
742 {
743 
744 	STAILQ_INIT(&sc->aer_list);
745 	sc->aer_count = 0;
746 }
747 
748 static void
749 pci_nvme_aer_init(struct pci_nvme_softc *sc)
750 {
751 
752 	pthread_mutex_init(&sc->aer_mtx, NULL);
753 	pci_nvme_aer_reset(sc);
754 }
755 
756 static void
757 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
758 {
759 	struct pci_nvme_aer *aer = NULL;
760 
761 	pthread_mutex_lock(&sc->aer_mtx);
762 	while (!STAILQ_EMPTY(&sc->aer_list)) {
763 		aer = STAILQ_FIRST(&sc->aer_list);
764 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
765 		free(aer);
766 	}
767 	pthread_mutex_unlock(&sc->aer_mtx);
768 
769 	pci_nvme_aer_reset(sc);
770 }
771 
772 static bool
773 pci_nvme_aer_available(struct pci_nvme_softc *sc)
774 {
775 
776 	return (sc->aer_count != 0);
777 }
778 
779 static bool
780 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
781 {
782 	struct nvme_controller_data *cd = &sc->ctrldata;
783 
784 	/* AERL is a zero based value while aer_count is one's based */
785 	return (sc->aer_count == (cd->aerl + 1));
786 }
787 
788 /*
789  * Add an Async Event Request
790  *
791  * Stores an AER to be returned later if the Controller needs to notify the
792  * host of an event.
793  * Note that while the NVMe spec doesn't require Controllers to return AER's
794  * in order, this implementation does preserve the order.
795  */
796 static int
797 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
798 {
799 	struct pci_nvme_aer *aer = NULL;
800 
801 	aer = calloc(1, sizeof(struct pci_nvme_aer));
802 	if (aer == NULL)
803 		return (-1);
804 
805 	/* Save the Command ID for use in the completion message */
806 	aer->cid = cid;
807 
808 	pthread_mutex_lock(&sc->aer_mtx);
809 	sc->aer_count++;
810 	STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
811 	pthread_mutex_unlock(&sc->aer_mtx);
812 
813 	return (0);
814 }
815 
816 /*
817  * Get an Async Event Request structure
818  *
819  * Returns a pointer to an AER previously submitted by the host or NULL if
820  * no AER's exist. Caller is responsible for freeing the returned struct.
821  */
822 static struct pci_nvme_aer *
823 pci_nvme_aer_get(struct pci_nvme_softc *sc)
824 {
825 	struct pci_nvme_aer *aer = NULL;
826 
827 	pthread_mutex_lock(&sc->aer_mtx);
828 	aer = STAILQ_FIRST(&sc->aer_list);
829 	if (aer != NULL) {
830 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
831 		sc->aer_count--;
832 	}
833 	pthread_mutex_unlock(&sc->aer_mtx);
834 
835 	return (aer);
836 }
837 
838 static void
839 pci_nvme_aen_reset(struct pci_nvme_softc *sc)
840 {
841 	uint32_t	atype;
842 
843 	memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen));
844 
845 	for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
846 		sc->aen[atype].atype = atype;
847 	}
848 }
849 
850 static void
851 pci_nvme_aen_init(struct pci_nvme_softc *sc)
852 {
853 	char nstr[80];
854 
855 	pci_nvme_aen_reset(sc);
856 
857 	pthread_mutex_init(&sc->aen_mtx, NULL);
858 	pthread_create(&sc->aen_tid, NULL, aen_thr, sc);
859 	snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot,
860 	    sc->nsc_pi->pi_func);
861 	pthread_set_name_np(sc->aen_tid, nstr);
862 }
863 
864 static void
865 pci_nvme_aen_destroy(struct pci_nvme_softc *sc)
866 {
867 
868 	pci_nvme_aen_reset(sc);
869 }
870 
871 /* Notify the AEN thread of pending work */
872 static void
873 pci_nvme_aen_notify(struct pci_nvme_softc *sc)
874 {
875 
876 	pthread_cond_signal(&sc->aen_cond);
877 }
878 
879 /*
880  * Post an Asynchronous Event Notification
881  */
882 static int32_t
883 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype,
884 		uint32_t event_data)
885 {
886 	struct pci_nvme_aen *aen;
887 
888 	if (atype >= PCI_NVME_AE_TYPE_MAX) {
889 		return(EINVAL);
890 	}
891 
892 	pthread_mutex_lock(&sc->aen_mtx);
893 	aen = &sc->aen[atype];
894 
895 	/* Has the controller already posted an event of this type? */
896 	if (aen->posted) {
897 		pthread_mutex_unlock(&sc->aen_mtx);
898 		return(EALREADY);
899 	}
900 
901 	aen->event_data = event_data;
902 	aen->posted = true;
903 	pthread_mutex_unlock(&sc->aen_mtx);
904 
905 	pci_nvme_aen_notify(sc);
906 
907 	return(0);
908 }
909 
910 static void
911 pci_nvme_aen_process(struct pci_nvme_softc *sc)
912 {
913 	struct pci_nvme_aer *aer;
914 	struct pci_nvme_aen *aen;
915 	pci_nvme_async_type atype;
916 	uint32_t mask;
917 	uint16_t status;
918 	uint8_t lid;
919 
920 	assert(pthread_mutex_isowned_np(&sc->aen_mtx));
921 	for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
922 		aen = &sc->aen[atype];
923 		/* Previous iterations may have depleted the available AER's */
924 		if (!pci_nvme_aer_available(sc)) {
925 			DPRINTF("%s: no AER", __func__);
926 			break;
927 		}
928 
929 		if (!aen->posted) {
930 			DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype);
931 			continue;
932 		}
933 
934 		status = NVME_SC_SUCCESS;
935 
936 		/* Is the event masked? */
937 		mask =
938 		    sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11;
939 
940 		DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data);
941 		switch (atype) {
942 		case PCI_NVME_AE_TYPE_ERROR:
943 			lid = NVME_LOG_ERROR;
944 			break;
945 		case PCI_NVME_AE_TYPE_SMART:
946 			mask &= 0xff;
947 			if ((mask & aen->event_data) == 0)
948 				continue;
949 			lid = NVME_LOG_HEALTH_INFORMATION;
950 			break;
951 		case PCI_NVME_AE_TYPE_NOTICE:
952 			if (aen->event_data >= PCI_NVME_AEI_NOTICE_MAX) {
953 				EPRINTLN("%s unknown AEN notice type %u",
954 				    __func__, aen->event_data);
955 				status = NVME_SC_INTERNAL_DEVICE_ERROR;
956 				break;
957 			}
958 			if ((PCI_NVME_AEI_NOTICE_MASK(aen->event_data) & mask) == 0)
959 				continue;
960 			switch (aen->event_data) {
961 			case PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED:
962 				lid = NVME_LOG_CHANGED_NAMESPACE;
963 				break;
964 			case PCI_NVME_AEI_NOTICE_FW_ACTIVATION:
965 				lid = NVME_LOG_FIRMWARE_SLOT;
966 				break;
967 			case PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE:
968 				lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED;
969 				break;
970 			case PCI_NVME_AEI_NOTICE_ANA_CHANGE:
971 				lid = NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS;
972 				break;
973 			case PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE:
974 				lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE;
975 				break;
976 			case PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT:
977 				lid = NVME_LOG_LBA_STATUS_INFORMATION;
978 				break;
979 			case PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE:
980 				lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE;
981 				break;
982 			default:
983 				lid = 0;
984 			}
985 			break;
986 		default:
987 			/* bad type?!? */
988 			EPRINTLN("%s unknown AEN type %u", __func__, atype);
989 			status = NVME_SC_INTERNAL_DEVICE_ERROR;
990 			break;
991 		}
992 
993 		aer = pci_nvme_aer_get(sc);
994 		assert(aer != NULL);
995 
996 		DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype);
997 		pci_nvme_cq_update(sc, &sc->compl_queues[0],
998 		    (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */
999 		    aer->cid,
1000 		    0,		/* SQID */
1001 		    status);
1002 
1003 		aen->event_data = 0;
1004 		aen->posted = false;
1005 
1006 		pci_generate_msix(sc->nsc_pi, 0);
1007 	}
1008 }
1009 
1010 static void *
1011 aen_thr(void *arg)
1012 {
1013 	struct pci_nvme_softc *sc;
1014 
1015 	sc = arg;
1016 
1017 	pthread_mutex_lock(&sc->aen_mtx);
1018 	for (;;) {
1019 		pci_nvme_aen_process(sc);
1020 		pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx);
1021 	}
1022 	pthread_mutex_unlock(&sc->aen_mtx);
1023 
1024 	pthread_exit(NULL);
1025 	return (NULL);
1026 }
1027 
1028 static void
1029 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
1030 {
1031 	uint32_t i;
1032 
1033 	DPRINTF("%s", __func__);
1034 
1035 	sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
1036 	    (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
1037 	    (60 << NVME_CAP_LO_REG_TO_SHIFT);
1038 
1039 	sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
1040 
1041 	sc->regs.vs = NVME_REV(1,4);	/* NVMe v1.4 */
1042 
1043 	sc->regs.cc = 0;
1044 
1045 	assert(sc->submit_queues != NULL);
1046 
1047 	for (i = 0; i < sc->num_squeues + 1; i++) {
1048 		sc->submit_queues[i].qbase = NULL;
1049 		sc->submit_queues[i].size = 0;
1050 		sc->submit_queues[i].cqid = 0;
1051 		sc->submit_queues[i].tail = 0;
1052 		sc->submit_queues[i].head = 0;
1053 	}
1054 
1055 	assert(sc->compl_queues != NULL);
1056 
1057 	for (i = 0; i < sc->num_cqueues + 1; i++) {
1058 		sc->compl_queues[i].qbase = NULL;
1059 		sc->compl_queues[i].size = 0;
1060 		sc->compl_queues[i].tail = 0;
1061 		sc->compl_queues[i].head = 0;
1062 	}
1063 
1064 	sc->num_q_is_set = false;
1065 
1066 	pci_nvme_aer_destroy(sc);
1067 	pci_nvme_aen_destroy(sc);
1068 
1069 	/*
1070 	 * Clear CSTS.RDY last to prevent the host from enabling Controller
1071 	 * before cleanup completes
1072 	 */
1073 	sc->regs.csts = 0;
1074 }
1075 
1076 static void
1077 pci_nvme_reset(struct pci_nvme_softc *sc)
1078 {
1079 	pthread_mutex_lock(&sc->mtx);
1080 	pci_nvme_reset_locked(sc);
1081 	pthread_mutex_unlock(&sc->mtx);
1082 }
1083 
1084 static void
1085 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
1086 {
1087 	uint16_t acqs, asqs;
1088 
1089 	DPRINTF("%s", __func__);
1090 
1091 	asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
1092 	sc->submit_queues[0].size = asqs;
1093 	sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
1094 	            sizeof(struct nvme_command) * asqs);
1095 
1096 	DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
1097 	        __func__, sc->regs.asq, sc->submit_queues[0].qbase);
1098 
1099 	acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
1100 	    NVME_AQA_REG_ACQS_MASK) + 1;
1101 	sc->compl_queues[0].size = acqs;
1102 	sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
1103 	         sizeof(struct nvme_completion) * acqs);
1104 	sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
1105 
1106 	DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
1107 	        __func__, sc->regs.acq, sc->compl_queues[0].qbase);
1108 }
1109 
1110 static int
1111 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
1112 	size_t len, enum nvme_copy_dir dir)
1113 {
1114 	uint8_t *p;
1115 	size_t bytes;
1116 
1117 	if (len > (8 * 1024)) {
1118 		return (-1);
1119 	}
1120 
1121 	/* Copy from the start of prp1 to the end of the physical page */
1122 	bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
1123 	bytes = MIN(bytes, len);
1124 
1125 	p = vm_map_gpa(ctx, prp1, bytes);
1126 	if (p == NULL) {
1127 		return (-1);
1128 	}
1129 
1130 	if (dir == NVME_COPY_TO_PRP)
1131 		memcpy(p, b, bytes);
1132 	else
1133 		memcpy(b, p, bytes);
1134 
1135 	b += bytes;
1136 
1137 	len -= bytes;
1138 	if (len == 0) {
1139 		return (0);
1140 	}
1141 
1142 	len = MIN(len, PAGE_SIZE);
1143 
1144 	p = vm_map_gpa(ctx, prp2, len);
1145 	if (p == NULL) {
1146 		return (-1);
1147 	}
1148 
1149 	if (dir == NVME_COPY_TO_PRP)
1150 		memcpy(p, b, len);
1151 	else
1152 		memcpy(b, p, len);
1153 
1154 	return (0);
1155 }
1156 
1157 /*
1158  * Write a Completion Queue Entry update
1159  *
1160  * Write the completion and update the doorbell value
1161  */
1162 static void
1163 pci_nvme_cq_update(struct pci_nvme_softc *sc,
1164 		struct nvme_completion_queue *cq,
1165 		uint32_t cdw0,
1166 		uint16_t cid,
1167 		uint16_t sqid,
1168 		uint16_t status)
1169 {
1170 	struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
1171 	struct nvme_completion *cqe;
1172 
1173 	assert(cq->qbase != NULL);
1174 
1175 	pthread_mutex_lock(&cq->mtx);
1176 
1177 	cqe = &cq->qbase[cq->tail];
1178 
1179 	/* Flip the phase bit */
1180 	status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
1181 
1182 	cqe->cdw0 = cdw0;
1183 	cqe->sqhd = sq->head;
1184 	cqe->sqid = sqid;
1185 	cqe->cid = cid;
1186 	cqe->status = status;
1187 
1188 	cq->tail++;
1189 	if (cq->tail >= cq->size) {
1190 		cq->tail = 0;
1191 	}
1192 
1193 	pthread_mutex_unlock(&cq->mtx);
1194 }
1195 
1196 static int
1197 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1198 	struct nvme_completion* compl)
1199 {
1200 	uint16_t qid = command->cdw10 & 0xffff;
1201 
1202 	DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
1203 	if (qid == 0 || qid > sc->num_squeues ||
1204 	    (sc->submit_queues[qid].qbase == NULL)) {
1205 		WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
1206 		        __func__, qid, sc->num_squeues);
1207 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1208 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1209 		return (1);
1210 	}
1211 
1212 	sc->submit_queues[qid].qbase = NULL;
1213 	sc->submit_queues[qid].cqid = 0;
1214 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1215 	return (1);
1216 }
1217 
1218 static int
1219 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1220 	struct nvme_completion* compl)
1221 {
1222 	if (command->cdw11 & NVME_CMD_CDW11_PC) {
1223 		uint16_t qid = command->cdw10 & 0xffff;
1224 		struct nvme_submission_queue *nsq;
1225 
1226 		if ((qid == 0) || (qid > sc->num_squeues) ||
1227 		    (sc->submit_queues[qid].qbase != NULL)) {
1228 			WPRINTF("%s queue index %u > num_squeues %u",
1229 			        __func__, qid, sc->num_squeues);
1230 			pci_nvme_status_tc(&compl->status,
1231 			    NVME_SCT_COMMAND_SPECIFIC,
1232 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1233 			return (1);
1234 		}
1235 
1236 		nsq = &sc->submit_queues[qid];
1237 		nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1238 		DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
1239 		if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
1240 			/*
1241 			 * Queues must specify at least two entries
1242 			 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1243 			 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1244 			 */
1245 			pci_nvme_status_tc(&compl->status,
1246 			    NVME_SCT_COMMAND_SPECIFIC,
1247 			    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1248 			return (1);
1249 		}
1250 		nsq->head = nsq->tail = 0;
1251 
1252 		nsq->cqid = (command->cdw11 >> 16) & 0xffff;
1253 		if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
1254 			pci_nvme_status_tc(&compl->status,
1255 			    NVME_SCT_COMMAND_SPECIFIC,
1256 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1257 			return (1);
1258 		}
1259 
1260 		if (sc->compl_queues[nsq->cqid].qbase == NULL) {
1261 			pci_nvme_status_tc(&compl->status,
1262 			    NVME_SCT_COMMAND_SPECIFIC,
1263 			    NVME_SC_COMPLETION_QUEUE_INVALID);
1264 			return (1);
1265 		}
1266 
1267 		nsq->qpriority = (command->cdw11 >> 1) & 0x03;
1268 
1269 		nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1270 		              sizeof(struct nvme_command) * (size_t)nsq->size);
1271 
1272 		DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
1273 		        qid, nsq->size, nsq->qbase, nsq->cqid);
1274 
1275 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1276 
1277 		DPRINTF("%s completed creating IOSQ qid %u",
1278 		         __func__, qid);
1279 	} else {
1280 		/*
1281 		 * Guest sent non-cont submission queue request.
1282 		 * This setting is unsupported by this emulation.
1283 		 */
1284 		WPRINTF("%s unsupported non-contig (list-based) "
1285 		         "create i/o submission queue", __func__);
1286 
1287 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1288 	}
1289 	return (1);
1290 }
1291 
1292 static int
1293 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1294 	struct nvme_completion* compl)
1295 {
1296 	uint16_t qid = command->cdw10 & 0xffff;
1297 	uint16_t sqid;
1298 
1299 	DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
1300 	if (qid == 0 || qid > sc->num_cqueues ||
1301 	    (sc->compl_queues[qid].qbase == NULL)) {
1302 		WPRINTF("%s queue index %u / num_cqueues %u",
1303 		        __func__, qid, sc->num_cqueues);
1304 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1305 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1306 		return (1);
1307 	}
1308 
1309 	/* Deleting an Active CQ is an error */
1310 	for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
1311 		if (sc->submit_queues[sqid].cqid == qid) {
1312 			pci_nvme_status_tc(&compl->status,
1313 			    NVME_SCT_COMMAND_SPECIFIC,
1314 			    NVME_SC_INVALID_QUEUE_DELETION);
1315 			return (1);
1316 		}
1317 
1318 	sc->compl_queues[qid].qbase = NULL;
1319 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1320 	return (1);
1321 }
1322 
1323 static int
1324 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1325 	struct nvme_completion* compl)
1326 {
1327 	struct nvme_completion_queue *ncq;
1328 	uint16_t qid = command->cdw10 & 0xffff;
1329 
1330 	/* Only support Physically Contiguous queues */
1331 	if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1332 		WPRINTF("%s unsupported non-contig (list-based) "
1333 		         "create i/o completion queue",
1334 		         __func__);
1335 
1336 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1337 		return (1);
1338 	}
1339 
1340 	if ((qid == 0) || (qid > sc->num_cqueues) ||
1341 	    (sc->compl_queues[qid].qbase != NULL)) {
1342 		WPRINTF("%s queue index %u > num_cqueues %u",
1343 			__func__, qid, sc->num_cqueues);
1344 		pci_nvme_status_tc(&compl->status,
1345 		    NVME_SCT_COMMAND_SPECIFIC,
1346 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1347 		return (1);
1348  	}
1349 
1350 	ncq = &sc->compl_queues[qid];
1351 	ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1352 	ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1353 	if (ncq->intr_vec > (sc->max_queues + 1)) {
1354 		pci_nvme_status_tc(&compl->status,
1355 		    NVME_SCT_COMMAND_SPECIFIC,
1356 		    NVME_SC_INVALID_INTERRUPT_VECTOR);
1357 		return (1);
1358 	}
1359 
1360 	ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1361 	if ((ncq->size < 2) || (ncq->size > sc->max_qentries))  {
1362 		/*
1363 		 * Queues must specify at least two entries
1364 		 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1365 		 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1366 		 */
1367 		pci_nvme_status_tc(&compl->status,
1368 		    NVME_SCT_COMMAND_SPECIFIC,
1369 		    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1370 		return (1);
1371 	}
1372 	ncq->head = ncq->tail = 0;
1373 	ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1374 		     command->prp1,
1375 		     sizeof(struct nvme_command) * (size_t)ncq->size);
1376 
1377 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1378 
1379 
1380 	return (1);
1381 }
1382 
1383 static int
1384 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1385 	struct nvme_completion* compl)
1386 {
1387 	uint64_t logoff;
1388 	uint32_t logsize;
1389 	uint8_t logpage = command->cdw10 & 0xFF;
1390 
1391 	DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1392 
1393 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1394 
1395 	/*
1396 	 * Command specifies the number of dwords to return in fields NUMDU
1397 	 * and NUMDL. This is a zero-based value.
1398 	 */
1399 	logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1400 	logsize *= sizeof(uint32_t);
1401 	logoff  = ((uint64_t)(command->cdw13) << 32) | command->cdw12;
1402 
1403 	switch (logpage) {
1404 	case NVME_LOG_ERROR:
1405 		if (logoff >= sizeof(sc->err_log)) {
1406 			pci_nvme_status_genc(&compl->status,
1407 			    NVME_SC_INVALID_FIELD);
1408 			break;
1409 		}
1410 
1411 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1412 		    command->prp2, (uint8_t *)&sc->err_log + logoff,
1413 		    MIN(logsize - logoff, sizeof(sc->err_log)),
1414 		    NVME_COPY_TO_PRP);
1415 		break;
1416 	case NVME_LOG_HEALTH_INFORMATION:
1417 		if (logoff >= sizeof(sc->health_log)) {
1418 			pci_nvme_status_genc(&compl->status,
1419 			    NVME_SC_INVALID_FIELD);
1420 			break;
1421 		}
1422 
1423 		pthread_mutex_lock(&sc->mtx);
1424 		memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1425 		    sizeof(sc->health_log.data_units_read));
1426 		memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1427 		    sizeof(sc->health_log.data_units_written));
1428 		memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1429 		    sizeof(sc->health_log.host_read_commands));
1430 		memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1431 		    sizeof(sc->health_log.host_write_commands));
1432 		pthread_mutex_unlock(&sc->mtx);
1433 
1434 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1435 		    command->prp2, (uint8_t *)&sc->health_log + logoff,
1436 		    MIN(logsize - logoff, sizeof(sc->health_log)),
1437 		    NVME_COPY_TO_PRP);
1438 		break;
1439 	case NVME_LOG_FIRMWARE_SLOT:
1440 		if (logoff >= sizeof(sc->fw_log)) {
1441 			pci_nvme_status_genc(&compl->status,
1442 			    NVME_SC_INVALID_FIELD);
1443 			break;
1444 		}
1445 
1446 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1447 		    command->prp2, (uint8_t *)&sc->fw_log + logoff,
1448 		    MIN(logsize - logoff, sizeof(sc->fw_log)),
1449 		    NVME_COPY_TO_PRP);
1450 		break;
1451 	case NVME_LOG_CHANGED_NAMESPACE:
1452 		if (logoff >= sizeof(sc->ns_log)) {
1453 			pci_nvme_status_genc(&compl->status,
1454 			    NVME_SC_INVALID_FIELD);
1455 			break;
1456 		}
1457 
1458 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1459 		    command->prp2, (uint8_t *)&sc->ns_log + logoff,
1460 		    MIN(logsize - logoff, sizeof(sc->ns_log)),
1461 		    NVME_COPY_TO_PRP);
1462 		memset(&sc->ns_log, 0, sizeof(sc->ns_log));
1463 		break;
1464 	default:
1465 		DPRINTF("%s get log page %x command not supported",
1466 		        __func__, logpage);
1467 
1468 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1469 		    NVME_SC_INVALID_LOG_PAGE);
1470 	}
1471 
1472 	return (1);
1473 }
1474 
1475 static int
1476 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1477 	struct nvme_completion* compl)
1478 {
1479 	void *dest;
1480 	uint16_t status;
1481 
1482 	DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1483 	        command->cdw10 & 0xFF, command->nsid);
1484 
1485 	pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1486 
1487 	switch (command->cdw10 & 0xFF) {
1488 	case 0x00: /* return Identify Namespace data structure */
1489 		/* Global NS only valid with NS Management */
1490 		if (command->nsid == NVME_GLOBAL_NAMESPACE_TAG) {
1491 			pci_nvme_status_genc(&status,
1492 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1493 			break;
1494 		}
1495 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1496 		    command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1497 		    NVME_COPY_TO_PRP);
1498 		break;
1499 	case 0x01: /* return Identify Controller data structure */
1500 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1501 		    command->prp2, (uint8_t *)&sc->ctrldata,
1502 		    sizeof(sc->ctrldata),
1503 		    NVME_COPY_TO_PRP);
1504 		break;
1505 	case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1506 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1507 		                  sizeof(uint32_t) * 1024);
1508 		/* All unused entries shall be zero */
1509 		bzero(dest, sizeof(uint32_t) * 1024);
1510 		((uint32_t *)dest)[0] = 1;
1511 		break;
1512 	case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1513 		if (command->nsid != 1) {
1514 			pci_nvme_status_genc(&status,
1515 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1516 			break;
1517 		}
1518 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1519 		                  sizeof(uint32_t) * 1024);
1520 		/* All bytes after the descriptor shall be zero */
1521 		bzero(dest, sizeof(uint32_t) * 1024);
1522 
1523 		/* Return NIDT=1 (i.e. EUI64) descriptor */
1524 		((uint8_t *)dest)[0] = 1;
1525 		((uint8_t *)dest)[1] = sizeof(uint64_t);
1526 		bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1527 		break;
1528 	default:
1529 		DPRINTF("%s unsupported identify command requested 0x%x",
1530 		         __func__, command->cdw10 & 0xFF);
1531 		pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1532 		break;
1533 	}
1534 
1535 	compl->status = status;
1536 	return (1);
1537 }
1538 
1539 static const char *
1540 nvme_fid_to_name(uint8_t fid)
1541 {
1542 	const char *name;
1543 
1544 	switch (fid) {
1545 	case NVME_FEAT_ARBITRATION:
1546 		name = "Arbitration";
1547 		break;
1548 	case NVME_FEAT_POWER_MANAGEMENT:
1549 		name = "Power Management";
1550 		break;
1551 	case NVME_FEAT_LBA_RANGE_TYPE:
1552 		name = "LBA Range Type";
1553 		break;
1554 	case NVME_FEAT_TEMPERATURE_THRESHOLD:
1555 		name = "Temperature Threshold";
1556 		break;
1557 	case NVME_FEAT_ERROR_RECOVERY:
1558 		name = "Error Recovery";
1559 		break;
1560 	case NVME_FEAT_VOLATILE_WRITE_CACHE:
1561 		name = "Volatile Write Cache";
1562 		break;
1563 	case NVME_FEAT_NUMBER_OF_QUEUES:
1564 		name = "Number of Queues";
1565 		break;
1566 	case NVME_FEAT_INTERRUPT_COALESCING:
1567 		name = "Interrupt Coalescing";
1568 		break;
1569 	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1570 		name = "Interrupt Vector Configuration";
1571 		break;
1572 	case NVME_FEAT_WRITE_ATOMICITY:
1573 		name = "Write Atomicity Normal";
1574 		break;
1575 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1576 		name = "Asynchronous Event Configuration";
1577 		break;
1578 	case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1579 		name = "Autonomous Power State Transition";
1580 		break;
1581 	case NVME_FEAT_HOST_MEMORY_BUFFER:
1582 		name = "Host Memory Buffer";
1583 		break;
1584 	case NVME_FEAT_TIMESTAMP:
1585 		name = "Timestamp";
1586 		break;
1587 	case NVME_FEAT_KEEP_ALIVE_TIMER:
1588 		name = "Keep Alive Timer";
1589 		break;
1590 	case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1591 		name = "Host Controlled Thermal Management";
1592 		break;
1593 	case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1594 		name = "Non-Operation Power State Config";
1595 		break;
1596 	case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1597 		name = "Read Recovery Level Config";
1598 		break;
1599 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1600 		name = "Predictable Latency Mode Config";
1601 		break;
1602 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1603 		name = "Predictable Latency Mode Window";
1604 		break;
1605 	case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1606 		name = "LBA Status Information Report Interval";
1607 		break;
1608 	case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1609 		name = "Host Behavior Support";
1610 		break;
1611 	case NVME_FEAT_SANITIZE_CONFIG:
1612 		name = "Sanitize Config";
1613 		break;
1614 	case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1615 		name = "Endurance Group Event Configuration";
1616 		break;
1617 	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1618 		name = "Software Progress Marker";
1619 		break;
1620 	case NVME_FEAT_HOST_IDENTIFIER:
1621 		name = "Host Identifier";
1622 		break;
1623 	case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1624 		name = "Reservation Notification Mask";
1625 		break;
1626 	case NVME_FEAT_RESERVATION_PERSISTENCE:
1627 		name = "Reservation Persistence";
1628 		break;
1629 	case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1630 		name = "Namespace Write Protection Config";
1631 		break;
1632 	default:
1633 		name = "Unknown";
1634 		break;
1635 	}
1636 
1637 	return (name);
1638 }
1639 
1640 static void
1641 nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
1642     struct nvme_feature_obj *feat,
1643     struct nvme_command *command,
1644     struct nvme_completion *compl)
1645 {
1646 
1647 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1648 }
1649 
1650 static void
1651 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1652     struct nvme_feature_obj *feat,
1653     struct nvme_command *command,
1654     struct nvme_completion *compl)
1655 {
1656 	uint32_t i;
1657 	uint32_t cdw11 = command->cdw11;
1658 	uint16_t iv;
1659 	bool cd;
1660 
1661 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1662 
1663 	iv = cdw11 & 0xffff;
1664 	cd = cdw11 & (1 << 16);
1665 
1666 	if (iv > (sc->max_queues + 1)) {
1667 		return;
1668 	}
1669 
1670 	/* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1671 	if ((iv == 0) && !cd)
1672 		return;
1673 
1674 	/* Requested Interrupt Vector must be used by a CQ */
1675 	for (i = 0; i < sc->num_cqueues + 1; i++) {
1676 		if (sc->compl_queues[i].intr_vec == iv) {
1677 			pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1678 		}
1679 	}
1680 }
1681 
1682 #define NVME_ASYNC_EVENT_ENDURANCE_GROUP		(0x4000)
1683 static void
1684 nvme_feature_async_event(struct pci_nvme_softc *sc,
1685     struct nvme_feature_obj *feat,
1686     struct nvme_command *command,
1687     struct nvme_completion *compl)
1688 {
1689 
1690 	if (command->cdw11 & NVME_ASYNC_EVENT_ENDURANCE_GROUP)
1691 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1692 }
1693 
1694 #define NVME_TEMP_THRESH_OVER	0
1695 #define NVME_TEMP_THRESH_UNDER	1
1696 static void
1697 nvme_feature_temperature(struct pci_nvme_softc *sc,
1698     struct nvme_feature_obj *feat,
1699     struct nvme_command *command,
1700     struct nvme_completion *compl)
1701 {
1702 	uint16_t	tmpth;	/* Temperature Threshold */
1703 	uint8_t		tmpsel; /* Threshold Temperature Select */
1704 	uint8_t		thsel;  /* Threshold Type Select */
1705 	bool		set_crit = false;
1706 
1707 	tmpth  = command->cdw11 & 0xffff;
1708 	tmpsel = (command->cdw11 >> 16) & 0xf;
1709 	thsel  = (command->cdw11 >> 20) & 0x3;
1710 
1711 	DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel);
1712 
1713 	/* Check for unsupported values */
1714 	if (((tmpsel != 0) && (tmpsel != 0xf)) ||
1715 	    (thsel > NVME_TEMP_THRESH_UNDER)) {
1716 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1717 		return;
1718 	}
1719 
1720 	if (((thsel == NVME_TEMP_THRESH_OVER)  && (NVME_TEMPERATURE >= tmpth)) ||
1721 	    ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth)))
1722 		set_crit = true;
1723 
1724 	pthread_mutex_lock(&sc->mtx);
1725 	if (set_crit)
1726 		sc->health_log.critical_warning |=
1727 		    NVME_CRIT_WARN_ST_TEMPERATURE;
1728 	else
1729 		sc->health_log.critical_warning &=
1730 		    ~NVME_CRIT_WARN_ST_TEMPERATURE;
1731 	pthread_mutex_unlock(&sc->mtx);
1732 
1733 	if (set_crit)
1734 		pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART,
1735 		    sc->health_log.critical_warning);
1736 
1737 
1738 	DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status);
1739 }
1740 
1741 static void
1742 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1743     struct nvme_feature_obj *feat,
1744     struct nvme_command *command,
1745     struct nvme_completion *compl)
1746 {
1747 	uint16_t nqr;	/* Number of Queues Requested */
1748 
1749 	if (sc->num_q_is_set) {
1750 		WPRINTF("%s: Number of Queues already set", __func__);
1751 		pci_nvme_status_genc(&compl->status,
1752 		    NVME_SC_COMMAND_SEQUENCE_ERROR);
1753 		return;
1754 	}
1755 
1756 	nqr = command->cdw11 & 0xFFFF;
1757 	if (nqr == 0xffff) {
1758 		WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1759 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1760 		return;
1761 	}
1762 
1763 	sc->num_squeues = ONE_BASED(nqr);
1764 	if (sc->num_squeues > sc->max_queues) {
1765 		DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1766 					sc->max_queues);
1767 		sc->num_squeues = sc->max_queues;
1768 	}
1769 
1770 	nqr = (command->cdw11 >> 16) & 0xFFFF;
1771 	if (nqr == 0xffff) {
1772 		WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1773 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1774 		return;
1775 	}
1776 
1777 	sc->num_cqueues = ONE_BASED(nqr);
1778 	if (sc->num_cqueues > sc->max_queues) {
1779 		DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1780 					sc->max_queues);
1781 		sc->num_cqueues = sc->max_queues;
1782 	}
1783 
1784 	/* Patch the command value which will be saved on callback's return */
1785 	command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1786 	compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1787 
1788 	sc->num_q_is_set = true;
1789 }
1790 
1791 static int
1792 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1793 	struct nvme_completion *compl)
1794 {
1795 	struct nvme_feature_obj *feat;
1796 	uint32_t nsid = command->nsid;
1797 	uint8_t fid = command->cdw10 & 0xFF;
1798 
1799 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1800 
1801 	if (fid >= NVME_FID_MAX) {
1802 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1803 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1804 		return (1);
1805 	}
1806 	feat = &sc->feat[fid];
1807 
1808 	if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) {
1809 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1810 		return (1);
1811 	}
1812 
1813 	if (!feat->namespace_specific &&
1814 	    !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1815 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1816 		    NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1817 		return (1);
1818 	}
1819 
1820 	compl->cdw0 = 0;
1821 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1822 
1823 	if (feat->set)
1824 		feat->set(sc, feat, command, compl);
1825 
1826 	DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11);
1827 	if (compl->status == NVME_SC_SUCCESS) {
1828 		feat->cdw11 = command->cdw11;
1829 		if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) &&
1830 		    (command->cdw11 != 0))
1831 			pci_nvme_aen_notify(sc);
1832 	}
1833 
1834 	return (0);
1835 }
1836 
1837 #define NVME_FEATURES_SEL_SUPPORTED	0x3
1838 #define NVME_FEATURES_NS_SPECIFIC	(1 << 1)
1839 
1840 static int
1841 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1842 	struct nvme_completion* compl)
1843 {
1844 	struct nvme_feature_obj *feat;
1845 	uint8_t fid = command->cdw10 & 0xFF;
1846 	uint8_t sel = (command->cdw10 >> 8) & 0x7;
1847 
1848 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1849 
1850 	if (fid >= NVME_FID_MAX) {
1851 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1852 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1853 		return (1);
1854 	}
1855 
1856 	compl->cdw0 = 0;
1857 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1858 
1859 	feat = &sc->feat[fid];
1860 	if (feat->get) {
1861 		feat->get(sc, feat, command, compl);
1862 	}
1863 
1864 	if (compl->status == NVME_SC_SUCCESS) {
1865 		if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific)
1866 			compl->cdw0 = NVME_FEATURES_NS_SPECIFIC;
1867 		else
1868 			compl->cdw0 = feat->cdw11;
1869 	}
1870 
1871 	return (0);
1872 }
1873 
1874 static int
1875 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1876 	struct nvme_completion* compl)
1877 {
1878 	uint8_t	ses, lbaf, pi;
1879 
1880 	/* Only supports Secure Erase Setting - User Data Erase */
1881 	ses = (command->cdw10 >> 9) & 0x7;
1882 	if (ses > 0x1) {
1883 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1884 		return (1);
1885 	}
1886 
1887 	/* Only supports a single LBA Format */
1888 	lbaf = command->cdw10 & 0xf;
1889 	if (lbaf != 0) {
1890 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1891 		    NVME_SC_INVALID_FORMAT);
1892 		return (1);
1893 	}
1894 
1895 	/* Doesn't support Protection Infomation */
1896 	pi = (command->cdw10 >> 5) & 0x7;
1897 	if (pi != 0) {
1898 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1899 		return (1);
1900 	}
1901 
1902 	if (sc->nvstore.type == NVME_STOR_RAM) {
1903 		if (sc->nvstore.ctx)
1904 			free(sc->nvstore.ctx);
1905 		sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1906 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1907 	} else {
1908 		struct pci_nvme_ioreq *req;
1909 		int err;
1910 
1911 		req = pci_nvme_get_ioreq(sc);
1912 		if (req == NULL) {
1913 			pci_nvme_status_genc(&compl->status,
1914 			    NVME_SC_INTERNAL_DEVICE_ERROR);
1915 			WPRINTF("%s: unable to allocate IO req", __func__);
1916 			return (1);
1917 		}
1918 		req->nvme_sq = &sc->submit_queues[0];
1919 		req->sqid = 0;
1920 		req->opc = command->opc;
1921 		req->cid = command->cid;
1922 		req->nsid = command->nsid;
1923 
1924 		req->io_req.br_offset = 0;
1925 		req->io_req.br_resid = sc->nvstore.size;
1926 		req->io_req.br_callback = pci_nvme_io_done;
1927 
1928 		err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1929 		if (err) {
1930 			pci_nvme_status_genc(&compl->status,
1931 			    NVME_SC_INTERNAL_DEVICE_ERROR);
1932 			pci_nvme_release_ioreq(sc, req);
1933 		} else
1934 			compl->status = NVME_NO_STATUS;
1935 	}
1936 
1937 	return (1);
1938 }
1939 
1940 static int
1941 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1942 	struct nvme_completion* compl)
1943 {
1944 	DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1945 	        command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1946 
1947 	/* TODO: search for the command ID and abort it */
1948 
1949 	compl->cdw0 = 1;
1950 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1951 	return (1);
1952 }
1953 
1954 static int
1955 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1956 	struct nvme_command* command, struct nvme_completion* compl)
1957 {
1958 	DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__,
1959 	    sc->aer_count, sc->ctrldata.aerl, command->cid);
1960 
1961 	/* Don't exceed the Async Event Request Limit (AERL). */
1962 	if (pci_nvme_aer_limit_reached(sc)) {
1963 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1964 				NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1965 		return (1);
1966 	}
1967 
1968 	if (pci_nvme_aer_add(sc, command->cid)) {
1969 		pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
1970 				NVME_SC_INTERNAL_DEVICE_ERROR);
1971 		return (1);
1972 	}
1973 
1974 	/*
1975 	 * Raise events when they happen based on the Set Features cmd.
1976 	 * These events happen async, so only set completion successful if
1977 	 * there is an event reflective of the request to get event.
1978 	 */
1979 	compl->status = NVME_NO_STATUS;
1980 	pci_nvme_aen_notify(sc);
1981 
1982 	return (0);
1983 }
1984 
1985 static void
1986 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1987 {
1988 	struct nvme_completion compl;
1989 	struct nvme_command *cmd;
1990 	struct nvme_submission_queue *sq;
1991 	struct nvme_completion_queue *cq;
1992 	uint16_t sqhead;
1993 
1994 	DPRINTF("%s index %u", __func__, (uint32_t)value);
1995 
1996 	sq = &sc->submit_queues[0];
1997 	cq = &sc->compl_queues[0];
1998 
1999 	pthread_mutex_lock(&sq->mtx);
2000 
2001 	sqhead = sq->head;
2002 	DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
2003 
2004 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
2005 		cmd = &(sq->qbase)[sqhead];
2006 		compl.cdw0 = 0;
2007 		compl.status = 0;
2008 
2009 		switch (cmd->opc) {
2010 		case NVME_OPC_DELETE_IO_SQ:
2011 			DPRINTF("%s command DELETE_IO_SQ", __func__);
2012 			nvme_opc_delete_io_sq(sc, cmd, &compl);
2013 			break;
2014 		case NVME_OPC_CREATE_IO_SQ:
2015 			DPRINTF("%s command CREATE_IO_SQ", __func__);
2016 			nvme_opc_create_io_sq(sc, cmd, &compl);
2017 			break;
2018 		case NVME_OPC_DELETE_IO_CQ:
2019 			DPRINTF("%s command DELETE_IO_CQ", __func__);
2020 			nvme_opc_delete_io_cq(sc, cmd, &compl);
2021 			break;
2022 		case NVME_OPC_CREATE_IO_CQ:
2023 			DPRINTF("%s command CREATE_IO_CQ", __func__);
2024 			nvme_opc_create_io_cq(sc, cmd, &compl);
2025 			break;
2026 		case NVME_OPC_GET_LOG_PAGE:
2027 			DPRINTF("%s command GET_LOG_PAGE", __func__);
2028 			nvme_opc_get_log_page(sc, cmd, &compl);
2029 			break;
2030 		case NVME_OPC_IDENTIFY:
2031 			DPRINTF("%s command IDENTIFY", __func__);
2032 			nvme_opc_identify(sc, cmd, &compl);
2033 			break;
2034 		case NVME_OPC_ABORT:
2035 			DPRINTF("%s command ABORT", __func__);
2036 			nvme_opc_abort(sc, cmd, &compl);
2037 			break;
2038 		case NVME_OPC_SET_FEATURES:
2039 			DPRINTF("%s command SET_FEATURES", __func__);
2040 			nvme_opc_set_features(sc, cmd, &compl);
2041 			break;
2042 		case NVME_OPC_GET_FEATURES:
2043 			DPRINTF("%s command GET_FEATURES", __func__);
2044 			nvme_opc_get_features(sc, cmd, &compl);
2045 			break;
2046 		case NVME_OPC_FIRMWARE_ACTIVATE:
2047 			DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
2048 			pci_nvme_status_tc(&compl.status,
2049 			    NVME_SCT_COMMAND_SPECIFIC,
2050 			    NVME_SC_INVALID_FIRMWARE_SLOT);
2051 			break;
2052 		case NVME_OPC_ASYNC_EVENT_REQUEST:
2053 			DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
2054 			nvme_opc_async_event_req(sc, cmd, &compl);
2055 			break;
2056 		case NVME_OPC_FORMAT_NVM:
2057 			DPRINTF("%s command FORMAT_NVM", __func__);
2058 			if ((sc->ctrldata.oacs &
2059 			    (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
2060 				pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
2061 				break;
2062 			}
2063 			nvme_opc_format_nvm(sc, cmd, &compl);
2064 			break;
2065 		case NVME_OPC_SECURITY_SEND:
2066 		case NVME_OPC_SECURITY_RECEIVE:
2067 		case NVME_OPC_SANITIZE:
2068 		case NVME_OPC_GET_LBA_STATUS:
2069 			DPRINTF("%s command OPC=%#x (unsupported)", __func__,
2070 			    cmd->opc);
2071 			/* Valid but unsupported opcodes */
2072 			pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD);
2073 			break;
2074 		default:
2075 			DPRINTF("%s command OPC=%#X (not implemented)",
2076 			    __func__,
2077 			    cmd->opc);
2078 			pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
2079 		}
2080 		sqhead = (sqhead + 1) % sq->size;
2081 
2082 		if (NVME_COMPLETION_VALID(compl)) {
2083 			pci_nvme_cq_update(sc, &sc->compl_queues[0],
2084 			    compl.cdw0,
2085 			    cmd->cid,
2086 			    0,		/* SQID */
2087 			    compl.status);
2088 		}
2089 	}
2090 
2091 	DPRINTF("setting sqhead %u", sqhead);
2092 	sq->head = sqhead;
2093 
2094 	if (cq->head != cq->tail)
2095 		pci_generate_msix(sc->nsc_pi, 0);
2096 
2097 	pthread_mutex_unlock(&sq->mtx);
2098 }
2099 
2100 /*
2101  * Update the Write and Read statistics reported in SMART data
2102  *
2103  * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
2104  * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
2105  * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
2106  */
2107 static void
2108 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
2109     size_t bytes, uint16_t status)
2110 {
2111 
2112 	pthread_mutex_lock(&sc->mtx);
2113 	switch (opc) {
2114 	case NVME_OPC_WRITE:
2115 		sc->write_commands++;
2116 		if (status != NVME_SC_SUCCESS)
2117 			break;
2118 		sc->write_dunits_remainder += (bytes / 512);
2119 		while (sc->write_dunits_remainder >= 1000) {
2120 			sc->write_data_units++;
2121 			sc->write_dunits_remainder -= 1000;
2122 		}
2123 		break;
2124 	case NVME_OPC_READ:
2125 		sc->read_commands++;
2126 		if (status != NVME_SC_SUCCESS)
2127 			break;
2128 		sc->read_dunits_remainder += (bytes / 512);
2129 		while (sc->read_dunits_remainder >= 1000) {
2130 			sc->read_data_units++;
2131 			sc->read_dunits_remainder -= 1000;
2132 		}
2133 		break;
2134 	default:
2135 		DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
2136 		break;
2137 	}
2138 	pthread_mutex_unlock(&sc->mtx);
2139 }
2140 
2141 /*
2142  * Check if the combination of Starting LBA (slba) and number of blocks
2143  * exceeds the range of the underlying storage.
2144  *
2145  * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
2146  * the capacity in bytes as a uint64_t, care must be taken to avoid integer
2147  * overflow.
2148  */
2149 static bool
2150 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
2151     uint32_t nblocks)
2152 {
2153 	size_t	offset, bytes;
2154 
2155 	/* Overflow check of multiplying Starting LBA by the sector size */
2156 	if (slba >> (64 - nvstore->sectsz_bits))
2157 		return (true);
2158 
2159 	offset = slba << nvstore->sectsz_bits;
2160 	bytes = nblocks << nvstore->sectsz_bits;
2161 
2162 	/* Overflow check of Number of Logical Blocks */
2163 	if ((nvstore->size <= offset) || ((nvstore->size - offset) < bytes))
2164 		return (true);
2165 
2166 	return (false);
2167 }
2168 
2169 static int
2170 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
2171 	uint64_t gpaddr, size_t size, int do_write, uint64_t offset)
2172 {
2173 	int iovidx;
2174 	bool range_is_contiguous;
2175 
2176 	if (req == NULL)
2177 		return (-1);
2178 
2179 	if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
2180 		return (-1);
2181 	}
2182 
2183 	/*
2184 	 * Minimize the number of IOVs by concatenating contiguous address
2185 	 * ranges. If the IOV count is zero, there is no previous range to
2186 	 * concatenate.
2187 	 */
2188 	if (req->io_req.br_iovcnt == 0)
2189 		range_is_contiguous = false;
2190 	else
2191 		range_is_contiguous = (req->prev_gpaddr + req->prev_size) == gpaddr;
2192 
2193 	if (range_is_contiguous) {
2194 		iovidx = req->io_req.br_iovcnt - 1;
2195 
2196 		req->io_req.br_iov[iovidx].iov_base =
2197 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2198 				     req->prev_gpaddr, size);
2199 
2200 		req->prev_size += size;
2201 		req->io_req.br_resid += size;
2202 
2203 		req->io_req.br_iov[iovidx].iov_len = req->prev_size;
2204 	} else {
2205 		iovidx = req->io_req.br_iovcnt;
2206 		if (iovidx == 0) {
2207 			req->io_req.br_offset = offset;
2208 			req->io_req.br_resid = 0;
2209 			req->io_req.br_param = req;
2210 		}
2211 
2212 		req->io_req.br_iov[iovidx].iov_base =
2213 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2214 				     gpaddr, size);
2215 
2216 		req->io_req.br_iov[iovidx].iov_len = size;
2217 
2218 		req->prev_gpaddr = gpaddr;
2219 		req->prev_size = size;
2220 		req->io_req.br_resid += size;
2221 
2222 		req->io_req.br_iovcnt++;
2223 	}
2224 
2225 	return (0);
2226 }
2227 
2228 static void
2229 pci_nvme_set_completion(struct pci_nvme_softc *sc,
2230 	struct nvme_submission_queue *sq, int sqid, uint16_t cid,
2231 	uint32_t cdw0, uint16_t status)
2232 {
2233 	struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
2234 
2235 	DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
2236 		 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
2237 		 NVME_STATUS_GET_SC(status));
2238 
2239 	pci_nvme_cq_update(sc, cq,
2240 	    0,		/* CDW0 */
2241 	    cid,
2242 	    sqid,
2243 	    status);
2244 
2245 	if (cq->head != cq->tail) {
2246 		if (cq->intr_en & NVME_CQ_INTEN) {
2247 			pci_generate_msix(sc->nsc_pi, cq->intr_vec);
2248 		} else {
2249 			DPRINTF("%s: CQ%u interrupt disabled",
2250 						__func__, sq->cqid);
2251 		}
2252 	}
2253 }
2254 
2255 static void
2256 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
2257 {
2258 	req->sc = NULL;
2259 	req->nvme_sq = NULL;
2260 	req->sqid = 0;
2261 
2262 	pthread_mutex_lock(&sc->mtx);
2263 
2264 	STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
2265 	sc->pending_ios--;
2266 
2267 	/* when no more IO pending, can set to ready if device reset/enabled */
2268 	if (sc->pending_ios == 0 &&
2269 	    NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
2270 		sc->regs.csts |= NVME_CSTS_RDY;
2271 
2272 	pthread_mutex_unlock(&sc->mtx);
2273 
2274 	sem_post(&sc->iosemlock);
2275 }
2276 
2277 static struct pci_nvme_ioreq *
2278 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
2279 {
2280 	struct pci_nvme_ioreq *req = NULL;
2281 
2282 	sem_wait(&sc->iosemlock);
2283 	pthread_mutex_lock(&sc->mtx);
2284 
2285 	req = STAILQ_FIRST(&sc->ioreqs_free);
2286 	assert(req != NULL);
2287 	STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
2288 
2289 	req->sc = sc;
2290 
2291 	sc->pending_ios++;
2292 
2293 	pthread_mutex_unlock(&sc->mtx);
2294 
2295 	req->io_req.br_iovcnt = 0;
2296 	req->io_req.br_offset = 0;
2297 	req->io_req.br_resid = 0;
2298 	req->io_req.br_param = req;
2299 	req->prev_gpaddr = 0;
2300 	req->prev_size = 0;
2301 
2302 	return req;
2303 }
2304 
2305 static void
2306 pci_nvme_io_done(struct blockif_req *br, int err)
2307 {
2308 	struct pci_nvme_ioreq *req = br->br_param;
2309 	struct nvme_submission_queue *sq = req->nvme_sq;
2310 	uint16_t code, status;
2311 
2312 	DPRINTF("%s error %d %s", __func__, err, strerror(err));
2313 
2314 	/* TODO return correct error */
2315 	code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
2316 	pci_nvme_status_genc(&status, code);
2317 
2318 	pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
2319 	pci_nvme_stats_write_read_update(req->sc, req->opc,
2320 	    req->bytes, status);
2321 	pci_nvme_release_ioreq(req->sc, req);
2322 }
2323 
2324 /*
2325  * Implements the Flush command. The specification states:
2326  *    If a volatile write cache is not present, Flush commands complete
2327  *    successfully and have no effect
2328  * in the description of the Volatile Write Cache (VWC) field of the Identify
2329  * Controller data. Therefore, set status to Success if the command is
2330  * not supported (i.e. RAM or as indicated by the blockif).
2331  */
2332 static bool
2333 nvme_opc_flush(struct pci_nvme_softc *sc,
2334     struct nvme_command *cmd,
2335     struct pci_nvme_blockstore *nvstore,
2336     struct pci_nvme_ioreq *req,
2337     uint16_t *status)
2338 {
2339 	bool pending = false;
2340 
2341 	if (nvstore->type == NVME_STOR_RAM) {
2342 		pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2343 	} else {
2344 		int err;
2345 
2346 		req->io_req.br_callback = pci_nvme_io_done;
2347 
2348 		err = blockif_flush(nvstore->ctx, &req->io_req);
2349 		switch (err) {
2350 		case 0:
2351 			pending = true;
2352 			break;
2353 		case EOPNOTSUPP:
2354 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2355 			break;
2356 		default:
2357 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2358 		}
2359 	}
2360 
2361 	return (pending);
2362 }
2363 
2364 static uint16_t
2365 nvme_write_read_ram(struct pci_nvme_softc *sc,
2366     struct pci_nvme_blockstore *nvstore,
2367     uint64_t prp1, uint64_t prp2,
2368     size_t offset, uint64_t bytes,
2369     bool is_write)
2370 {
2371 	uint8_t *buf = nvstore->ctx;
2372 	enum nvme_copy_dir dir;
2373 	uint16_t status;
2374 
2375 	if (is_write)
2376 		dir = NVME_COPY_TO_PRP;
2377 	else
2378 		dir = NVME_COPY_FROM_PRP;
2379 
2380 	if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
2381 	    buf + offset, bytes, dir))
2382 		pci_nvme_status_genc(&status,
2383 		    NVME_SC_DATA_TRANSFER_ERROR);
2384 	else
2385 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2386 
2387 	return (status);
2388 }
2389 
2390 static uint16_t
2391 nvme_write_read_blockif(struct pci_nvme_softc *sc,
2392     struct pci_nvme_blockstore *nvstore,
2393     struct pci_nvme_ioreq *req,
2394     uint64_t prp1, uint64_t prp2,
2395     size_t offset, uint64_t bytes,
2396     bool is_write)
2397 {
2398 	uint64_t size;
2399 	int err;
2400 	uint16_t status = NVME_NO_STATUS;
2401 
2402 	size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
2403 	if (pci_nvme_append_iov_req(sc, req, prp1,
2404 	    size, is_write, offset)) {
2405 		pci_nvme_status_genc(&status,
2406 		    NVME_SC_DATA_TRANSFER_ERROR);
2407 		goto out;
2408 	}
2409 
2410 	offset += size;
2411 	bytes  -= size;
2412 
2413 	if (bytes == 0) {
2414 		;
2415 	} else if (bytes <= PAGE_SIZE) {
2416 		size = bytes;
2417 		if (pci_nvme_append_iov_req(sc, req, prp2,
2418 		    size, is_write, offset)) {
2419 			pci_nvme_status_genc(&status,
2420 			    NVME_SC_DATA_TRANSFER_ERROR);
2421 			goto out;
2422 		}
2423 	} else {
2424 		void *vmctx = sc->nsc_pi->pi_vmctx;
2425 		uint64_t *prp_list = &prp2;
2426 		uint64_t *last = prp_list;
2427 
2428 		/* PRP2 is pointer to a physical region page list */
2429 		while (bytes) {
2430 			/* Last entry in list points to the next list */
2431 			if ((prp_list == last) && (bytes > PAGE_SIZE)) {
2432 				uint64_t prp = *prp_list;
2433 
2434 				prp_list = paddr_guest2host(vmctx, prp,
2435 				    PAGE_SIZE - (prp % PAGE_SIZE));
2436 				last = prp_list + (NVME_PRP2_ITEMS - 1);
2437 			}
2438 
2439 			size = MIN(bytes, PAGE_SIZE);
2440 
2441 			if (pci_nvme_append_iov_req(sc, req, *prp_list,
2442 			    size, is_write, offset)) {
2443 				pci_nvme_status_genc(&status,
2444 				    NVME_SC_DATA_TRANSFER_ERROR);
2445 				goto out;
2446 			}
2447 
2448 			offset += size;
2449 			bytes  -= size;
2450 
2451 			prp_list++;
2452 		}
2453 	}
2454 	req->io_req.br_callback = pci_nvme_io_done;
2455 	if (is_write)
2456 		err = blockif_write(nvstore->ctx, &req->io_req);
2457 	else
2458 		err = blockif_read(nvstore->ctx, &req->io_req);
2459 
2460 	if (err)
2461 		pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2462 out:
2463 	return (status);
2464 }
2465 
2466 static bool
2467 nvme_opc_write_read(struct pci_nvme_softc *sc,
2468     struct nvme_command *cmd,
2469     struct pci_nvme_blockstore *nvstore,
2470     struct pci_nvme_ioreq *req,
2471     uint16_t *status)
2472 {
2473 	uint64_t lba, nblocks, bytes;
2474 	size_t offset;
2475 	bool is_write = cmd->opc == NVME_OPC_WRITE;
2476 	bool pending = false;
2477 
2478 	lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2479 	nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2480 
2481 	if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2482 		WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)",
2483 		    __func__, lba, nblocks);
2484 		pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2485 		goto out;
2486 	}
2487 
2488 	bytes  = nblocks << nvstore->sectsz_bits;
2489 	if (bytes > NVME_MAX_DATA_SIZE) {
2490 		WPRINTF("%s command would exceed MDTS", __func__);
2491 		pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2492 		goto out;
2493 	}
2494 
2495 	offset = lba << nvstore->sectsz_bits;
2496 
2497 	req->bytes = bytes;
2498 	req->io_req.br_offset = lba;
2499 
2500 	/* PRP bits 1:0 must be zero */
2501 	cmd->prp1 &= ~0x3UL;
2502 	cmd->prp2 &= ~0x3UL;
2503 
2504 	if (nvstore->type == NVME_STOR_RAM) {
2505 		*status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2506 		    cmd->prp2, offset, bytes, is_write);
2507 	} else {
2508 		*status = nvme_write_read_blockif(sc, nvstore, req,
2509 		    cmd->prp1, cmd->prp2, offset, bytes, is_write);
2510 
2511 		if (*status == NVME_NO_STATUS)
2512 			pending = true;
2513 	}
2514 out:
2515 	if (!pending)
2516 		pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2517 
2518 	return (pending);
2519 }
2520 
2521 static void
2522 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2523 {
2524 	struct pci_nvme_ioreq *req = br->br_param;
2525 	struct pci_nvme_softc *sc = req->sc;
2526 	bool done = true;
2527 	uint16_t status;
2528 
2529 	if (err) {
2530 		pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2531 	} else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2532 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2533 	} else {
2534 		struct iovec *iov = req->io_req.br_iov;
2535 
2536 		req->prev_gpaddr++;
2537 		iov += req->prev_gpaddr;
2538 
2539 		/* The iov_* values already include the sector size */
2540 		req->io_req.br_offset = (off_t)iov->iov_base;
2541 		req->io_req.br_resid = iov->iov_len;
2542 		if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2543 			pci_nvme_status_genc(&status,
2544 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2545 		} else
2546 			done = false;
2547 	}
2548 
2549 	if (done) {
2550 		pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
2551 		    req->cid, 0, status);
2552 		pci_nvme_release_ioreq(sc, req);
2553 	}
2554 }
2555 
2556 static bool
2557 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2558     struct nvme_command *cmd,
2559     struct pci_nvme_blockstore *nvstore,
2560     struct pci_nvme_ioreq *req,
2561     uint16_t *status)
2562 {
2563 	struct nvme_dsm_range *range;
2564 	uint32_t nr, r, non_zero, dr;
2565 	int err;
2566 	bool pending = false;
2567 
2568 	if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2569 		pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2570 		goto out;
2571 	}
2572 
2573 	nr = cmd->cdw10 & 0xff;
2574 
2575 	/* copy locally because a range entry could straddle PRPs */
2576 	range = calloc(1, NVME_MAX_DSM_TRIM);
2577 	if (range == NULL) {
2578 		pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2579 		goto out;
2580 	}
2581 	nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2582 	    (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2583 
2584 	/* Check for invalid ranges and the number of non-zero lengths */
2585 	non_zero = 0;
2586 	for (r = 0; r <= nr; r++) {
2587 		if (pci_nvme_out_of_range(nvstore,
2588 		    range[r].starting_lba, range[r].length)) {
2589 			pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2590 			goto out;
2591 		}
2592 		if (range[r].length != 0)
2593 			non_zero++;
2594 	}
2595 
2596 	if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2597 		size_t offset, bytes;
2598 		int sectsz_bits = sc->nvstore.sectsz_bits;
2599 
2600 		/*
2601 		 * DSM calls are advisory only, and compliant controllers
2602 		 * may choose to take no actions (i.e. return Success).
2603 		 */
2604 		if (!nvstore->deallocate) {
2605 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2606 			goto out;
2607 		}
2608 
2609 		/* If all ranges have a zero length, return Success */
2610 		if (non_zero == 0) {
2611 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2612 			goto out;
2613 		}
2614 
2615 		if (req == NULL) {
2616 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2617 			goto out;
2618 		}
2619 
2620 		offset = range[0].starting_lba << sectsz_bits;
2621 		bytes = range[0].length << sectsz_bits;
2622 
2623 		/*
2624 		 * If the request is for more than a single range, store
2625 		 * the ranges in the br_iov. Optimize for the common case
2626 		 * of a single range.
2627 		 *
2628 		 * Note that NVMe Number of Ranges is a zero based value
2629 		 */
2630 		req->io_req.br_iovcnt = 0;
2631 		req->io_req.br_offset = offset;
2632 		req->io_req.br_resid = bytes;
2633 
2634 		if (nr == 0) {
2635 			req->io_req.br_callback = pci_nvme_io_done;
2636 		} else {
2637 			struct iovec *iov = req->io_req.br_iov;
2638 
2639 			for (r = 0, dr = 0; r <= nr; r++) {
2640 				offset = range[r].starting_lba << sectsz_bits;
2641 				bytes = range[r].length << sectsz_bits;
2642 				if (bytes == 0)
2643 					continue;
2644 
2645 				if ((nvstore->size - offset) < bytes) {
2646 					pci_nvme_status_genc(status,
2647 					    NVME_SC_LBA_OUT_OF_RANGE);
2648 					goto out;
2649 				}
2650 				iov[dr].iov_base = (void *)offset;
2651 				iov[dr].iov_len = bytes;
2652 				dr++;
2653 			}
2654 			req->io_req.br_callback = pci_nvme_dealloc_sm;
2655 
2656 			/*
2657 			 * Use prev_gpaddr to track the current entry and
2658 			 * prev_size to track the number of entries
2659 			 */
2660 			req->prev_gpaddr = 0;
2661 			req->prev_size = dr;
2662 		}
2663 
2664 		err = blockif_delete(nvstore->ctx, &req->io_req);
2665 		if (err)
2666 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2667 		else
2668 			pending = true;
2669 	}
2670 out:
2671 	free(range);
2672 	return (pending);
2673 }
2674 
2675 static void
2676 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2677 {
2678 	struct nvme_submission_queue *sq;
2679 	uint16_t status;
2680 	uint16_t sqhead;
2681 
2682 	/* handle all submissions up to sq->tail index */
2683 	sq = &sc->submit_queues[idx];
2684 
2685 	pthread_mutex_lock(&sq->mtx);
2686 
2687 	sqhead = sq->head;
2688 	DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2689 	         idx, sqhead, sq->tail, sq->qbase);
2690 
2691 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
2692 		struct nvme_command *cmd;
2693 		struct pci_nvme_ioreq *req;
2694 		uint32_t nsid;
2695 		bool pending;
2696 
2697 		pending = false;
2698 		req = NULL;
2699 		status = 0;
2700 
2701 		cmd = &sq->qbase[sqhead];
2702 		sqhead = (sqhead + 1) % sq->size;
2703 
2704 		nsid = le32toh(cmd->nsid);
2705 		if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2706 			pci_nvme_status_genc(&status,
2707 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2708 			status |=
2709 			    NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2710 			goto complete;
2711  		}
2712 
2713 		req = pci_nvme_get_ioreq(sc);
2714 		if (req == NULL) {
2715 			pci_nvme_status_genc(&status,
2716 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2717 			WPRINTF("%s: unable to allocate IO req", __func__);
2718 			goto complete;
2719 		}
2720 		req->nvme_sq = sq;
2721 		req->sqid = idx;
2722 		req->opc = cmd->opc;
2723 		req->cid = cmd->cid;
2724 		req->nsid = cmd->nsid;
2725 
2726 		switch (cmd->opc) {
2727 		case NVME_OPC_FLUSH:
2728 			pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2729 			    req, &status);
2730  			break;
2731 		case NVME_OPC_WRITE:
2732 		case NVME_OPC_READ:
2733 			pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2734 			    req, &status);
2735 			break;
2736 		case NVME_OPC_WRITE_ZEROES:
2737 			/* TODO: write zeroes
2738 			WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2739 			        __func__, lba, cmd->cdw12 & 0xFFFF); */
2740 			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2741 			break;
2742 		case NVME_OPC_DATASET_MANAGEMENT:
2743  			pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2744 			    req, &status);
2745 			break;
2746  		default:
2747  			WPRINTF("%s unhandled io command 0x%x",
2748 			    __func__, cmd->opc);
2749 			pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2750 		}
2751 complete:
2752 		if (!pending) {
2753 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
2754 			    status);
2755 			if (req != NULL)
2756 				pci_nvme_release_ioreq(sc, req);
2757 		}
2758 	}
2759 
2760 	sq->head = sqhead;
2761 
2762 	pthread_mutex_unlock(&sq->mtx);
2763 }
2764 
2765 static void
2766 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
2767 	uint64_t idx, int is_sq, uint64_t value)
2768 {
2769 	DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2770 	        idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2771 
2772 	if (is_sq) {
2773 		if (idx > sc->num_squeues) {
2774 			WPRINTF("%s queue index %lu overflow from "
2775 			         "guest (max %u)",
2776 			         __func__, idx, sc->num_squeues);
2777 			return;
2778 		}
2779 
2780 		atomic_store_short(&sc->submit_queues[idx].tail,
2781 		                   (uint16_t)value);
2782 
2783 		if (idx == 0) {
2784 			pci_nvme_handle_admin_cmd(sc, value);
2785 		} else {
2786 			/* submission queue; handle new entries in SQ */
2787 			if (idx > sc->num_squeues) {
2788 				WPRINTF("%s SQ index %lu overflow from "
2789 				         "guest (max %u)",
2790 				         __func__, idx, sc->num_squeues);
2791 				return;
2792 			}
2793 			pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2794 		}
2795 	} else {
2796 		if (idx > sc->num_cqueues) {
2797 			WPRINTF("%s queue index %lu overflow from "
2798 			         "guest (max %u)",
2799 			         __func__, idx, sc->num_cqueues);
2800 			return;
2801 		}
2802 
2803 		atomic_store_short(&sc->compl_queues[idx].head,
2804 				(uint16_t)value);
2805 	}
2806 }
2807 
2808 static void
2809 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2810 {
2811 	const char *s = iswrite ? "WRITE" : "READ";
2812 
2813 	switch (offset) {
2814 	case NVME_CR_CAP_LOW:
2815 		DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2816 		break;
2817 	case NVME_CR_CAP_HI:
2818 		DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2819 		break;
2820 	case NVME_CR_VS:
2821 		DPRINTF("%s %s NVME_CR_VS", func, s);
2822 		break;
2823 	case NVME_CR_INTMS:
2824 		DPRINTF("%s %s NVME_CR_INTMS", func, s);
2825 		break;
2826 	case NVME_CR_INTMC:
2827 		DPRINTF("%s %s NVME_CR_INTMC", func, s);
2828 		break;
2829 	case NVME_CR_CC:
2830 		DPRINTF("%s %s NVME_CR_CC", func, s);
2831 		break;
2832 	case NVME_CR_CSTS:
2833 		DPRINTF("%s %s NVME_CR_CSTS", func, s);
2834 		break;
2835 	case NVME_CR_NSSR:
2836 		DPRINTF("%s %s NVME_CR_NSSR", func, s);
2837 		break;
2838 	case NVME_CR_AQA:
2839 		DPRINTF("%s %s NVME_CR_AQA", func, s);
2840 		break;
2841 	case NVME_CR_ASQ_LOW:
2842 		DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2843 		break;
2844 	case NVME_CR_ASQ_HI:
2845 		DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2846 		break;
2847 	case NVME_CR_ACQ_LOW:
2848 		DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2849 		break;
2850 	case NVME_CR_ACQ_HI:
2851 		DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2852 		break;
2853 	default:
2854 		DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2855 	}
2856 
2857 }
2858 
2859 static void
2860 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2861 	uint64_t offset, int size, uint64_t value)
2862 {
2863 	uint32_t ccreg;
2864 
2865 	if (offset >= NVME_DOORBELL_OFFSET) {
2866 		uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2867 		uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2868 		int is_sq = (belloffset % 8) < 4;
2869 
2870 		if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2871 			WPRINTF("guest attempted an overflow write offset "
2872 			         "0x%lx, val 0x%lx in %s",
2873 			         offset, value, __func__);
2874 			return;
2875 		}
2876 
2877 		pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2878 		return;
2879 	}
2880 
2881 	DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2882 	        offset, size, value);
2883 
2884 	if (size != 4) {
2885 		WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2886 		         "val 0x%lx) to bar0 in %s",
2887 		         size, offset, value, __func__);
2888 		/* TODO: shutdown device */
2889 		return;
2890 	}
2891 
2892 	pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2893 
2894 	pthread_mutex_lock(&sc->mtx);
2895 
2896 	switch (offset) {
2897 	case NVME_CR_CAP_LOW:
2898 	case NVME_CR_CAP_HI:
2899 		/* readonly */
2900 		break;
2901 	case NVME_CR_VS:
2902 		/* readonly */
2903 		break;
2904 	case NVME_CR_INTMS:
2905 		/* MSI-X, so ignore */
2906 		break;
2907 	case NVME_CR_INTMC:
2908 		/* MSI-X, so ignore */
2909 		break;
2910 	case NVME_CR_CC:
2911 		ccreg = (uint32_t)value;
2912 
2913 		DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2914 		         "iocqes %u",
2915 		        __func__,
2916 			 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2917 			 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2918 			 NVME_CC_GET_IOCQES(ccreg));
2919 
2920 		if (NVME_CC_GET_SHN(ccreg)) {
2921 			/* perform shutdown - flush out data to backend */
2922 			sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2923 			    NVME_CSTS_REG_SHST_SHIFT);
2924 			sc->regs.csts |= NVME_SHST_COMPLETE <<
2925 			    NVME_CSTS_REG_SHST_SHIFT;
2926 		}
2927 		if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2928 			if (NVME_CC_GET_EN(ccreg) == 0)
2929 				/* transition 1-> causes controller reset */
2930 				pci_nvme_reset_locked(sc);
2931 			else
2932 				pci_nvme_init_controller(ctx, sc);
2933 		}
2934 
2935 		/* Insert the iocqes, iosqes and en bits from the write */
2936 		sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2937 		sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2938 		if (NVME_CC_GET_EN(ccreg) == 0) {
2939 			/* Insert the ams, mps and css bit fields */
2940 			sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2941 			sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2942 			sc->regs.csts &= ~NVME_CSTS_RDY;
2943 		} else if (sc->pending_ios == 0) {
2944 			sc->regs.csts |= NVME_CSTS_RDY;
2945 		}
2946 		break;
2947 	case NVME_CR_CSTS:
2948 		break;
2949 	case NVME_CR_NSSR:
2950 		/* ignore writes; don't support subsystem reset */
2951 		break;
2952 	case NVME_CR_AQA:
2953 		sc->regs.aqa = (uint32_t)value;
2954 		break;
2955 	case NVME_CR_ASQ_LOW:
2956 		sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2957 		               (0xFFFFF000 & value);
2958 		break;
2959 	case NVME_CR_ASQ_HI:
2960 		sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2961 		               (value << 32);
2962 		break;
2963 	case NVME_CR_ACQ_LOW:
2964 		sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2965 		               (0xFFFFF000 & value);
2966 		break;
2967 	case NVME_CR_ACQ_HI:
2968 		sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2969 		               (value << 32);
2970 		break;
2971 	default:
2972 		DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2973 		         __func__, offset, value, size);
2974 	}
2975 	pthread_mutex_unlock(&sc->mtx);
2976 }
2977 
2978 static void
2979 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2980                 int baridx, uint64_t offset, int size, uint64_t value)
2981 {
2982 	struct pci_nvme_softc* sc = pi->pi_arg;
2983 
2984 	if (baridx == pci_msix_table_bar(pi) ||
2985 	    baridx == pci_msix_pba_bar(pi)) {
2986 		DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2987 		         " value 0x%lx", baridx, offset, size, value);
2988 
2989 		pci_emul_msix_twrite(pi, offset, size, value);
2990 		return;
2991 	}
2992 
2993 	switch (baridx) {
2994 	case 0:
2995 		pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2996 		break;
2997 
2998 	default:
2999 		DPRINTF("%s unknown baridx %d, val 0x%lx",
3000 		         __func__, baridx, value);
3001 	}
3002 }
3003 
3004 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
3005 	uint64_t offset, int size)
3006 {
3007 	uint64_t value;
3008 
3009 	pci_nvme_bar0_reg_dumps(__func__, offset, 0);
3010 
3011 	if (offset < NVME_DOORBELL_OFFSET) {
3012 		void *p = &(sc->regs);
3013 		pthread_mutex_lock(&sc->mtx);
3014 		memcpy(&value, (void *)((uintptr_t)p + offset), size);
3015 		pthread_mutex_unlock(&sc->mtx);
3016 	} else {
3017 		value = 0;
3018                 WPRINTF("pci_nvme: read invalid offset %ld", offset);
3019 	}
3020 
3021 	switch (size) {
3022 	case 1:
3023 		value &= 0xFF;
3024 		break;
3025 	case 2:
3026 		value &= 0xFFFF;
3027 		break;
3028 	case 4:
3029 		value &= 0xFFFFFFFF;
3030 		break;
3031 	}
3032 
3033 	DPRINTF("   nvme-read offset 0x%lx, size %d -> value 0x%x",
3034 	         offset, size, (uint32_t)value);
3035 
3036 	return (value);
3037 }
3038 
3039 
3040 
3041 static uint64_t
3042 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
3043     uint64_t offset, int size)
3044 {
3045 	struct pci_nvme_softc* sc = pi->pi_arg;
3046 
3047 	if (baridx == pci_msix_table_bar(pi) ||
3048 	    baridx == pci_msix_pba_bar(pi)) {
3049 		DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
3050 		        baridx, offset, size);
3051 
3052 		return pci_emul_msix_tread(pi, offset, size);
3053 	}
3054 
3055 	switch (baridx) {
3056 	case 0:
3057        		return pci_nvme_read_bar_0(sc, offset, size);
3058 
3059 	default:
3060 		DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
3061 	}
3062 
3063 	return (0);
3064 }
3065 
3066 static int
3067 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl)
3068 {
3069 	char bident[sizeof("XX:X:X")];
3070 	const char *value;
3071 	uint32_t sectsz;
3072 
3073 	sc->max_queues = NVME_QUEUES;
3074 	sc->max_qentries = NVME_MAX_QENTRIES;
3075 	sc->ioslots = NVME_IOSLOTS;
3076 	sc->num_squeues = sc->max_queues;
3077 	sc->num_cqueues = sc->max_queues;
3078 	sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
3079 	sectsz = 0;
3080 	snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
3081 	         "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
3082 
3083 	value = get_config_value_node(nvl, "maxq");
3084 	if (value != NULL)
3085 		sc->max_queues = atoi(value);
3086 	value = get_config_value_node(nvl, "qsz");
3087 	if (value != NULL) {
3088 		sc->max_qentries = atoi(value);
3089 		if (sc->max_qentries <= 0) {
3090 			EPRINTLN("nvme: Invalid qsz option %d",
3091 			    sc->max_qentries);
3092 			return (-1);
3093 		}
3094 	}
3095 	value = get_config_value_node(nvl, "ioslots");
3096 	if (value != NULL) {
3097 		sc->ioslots = atoi(value);
3098 		if (sc->ioslots <= 0) {
3099 			EPRINTLN("Invalid ioslots option %d", sc->ioslots);
3100 			return (-1);
3101 		}
3102 	}
3103 	value = get_config_value_node(nvl, "sectsz");
3104 	if (value != NULL)
3105 		sectsz = atoi(value);
3106 	value = get_config_value_node(nvl, "ser");
3107 	if (value != NULL) {
3108 		/*
3109 		 * This field indicates the Product Serial Number in
3110 		 * 7-bit ASCII, unused bytes should be space characters.
3111 		 * Ref: NVMe v1.3c.
3112 		 */
3113 		cpywithpad((char *)sc->ctrldata.sn,
3114 		    sizeof(sc->ctrldata.sn), value, ' ');
3115 	}
3116 	value = get_config_value_node(nvl, "eui64");
3117 	if (value != NULL)
3118 		sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0));
3119 	value = get_config_value_node(nvl, "dsm");
3120 	if (value != NULL) {
3121 		if (strcmp(value, "auto") == 0)
3122 			sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
3123 		else if (strcmp(value, "enable") == 0)
3124 			sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
3125 		else if (strcmp(value, "disable") == 0)
3126 			sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
3127 	}
3128 
3129 	value = get_config_value_node(nvl, "ram");
3130 	if (value != NULL) {
3131 		uint64_t sz = strtoull(value, NULL, 10);
3132 
3133 		sc->nvstore.type = NVME_STOR_RAM;
3134 		sc->nvstore.size = sz * 1024 * 1024;
3135 		sc->nvstore.ctx = calloc(1, sc->nvstore.size);
3136 		sc->nvstore.sectsz = 4096;
3137 		sc->nvstore.sectsz_bits = 12;
3138 		if (sc->nvstore.ctx == NULL) {
3139 			EPRINTLN("nvme: Unable to allocate RAM");
3140 			return (-1);
3141 		}
3142 	} else {
3143 		snprintf(bident, sizeof(bident), "%d:%d",
3144 		    sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
3145 		sc->nvstore.ctx = blockif_open(nvl, bident);
3146 		if (sc->nvstore.ctx == NULL) {
3147 			EPRINTLN("nvme: Could not open backing file: %s",
3148 			    strerror(errno));
3149 			return (-1);
3150 		}
3151 		sc->nvstore.type = NVME_STOR_BLOCKIF;
3152 		sc->nvstore.size = blockif_size(sc->nvstore.ctx);
3153 	}
3154 
3155 	if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
3156 		sc->nvstore.sectsz = sectsz;
3157 	else if (sc->nvstore.type != NVME_STOR_RAM)
3158 		sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
3159 	for (sc->nvstore.sectsz_bits = 9;
3160 	     (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
3161 	     sc->nvstore.sectsz_bits++);
3162 
3163 	if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
3164 		sc->max_queues = NVME_QUEUES;
3165 
3166 	return (0);
3167 }
3168 
3169 static void
3170 pci_nvme_resized(struct blockif_ctxt *bctxt, void *arg, size_t new_size)
3171 {
3172 	struct pci_nvme_softc *sc;
3173 	struct pci_nvme_blockstore *nvstore;
3174 	struct nvme_namespace_data *nd;
3175 
3176 	sc = arg;
3177 	nvstore = &sc->nvstore;
3178 	nd = &sc->nsdata;
3179 
3180 	nvstore->size = new_size;
3181 	pci_nvme_init_nsdata_size(nvstore, nd);
3182 
3183 	/* Add changed NSID to list */
3184 	sc->ns_log.ns[0] = 1;
3185 	sc->ns_log.ns[1] = 0;
3186 
3187 	pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE,
3188 	    PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED);
3189 }
3190 
3191 static int
3192 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl)
3193 {
3194 	struct pci_nvme_softc *sc;
3195 	uint32_t pci_membar_sz;
3196 	int	error;
3197 
3198 	error = 0;
3199 
3200 	sc = calloc(1, sizeof(struct pci_nvme_softc));
3201 	pi->pi_arg = sc;
3202 	sc->nsc_pi = pi;
3203 
3204 	error = pci_nvme_parse_config(sc, nvl);
3205 	if (error < 0)
3206 		goto done;
3207 	else
3208 		error = 0;
3209 
3210 	STAILQ_INIT(&sc->ioreqs_free);
3211 	sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
3212 	for (int i = 0; i < sc->ioslots; i++) {
3213 		STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
3214 	}
3215 
3216 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
3217 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
3218 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
3219 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
3220 	pci_set_cfgdata8(pi, PCIR_PROGIF,
3221 	                 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
3222 
3223 	/*
3224 	 * Allocate size of NVMe registers + doorbell space for all queues.
3225 	 *
3226 	 * The specification requires a minimum memory I/O window size of 16K.
3227 	 * The Windows driver will refuse to start a device with a smaller
3228 	 * window.
3229 	 */
3230 	pci_membar_sz = sizeof(struct nvme_registers) +
3231 	    2 * sizeof(uint32_t) * (sc->max_queues + 1);
3232 	pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
3233 
3234 	DPRINTF("nvme membar size: %u", pci_membar_sz);
3235 
3236 	error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
3237 	if (error) {
3238 		WPRINTF("%s pci alloc mem bar failed", __func__);
3239 		goto done;
3240 	}
3241 
3242 	error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
3243 	if (error) {
3244 		WPRINTF("%s pci add msixcap failed", __func__);
3245 		goto done;
3246 	}
3247 
3248 	error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
3249 	if (error) {
3250 		WPRINTF("%s pci add Express capability failed", __func__);
3251 		goto done;
3252 	}
3253 
3254 	pthread_mutex_init(&sc->mtx, NULL);
3255 	sem_init(&sc->iosemlock, 0, sc->ioslots);
3256 	blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc);
3257 
3258 	pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
3259 	/*
3260 	 * Controller data depends on Namespace data so initialize Namespace
3261 	 * data first.
3262 	 */
3263 	pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
3264 	pci_nvme_init_ctrldata(sc);
3265 	pci_nvme_init_logpages(sc);
3266 	pci_nvme_init_features(sc);
3267 
3268 	pci_nvme_aer_init(sc);
3269 	pci_nvme_aen_init(sc);
3270 
3271 	pci_nvme_reset(sc);
3272 
3273 	pci_lintr_request(pi);
3274 
3275 done:
3276 	return (error);
3277 }
3278 
3279 static int
3280 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts)
3281 {
3282 	char *cp, *ram;
3283 
3284 	if (opts == NULL)
3285 		return (0);
3286 
3287 	if (strncmp(opts, "ram=", 4) == 0) {
3288 		cp = strchr(opts, ',');
3289 		if (cp == NULL) {
3290 			set_config_value_node(nvl, "ram", opts + 4);
3291 			return (0);
3292 		}
3293 		ram = strndup(opts + 4, cp - opts - 4);
3294 		set_config_value_node(nvl, "ram", ram);
3295 		free(ram);
3296 		return (pci_parse_legacy_config(nvl, cp + 1));
3297 	} else
3298 		return (blockif_legacy_config(nvl, opts));
3299 }
3300 
3301 struct pci_devemu pci_de_nvme = {
3302 	.pe_emu =	"nvme",
3303 	.pe_init =	pci_nvme_init,
3304 	.pe_legacy_config = pci_nvme_legacy_config,
3305 	.pe_barwrite =	pci_nvme_write,
3306 	.pe_barread =	pci_nvme_read
3307 };
3308 PCI_EMUL_SET(pci_de_nvme);
3309