xref: /illumos-gate/usr/src/cmd/bhyve/pci_nvme.c (revision 29219719c034367724cbf77434175b3c4e681e43)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  * Copyright (c) 2020 Chuck Tuffli
7  *
8  * Function crc16 Copyright (c) 2017, Fedor Uporov
9  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 /*
34  * bhyve PCIe-NVMe device emulation.
35  *
36  * options:
37  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
38  *
39  *  accepted devpath:
40  *    /dev/blockdev
41  *    /path/to/image
42  *    ram=size_in_MiB
43  *
44  *  maxq    = max number of queues
45  *  qsz     = max elements in each queue
46  *  ioslots = max number of concurrent io requests
47  *  sectsz  = sector size (defaults to blockif sector size)
48  *  ser     = serial number (20-chars max)
49  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
50  *  dsm     = DataSet Management support. Option is one of auto, enable,disable
51  *
52  */
53 
54 /* TODO:
55     - create async event for smart and log
56     - intr coalesce
57  */
58 
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61 
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
65 #ifndef __FreeBSD__
66 #include <endian.h>
67 #endif
68 
69 #include <assert.h>
70 #include <pthread.h>
71 #include <pthread_np.h>
72 #include <semaphore.h>
73 #include <stdbool.h>
74 #include <stddef.h>
75 #include <stdint.h>
76 #include <stdio.h>
77 #include <stdlib.h>
78 #include <string.h>
79 
80 #include <machine/atomic.h>
81 #include <machine/vmm.h>
82 #include <vmmapi.h>
83 
84 #include <dev/nvme/nvme.h>
85 
86 #include "bhyverun.h"
87 #include "block_if.h"
88 #include "config.h"
89 #include "debug.h"
90 #include "pci_emul.h"
91 
92 
93 static int nvme_debug = 0;
94 #define	DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
95 #define	WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
96 
97 /* defaults; can be overridden */
98 #define	NVME_MSIX_BAR		4
99 
100 #define	NVME_IOSLOTS		8
101 
102 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
103 #define NVME_MMIO_SPACE_MIN	(1 << 14)
104 
105 #define	NVME_QUEUES		16
106 #define	NVME_MAX_QENTRIES	2048
107 /* Memory Page size Minimum reported in CAP register */
108 #define	NVME_MPSMIN		0
109 /* MPSMIN converted to bytes */
110 #define	NVME_MPSMIN_BYTES	(1 << (12 + NVME_MPSMIN))
111 
112 #define	NVME_PRP2_ITEMS		(PAGE_SIZE/sizeof(uint64_t))
113 #define	NVME_MDTS		9
114 /* Note the + 1 allows for the initial descriptor to not be page aligned */
115 #define	NVME_MAX_IOVEC		((1 << NVME_MDTS) + 1)
116 #define	NVME_MAX_DATA_SIZE	((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
117 
118 /* This is a synthetic status code to indicate there is no status */
119 #define NVME_NO_STATUS		0xffff
120 #define NVME_COMPLETION_VALID(c)	((c).status != NVME_NO_STATUS)
121 
122 /* Reported temperature in Kelvin (i.e. room temperature) */
123 #define NVME_TEMPERATURE 296
124 
125 /* helpers */
126 
127 /* Convert a zero-based value into a one-based value */
128 #define ONE_BASED(zero)		((zero) + 1)
129 /* Convert a one-based value into a zero-based value */
130 #define ZERO_BASED(one)		((one)  - 1)
131 
132 /* Encode number of SQ's and CQ's for Set/Get Features */
133 #define NVME_FEATURE_NUM_QUEUES(sc) \
134 	(ZERO_BASED((sc)->num_squeues) & 0xffff) | \
135 	(ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
136 
137 #define	NVME_DOORBELL_OFFSET	offsetof(struct nvme_registers, doorbell)
138 
139 enum nvme_controller_register_offsets {
140 	NVME_CR_CAP_LOW = 0x00,
141 	NVME_CR_CAP_HI  = 0x04,
142 	NVME_CR_VS      = 0x08,
143 	NVME_CR_INTMS   = 0x0c,
144 	NVME_CR_INTMC   = 0x10,
145 	NVME_CR_CC      = 0x14,
146 	NVME_CR_CSTS    = 0x1c,
147 	NVME_CR_NSSR    = 0x20,
148 	NVME_CR_AQA     = 0x24,
149 	NVME_CR_ASQ_LOW = 0x28,
150 	NVME_CR_ASQ_HI  = 0x2c,
151 	NVME_CR_ACQ_LOW = 0x30,
152 	NVME_CR_ACQ_HI  = 0x34,
153 };
154 
155 enum nvme_cmd_cdw11 {
156 	NVME_CMD_CDW11_PC  = 0x0001,
157 	NVME_CMD_CDW11_IEN = 0x0002,
158 	NVME_CMD_CDW11_IV  = 0xFFFF0000,
159 };
160 
161 enum nvme_copy_dir {
162 	NVME_COPY_TO_PRP,
163 	NVME_COPY_FROM_PRP,
164 };
165 
166 #define	NVME_CQ_INTEN	0x01
167 #define	NVME_CQ_INTCOAL	0x02
168 
169 struct nvme_completion_queue {
170 	struct nvme_completion *qbase;
171 	pthread_mutex_t	mtx;
172 	uint32_t	size;
173 	uint16_t	tail; /* nvme progress */
174 	uint16_t	head; /* guest progress */
175 	uint16_t	intr_vec;
176 	uint32_t	intr_en;
177 };
178 
179 struct nvme_submission_queue {
180 	struct nvme_command *qbase;
181 	pthread_mutex_t	mtx;
182 	uint32_t	size;
183 	uint16_t	head; /* nvme progress */
184 	uint16_t	tail; /* guest progress */
185 	uint16_t	cqid; /* completion queue id */
186 	int		qpriority;
187 };
188 
189 enum nvme_storage_type {
190 	NVME_STOR_BLOCKIF = 0,
191 	NVME_STOR_RAM = 1,
192 };
193 
194 struct pci_nvme_blockstore {
195 	enum nvme_storage_type type;
196 	void		*ctx;
197 	uint64_t	size;
198 	uint32_t	sectsz;
199 	uint32_t	sectsz_bits;
200 	uint64_t	eui64;
201 	uint32_t	deallocate:1;
202 };
203 
204 /*
205  * Calculate the number of additional page descriptors for guest IO requests
206  * based on the advertised Max Data Transfer (MDTS) and given the number of
207  * default iovec's in a struct blockif_req.
208  */
209 #define MDTS_PAD_SIZE \
210 	( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
211 	  NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
212 	  0 )
213 
214 struct pci_nvme_ioreq {
215 	struct pci_nvme_softc *sc;
216 	STAILQ_ENTRY(pci_nvme_ioreq) link;
217 	struct nvme_submission_queue *nvme_sq;
218 	uint16_t	sqid;
219 
220 	/* command information */
221 	uint16_t	opc;
222 	uint16_t	cid;
223 	uint32_t	nsid;
224 
225 	uint64_t	prev_gpaddr;
226 	size_t		prev_size;
227 	size_t		bytes;
228 
229 	struct blockif_req io_req;
230 
231 	struct iovec	iovpadding[MDTS_PAD_SIZE];
232 };
233 
234 enum nvme_dsm_type {
235 	/* Dataset Management bit in ONCS reflects backing storage capability */
236 	NVME_DATASET_MANAGEMENT_AUTO,
237 	/* Unconditionally set Dataset Management bit in ONCS */
238 	NVME_DATASET_MANAGEMENT_ENABLE,
239 	/* Unconditionally clear Dataset Management bit in ONCS */
240 	NVME_DATASET_MANAGEMENT_DISABLE,
241 };
242 
243 struct pci_nvme_softc;
244 struct nvme_feature_obj;
245 
246 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
247     struct nvme_feature_obj *,
248     struct nvme_command *,
249     struct nvme_completion *);
250 
251 struct nvme_feature_obj {
252 	uint32_t	cdw11;
253 	nvme_feature_cb	set;
254 	nvme_feature_cb	get;
255 	bool namespace_specific;
256 };
257 
258 #define NVME_FID_MAX		(NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
259 
260 typedef enum {
261 	PCI_NVME_AE_TYPE_ERROR = 0,
262 	PCI_NVME_AE_TYPE_SMART,
263 	PCI_NVME_AE_TYPE_NOTICE,
264 	PCI_NVME_AE_TYPE_IO_CMD = 6,
265 	PCI_NVME_AE_TYPE_VENDOR = 7,
266 	PCI_NVME_AE_TYPE_MAX		/* Must be last */
267 } pci_nvme_async_type;
268 
269 /* Asynchronous Event Requests */
270 struct pci_nvme_aer {
271 	STAILQ_ENTRY(pci_nvme_aer) link;
272 	uint16_t	cid;	/* Command ID of the submitted AER */
273 };
274 
275 /** Asynchronous Event Information - Notice */
276 typedef enum {
277 	PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED = 0,
278 	PCI_NVME_AEI_NOTICE_FW_ACTIVATION,
279 	PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE,
280 	PCI_NVME_AEI_NOTICE_ANA_CHANGE,
281 	PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE,
282 	PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT,
283 	PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE,
284 	PCI_NVME_AEI_NOTICE_MAX,
285 } pci_nvme_async_event_info_notice;
286 
287 #define PCI_NVME_AEI_NOTICE_SHIFT		8
288 #define PCI_NVME_AEI_NOTICE_MASK(event)	(1 << (event + PCI_NVME_AEI_NOTICE_SHIFT))
289 
290 /* Asynchronous Event Notifications */
291 struct pci_nvme_aen {
292 	pci_nvme_async_type atype;
293 	uint32_t	event_data;
294 	bool		posted;
295 };
296 
297 /*
298  * By default, enable all Asynchrnous Event Notifications:
299  *     SMART / Health Critical Warnings
300  *     Namespace Attribute Notices
301  */
302 #define PCI_NVME_AEN_DEFAULT_MASK	0x11f
303 
304 typedef enum {
305 	NVME_CNTRLTYPE_IO = 1,
306 	NVME_CNTRLTYPE_DISCOVERY = 2,
307 	NVME_CNTRLTYPE_ADMIN = 3,
308 } pci_nvme_cntrl_type;
309 
310 struct pci_nvme_softc {
311 	struct pci_devinst *nsc_pi;
312 
313 	pthread_mutex_t	mtx;
314 
315 	struct nvme_registers regs;
316 
317 	struct nvme_namespace_data  nsdata;
318 	struct nvme_controller_data ctrldata;
319 	struct nvme_error_information_entry err_log;
320 	struct nvme_health_information_page health_log;
321 	struct nvme_firmware_page fw_log;
322 	struct nvme_ns_list ns_log;
323 
324 	struct pci_nvme_blockstore nvstore;
325 
326 	uint16_t	max_qentries;	/* max entries per queue */
327 	uint32_t	max_queues;	/* max number of IO SQ's or CQ's */
328 	uint32_t	num_cqueues;
329 	uint32_t	num_squeues;
330 	bool		num_q_is_set; /* Has host set Number of Queues */
331 
332 	struct pci_nvme_ioreq *ioreqs;
333 	STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
334 	uint32_t	pending_ios;
335 	uint32_t	ioslots;
336 	sem_t		iosemlock;
337 
338 	/*
339 	 * Memory mapped Submission and Completion queues
340 	 * Each array includes both Admin and IO queues
341 	 */
342 	struct nvme_completion_queue *compl_queues;
343 	struct nvme_submission_queue *submit_queues;
344 
345 	struct nvme_feature_obj feat[NVME_FID_MAX];
346 
347 	enum nvme_dsm_type dataset_management;
348 
349 	/* Accounting for SMART data */
350 	__uint128_t	read_data_units;
351 	__uint128_t	write_data_units;
352 	__uint128_t	read_commands;
353 	__uint128_t	write_commands;
354 	uint32_t	read_dunits_remainder;
355 	uint32_t	write_dunits_remainder;
356 
357 	STAILQ_HEAD(, pci_nvme_aer) aer_list;
358 	pthread_mutex_t	aer_mtx;
359 	uint32_t	aer_count;
360 	struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX];
361 	pthread_t	aen_tid;
362 	pthread_mutex_t	aen_mtx;
363 	pthread_cond_t	aen_cond;
364 };
365 
366 
367 static void pci_nvme_cq_update(struct pci_nvme_softc *sc,
368     struct nvme_completion_queue *cq,
369     uint32_t cdw0,
370     uint16_t cid,
371     uint16_t sqid,
372     uint16_t status);
373 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
374 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
375 static void pci_nvme_io_done(struct blockif_req *, int);
376 
377 /* Controller Configuration utils */
378 #define	NVME_CC_GET_EN(cc) \
379 	((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
380 #define	NVME_CC_GET_CSS(cc) \
381 	((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
382 #define	NVME_CC_GET_SHN(cc) \
383 	((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
384 #define	NVME_CC_GET_IOSQES(cc) \
385 	((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
386 #define	NVME_CC_GET_IOCQES(cc) \
387 	((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
388 
389 #define	NVME_CC_WRITE_MASK \
390 	((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
391 	 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
392 	 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
393 
394 #define	NVME_CC_NEN_WRITE_MASK \
395 	((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
396 	 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
397 	 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
398 
399 /* Controller Status utils */
400 #define	NVME_CSTS_GET_RDY(sts) \
401 	((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
402 
403 #define	NVME_CSTS_RDY	(1 << NVME_CSTS_REG_RDY_SHIFT)
404 
405 /* Completion Queue status word utils */
406 #define	NVME_STATUS_P	(1 << NVME_STATUS_P_SHIFT)
407 #define	NVME_STATUS_MASK \
408 	((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
409 	 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
410 
411 #define NVME_ONCS_DSM	(NVME_CTRLR_DATA_ONCS_DSM_MASK << \
412 	NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
413 
414 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
415     struct nvme_feature_obj *,
416     struct nvme_command *,
417     struct nvme_completion *);
418 static void nvme_feature_temperature(struct pci_nvme_softc *,
419     struct nvme_feature_obj *,
420     struct nvme_command *,
421     struct nvme_completion *);
422 static void nvme_feature_num_queues(struct pci_nvme_softc *,
423     struct nvme_feature_obj *,
424     struct nvme_command *,
425     struct nvme_completion *);
426 static void nvme_feature_iv_config(struct pci_nvme_softc *,
427     struct nvme_feature_obj *,
428     struct nvme_command *,
429     struct nvme_completion *);
430 static void nvme_feature_async_event(struct pci_nvme_softc *,
431     struct nvme_feature_obj *,
432     struct nvme_command *,
433     struct nvme_completion *);
434 
435 static void *aen_thr(void *arg);
436 
437 static __inline void
438 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
439 {
440 	size_t len;
441 
442 	len = strnlen(src, dst_size);
443 	memset(dst, pad, dst_size);
444 	memcpy(dst, src, len);
445 }
446 
447 static __inline void
448 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
449 {
450 
451 	*status &= ~NVME_STATUS_MASK;
452 	*status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
453 		(code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
454 }
455 
456 static __inline void
457 pci_nvme_status_genc(uint16_t *status, uint16_t code)
458 {
459 
460 	pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
461 }
462 
463 /*
464  * Initialize the requested number or IO Submission and Completion Queues.
465  * Admin queues are allocated implicitly.
466  */
467 static void
468 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
469 {
470 	uint32_t i;
471 
472 	/*
473 	 * Allocate and initialize the Submission Queues
474 	 */
475 	if (nsq > NVME_QUEUES) {
476 		WPRINTF("%s: clamping number of SQ from %u to %u",
477 					__func__, nsq, NVME_QUEUES);
478 		nsq = NVME_QUEUES;
479 	}
480 
481 	sc->num_squeues = nsq;
482 
483 	sc->submit_queues = calloc(sc->num_squeues + 1,
484 				sizeof(struct nvme_submission_queue));
485 	if (sc->submit_queues == NULL) {
486 		WPRINTF("%s: SQ allocation failed", __func__);
487 		sc->num_squeues = 0;
488 	} else {
489 		struct nvme_submission_queue *sq = sc->submit_queues;
490 
491 		for (i = 0; i < sc->num_squeues + 1; i++)
492 			pthread_mutex_init(&sq[i].mtx, NULL);
493 	}
494 
495 	/*
496 	 * Allocate and initialize the Completion Queues
497 	 */
498 	if (ncq > NVME_QUEUES) {
499 		WPRINTF("%s: clamping number of CQ from %u to %u",
500 					__func__, ncq, NVME_QUEUES);
501 		ncq = NVME_QUEUES;
502 	}
503 
504 	sc->num_cqueues = ncq;
505 
506 	sc->compl_queues = calloc(sc->num_cqueues + 1,
507 				sizeof(struct nvme_completion_queue));
508 	if (sc->compl_queues == NULL) {
509 		WPRINTF("%s: CQ allocation failed", __func__);
510 		sc->num_cqueues = 0;
511 	} else {
512 		struct nvme_completion_queue *cq = sc->compl_queues;
513 
514 		for (i = 0; i < sc->num_cqueues + 1; i++)
515 			pthread_mutex_init(&cq[i].mtx, NULL);
516 	}
517 }
518 
519 static void
520 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
521 {
522 	struct nvme_controller_data *cd = &sc->ctrldata;
523 
524 	cd->vid = 0xFB5D;
525 	cd->ssvid = 0x0000;
526 
527 	cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
528 	cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
529 
530 	/* Num of submission commands that we can handle at a time (2^rab) */
531 	cd->rab   = 4;
532 
533 	/* FreeBSD OUI */
534 	cd->ieee[0] = 0x58;
535 	cd->ieee[1] = 0x9c;
536 	cd->ieee[2] = 0xfc;
537 
538 	cd->mic = 0;
539 
540 	cd->mdts = NVME_MDTS;	/* max data transfer size (2^mdts * CAP.MPSMIN) */
541 
542 	cd->ver = NVME_REV(1,4);
543 
544 	cd->cntrltype = NVME_CNTRLTYPE_IO;
545 	cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
546 	cd->oaes = NVMEB(NVME_CTRLR_DATA_OAES_NS_ATTR);
547 	cd->acl = 2;
548 	cd->aerl = 4;
549 
550 	/* Advertise 1, Read-only firmware slot */
551 	cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK |
552 	    (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
553 	cd->lpa = 0;	/* TODO: support some simple things like SMART */
554 	cd->elpe = 0;	/* max error log page entries */
555 	cd->npss = 1;	/* number of power states support */
556 
557 	/* Warning Composite Temperature Threshold */
558 	cd->wctemp = 0x0157;
559 	cd->cctemp = 0x0157;
560 
561 	cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
562 	    (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
563 	cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
564 	    (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
565 	cd->nn = 1;	/* number of namespaces */
566 
567 	cd->oncs = 0;
568 	switch (sc->dataset_management) {
569 	case NVME_DATASET_MANAGEMENT_AUTO:
570 		if (sc->nvstore.deallocate)
571 			cd->oncs |= NVME_ONCS_DSM;
572 		break;
573 	case NVME_DATASET_MANAGEMENT_ENABLE:
574 		cd->oncs |= NVME_ONCS_DSM;
575 		break;
576 	default:
577 		break;
578 	}
579 
580 	cd->fna = NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK <<
581 	    NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT;
582 
583 	cd->vwc = NVME_CTRLR_DATA_VWC_ALL_NO << NVME_CTRLR_DATA_VWC_ALL_SHIFT;
584 
585 	cd->power_state[0].mp = 10;
586 }
587 
588 /*
589  * Calculate the CRC-16 of the given buffer
590  * See copyright attribution at top of file
591  */
592 static uint16_t
593 crc16(uint16_t crc, const void *buffer, unsigned int len)
594 {
595 	const unsigned char *cp = buffer;
596 	/* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
597 	static uint16_t const crc16_table[256] = {
598 		0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
599 		0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
600 		0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
601 		0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
602 		0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
603 		0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
604 		0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
605 		0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
606 		0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
607 		0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
608 		0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
609 		0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
610 		0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
611 		0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
612 		0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
613 		0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
614 		0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
615 		0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
616 		0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
617 		0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
618 		0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
619 		0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
620 		0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
621 		0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
622 		0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
623 		0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
624 		0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
625 		0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
626 		0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
627 		0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
628 		0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
629 		0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
630 	};
631 
632 	while (len--)
633 		crc = (((crc >> 8) & 0xffU) ^
634 		    crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
635 	return crc;
636 }
637 
638 static void
639 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore,
640     struct nvme_namespace_data *nd)
641 {
642 
643 	/* Get capacity and block size information from backing store */
644 	nd->nsze = nvstore->size / nvstore->sectsz;
645 	nd->ncap = nd->nsze;
646 	nd->nuse = nd->nsze;
647 }
648 
649 static void
650 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
651     struct nvme_namespace_data *nd, uint32_t nsid,
652     struct pci_nvme_blockstore *nvstore)
653 {
654 
655 	pci_nvme_init_nsdata_size(nvstore, nd);
656 
657 	if (nvstore->type == NVME_STOR_BLOCKIF)
658 		nvstore->deallocate = blockif_candelete(nvstore->ctx);
659 
660 	nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
661 	nd->flbas = 0;
662 
663 	/* Create an EUI-64 if user did not provide one */
664 	if (nvstore->eui64 == 0) {
665 		char *data = NULL;
666 		uint64_t eui64 = nvstore->eui64;
667 
668 		asprintf(&data, "%s%u%u%u", get_config_value("name"),
669 		    sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot,
670 		    sc->nsc_pi->pi_func);
671 
672 		if (data != NULL) {
673 			eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
674 			free(data);
675 		}
676 		nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
677 	}
678 	be64enc(nd->eui64, nvstore->eui64);
679 
680 	/* LBA data-sz = 2^lbads */
681 	nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
682 }
683 
684 static void
685 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
686 {
687 
688 	memset(&sc->err_log, 0, sizeof(sc->err_log));
689 	memset(&sc->health_log, 0, sizeof(sc->health_log));
690 	memset(&sc->fw_log, 0, sizeof(sc->fw_log));
691 	memset(&sc->ns_log, 0, sizeof(sc->ns_log));
692 
693 	/* Set read/write remainder to round up according to spec */
694 	sc->read_dunits_remainder = 999;
695 	sc->write_dunits_remainder = 999;
696 
697 	/* Set nominal Health values checked by implementations */
698 	sc->health_log.temperature = NVME_TEMPERATURE;
699 	sc->health_log.available_spare = 100;
700 	sc->health_log.available_spare_threshold = 10;
701 }
702 
703 static void
704 pci_nvme_init_features(struct pci_nvme_softc *sc)
705 {
706 	enum nvme_feature	fid;
707 
708 	for (fid = 0; fid < NVME_FID_MAX; fid++) {
709 		switch (fid) {
710 		case NVME_FEAT_ARBITRATION:
711 		case NVME_FEAT_POWER_MANAGEMENT:
712 		case NVME_FEAT_INTERRUPT_COALESCING: //XXX
713 		case NVME_FEAT_WRITE_ATOMICITY:
714 			/* Mandatory but no special handling required */
715 		//XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
716 		//XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
717 		//		  this returns a data buffer
718 			break;
719 		case NVME_FEAT_TEMPERATURE_THRESHOLD:
720 			sc->feat[fid].set = nvme_feature_temperature;
721 			break;
722 		case NVME_FEAT_ERROR_RECOVERY:
723 			sc->feat[fid].namespace_specific = true;
724 			break;
725 		case NVME_FEAT_NUMBER_OF_QUEUES:
726 			sc->feat[fid].set = nvme_feature_num_queues;
727 			break;
728 		case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
729 			sc->feat[fid].set = nvme_feature_iv_config;
730 			break;
731 		case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
732 			sc->feat[fid].set = nvme_feature_async_event;
733 			/* Enable all AENs by default */
734 			sc->feat[fid].cdw11 = PCI_NVME_AEN_DEFAULT_MASK;
735 			break;
736 		default:
737 			sc->feat[fid].set = nvme_feature_invalid_cb;
738 			sc->feat[fid].get = nvme_feature_invalid_cb;
739 		}
740 	}
741 }
742 
743 static void
744 pci_nvme_aer_reset(struct pci_nvme_softc *sc)
745 {
746 
747 	STAILQ_INIT(&sc->aer_list);
748 	sc->aer_count = 0;
749 }
750 
751 static void
752 pci_nvme_aer_init(struct pci_nvme_softc *sc)
753 {
754 
755 	pthread_mutex_init(&sc->aer_mtx, NULL);
756 	pci_nvme_aer_reset(sc);
757 }
758 
759 static void
760 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
761 {
762 	struct pci_nvme_aer *aer = NULL;
763 
764 	pthread_mutex_lock(&sc->aer_mtx);
765 	while (!STAILQ_EMPTY(&sc->aer_list)) {
766 		aer = STAILQ_FIRST(&sc->aer_list);
767 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
768 		free(aer);
769 	}
770 	pthread_mutex_unlock(&sc->aer_mtx);
771 
772 	pci_nvme_aer_reset(sc);
773 }
774 
775 static bool
776 pci_nvme_aer_available(struct pci_nvme_softc *sc)
777 {
778 
779 	return (sc->aer_count != 0);
780 }
781 
782 static bool
783 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
784 {
785 	struct nvme_controller_data *cd = &sc->ctrldata;
786 
787 	/* AERL is a zero based value while aer_count is one's based */
788 	return (sc->aer_count == (cd->aerl + 1));
789 }
790 
791 /*
792  * Add an Async Event Request
793  *
794  * Stores an AER to be returned later if the Controller needs to notify the
795  * host of an event.
796  * Note that while the NVMe spec doesn't require Controllers to return AER's
797  * in order, this implementation does preserve the order.
798  */
799 static int
800 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
801 {
802 	struct pci_nvme_aer *aer = NULL;
803 
804 	aer = calloc(1, sizeof(struct pci_nvme_aer));
805 	if (aer == NULL)
806 		return (-1);
807 
808 	/* Save the Command ID for use in the completion message */
809 	aer->cid = cid;
810 
811 	pthread_mutex_lock(&sc->aer_mtx);
812 	sc->aer_count++;
813 	STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
814 	pthread_mutex_unlock(&sc->aer_mtx);
815 
816 	return (0);
817 }
818 
819 /*
820  * Get an Async Event Request structure
821  *
822  * Returns a pointer to an AER previously submitted by the host or NULL if
823  * no AER's exist. Caller is responsible for freeing the returned struct.
824  */
825 static struct pci_nvme_aer *
826 pci_nvme_aer_get(struct pci_nvme_softc *sc)
827 {
828 	struct pci_nvme_aer *aer = NULL;
829 
830 	pthread_mutex_lock(&sc->aer_mtx);
831 	aer = STAILQ_FIRST(&sc->aer_list);
832 	if (aer != NULL) {
833 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
834 		sc->aer_count--;
835 	}
836 	pthread_mutex_unlock(&sc->aer_mtx);
837 
838 	return (aer);
839 }
840 
841 static void
842 pci_nvme_aen_reset(struct pci_nvme_softc *sc)
843 {
844 	uint32_t	atype;
845 
846 	memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen));
847 
848 	for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
849 		sc->aen[atype].atype = atype;
850 	}
851 }
852 
853 static void
854 pci_nvme_aen_init(struct pci_nvme_softc *sc)
855 {
856 	char nstr[80];
857 
858 	pci_nvme_aen_reset(sc);
859 
860 	pthread_mutex_init(&sc->aen_mtx, NULL);
861 	pthread_create(&sc->aen_tid, NULL, aen_thr, sc);
862 	snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot,
863 	    sc->nsc_pi->pi_func);
864 	pthread_set_name_np(sc->aen_tid, nstr);
865 }
866 
867 static void
868 pci_nvme_aen_destroy(struct pci_nvme_softc *sc)
869 {
870 
871 	pci_nvme_aen_reset(sc);
872 }
873 
874 /* Notify the AEN thread of pending work */
875 static void
876 pci_nvme_aen_notify(struct pci_nvme_softc *sc)
877 {
878 
879 	pthread_cond_signal(&sc->aen_cond);
880 }
881 
882 /*
883  * Post an Asynchronous Event Notification
884  */
885 static int32_t
886 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype,
887 		uint32_t event_data)
888 {
889 	struct pci_nvme_aen *aen;
890 
891 	if (atype >= PCI_NVME_AE_TYPE_MAX) {
892 		return(EINVAL);
893 	}
894 
895 	pthread_mutex_lock(&sc->aen_mtx);
896 	aen = &sc->aen[atype];
897 
898 	/* Has the controller already posted an event of this type? */
899 	if (aen->posted) {
900 		pthread_mutex_unlock(&sc->aen_mtx);
901 		return(EALREADY);
902 	}
903 
904 	aen->event_data = event_data;
905 	aen->posted = true;
906 	pthread_mutex_unlock(&sc->aen_mtx);
907 
908 	pci_nvme_aen_notify(sc);
909 
910 	return(0);
911 }
912 
913 static void
914 pci_nvme_aen_process(struct pci_nvme_softc *sc)
915 {
916 	struct pci_nvme_aer *aer;
917 	struct pci_nvme_aen *aen;
918 	pci_nvme_async_type atype;
919 	uint32_t mask;
920 	uint16_t status;
921 	uint8_t lid;
922 
923 #ifndef __FreeBSD__
924 	lid = 0;
925 #endif
926 
927 	assert(pthread_mutex_isowned_np(&sc->aen_mtx));
928 	for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
929 		aen = &sc->aen[atype];
930 		/* Previous iterations may have depleted the available AER's */
931 		if (!pci_nvme_aer_available(sc)) {
932 			DPRINTF("%s: no AER", __func__);
933 			break;
934 		}
935 
936 		if (!aen->posted) {
937 			DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype);
938 			continue;
939 		}
940 
941 		status = NVME_SC_SUCCESS;
942 
943 		/* Is the event masked? */
944 		mask =
945 		    sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11;
946 
947 		DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data);
948 		switch (atype) {
949 		case PCI_NVME_AE_TYPE_ERROR:
950 			lid = NVME_LOG_ERROR;
951 			break;
952 		case PCI_NVME_AE_TYPE_SMART:
953 			mask &= 0xff;
954 			if ((mask & aen->event_data) == 0)
955 				continue;
956 			lid = NVME_LOG_HEALTH_INFORMATION;
957 			break;
958 		case PCI_NVME_AE_TYPE_NOTICE:
959 			if (aen->event_data >= PCI_NVME_AEI_NOTICE_MAX) {
960 				EPRINTLN("%s unknown AEN notice type %u",
961 				    __func__, aen->event_data);
962 				status = NVME_SC_INTERNAL_DEVICE_ERROR;
963 				break;
964 			}
965 			if ((PCI_NVME_AEI_NOTICE_MASK(aen->event_data) & mask) == 0)
966 				continue;
967 			switch (aen->event_data) {
968 			case PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED:
969 				lid = NVME_LOG_CHANGED_NAMESPACE;
970 				break;
971 			case PCI_NVME_AEI_NOTICE_FW_ACTIVATION:
972 				lid = NVME_LOG_FIRMWARE_SLOT;
973 				break;
974 			case PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE:
975 				lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED;
976 				break;
977 			case PCI_NVME_AEI_NOTICE_ANA_CHANGE:
978 				lid = NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS;
979 				break;
980 			case PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE:
981 				lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE;
982 				break;
983 			case PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT:
984 				lid = NVME_LOG_LBA_STATUS_INFORMATION;
985 				break;
986 			case PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE:
987 				lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE;
988 				break;
989 			default:
990 				lid = 0;
991 			}
992 			break;
993 		default:
994 			/* bad type?!? */
995 			EPRINTLN("%s unknown AEN type %u", __func__, atype);
996 			status = NVME_SC_INTERNAL_DEVICE_ERROR;
997 			break;
998 		}
999 
1000 		aer = pci_nvme_aer_get(sc);
1001 		assert(aer != NULL);
1002 
1003 		DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype);
1004 		pci_nvme_cq_update(sc, &sc->compl_queues[0],
1005 		    (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */
1006 		    aer->cid,
1007 		    0,		/* SQID */
1008 		    status);
1009 
1010 		aen->event_data = 0;
1011 		aen->posted = false;
1012 
1013 		pci_generate_msix(sc->nsc_pi, 0);
1014 	}
1015 }
1016 
1017 static void *
1018 aen_thr(void *arg)
1019 {
1020 	struct pci_nvme_softc *sc;
1021 
1022 	sc = arg;
1023 
1024 	pthread_mutex_lock(&sc->aen_mtx);
1025 	for (;;) {
1026 		pci_nvme_aen_process(sc);
1027 		pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx);
1028 	}
1029 #ifdef __FreeBSD__
1030 	pthread_mutex_unlock(&sc->aen_mtx);
1031 
1032 	pthread_exit(NULL);
1033 #endif
1034 	return (NULL);
1035 }
1036 
1037 static void
1038 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
1039 {
1040 	uint32_t i;
1041 
1042 	DPRINTF("%s", __func__);
1043 
1044 	sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
1045 	    (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
1046 	    (60 << NVME_CAP_LO_REG_TO_SHIFT);
1047 
1048 	sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
1049 
1050 	sc->regs.vs = NVME_REV(1,4);	/* NVMe v1.4 */
1051 
1052 	sc->regs.cc = 0;
1053 
1054 	assert(sc->submit_queues != NULL);
1055 
1056 	for (i = 0; i < sc->num_squeues + 1; i++) {
1057 		sc->submit_queues[i].qbase = NULL;
1058 		sc->submit_queues[i].size = 0;
1059 		sc->submit_queues[i].cqid = 0;
1060 		sc->submit_queues[i].tail = 0;
1061 		sc->submit_queues[i].head = 0;
1062 	}
1063 
1064 	assert(sc->compl_queues != NULL);
1065 
1066 	for (i = 0; i < sc->num_cqueues + 1; i++) {
1067 		sc->compl_queues[i].qbase = NULL;
1068 		sc->compl_queues[i].size = 0;
1069 		sc->compl_queues[i].tail = 0;
1070 		sc->compl_queues[i].head = 0;
1071 	}
1072 
1073 	sc->num_q_is_set = false;
1074 
1075 	pci_nvme_aer_destroy(sc);
1076 	pci_nvme_aen_destroy(sc);
1077 
1078 	/*
1079 	 * Clear CSTS.RDY last to prevent the host from enabling Controller
1080 	 * before cleanup completes
1081 	 */
1082 	sc->regs.csts = 0;
1083 }
1084 
1085 static void
1086 pci_nvme_reset(struct pci_nvme_softc *sc)
1087 {
1088 	pthread_mutex_lock(&sc->mtx);
1089 	pci_nvme_reset_locked(sc);
1090 	pthread_mutex_unlock(&sc->mtx);
1091 }
1092 
1093 static void
1094 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
1095 {
1096 	uint16_t acqs, asqs;
1097 
1098 	DPRINTF("%s", __func__);
1099 
1100 	asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
1101 	sc->submit_queues[0].size = asqs;
1102 	sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
1103 	            sizeof(struct nvme_command) * asqs);
1104 
1105 	DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
1106 	        __func__, sc->regs.asq, sc->submit_queues[0].qbase);
1107 
1108 	acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
1109 	    NVME_AQA_REG_ACQS_MASK) + 1;
1110 	sc->compl_queues[0].size = acqs;
1111 	sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
1112 	         sizeof(struct nvme_completion) * acqs);
1113 	sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
1114 
1115 	DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
1116 	        __func__, sc->regs.acq, sc->compl_queues[0].qbase);
1117 }
1118 
1119 static int
1120 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
1121 	size_t len, enum nvme_copy_dir dir)
1122 {
1123 	uint8_t *p;
1124 	size_t bytes;
1125 
1126 	if (len > (8 * 1024)) {
1127 		return (-1);
1128 	}
1129 
1130 	/* Copy from the start of prp1 to the end of the physical page */
1131 	bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
1132 	bytes = MIN(bytes, len);
1133 
1134 	p = vm_map_gpa(ctx, prp1, bytes);
1135 	if (p == NULL) {
1136 		return (-1);
1137 	}
1138 
1139 	if (dir == NVME_COPY_TO_PRP)
1140 		memcpy(p, b, bytes);
1141 	else
1142 		memcpy(b, p, bytes);
1143 
1144 	b += bytes;
1145 
1146 	len -= bytes;
1147 	if (len == 0) {
1148 		return (0);
1149 	}
1150 
1151 	len = MIN(len, PAGE_SIZE);
1152 
1153 	p = vm_map_gpa(ctx, prp2, len);
1154 	if (p == NULL) {
1155 		return (-1);
1156 	}
1157 
1158 	if (dir == NVME_COPY_TO_PRP)
1159 		memcpy(p, b, len);
1160 	else
1161 		memcpy(b, p, len);
1162 
1163 	return (0);
1164 }
1165 
1166 /*
1167  * Write a Completion Queue Entry update
1168  *
1169  * Write the completion and update the doorbell value
1170  */
1171 static void
1172 pci_nvme_cq_update(struct pci_nvme_softc *sc,
1173 		struct nvme_completion_queue *cq,
1174 		uint32_t cdw0,
1175 		uint16_t cid,
1176 		uint16_t sqid,
1177 		uint16_t status)
1178 {
1179 	struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
1180 	struct nvme_completion *cqe;
1181 
1182 	assert(cq->qbase != NULL);
1183 
1184 	pthread_mutex_lock(&cq->mtx);
1185 
1186 	cqe = &cq->qbase[cq->tail];
1187 
1188 	/* Flip the phase bit */
1189 	status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
1190 
1191 	cqe->cdw0 = cdw0;
1192 	cqe->sqhd = sq->head;
1193 	cqe->sqid = sqid;
1194 	cqe->cid = cid;
1195 	cqe->status = status;
1196 
1197 	cq->tail++;
1198 	if (cq->tail >= cq->size) {
1199 		cq->tail = 0;
1200 	}
1201 
1202 	pthread_mutex_unlock(&cq->mtx);
1203 }
1204 
1205 static int
1206 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1207 	struct nvme_completion* compl)
1208 {
1209 	uint16_t qid = command->cdw10 & 0xffff;
1210 
1211 	DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
1212 	if (qid == 0 || qid > sc->num_squeues ||
1213 	    (sc->submit_queues[qid].qbase == NULL)) {
1214 		WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
1215 		        __func__, qid, sc->num_squeues);
1216 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1217 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1218 		return (1);
1219 	}
1220 
1221 	sc->submit_queues[qid].qbase = NULL;
1222 	sc->submit_queues[qid].cqid = 0;
1223 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1224 	return (1);
1225 }
1226 
1227 static int
1228 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1229 	struct nvme_completion* compl)
1230 {
1231 	if (command->cdw11 & NVME_CMD_CDW11_PC) {
1232 		uint16_t qid = command->cdw10 & 0xffff;
1233 		struct nvme_submission_queue *nsq;
1234 
1235 		if ((qid == 0) || (qid > sc->num_squeues) ||
1236 		    (sc->submit_queues[qid].qbase != NULL)) {
1237 			WPRINTF("%s queue index %u > num_squeues %u",
1238 			        __func__, qid, sc->num_squeues);
1239 			pci_nvme_status_tc(&compl->status,
1240 			    NVME_SCT_COMMAND_SPECIFIC,
1241 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1242 			return (1);
1243 		}
1244 
1245 		nsq = &sc->submit_queues[qid];
1246 		nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1247 		DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
1248 		if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
1249 			/*
1250 			 * Queues must specify at least two entries
1251 			 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1252 			 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1253 			 */
1254 			pci_nvme_status_tc(&compl->status,
1255 			    NVME_SCT_COMMAND_SPECIFIC,
1256 			    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1257 			return (1);
1258 		}
1259 		nsq->head = nsq->tail = 0;
1260 
1261 		nsq->cqid = (command->cdw11 >> 16) & 0xffff;
1262 		if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
1263 			pci_nvme_status_tc(&compl->status,
1264 			    NVME_SCT_COMMAND_SPECIFIC,
1265 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1266 			return (1);
1267 		}
1268 
1269 		if (sc->compl_queues[nsq->cqid].qbase == NULL) {
1270 			pci_nvme_status_tc(&compl->status,
1271 			    NVME_SCT_COMMAND_SPECIFIC,
1272 			    NVME_SC_COMPLETION_QUEUE_INVALID);
1273 			return (1);
1274 		}
1275 
1276 		nsq->qpriority = (command->cdw11 >> 1) & 0x03;
1277 
1278 		nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1279 		              sizeof(struct nvme_command) * (size_t)nsq->size);
1280 
1281 		DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
1282 		        qid, nsq->size, nsq->qbase, nsq->cqid);
1283 
1284 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1285 
1286 		DPRINTF("%s completed creating IOSQ qid %u",
1287 		         __func__, qid);
1288 	} else {
1289 		/*
1290 		 * Guest sent non-cont submission queue request.
1291 		 * This setting is unsupported by this emulation.
1292 		 */
1293 		WPRINTF("%s unsupported non-contig (list-based) "
1294 		         "create i/o submission queue", __func__);
1295 
1296 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1297 	}
1298 	return (1);
1299 }
1300 
1301 static int
1302 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1303 	struct nvme_completion* compl)
1304 {
1305 	uint16_t qid = command->cdw10 & 0xffff;
1306 	uint16_t sqid;
1307 
1308 	DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
1309 	if (qid == 0 || qid > sc->num_cqueues ||
1310 	    (sc->compl_queues[qid].qbase == NULL)) {
1311 		WPRINTF("%s queue index %u / num_cqueues %u",
1312 		        __func__, qid, sc->num_cqueues);
1313 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1314 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1315 		return (1);
1316 	}
1317 
1318 	/* Deleting an Active CQ is an error */
1319 	for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
1320 		if (sc->submit_queues[sqid].cqid == qid) {
1321 			pci_nvme_status_tc(&compl->status,
1322 			    NVME_SCT_COMMAND_SPECIFIC,
1323 			    NVME_SC_INVALID_QUEUE_DELETION);
1324 			return (1);
1325 		}
1326 
1327 	sc->compl_queues[qid].qbase = NULL;
1328 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1329 	return (1);
1330 }
1331 
1332 static int
1333 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1334 	struct nvme_completion* compl)
1335 {
1336 	struct nvme_completion_queue *ncq;
1337 	uint16_t qid = command->cdw10 & 0xffff;
1338 
1339 	/* Only support Physically Contiguous queues */
1340 	if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1341 		WPRINTF("%s unsupported non-contig (list-based) "
1342 		         "create i/o completion queue",
1343 		         __func__);
1344 
1345 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1346 		return (1);
1347 	}
1348 
1349 	if ((qid == 0) || (qid > sc->num_cqueues) ||
1350 	    (sc->compl_queues[qid].qbase != NULL)) {
1351 		WPRINTF("%s queue index %u > num_cqueues %u",
1352 			__func__, qid, sc->num_cqueues);
1353 		pci_nvme_status_tc(&compl->status,
1354 		    NVME_SCT_COMMAND_SPECIFIC,
1355 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1356 		return (1);
1357  	}
1358 
1359 	ncq = &sc->compl_queues[qid];
1360 	ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1361 	ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1362 	if (ncq->intr_vec > (sc->max_queues + 1)) {
1363 		pci_nvme_status_tc(&compl->status,
1364 		    NVME_SCT_COMMAND_SPECIFIC,
1365 		    NVME_SC_INVALID_INTERRUPT_VECTOR);
1366 		return (1);
1367 	}
1368 
1369 	ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1370 	if ((ncq->size < 2) || (ncq->size > sc->max_qentries))  {
1371 		/*
1372 		 * Queues must specify at least two entries
1373 		 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1374 		 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1375 		 */
1376 		pci_nvme_status_tc(&compl->status,
1377 		    NVME_SCT_COMMAND_SPECIFIC,
1378 		    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1379 		return (1);
1380 	}
1381 	ncq->head = ncq->tail = 0;
1382 	ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1383 		     command->prp1,
1384 		     sizeof(struct nvme_command) * (size_t)ncq->size);
1385 
1386 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1387 
1388 
1389 	return (1);
1390 }
1391 
1392 static int
1393 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1394 	struct nvme_completion* compl)
1395 {
1396 	uint64_t logoff;
1397 	uint32_t logsize;
1398 	uint8_t logpage = command->cdw10 & 0xFF;
1399 
1400 #ifndef __FreeBSD__
1401 	logsize = 0;
1402 #endif
1403 
1404 	DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1405 
1406 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1407 
1408 	/*
1409 	 * Command specifies the number of dwords to return in fields NUMDU
1410 	 * and NUMDL. This is a zero-based value.
1411 	 */
1412 	logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1413 	logsize *= sizeof(uint32_t);
1414 	logoff  = ((uint64_t)(command->cdw13) << 32) | command->cdw12;
1415 
1416 	switch (logpage) {
1417 	case NVME_LOG_ERROR:
1418 		if (logoff >= sizeof(sc->err_log)) {
1419 			pci_nvme_status_genc(&compl->status,
1420 			    NVME_SC_INVALID_FIELD);
1421 			break;
1422 		}
1423 
1424 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1425 		    command->prp2, (uint8_t *)&sc->err_log + logoff,
1426 		    MIN(logsize - logoff, sizeof(sc->err_log)),
1427 		    NVME_COPY_TO_PRP);
1428 		break;
1429 	case NVME_LOG_HEALTH_INFORMATION:
1430 		if (logoff >= sizeof(sc->health_log)) {
1431 			pci_nvme_status_genc(&compl->status,
1432 			    NVME_SC_INVALID_FIELD);
1433 			break;
1434 		}
1435 
1436 		pthread_mutex_lock(&sc->mtx);
1437 		memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1438 		    sizeof(sc->health_log.data_units_read));
1439 		memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1440 		    sizeof(sc->health_log.data_units_written));
1441 		memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1442 		    sizeof(sc->health_log.host_read_commands));
1443 		memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1444 		    sizeof(sc->health_log.host_write_commands));
1445 		pthread_mutex_unlock(&sc->mtx);
1446 
1447 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1448 		    command->prp2, (uint8_t *)&sc->health_log + logoff,
1449 		    MIN(logsize - logoff, sizeof(sc->health_log)),
1450 		    NVME_COPY_TO_PRP);
1451 		break;
1452 	case NVME_LOG_FIRMWARE_SLOT:
1453 		if (logoff >= sizeof(sc->fw_log)) {
1454 			pci_nvme_status_genc(&compl->status,
1455 			    NVME_SC_INVALID_FIELD);
1456 			break;
1457 		}
1458 
1459 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1460 		    command->prp2, (uint8_t *)&sc->fw_log + logoff,
1461 		    MIN(logsize - logoff, sizeof(sc->fw_log)),
1462 		    NVME_COPY_TO_PRP);
1463 		break;
1464 	case NVME_LOG_CHANGED_NAMESPACE:
1465 		if (logoff >= sizeof(sc->ns_log)) {
1466 			pci_nvme_status_genc(&compl->status,
1467 			    NVME_SC_INVALID_FIELD);
1468 			break;
1469 		}
1470 
1471 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1472 		    command->prp2, (uint8_t *)&sc->ns_log + logoff,
1473 		    MIN(logsize - logoff, sizeof(sc->ns_log)),
1474 		    NVME_COPY_TO_PRP);
1475 		memset(&sc->ns_log, 0, sizeof(sc->ns_log));
1476 		break;
1477 	default:
1478 		DPRINTF("%s get log page %x command not supported",
1479 		        __func__, logpage);
1480 
1481 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1482 		    NVME_SC_INVALID_LOG_PAGE);
1483 	}
1484 
1485 	return (1);
1486 }
1487 
1488 static int
1489 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1490 	struct nvme_completion* compl)
1491 {
1492 	void *dest;
1493 	uint16_t status;
1494 
1495 #ifndef __FreeBSD__
1496 	status = 0;
1497 #endif
1498 
1499 	DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1500 	        command->cdw10 & 0xFF, command->nsid);
1501 
1502 	pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1503 
1504 	switch (command->cdw10 & 0xFF) {
1505 	case 0x00: /* return Identify Namespace data structure */
1506 		/* Global NS only valid with NS Management */
1507 		if (command->nsid == NVME_GLOBAL_NAMESPACE_TAG) {
1508 			pci_nvme_status_genc(&status,
1509 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1510 			break;
1511 		}
1512 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1513 		    command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1514 		    NVME_COPY_TO_PRP);
1515 		break;
1516 	case 0x01: /* return Identify Controller data structure */
1517 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1518 		    command->prp2, (uint8_t *)&sc->ctrldata,
1519 		    sizeof(sc->ctrldata),
1520 		    NVME_COPY_TO_PRP);
1521 		break;
1522 	case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1523 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1524 		                  sizeof(uint32_t) * 1024);
1525 		/* All unused entries shall be zero */
1526 		bzero(dest, sizeof(uint32_t) * 1024);
1527 		((uint32_t *)dest)[0] = 1;
1528 		break;
1529 	case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1530 		if (command->nsid != 1) {
1531 			pci_nvme_status_genc(&status,
1532 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1533 			break;
1534 		}
1535 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1536 		                  sizeof(uint32_t) * 1024);
1537 		/* All bytes after the descriptor shall be zero */
1538 		bzero(dest, sizeof(uint32_t) * 1024);
1539 
1540 		/* Return NIDT=1 (i.e. EUI64) descriptor */
1541 		((uint8_t *)dest)[0] = 1;
1542 		((uint8_t *)dest)[1] = sizeof(uint64_t);
1543 		bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1544 		break;
1545 	default:
1546 		DPRINTF("%s unsupported identify command requested 0x%x",
1547 		         __func__, command->cdw10 & 0xFF);
1548 		pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1549 		break;
1550 	}
1551 
1552 	compl->status = status;
1553 	return (1);
1554 }
1555 
1556 static const char *
1557 nvme_fid_to_name(uint8_t fid)
1558 {
1559 	const char *name;
1560 
1561 	switch (fid) {
1562 	case NVME_FEAT_ARBITRATION:
1563 		name = "Arbitration";
1564 		break;
1565 	case NVME_FEAT_POWER_MANAGEMENT:
1566 		name = "Power Management";
1567 		break;
1568 	case NVME_FEAT_LBA_RANGE_TYPE:
1569 		name = "LBA Range Type";
1570 		break;
1571 	case NVME_FEAT_TEMPERATURE_THRESHOLD:
1572 		name = "Temperature Threshold";
1573 		break;
1574 	case NVME_FEAT_ERROR_RECOVERY:
1575 		name = "Error Recovery";
1576 		break;
1577 	case NVME_FEAT_VOLATILE_WRITE_CACHE:
1578 		name = "Volatile Write Cache";
1579 		break;
1580 	case NVME_FEAT_NUMBER_OF_QUEUES:
1581 		name = "Number of Queues";
1582 		break;
1583 	case NVME_FEAT_INTERRUPT_COALESCING:
1584 		name = "Interrupt Coalescing";
1585 		break;
1586 	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1587 		name = "Interrupt Vector Configuration";
1588 		break;
1589 	case NVME_FEAT_WRITE_ATOMICITY:
1590 		name = "Write Atomicity Normal";
1591 		break;
1592 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1593 		name = "Asynchronous Event Configuration";
1594 		break;
1595 	case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1596 		name = "Autonomous Power State Transition";
1597 		break;
1598 	case NVME_FEAT_HOST_MEMORY_BUFFER:
1599 		name = "Host Memory Buffer";
1600 		break;
1601 	case NVME_FEAT_TIMESTAMP:
1602 		name = "Timestamp";
1603 		break;
1604 	case NVME_FEAT_KEEP_ALIVE_TIMER:
1605 		name = "Keep Alive Timer";
1606 		break;
1607 	case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1608 		name = "Host Controlled Thermal Management";
1609 		break;
1610 	case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1611 		name = "Non-Operation Power State Config";
1612 		break;
1613 	case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1614 		name = "Read Recovery Level Config";
1615 		break;
1616 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1617 		name = "Predictable Latency Mode Config";
1618 		break;
1619 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1620 		name = "Predictable Latency Mode Window";
1621 		break;
1622 	case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1623 		name = "LBA Status Information Report Interval";
1624 		break;
1625 	case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1626 		name = "Host Behavior Support";
1627 		break;
1628 	case NVME_FEAT_SANITIZE_CONFIG:
1629 		name = "Sanitize Config";
1630 		break;
1631 	case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1632 		name = "Endurance Group Event Configuration";
1633 		break;
1634 	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1635 		name = "Software Progress Marker";
1636 		break;
1637 	case NVME_FEAT_HOST_IDENTIFIER:
1638 		name = "Host Identifier";
1639 		break;
1640 	case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1641 		name = "Reservation Notification Mask";
1642 		break;
1643 	case NVME_FEAT_RESERVATION_PERSISTENCE:
1644 		name = "Reservation Persistence";
1645 		break;
1646 	case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1647 		name = "Namespace Write Protection Config";
1648 		break;
1649 	default:
1650 		name = "Unknown";
1651 		break;
1652 	}
1653 
1654 	return (name);
1655 }
1656 
1657 static void
1658 nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
1659     struct nvme_feature_obj *feat,
1660     struct nvme_command *command,
1661     struct nvme_completion *compl)
1662 {
1663 
1664 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1665 }
1666 
1667 static void
1668 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1669     struct nvme_feature_obj *feat,
1670     struct nvme_command *command,
1671     struct nvme_completion *compl)
1672 {
1673 	uint32_t i;
1674 	uint32_t cdw11 = command->cdw11;
1675 	uint16_t iv;
1676 	bool cd;
1677 
1678 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1679 
1680 	iv = cdw11 & 0xffff;
1681 	cd = cdw11 & (1 << 16);
1682 
1683 	if (iv > (sc->max_queues + 1)) {
1684 		return;
1685 	}
1686 
1687 	/* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1688 	if ((iv == 0) && !cd)
1689 		return;
1690 
1691 	/* Requested Interrupt Vector must be used by a CQ */
1692 	for (i = 0; i < sc->num_cqueues + 1; i++) {
1693 		if (sc->compl_queues[i].intr_vec == iv) {
1694 			pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1695 		}
1696 	}
1697 }
1698 
1699 #define NVME_ASYNC_EVENT_ENDURANCE_GROUP		(0x4000)
1700 static void
1701 nvme_feature_async_event(struct pci_nvme_softc *sc,
1702     struct nvme_feature_obj *feat,
1703     struct nvme_command *command,
1704     struct nvme_completion *compl)
1705 {
1706 
1707 	if (command->cdw11 & NVME_ASYNC_EVENT_ENDURANCE_GROUP)
1708 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1709 }
1710 
1711 #define NVME_TEMP_THRESH_OVER	0
1712 #define NVME_TEMP_THRESH_UNDER	1
1713 static void
1714 nvme_feature_temperature(struct pci_nvme_softc *sc,
1715     struct nvme_feature_obj *feat,
1716     struct nvme_command *command,
1717     struct nvme_completion *compl)
1718 {
1719 	uint16_t	tmpth;	/* Temperature Threshold */
1720 	uint8_t		tmpsel; /* Threshold Temperature Select */
1721 	uint8_t		thsel;  /* Threshold Type Select */
1722 	bool		set_crit = false;
1723 
1724 	tmpth  = command->cdw11 & 0xffff;
1725 	tmpsel = (command->cdw11 >> 16) & 0xf;
1726 	thsel  = (command->cdw11 >> 20) & 0x3;
1727 
1728 	DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel);
1729 
1730 	/* Check for unsupported values */
1731 	if (((tmpsel != 0) && (tmpsel != 0xf)) ||
1732 	    (thsel > NVME_TEMP_THRESH_UNDER)) {
1733 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1734 		return;
1735 	}
1736 
1737 	if (((thsel == NVME_TEMP_THRESH_OVER)  && (NVME_TEMPERATURE >= tmpth)) ||
1738 	    ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth)))
1739 		set_crit = true;
1740 
1741 	pthread_mutex_lock(&sc->mtx);
1742 	if (set_crit)
1743 		sc->health_log.critical_warning |=
1744 		    NVME_CRIT_WARN_ST_TEMPERATURE;
1745 	else
1746 		sc->health_log.critical_warning &=
1747 		    ~NVME_CRIT_WARN_ST_TEMPERATURE;
1748 	pthread_mutex_unlock(&sc->mtx);
1749 
1750 	if (set_crit)
1751 		pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART,
1752 		    sc->health_log.critical_warning);
1753 
1754 
1755 	DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status);
1756 }
1757 
1758 static void
1759 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1760     struct nvme_feature_obj *feat,
1761     struct nvme_command *command,
1762     struct nvme_completion *compl)
1763 {
1764 	uint16_t nqr;	/* Number of Queues Requested */
1765 
1766 	if (sc->num_q_is_set) {
1767 		WPRINTF("%s: Number of Queues already set", __func__);
1768 		pci_nvme_status_genc(&compl->status,
1769 		    NVME_SC_COMMAND_SEQUENCE_ERROR);
1770 		return;
1771 	}
1772 
1773 	nqr = command->cdw11 & 0xFFFF;
1774 	if (nqr == 0xffff) {
1775 		WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1776 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1777 		return;
1778 	}
1779 
1780 	sc->num_squeues = ONE_BASED(nqr);
1781 	if (sc->num_squeues > sc->max_queues) {
1782 		DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1783 					sc->max_queues);
1784 		sc->num_squeues = sc->max_queues;
1785 	}
1786 
1787 	nqr = (command->cdw11 >> 16) & 0xFFFF;
1788 	if (nqr == 0xffff) {
1789 		WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1790 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1791 		return;
1792 	}
1793 
1794 	sc->num_cqueues = ONE_BASED(nqr);
1795 	if (sc->num_cqueues > sc->max_queues) {
1796 		DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1797 					sc->max_queues);
1798 		sc->num_cqueues = sc->max_queues;
1799 	}
1800 
1801 	/* Patch the command value which will be saved on callback's return */
1802 	command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1803 	compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1804 
1805 	sc->num_q_is_set = true;
1806 }
1807 
1808 static int
1809 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1810 	struct nvme_completion *compl)
1811 {
1812 	struct nvme_feature_obj *feat;
1813 	uint32_t nsid = command->nsid;
1814 	uint8_t fid = command->cdw10 & 0xFF;
1815 
1816 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1817 
1818 	if (fid >= NVME_FID_MAX) {
1819 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1820 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1821 		return (1);
1822 	}
1823 	feat = &sc->feat[fid];
1824 
1825 	if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) {
1826 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1827 		return (1);
1828 	}
1829 
1830 	if (!feat->namespace_specific &&
1831 	    !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1832 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1833 		    NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1834 		return (1);
1835 	}
1836 
1837 	compl->cdw0 = 0;
1838 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1839 
1840 	if (feat->set)
1841 		feat->set(sc, feat, command, compl);
1842 
1843 	DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11);
1844 	if (compl->status == NVME_SC_SUCCESS) {
1845 		feat->cdw11 = command->cdw11;
1846 		if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) &&
1847 		    (command->cdw11 != 0))
1848 			pci_nvme_aen_notify(sc);
1849 	}
1850 
1851 	return (0);
1852 }
1853 
1854 #define NVME_FEATURES_SEL_SUPPORTED	0x3
1855 #define NVME_FEATURES_NS_SPECIFIC	(1 << 1)
1856 
1857 static int
1858 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1859 	struct nvme_completion* compl)
1860 {
1861 	struct nvme_feature_obj *feat;
1862 	uint8_t fid = command->cdw10 & 0xFF;
1863 	uint8_t sel = (command->cdw10 >> 8) & 0x7;
1864 
1865 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1866 
1867 	if (fid >= NVME_FID_MAX) {
1868 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1869 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1870 		return (1);
1871 	}
1872 
1873 	compl->cdw0 = 0;
1874 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1875 
1876 	feat = &sc->feat[fid];
1877 	if (feat->get) {
1878 		feat->get(sc, feat, command, compl);
1879 	}
1880 
1881 	if (compl->status == NVME_SC_SUCCESS) {
1882 		if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific)
1883 			compl->cdw0 = NVME_FEATURES_NS_SPECIFIC;
1884 		else
1885 			compl->cdw0 = feat->cdw11;
1886 	}
1887 
1888 	return (0);
1889 }
1890 
1891 static int
1892 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1893 	struct nvme_completion* compl)
1894 {
1895 	uint8_t	ses, lbaf, pi;
1896 
1897 	/* Only supports Secure Erase Setting - User Data Erase */
1898 	ses = (command->cdw10 >> 9) & 0x7;
1899 	if (ses > 0x1) {
1900 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1901 		return (1);
1902 	}
1903 
1904 	/* Only supports a single LBA Format */
1905 	lbaf = command->cdw10 & 0xf;
1906 	if (lbaf != 0) {
1907 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1908 		    NVME_SC_INVALID_FORMAT);
1909 		return (1);
1910 	}
1911 
1912 	/* Doesn't support Protection Infomation */
1913 	pi = (command->cdw10 >> 5) & 0x7;
1914 	if (pi != 0) {
1915 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1916 		return (1);
1917 	}
1918 
1919 	if (sc->nvstore.type == NVME_STOR_RAM) {
1920 		if (sc->nvstore.ctx)
1921 			free(sc->nvstore.ctx);
1922 		sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1923 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1924 	} else {
1925 		struct pci_nvme_ioreq *req;
1926 		int err;
1927 
1928 		req = pci_nvme_get_ioreq(sc);
1929 		if (req == NULL) {
1930 			pci_nvme_status_genc(&compl->status,
1931 			    NVME_SC_INTERNAL_DEVICE_ERROR);
1932 			WPRINTF("%s: unable to allocate IO req", __func__);
1933 			return (1);
1934 		}
1935 		req->nvme_sq = &sc->submit_queues[0];
1936 		req->sqid = 0;
1937 		req->opc = command->opc;
1938 		req->cid = command->cid;
1939 		req->nsid = command->nsid;
1940 
1941 		req->io_req.br_offset = 0;
1942 		req->io_req.br_resid = sc->nvstore.size;
1943 		req->io_req.br_callback = pci_nvme_io_done;
1944 
1945 		err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1946 		if (err) {
1947 			pci_nvme_status_genc(&compl->status,
1948 			    NVME_SC_INTERNAL_DEVICE_ERROR);
1949 			pci_nvme_release_ioreq(sc, req);
1950 		} else
1951 			compl->status = NVME_NO_STATUS;
1952 	}
1953 
1954 	return (1);
1955 }
1956 
1957 static int
1958 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1959 	struct nvme_completion* compl)
1960 {
1961 	DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1962 	        command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1963 
1964 	/* TODO: search for the command ID and abort it */
1965 
1966 	compl->cdw0 = 1;
1967 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1968 	return (1);
1969 }
1970 
1971 static int
1972 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1973 	struct nvme_command* command, struct nvme_completion* compl)
1974 {
1975 	DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__,
1976 	    sc->aer_count, sc->ctrldata.aerl, command->cid);
1977 
1978 	/* Don't exceed the Async Event Request Limit (AERL). */
1979 	if (pci_nvme_aer_limit_reached(sc)) {
1980 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1981 				NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1982 		return (1);
1983 	}
1984 
1985 	if (pci_nvme_aer_add(sc, command->cid)) {
1986 		pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
1987 				NVME_SC_INTERNAL_DEVICE_ERROR);
1988 		return (1);
1989 	}
1990 
1991 	/*
1992 	 * Raise events when they happen based on the Set Features cmd.
1993 	 * These events happen async, so only set completion successful if
1994 	 * there is an event reflective of the request to get event.
1995 	 */
1996 	compl->status = NVME_NO_STATUS;
1997 	pci_nvme_aen_notify(sc);
1998 
1999 	return (0);
2000 }
2001 
2002 static void
2003 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
2004 {
2005 	struct nvme_completion compl;
2006 	struct nvme_command *cmd;
2007 	struct nvme_submission_queue *sq;
2008 	struct nvme_completion_queue *cq;
2009 	uint16_t sqhead;
2010 
2011 	DPRINTF("%s index %u", __func__, (uint32_t)value);
2012 
2013 	sq = &sc->submit_queues[0];
2014 	cq = &sc->compl_queues[0];
2015 
2016 	pthread_mutex_lock(&sq->mtx);
2017 
2018 	sqhead = sq->head;
2019 	DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
2020 
2021 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
2022 		cmd = &(sq->qbase)[sqhead];
2023 		compl.cdw0 = 0;
2024 		compl.status = 0;
2025 
2026 		switch (cmd->opc) {
2027 		case NVME_OPC_DELETE_IO_SQ:
2028 			DPRINTF("%s command DELETE_IO_SQ", __func__);
2029 			nvme_opc_delete_io_sq(sc, cmd, &compl);
2030 			break;
2031 		case NVME_OPC_CREATE_IO_SQ:
2032 			DPRINTF("%s command CREATE_IO_SQ", __func__);
2033 			nvme_opc_create_io_sq(sc, cmd, &compl);
2034 			break;
2035 		case NVME_OPC_DELETE_IO_CQ:
2036 			DPRINTF("%s command DELETE_IO_CQ", __func__);
2037 			nvme_opc_delete_io_cq(sc, cmd, &compl);
2038 			break;
2039 		case NVME_OPC_CREATE_IO_CQ:
2040 			DPRINTF("%s command CREATE_IO_CQ", __func__);
2041 			nvme_opc_create_io_cq(sc, cmd, &compl);
2042 			break;
2043 		case NVME_OPC_GET_LOG_PAGE:
2044 			DPRINTF("%s command GET_LOG_PAGE", __func__);
2045 			nvme_opc_get_log_page(sc, cmd, &compl);
2046 			break;
2047 		case NVME_OPC_IDENTIFY:
2048 			DPRINTF("%s command IDENTIFY", __func__);
2049 			nvme_opc_identify(sc, cmd, &compl);
2050 			break;
2051 		case NVME_OPC_ABORT:
2052 			DPRINTF("%s command ABORT", __func__);
2053 			nvme_opc_abort(sc, cmd, &compl);
2054 			break;
2055 		case NVME_OPC_SET_FEATURES:
2056 			DPRINTF("%s command SET_FEATURES", __func__);
2057 			nvme_opc_set_features(sc, cmd, &compl);
2058 			break;
2059 		case NVME_OPC_GET_FEATURES:
2060 			DPRINTF("%s command GET_FEATURES", __func__);
2061 			nvme_opc_get_features(sc, cmd, &compl);
2062 			break;
2063 		case NVME_OPC_FIRMWARE_ACTIVATE:
2064 			DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
2065 			pci_nvme_status_tc(&compl.status,
2066 			    NVME_SCT_COMMAND_SPECIFIC,
2067 			    NVME_SC_INVALID_FIRMWARE_SLOT);
2068 			break;
2069 		case NVME_OPC_ASYNC_EVENT_REQUEST:
2070 			DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
2071 			nvme_opc_async_event_req(sc, cmd, &compl);
2072 			break;
2073 		case NVME_OPC_FORMAT_NVM:
2074 			DPRINTF("%s command FORMAT_NVM", __func__);
2075 			if ((sc->ctrldata.oacs &
2076 			    (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
2077 				pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
2078 				break;
2079 			}
2080 			nvme_opc_format_nvm(sc, cmd, &compl);
2081 			break;
2082 		case NVME_OPC_SECURITY_SEND:
2083 		case NVME_OPC_SECURITY_RECEIVE:
2084 		case NVME_OPC_SANITIZE:
2085 		case NVME_OPC_GET_LBA_STATUS:
2086 			DPRINTF("%s command OPC=%#x (unsupported)", __func__,
2087 			    cmd->opc);
2088 			/* Valid but unsupported opcodes */
2089 			pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD);
2090 			break;
2091 		default:
2092 			DPRINTF("%s command OPC=%#X (not implemented)",
2093 			    __func__,
2094 			    cmd->opc);
2095 			pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
2096 		}
2097 		sqhead = (sqhead + 1) % sq->size;
2098 
2099 		if (NVME_COMPLETION_VALID(compl)) {
2100 			pci_nvme_cq_update(sc, &sc->compl_queues[0],
2101 			    compl.cdw0,
2102 			    cmd->cid,
2103 			    0,		/* SQID */
2104 			    compl.status);
2105 		}
2106 	}
2107 
2108 	DPRINTF("setting sqhead %u", sqhead);
2109 	sq->head = sqhead;
2110 
2111 	if (cq->head != cq->tail)
2112 		pci_generate_msix(sc->nsc_pi, 0);
2113 
2114 	pthread_mutex_unlock(&sq->mtx);
2115 }
2116 
2117 /*
2118  * Update the Write and Read statistics reported in SMART data
2119  *
2120  * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
2121  * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
2122  * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
2123  */
2124 static void
2125 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
2126     size_t bytes, uint16_t status)
2127 {
2128 
2129 	pthread_mutex_lock(&sc->mtx);
2130 	switch (opc) {
2131 	case NVME_OPC_WRITE:
2132 		sc->write_commands++;
2133 		if (status != NVME_SC_SUCCESS)
2134 			break;
2135 		sc->write_dunits_remainder += (bytes / 512);
2136 		while (sc->write_dunits_remainder >= 1000) {
2137 			sc->write_data_units++;
2138 			sc->write_dunits_remainder -= 1000;
2139 		}
2140 		break;
2141 	case NVME_OPC_READ:
2142 		sc->read_commands++;
2143 		if (status != NVME_SC_SUCCESS)
2144 			break;
2145 		sc->read_dunits_remainder += (bytes / 512);
2146 		while (sc->read_dunits_remainder >= 1000) {
2147 			sc->read_data_units++;
2148 			sc->read_dunits_remainder -= 1000;
2149 		}
2150 		break;
2151 	default:
2152 		DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
2153 		break;
2154 	}
2155 	pthread_mutex_unlock(&sc->mtx);
2156 }
2157 
2158 /*
2159  * Check if the combination of Starting LBA (slba) and number of blocks
2160  * exceeds the range of the underlying storage.
2161  *
2162  * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
2163  * the capacity in bytes as a uint64_t, care must be taken to avoid integer
2164  * overflow.
2165  */
2166 static bool
2167 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
2168     uint32_t nblocks)
2169 {
2170 	size_t	offset, bytes;
2171 
2172 	/* Overflow check of multiplying Starting LBA by the sector size */
2173 	if (slba >> (64 - nvstore->sectsz_bits))
2174 		return (true);
2175 
2176 	offset = slba << nvstore->sectsz_bits;
2177 	bytes = nblocks << nvstore->sectsz_bits;
2178 
2179 	/* Overflow check of Number of Logical Blocks */
2180 	if ((nvstore->size <= offset) || ((nvstore->size - offset) < bytes))
2181 		return (true);
2182 
2183 	return (false);
2184 }
2185 
2186 static int
2187 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
2188 	uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
2189 {
2190 	int iovidx;
2191 
2192 	if (req == NULL)
2193 		return (-1);
2194 
2195 	if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
2196 		return (-1);
2197 	}
2198 
2199 	/* concatenate contig block-iovs to minimize number of iovs */
2200 	if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
2201 		iovidx = req->io_req.br_iovcnt - 1;
2202 
2203 		req->io_req.br_iov[iovidx].iov_base =
2204 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2205 				     req->prev_gpaddr, size);
2206 
2207 		req->prev_size += size;
2208 		req->io_req.br_resid += size;
2209 
2210 		req->io_req.br_iov[iovidx].iov_len = req->prev_size;
2211 	} else {
2212 		iovidx = req->io_req.br_iovcnt;
2213 		if (iovidx == 0) {
2214 			req->io_req.br_offset = lba;
2215 			req->io_req.br_resid = 0;
2216 			req->io_req.br_param = req;
2217 		}
2218 
2219 		req->io_req.br_iov[iovidx].iov_base =
2220 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2221 				     gpaddr, size);
2222 
2223 		req->io_req.br_iov[iovidx].iov_len = size;
2224 
2225 		req->prev_gpaddr = gpaddr;
2226 		req->prev_size = size;
2227 		req->io_req.br_resid += size;
2228 
2229 		req->io_req.br_iovcnt++;
2230 	}
2231 
2232 	return (0);
2233 }
2234 
2235 static void
2236 pci_nvme_set_completion(struct pci_nvme_softc *sc,
2237 	struct nvme_submission_queue *sq, int sqid, uint16_t cid,
2238 	uint32_t cdw0, uint16_t status)
2239 {
2240 	struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
2241 
2242 	DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
2243 		 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
2244 		 NVME_STATUS_GET_SC(status));
2245 
2246 	pci_nvme_cq_update(sc, cq,
2247 	    0,		/* CDW0 */
2248 	    cid,
2249 	    sqid,
2250 	    status);
2251 
2252 	if (cq->head != cq->tail) {
2253 		if (cq->intr_en & NVME_CQ_INTEN) {
2254 			pci_generate_msix(sc->nsc_pi, cq->intr_vec);
2255 		} else {
2256 			DPRINTF("%s: CQ%u interrupt disabled",
2257 						__func__, sq->cqid);
2258 		}
2259 	}
2260 }
2261 
2262 static void
2263 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
2264 {
2265 	req->sc = NULL;
2266 	req->nvme_sq = NULL;
2267 	req->sqid = 0;
2268 
2269 	pthread_mutex_lock(&sc->mtx);
2270 
2271 	STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
2272 	sc->pending_ios--;
2273 
2274 	/* when no more IO pending, can set to ready if device reset/enabled */
2275 	if (sc->pending_ios == 0 &&
2276 	    NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
2277 		sc->regs.csts |= NVME_CSTS_RDY;
2278 
2279 	pthread_mutex_unlock(&sc->mtx);
2280 
2281 	sem_post(&sc->iosemlock);
2282 }
2283 
2284 static struct pci_nvme_ioreq *
2285 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
2286 {
2287 	struct pci_nvme_ioreq *req = NULL;
2288 
2289 	sem_wait(&sc->iosemlock);
2290 	pthread_mutex_lock(&sc->mtx);
2291 
2292 	req = STAILQ_FIRST(&sc->ioreqs_free);
2293 	assert(req != NULL);
2294 	STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
2295 
2296 	req->sc = sc;
2297 
2298 	sc->pending_ios++;
2299 
2300 	pthread_mutex_unlock(&sc->mtx);
2301 
2302 	req->io_req.br_iovcnt = 0;
2303 	req->io_req.br_offset = 0;
2304 	req->io_req.br_resid = 0;
2305 	req->io_req.br_param = req;
2306 	req->prev_gpaddr = 0;
2307 	req->prev_size = 0;
2308 
2309 	return req;
2310 }
2311 
2312 static void
2313 pci_nvme_io_done(struct blockif_req *br, int err)
2314 {
2315 	struct pci_nvme_ioreq *req = br->br_param;
2316 	struct nvme_submission_queue *sq = req->nvme_sq;
2317 	uint16_t code, status;
2318 
2319 #ifndef __FreeBSD__
2320 	status = 0;
2321 #endif
2322 
2323 	DPRINTF("%s error %d %s", __func__, err, strerror(err));
2324 
2325 	/* TODO return correct error */
2326 	code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
2327 	pci_nvme_status_genc(&status, code);
2328 
2329 	pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
2330 	pci_nvme_stats_write_read_update(req->sc, req->opc,
2331 	    req->bytes, status);
2332 	pci_nvme_release_ioreq(req->sc, req);
2333 }
2334 
2335 /*
2336  * Implements the Flush command. The specification states:
2337  *    If a volatile write cache is not present, Flush commands complete
2338  *    successfully and have no effect
2339  * in the description of the Volatile Write Cache (VWC) field of the Identify
2340  * Controller data. Therefore, set status to Success if the command is
2341  * not supported (i.e. RAM or as indicated by the blockif).
2342  */
2343 static bool
2344 nvme_opc_flush(struct pci_nvme_softc *sc,
2345     struct nvme_command *cmd,
2346     struct pci_nvme_blockstore *nvstore,
2347     struct pci_nvme_ioreq *req,
2348     uint16_t *status)
2349 {
2350 	bool pending = false;
2351 
2352 	if (nvstore->type == NVME_STOR_RAM) {
2353 		pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2354 	} else {
2355 		int err;
2356 
2357 		req->io_req.br_callback = pci_nvme_io_done;
2358 
2359 		err = blockif_flush(nvstore->ctx, &req->io_req);
2360 		switch (err) {
2361 		case 0:
2362 			pending = true;
2363 			break;
2364 		case EOPNOTSUPP:
2365 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2366 			break;
2367 		default:
2368 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2369 		}
2370 	}
2371 
2372 	return (pending);
2373 }
2374 
2375 static uint16_t
2376 nvme_write_read_ram(struct pci_nvme_softc *sc,
2377     struct pci_nvme_blockstore *nvstore,
2378     uint64_t prp1, uint64_t prp2,
2379     size_t offset, uint64_t bytes,
2380     bool is_write)
2381 {
2382 	uint8_t *buf = nvstore->ctx;
2383 	enum nvme_copy_dir dir;
2384 	uint16_t status;
2385 
2386 #ifndef __FreeBSD__
2387 	status = 0;
2388 #endif
2389 
2390 	if (is_write)
2391 		dir = NVME_COPY_TO_PRP;
2392 	else
2393 		dir = NVME_COPY_FROM_PRP;
2394 
2395 	if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
2396 	    buf + offset, bytes, dir))
2397 		pci_nvme_status_genc(&status,
2398 		    NVME_SC_DATA_TRANSFER_ERROR);
2399 	else
2400 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2401 
2402 	return (status);
2403 }
2404 
2405 static uint16_t
2406 nvme_write_read_blockif(struct pci_nvme_softc *sc,
2407     struct pci_nvme_blockstore *nvstore,
2408     struct pci_nvme_ioreq *req,
2409     uint64_t prp1, uint64_t prp2,
2410     size_t offset, uint64_t bytes,
2411     bool is_write)
2412 {
2413 	uint64_t size;
2414 	int err;
2415 	uint16_t status = NVME_NO_STATUS;
2416 
2417 	size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
2418 	if (pci_nvme_append_iov_req(sc, req, prp1,
2419 	    size, is_write, offset)) {
2420 		pci_nvme_status_genc(&status,
2421 		    NVME_SC_DATA_TRANSFER_ERROR);
2422 		goto out;
2423 	}
2424 
2425 	offset += size;
2426 	bytes  -= size;
2427 
2428 	if (bytes == 0) {
2429 		;
2430 	} else if (bytes <= PAGE_SIZE) {
2431 		size = bytes;
2432 		if (pci_nvme_append_iov_req(sc, req, prp2,
2433 		    size, is_write, offset)) {
2434 			pci_nvme_status_genc(&status,
2435 			    NVME_SC_DATA_TRANSFER_ERROR);
2436 			goto out;
2437 		}
2438 	} else {
2439 		void *vmctx = sc->nsc_pi->pi_vmctx;
2440 		uint64_t *prp_list = &prp2;
2441 		uint64_t *last = prp_list;
2442 
2443 		/* PRP2 is pointer to a physical region page list */
2444 		while (bytes) {
2445 			/* Last entry in list points to the next list */
2446 			if ((prp_list == last) && (bytes > PAGE_SIZE)) {
2447 				uint64_t prp = *prp_list;
2448 
2449 				prp_list = paddr_guest2host(vmctx, prp,
2450 				    PAGE_SIZE - (prp % PAGE_SIZE));
2451 				last = prp_list + (NVME_PRP2_ITEMS - 1);
2452 			}
2453 
2454 			size = MIN(bytes, PAGE_SIZE);
2455 
2456 			if (pci_nvme_append_iov_req(sc, req, *prp_list,
2457 			    size, is_write, offset)) {
2458 				pci_nvme_status_genc(&status,
2459 				    NVME_SC_DATA_TRANSFER_ERROR);
2460 				goto out;
2461 			}
2462 
2463 			offset += size;
2464 			bytes  -= size;
2465 
2466 			prp_list++;
2467 		}
2468 	}
2469 	req->io_req.br_callback = pci_nvme_io_done;
2470 	if (is_write)
2471 		err = blockif_write(nvstore->ctx, &req->io_req);
2472 	else
2473 		err = blockif_read(nvstore->ctx, &req->io_req);
2474 
2475 	if (err)
2476 		pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2477 out:
2478 	return (status);
2479 }
2480 
2481 static bool
2482 nvme_opc_write_read(struct pci_nvme_softc *sc,
2483     struct nvme_command *cmd,
2484     struct pci_nvme_blockstore *nvstore,
2485     struct pci_nvme_ioreq *req,
2486     uint16_t *status)
2487 {
2488 	uint64_t lba, nblocks, bytes;
2489 	size_t offset;
2490 	bool is_write = cmd->opc == NVME_OPC_WRITE;
2491 	bool pending = false;
2492 
2493 #ifndef __FreeBSD__
2494 	bytes = 0;
2495 #endif
2496 
2497 	lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2498 	nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2499 
2500 	if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2501 		WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)",
2502 		    __func__, lba, nblocks);
2503 		pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2504 		goto out;
2505 	}
2506 
2507 	bytes  = nblocks << nvstore->sectsz_bits;
2508 	if (bytes > NVME_MAX_DATA_SIZE) {
2509 		WPRINTF("%s command would exceed MDTS", __func__);
2510 		pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2511 		goto out;
2512 	}
2513 
2514 	offset = lba << nvstore->sectsz_bits;
2515 
2516 	req->bytes = bytes;
2517 	req->io_req.br_offset = lba;
2518 
2519 	/* PRP bits 1:0 must be zero */
2520 	cmd->prp1 &= ~0x3UL;
2521 	cmd->prp2 &= ~0x3UL;
2522 
2523 	if (nvstore->type == NVME_STOR_RAM) {
2524 		*status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2525 		    cmd->prp2, offset, bytes, is_write);
2526 	} else {
2527 		*status = nvme_write_read_blockif(sc, nvstore, req,
2528 		    cmd->prp1, cmd->prp2, offset, bytes, is_write);
2529 
2530 		if (*status == NVME_NO_STATUS)
2531 			pending = true;
2532 	}
2533 out:
2534 	if (!pending)
2535 		pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2536 
2537 	return (pending);
2538 }
2539 
2540 static void
2541 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2542 {
2543 	struct pci_nvme_ioreq *req = br->br_param;
2544 	struct pci_nvme_softc *sc = req->sc;
2545 	bool done = true;
2546 	uint16_t status;
2547 
2548 #ifndef __FreeBSD__
2549 	status = 0;
2550 #endif
2551 
2552 	if (err) {
2553 		pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2554 	} else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2555 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2556 	} else {
2557 		struct iovec *iov = req->io_req.br_iov;
2558 
2559 		req->prev_gpaddr++;
2560 		iov += req->prev_gpaddr;
2561 
2562 		/* The iov_* values already include the sector size */
2563 		req->io_req.br_offset = (off_t)iov->iov_base;
2564 		req->io_req.br_resid = iov->iov_len;
2565 		if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2566 			pci_nvme_status_genc(&status,
2567 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2568 		} else
2569 			done = false;
2570 	}
2571 
2572 	if (done) {
2573 		pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
2574 		    req->cid, 0, status);
2575 		pci_nvme_release_ioreq(sc, req);
2576 	}
2577 }
2578 
2579 static bool
2580 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2581     struct nvme_command *cmd,
2582     struct pci_nvme_blockstore *nvstore,
2583     struct pci_nvme_ioreq *req,
2584     uint16_t *status)
2585 {
2586 	struct nvme_dsm_range *range;
2587 	uint32_t nr, r, non_zero, dr;
2588 	int err;
2589 	bool pending = false;
2590 
2591 #ifndef __FreeBSD__
2592 	range = NULL;
2593 #endif
2594 
2595 	if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2596 		pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2597 		goto out;
2598 	}
2599 
2600 	nr = cmd->cdw10 & 0xff;
2601 
2602 	/* copy locally because a range entry could straddle PRPs */
2603 	range = calloc(1, NVME_MAX_DSM_TRIM);
2604 	if (range == NULL) {
2605 		pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2606 		goto out;
2607 	}
2608 	nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2609 	    (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2610 
2611 	/* Check for invalid ranges and the number of non-zero lengths */
2612 	non_zero = 0;
2613 	for (r = 0; r <= nr; r++) {
2614 		if (pci_nvme_out_of_range(nvstore,
2615 		    range[r].starting_lba, range[r].length)) {
2616 			pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2617 			goto out;
2618 		}
2619 		if (range[r].length != 0)
2620 			non_zero++;
2621 	}
2622 
2623 	if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2624 		size_t offset, bytes;
2625 		int sectsz_bits = sc->nvstore.sectsz_bits;
2626 
2627 		/*
2628 		 * DSM calls are advisory only, and compliant controllers
2629 		 * may choose to take no actions (i.e. return Success).
2630 		 */
2631 		if (!nvstore->deallocate) {
2632 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2633 			goto out;
2634 		}
2635 
2636 		/* If all ranges have a zero length, return Success */
2637 		if (non_zero == 0) {
2638 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2639 			goto out;
2640 		}
2641 
2642 		if (req == NULL) {
2643 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2644 			goto out;
2645 		}
2646 
2647 		offset = range[0].starting_lba << sectsz_bits;
2648 		bytes = range[0].length << sectsz_bits;
2649 
2650 		/*
2651 		 * If the request is for more than a single range, store
2652 		 * the ranges in the br_iov. Optimize for the common case
2653 		 * of a single range.
2654 		 *
2655 		 * Note that NVMe Number of Ranges is a zero based value
2656 		 */
2657 		req->io_req.br_iovcnt = 0;
2658 		req->io_req.br_offset = offset;
2659 		req->io_req.br_resid = bytes;
2660 
2661 		if (nr == 0) {
2662 			req->io_req.br_callback = pci_nvme_io_done;
2663 		} else {
2664 			struct iovec *iov = req->io_req.br_iov;
2665 
2666 			for (r = 0, dr = 0; r <= nr; r++) {
2667 				offset = range[r].starting_lba << sectsz_bits;
2668 				bytes = range[r].length << sectsz_bits;
2669 				if (bytes == 0)
2670 					continue;
2671 
2672 				if ((nvstore->size - offset) < bytes) {
2673 					pci_nvme_status_genc(status,
2674 					    NVME_SC_LBA_OUT_OF_RANGE);
2675 					goto out;
2676 				}
2677 				iov[dr].iov_base = (void *)offset;
2678 				iov[dr].iov_len = bytes;
2679 				dr++;
2680 			}
2681 			req->io_req.br_callback = pci_nvme_dealloc_sm;
2682 
2683 			/*
2684 			 * Use prev_gpaddr to track the current entry and
2685 			 * prev_size to track the number of entries
2686 			 */
2687 			req->prev_gpaddr = 0;
2688 			req->prev_size = dr;
2689 		}
2690 
2691 		err = blockif_delete(nvstore->ctx, &req->io_req);
2692 		if (err)
2693 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2694 		else
2695 			pending = true;
2696 	}
2697 out:
2698 	free(range);
2699 	return (pending);
2700 }
2701 
2702 static void
2703 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2704 {
2705 	struct nvme_submission_queue *sq;
2706 	uint16_t status;
2707 	uint16_t sqhead;
2708 
2709 #ifndef __FreeBSD__
2710 	status = 0;
2711 #endif
2712 
2713 	/* handle all submissions up to sq->tail index */
2714 	sq = &sc->submit_queues[idx];
2715 
2716 	pthread_mutex_lock(&sq->mtx);
2717 
2718 	sqhead = sq->head;
2719 	DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2720 	         idx, sqhead, sq->tail, sq->qbase);
2721 
2722 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
2723 		struct nvme_command *cmd;
2724 		struct pci_nvme_ioreq *req;
2725 		uint32_t nsid;
2726 		bool pending;
2727 
2728 		pending = false;
2729 		req = NULL;
2730 		status = 0;
2731 
2732 		cmd = &sq->qbase[sqhead];
2733 		sqhead = (sqhead + 1) % sq->size;
2734 
2735 		nsid = le32toh(cmd->nsid);
2736 		if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2737 			pci_nvme_status_genc(&status,
2738 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2739 			status |=
2740 			    NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2741 			goto complete;
2742  		}
2743 
2744 		req = pci_nvme_get_ioreq(sc);
2745 		if (req == NULL) {
2746 			pci_nvme_status_genc(&status,
2747 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2748 			WPRINTF("%s: unable to allocate IO req", __func__);
2749 			goto complete;
2750 		}
2751 		req->nvme_sq = sq;
2752 		req->sqid = idx;
2753 		req->opc = cmd->opc;
2754 		req->cid = cmd->cid;
2755 		req->nsid = cmd->nsid;
2756 
2757 		switch (cmd->opc) {
2758 		case NVME_OPC_FLUSH:
2759 			pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2760 			    req, &status);
2761  			break;
2762 		case NVME_OPC_WRITE:
2763 		case NVME_OPC_READ:
2764 			pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2765 			    req, &status);
2766 			break;
2767 		case NVME_OPC_WRITE_ZEROES:
2768 			/* TODO: write zeroes
2769 			WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2770 			        __func__, lba, cmd->cdw12 & 0xFFFF); */
2771 			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2772 			break;
2773 		case NVME_OPC_DATASET_MANAGEMENT:
2774  			pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2775 			    req, &status);
2776 			break;
2777  		default:
2778  			WPRINTF("%s unhandled io command 0x%x",
2779 			    __func__, cmd->opc);
2780 			pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2781 		}
2782 complete:
2783 		if (!pending) {
2784 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
2785 			    status);
2786 			if (req != NULL)
2787 				pci_nvme_release_ioreq(sc, req);
2788 		}
2789 	}
2790 
2791 	sq->head = sqhead;
2792 
2793 	pthread_mutex_unlock(&sq->mtx);
2794 }
2795 
2796 static void
2797 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
2798 	uint64_t idx, int is_sq, uint64_t value)
2799 {
2800 	DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2801 	        idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2802 
2803 	if (is_sq) {
2804 		if (idx > sc->num_squeues) {
2805 			WPRINTF("%s queue index %lu overflow from "
2806 			         "guest (max %u)",
2807 			         __func__, idx, sc->num_squeues);
2808 			return;
2809 		}
2810 
2811 		atomic_store_short(&sc->submit_queues[idx].tail,
2812 		                   (uint16_t)value);
2813 
2814 		if (idx == 0) {
2815 			pci_nvme_handle_admin_cmd(sc, value);
2816 		} else {
2817 			/* submission queue; handle new entries in SQ */
2818 			if (idx > sc->num_squeues) {
2819 				WPRINTF("%s SQ index %lu overflow from "
2820 				         "guest (max %u)",
2821 				         __func__, idx, sc->num_squeues);
2822 				return;
2823 			}
2824 			pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2825 		}
2826 	} else {
2827 		if (idx > sc->num_cqueues) {
2828 			WPRINTF("%s queue index %lu overflow from "
2829 			         "guest (max %u)",
2830 			         __func__, idx, sc->num_cqueues);
2831 			return;
2832 		}
2833 
2834 		atomic_store_short(&sc->compl_queues[idx].head,
2835 				(uint16_t)value);
2836 	}
2837 }
2838 
2839 static void
2840 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2841 {
2842 	const char *s = iswrite ? "WRITE" : "READ";
2843 
2844 	switch (offset) {
2845 	case NVME_CR_CAP_LOW:
2846 		DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2847 		break;
2848 	case NVME_CR_CAP_HI:
2849 		DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2850 		break;
2851 	case NVME_CR_VS:
2852 		DPRINTF("%s %s NVME_CR_VS", func, s);
2853 		break;
2854 	case NVME_CR_INTMS:
2855 		DPRINTF("%s %s NVME_CR_INTMS", func, s);
2856 		break;
2857 	case NVME_CR_INTMC:
2858 		DPRINTF("%s %s NVME_CR_INTMC", func, s);
2859 		break;
2860 	case NVME_CR_CC:
2861 		DPRINTF("%s %s NVME_CR_CC", func, s);
2862 		break;
2863 	case NVME_CR_CSTS:
2864 		DPRINTF("%s %s NVME_CR_CSTS", func, s);
2865 		break;
2866 	case NVME_CR_NSSR:
2867 		DPRINTF("%s %s NVME_CR_NSSR", func, s);
2868 		break;
2869 	case NVME_CR_AQA:
2870 		DPRINTF("%s %s NVME_CR_AQA", func, s);
2871 		break;
2872 	case NVME_CR_ASQ_LOW:
2873 		DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2874 		break;
2875 	case NVME_CR_ASQ_HI:
2876 		DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2877 		break;
2878 	case NVME_CR_ACQ_LOW:
2879 		DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2880 		break;
2881 	case NVME_CR_ACQ_HI:
2882 		DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2883 		break;
2884 	default:
2885 		DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2886 	}
2887 
2888 }
2889 
2890 static void
2891 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2892 	uint64_t offset, int size, uint64_t value)
2893 {
2894 	uint32_t ccreg;
2895 
2896 	if (offset >= NVME_DOORBELL_OFFSET) {
2897 		uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2898 		uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2899 		int is_sq = (belloffset % 8) < 4;
2900 
2901 		if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2902 			WPRINTF("guest attempted an overflow write offset "
2903 			         "0x%lx, val 0x%lx in %s",
2904 			         offset, value, __func__);
2905 			return;
2906 		}
2907 
2908 		pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2909 		return;
2910 	}
2911 
2912 	DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2913 	        offset, size, value);
2914 
2915 	if (size != 4) {
2916 		WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2917 		         "val 0x%lx) to bar0 in %s",
2918 		         size, offset, value, __func__);
2919 		/* TODO: shutdown device */
2920 		return;
2921 	}
2922 
2923 	pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2924 
2925 	pthread_mutex_lock(&sc->mtx);
2926 
2927 	switch (offset) {
2928 	case NVME_CR_CAP_LOW:
2929 	case NVME_CR_CAP_HI:
2930 		/* readonly */
2931 		break;
2932 	case NVME_CR_VS:
2933 		/* readonly */
2934 		break;
2935 	case NVME_CR_INTMS:
2936 		/* MSI-X, so ignore */
2937 		break;
2938 	case NVME_CR_INTMC:
2939 		/* MSI-X, so ignore */
2940 		break;
2941 	case NVME_CR_CC:
2942 		ccreg = (uint32_t)value;
2943 
2944 		DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2945 		         "iocqes %u",
2946 		        __func__,
2947 			 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2948 			 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2949 			 NVME_CC_GET_IOCQES(ccreg));
2950 
2951 		if (NVME_CC_GET_SHN(ccreg)) {
2952 			/* perform shutdown - flush out data to backend */
2953 			sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2954 			    NVME_CSTS_REG_SHST_SHIFT);
2955 			sc->regs.csts |= NVME_SHST_COMPLETE <<
2956 			    NVME_CSTS_REG_SHST_SHIFT;
2957 		}
2958 		if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2959 			if (NVME_CC_GET_EN(ccreg) == 0)
2960 				/* transition 1-> causes controller reset */
2961 				pci_nvme_reset_locked(sc);
2962 			else
2963 				pci_nvme_init_controller(ctx, sc);
2964 		}
2965 
2966 		/* Insert the iocqes, iosqes and en bits from the write */
2967 		sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2968 		sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2969 		if (NVME_CC_GET_EN(ccreg) == 0) {
2970 			/* Insert the ams, mps and css bit fields */
2971 			sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2972 			sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2973 			sc->regs.csts &= ~NVME_CSTS_RDY;
2974 		} else if (sc->pending_ios == 0) {
2975 			sc->regs.csts |= NVME_CSTS_RDY;
2976 		}
2977 		break;
2978 	case NVME_CR_CSTS:
2979 		break;
2980 	case NVME_CR_NSSR:
2981 		/* ignore writes; don't support subsystem reset */
2982 		break;
2983 	case NVME_CR_AQA:
2984 		sc->regs.aqa = (uint32_t)value;
2985 		break;
2986 	case NVME_CR_ASQ_LOW:
2987 		sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2988 		               (0xFFFFF000 & value);
2989 		break;
2990 	case NVME_CR_ASQ_HI:
2991 		sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2992 		               (value << 32);
2993 		break;
2994 	case NVME_CR_ACQ_LOW:
2995 		sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2996 		               (0xFFFFF000 & value);
2997 		break;
2998 	case NVME_CR_ACQ_HI:
2999 		sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
3000 		               (value << 32);
3001 		break;
3002 	default:
3003 		DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
3004 		         __func__, offset, value, size);
3005 	}
3006 	pthread_mutex_unlock(&sc->mtx);
3007 }
3008 
3009 static void
3010 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
3011                 int baridx, uint64_t offset, int size, uint64_t value)
3012 {
3013 	struct pci_nvme_softc* sc = pi->pi_arg;
3014 
3015 	if (baridx == pci_msix_table_bar(pi) ||
3016 	    baridx == pci_msix_pba_bar(pi)) {
3017 		DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
3018 		         " value 0x%lx", baridx, offset, size, value);
3019 
3020 		pci_emul_msix_twrite(pi, offset, size, value);
3021 		return;
3022 	}
3023 
3024 	switch (baridx) {
3025 	case 0:
3026 		pci_nvme_write_bar_0(ctx, sc, offset, size, value);
3027 		break;
3028 
3029 	default:
3030 		DPRINTF("%s unknown baridx %d, val 0x%lx",
3031 		         __func__, baridx, value);
3032 	}
3033 }
3034 
3035 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
3036 	uint64_t offset, int size)
3037 {
3038 	uint64_t value;
3039 
3040 	pci_nvme_bar0_reg_dumps(__func__, offset, 0);
3041 
3042 	if (offset < NVME_DOORBELL_OFFSET) {
3043 		void *p = &(sc->regs);
3044 		pthread_mutex_lock(&sc->mtx);
3045 		memcpy(&value, (void *)((uintptr_t)p + offset), size);
3046 		pthread_mutex_unlock(&sc->mtx);
3047 	} else {
3048 		value = 0;
3049                 WPRINTF("pci_nvme: read invalid offset %ld", offset);
3050 	}
3051 
3052 	switch (size) {
3053 	case 1:
3054 		value &= 0xFF;
3055 		break;
3056 	case 2:
3057 		value &= 0xFFFF;
3058 		break;
3059 	case 4:
3060 		value &= 0xFFFFFFFF;
3061 		break;
3062 	}
3063 
3064 	DPRINTF("   nvme-read offset 0x%lx, size %d -> value 0x%x",
3065 	         offset, size, (uint32_t)value);
3066 
3067 	return (value);
3068 }
3069 
3070 
3071 
3072 static uint64_t
3073 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
3074     uint64_t offset, int size)
3075 {
3076 	struct pci_nvme_softc* sc = pi->pi_arg;
3077 
3078 	if (baridx == pci_msix_table_bar(pi) ||
3079 	    baridx == pci_msix_pba_bar(pi)) {
3080 		DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
3081 		        baridx, offset, size);
3082 
3083 		return pci_emul_msix_tread(pi, offset, size);
3084 	}
3085 
3086 	switch (baridx) {
3087 	case 0:
3088        		return pci_nvme_read_bar_0(sc, offset, size);
3089 
3090 	default:
3091 		DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
3092 	}
3093 
3094 	return (0);
3095 }
3096 
3097 static int
3098 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl)
3099 {
3100 	char bident[sizeof("XX:X:X")];
3101 	const char *value;
3102 	uint32_t sectsz;
3103 
3104 	sc->max_queues = NVME_QUEUES;
3105 	sc->max_qentries = NVME_MAX_QENTRIES;
3106 	sc->ioslots = NVME_IOSLOTS;
3107 	sc->num_squeues = sc->max_queues;
3108 	sc->num_cqueues = sc->max_queues;
3109 	sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
3110 	sectsz = 0;
3111 	snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
3112 	         "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
3113 
3114 	value = get_config_value_node(nvl, "maxq");
3115 	if (value != NULL)
3116 		sc->max_queues = atoi(value);
3117 	value = get_config_value_node(nvl, "qsz");
3118 	if (value != NULL) {
3119 		sc->max_qentries = atoi(value);
3120 		if (sc->max_qentries <= 0) {
3121 			EPRINTLN("nvme: Invalid qsz option %d",
3122 			    sc->max_qentries);
3123 			return (-1);
3124 		}
3125 	}
3126 	value = get_config_value_node(nvl, "ioslots");
3127 	if (value != NULL) {
3128 		sc->ioslots = atoi(value);
3129 		if (sc->ioslots <= 0) {
3130 			EPRINTLN("Invalid ioslots option %d", sc->ioslots);
3131 			return (-1);
3132 		}
3133 	}
3134 	value = get_config_value_node(nvl, "sectsz");
3135 	if (value != NULL)
3136 		sectsz = atoi(value);
3137 	value = get_config_value_node(nvl, "ser");
3138 	if (value != NULL) {
3139 		/*
3140 		 * This field indicates the Product Serial Number in
3141 		 * 7-bit ASCII, unused bytes should be space characters.
3142 		 * Ref: NVMe v1.3c.
3143 		 */
3144 		cpywithpad((char *)sc->ctrldata.sn,
3145 		    sizeof(sc->ctrldata.sn), value, ' ');
3146 	}
3147 	value = get_config_value_node(nvl, "eui64");
3148 	if (value != NULL)
3149 		sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0));
3150 	value = get_config_value_node(nvl, "dsm");
3151 	if (value != NULL) {
3152 		if (strcmp(value, "auto") == 0)
3153 			sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
3154 		else if (strcmp(value, "enable") == 0)
3155 			sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
3156 		else if (strcmp(value, "disable") == 0)
3157 			sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
3158 	}
3159 
3160 	value = get_config_value_node(nvl, "ram");
3161 	if (value != NULL) {
3162 		uint64_t sz = strtoull(value, NULL, 10);
3163 
3164 		sc->nvstore.type = NVME_STOR_RAM;
3165 		sc->nvstore.size = sz * 1024 * 1024;
3166 		sc->nvstore.ctx = calloc(1, sc->nvstore.size);
3167 		sc->nvstore.sectsz = 4096;
3168 		sc->nvstore.sectsz_bits = 12;
3169 		if (sc->nvstore.ctx == NULL) {
3170 			EPRINTLN("nvme: Unable to allocate RAM");
3171 			return (-1);
3172 		}
3173 	} else {
3174 		snprintf(bident, sizeof(bident), "%d:%d",
3175 		    sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
3176 		sc->nvstore.ctx = blockif_open(nvl, bident);
3177 		if (sc->nvstore.ctx == NULL) {
3178 			EPRINTLN("nvme: Could not open backing file: %s",
3179 			    strerror(errno));
3180 			return (-1);
3181 		}
3182 		sc->nvstore.type = NVME_STOR_BLOCKIF;
3183 		sc->nvstore.size = blockif_size(sc->nvstore.ctx);
3184 	}
3185 
3186 	if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
3187 		sc->nvstore.sectsz = sectsz;
3188 	else if (sc->nvstore.type != NVME_STOR_RAM)
3189 		sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
3190 	for (sc->nvstore.sectsz_bits = 9;
3191 	     (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
3192 	     sc->nvstore.sectsz_bits++);
3193 
3194 	if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
3195 		sc->max_queues = NVME_QUEUES;
3196 
3197 	return (0);
3198 }
3199 
3200 static void
3201 pci_nvme_resized(struct blockif_ctxt *bctxt, void *arg, size_t new_size)
3202 {
3203 	struct pci_nvme_softc *sc;
3204 	struct pci_nvme_blockstore *nvstore;
3205 	struct nvme_namespace_data *nd;
3206 
3207 	sc = arg;
3208 	nvstore = &sc->nvstore;
3209 	nd = &sc->nsdata;
3210 
3211 	nvstore->size = new_size;
3212 	pci_nvme_init_nsdata_size(nvstore, nd);
3213 
3214 	/* Add changed NSID to list */
3215 	sc->ns_log.ns[0] = 1;
3216 	sc->ns_log.ns[1] = 0;
3217 
3218 	pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE,
3219 	    PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED);
3220 }
3221 
3222 static int
3223 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl)
3224 {
3225 	struct pci_nvme_softc *sc;
3226 	uint32_t pci_membar_sz;
3227 	int	error;
3228 
3229 	error = 0;
3230 
3231 	sc = calloc(1, sizeof(struct pci_nvme_softc));
3232 	pi->pi_arg = sc;
3233 	sc->nsc_pi = pi;
3234 
3235 	error = pci_nvme_parse_config(sc, nvl);
3236 	if (error < 0)
3237 		goto done;
3238 	else
3239 		error = 0;
3240 
3241 	STAILQ_INIT(&sc->ioreqs_free);
3242 	sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
3243 	for (int i = 0; i < sc->ioslots; i++) {
3244 		STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
3245 	}
3246 
3247 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
3248 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
3249 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
3250 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
3251 	pci_set_cfgdata8(pi, PCIR_PROGIF,
3252 	                 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
3253 
3254 	/*
3255 	 * Allocate size of NVMe registers + doorbell space for all queues.
3256 	 *
3257 	 * The specification requires a minimum memory I/O window size of 16K.
3258 	 * The Windows driver will refuse to start a device with a smaller
3259 	 * window.
3260 	 */
3261 	pci_membar_sz = sizeof(struct nvme_registers) +
3262 	    2 * sizeof(uint32_t) * (sc->max_queues + 1);
3263 	pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
3264 
3265 	DPRINTF("nvme membar size: %u", pci_membar_sz);
3266 
3267 	error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
3268 	if (error) {
3269 		WPRINTF("%s pci alloc mem bar failed", __func__);
3270 		goto done;
3271 	}
3272 
3273 	error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
3274 	if (error) {
3275 		WPRINTF("%s pci add msixcap failed", __func__);
3276 		goto done;
3277 	}
3278 
3279 	error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
3280 	if (error) {
3281 		WPRINTF("%s pci add Express capability failed", __func__);
3282 		goto done;
3283 	}
3284 
3285 	pthread_mutex_init(&sc->mtx, NULL);
3286 	sem_init(&sc->iosemlock, 0, sc->ioslots);
3287 	blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc);
3288 
3289 	pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
3290 	/*
3291 	 * Controller data depends on Namespace data so initialize Namespace
3292 	 * data first.
3293 	 */
3294 	pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
3295 	pci_nvme_init_ctrldata(sc);
3296 	pci_nvme_init_logpages(sc);
3297 	pci_nvme_init_features(sc);
3298 
3299 	pci_nvme_aer_init(sc);
3300 	pci_nvme_aen_init(sc);
3301 
3302 	pci_nvme_reset(sc);
3303 
3304 	pci_lintr_request(pi);
3305 
3306 done:
3307 	return (error);
3308 }
3309 
3310 static int
3311 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts)
3312 {
3313 	char *cp, *ram;
3314 
3315 	if (opts == NULL)
3316 		return (0);
3317 
3318 	if (strncmp(opts, "ram=", 4) == 0) {
3319 		cp = strchr(opts, ',');
3320 		if (cp == NULL) {
3321 			set_config_value_node(nvl, "ram", opts + 4);
3322 			return (0);
3323 		}
3324 		ram = strndup(opts + 4, cp - opts - 4);
3325 		set_config_value_node(nvl, "ram", ram);
3326 		free(ram);
3327 		return (pci_parse_legacy_config(nvl, cp + 1));
3328 	} else
3329 		return (blockif_legacy_config(nvl, opts));
3330 }
3331 
3332 struct pci_devemu pci_de_nvme = {
3333 	.pe_emu =	"nvme",
3334 	.pe_init =	pci_nvme_init,
3335 	.pe_legacy_config = pci_nvme_legacy_config,
3336 	.pe_barwrite =	pci_nvme_write,
3337 	.pe_barread =	pci_nvme_read
3338 };
3339 PCI_EMUL_SET(pci_de_nvme);
3340