xref: /freebsd/usr.sbin/bhyve/pci_nvme.c (revision 8ddb146abcdf061be9f2c0db7e391697dafad85c)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  * Copyright (c) 2020 Chuck Tuffli
7  *
8  * Function crc16 Copyright (c) 2017, Fedor Uporov
9  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 /*
34  * bhyve PCIe-NVMe device emulation.
35  *
36  * options:
37  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
38  *
39  *  accepted devpath:
40  *    /dev/blockdev
41  *    /path/to/image
42  *    ram=size_in_MiB
43  *
44  *  maxq    = max number of queues
45  *  qsz     = max elements in each queue
46  *  ioslots = max number of concurrent io requests
47  *  sectsz  = sector size (defaults to blockif sector size)
48  *  ser     = serial number (20-chars max)
49  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
50  *  dsm     = DataSet Management support. Option is one of auto, enable,disable
51  *
52  */
53 
54 /* TODO:
55     - create async event for smart and log
56     - intr coalesce
57  */
58 
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61 
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
65 
66 #include <assert.h>
67 #include <pthread.h>
68 #include <pthread_np.h>
69 #include <semaphore.h>
70 #include <stdbool.h>
71 #include <stddef.h>
72 #include <stdint.h>
73 #include <stdio.h>
74 #include <stdlib.h>
75 #include <string.h>
76 
77 #include <machine/atomic.h>
78 #include <machine/vmm.h>
79 #include <vmmapi.h>
80 
81 #include <dev/nvme/nvme.h>
82 
83 #include "bhyverun.h"
84 #include "block_if.h"
85 #include "config.h"
86 #include "debug.h"
87 #include "pci_emul.h"
88 
89 
90 static int nvme_debug = 0;
91 #define	DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
92 #define	WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
93 
94 /* defaults; can be overridden */
95 #define	NVME_MSIX_BAR		4
96 
97 #define	NVME_IOSLOTS		8
98 
99 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
100 #define NVME_MMIO_SPACE_MIN	(1 << 14)
101 
102 #define	NVME_QUEUES		16
103 #define	NVME_MAX_QENTRIES	2048
104 /* Memory Page size Minimum reported in CAP register */
105 #define	NVME_MPSMIN		0
106 /* MPSMIN converted to bytes */
107 #define	NVME_MPSMIN_BYTES	(1 << (12 + NVME_MPSMIN))
108 
109 #define	NVME_PRP2_ITEMS		(PAGE_SIZE/sizeof(uint64_t))
110 #define	NVME_MDTS		9
111 /* Note the + 1 allows for the initial descriptor to not be page aligned */
112 #define	NVME_MAX_IOVEC		((1 << NVME_MDTS) + 1)
113 #define	NVME_MAX_DATA_SIZE	((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
114 
115 /* This is a synthetic status code to indicate there is no status */
116 #define NVME_NO_STATUS		0xffff
117 #define NVME_COMPLETION_VALID(c)	((c).status != NVME_NO_STATUS)
118 
119 /* Reported temperature in Kelvin (i.e. room temperature) */
120 #define NVME_TEMPERATURE 296
121 
122 /* helpers */
123 
124 /* Convert a zero-based value into a one-based value */
125 #define ONE_BASED(zero)		((zero) + 1)
126 /* Convert a one-based value into a zero-based value */
127 #define ZERO_BASED(one)		((one)  - 1)
128 
129 /* Encode number of SQ's and CQ's for Set/Get Features */
130 #define NVME_FEATURE_NUM_QUEUES(sc) \
131 	(ZERO_BASED((sc)->num_squeues) & 0xffff) | \
132 	(ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
133 
134 #define	NVME_DOORBELL_OFFSET	offsetof(struct nvme_registers, doorbell)
135 
136 enum nvme_controller_register_offsets {
137 	NVME_CR_CAP_LOW = 0x00,
138 	NVME_CR_CAP_HI  = 0x04,
139 	NVME_CR_VS      = 0x08,
140 	NVME_CR_INTMS   = 0x0c,
141 	NVME_CR_INTMC   = 0x10,
142 	NVME_CR_CC      = 0x14,
143 	NVME_CR_CSTS    = 0x1c,
144 	NVME_CR_NSSR    = 0x20,
145 	NVME_CR_AQA     = 0x24,
146 	NVME_CR_ASQ_LOW = 0x28,
147 	NVME_CR_ASQ_HI  = 0x2c,
148 	NVME_CR_ACQ_LOW = 0x30,
149 	NVME_CR_ACQ_HI  = 0x34,
150 };
151 
152 enum nvme_cmd_cdw11 {
153 	NVME_CMD_CDW11_PC  = 0x0001,
154 	NVME_CMD_CDW11_IEN = 0x0002,
155 	NVME_CMD_CDW11_IV  = 0xFFFF0000,
156 };
157 
158 enum nvme_copy_dir {
159 	NVME_COPY_TO_PRP,
160 	NVME_COPY_FROM_PRP,
161 };
162 
163 #define	NVME_CQ_INTEN	0x01
164 #define	NVME_CQ_INTCOAL	0x02
165 
166 struct nvme_completion_queue {
167 	struct nvme_completion *qbase;
168 	pthread_mutex_t	mtx;
169 	uint32_t	size;
170 	uint16_t	tail; /* nvme progress */
171 	uint16_t	head; /* guest progress */
172 	uint16_t	intr_vec;
173 	uint32_t	intr_en;
174 };
175 
176 struct nvme_submission_queue {
177 	struct nvme_command *qbase;
178 	pthread_mutex_t	mtx;
179 	uint32_t	size;
180 	uint16_t	head; /* nvme progress */
181 	uint16_t	tail; /* guest progress */
182 	uint16_t	cqid; /* completion queue id */
183 	int		qpriority;
184 };
185 
186 enum nvme_storage_type {
187 	NVME_STOR_BLOCKIF = 0,
188 	NVME_STOR_RAM = 1,
189 };
190 
191 struct pci_nvme_blockstore {
192 	enum nvme_storage_type type;
193 	void		*ctx;
194 	uint64_t	size;
195 	uint32_t	sectsz;
196 	uint32_t	sectsz_bits;
197 	uint64_t	eui64;
198 	uint32_t	deallocate:1;
199 };
200 
201 /*
202  * Calculate the number of additional page descriptors for guest IO requests
203  * based on the advertised Max Data Transfer (MDTS) and given the number of
204  * default iovec's in a struct blockif_req.
205  */
206 #define MDTS_PAD_SIZE \
207 	( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
208 	  NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
209 	  0 )
210 
211 struct pci_nvme_ioreq {
212 	struct pci_nvme_softc *sc;
213 	STAILQ_ENTRY(pci_nvme_ioreq) link;
214 	struct nvme_submission_queue *nvme_sq;
215 	uint16_t	sqid;
216 
217 	/* command information */
218 	uint16_t	opc;
219 	uint16_t	cid;
220 	uint32_t	nsid;
221 
222 	uint64_t	prev_gpaddr;
223 	size_t		prev_size;
224 	size_t		bytes;
225 
226 	struct blockif_req io_req;
227 
228 	struct iovec	iovpadding[MDTS_PAD_SIZE];
229 };
230 
231 enum nvme_dsm_type {
232 	/* Dataset Management bit in ONCS reflects backing storage capability */
233 	NVME_DATASET_MANAGEMENT_AUTO,
234 	/* Unconditionally set Dataset Management bit in ONCS */
235 	NVME_DATASET_MANAGEMENT_ENABLE,
236 	/* Unconditionally clear Dataset Management bit in ONCS */
237 	NVME_DATASET_MANAGEMENT_DISABLE,
238 };
239 
240 struct pci_nvme_softc;
241 struct nvme_feature_obj;
242 
243 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
244     struct nvme_feature_obj *,
245     struct nvme_command *,
246     struct nvme_completion *);
247 
248 struct nvme_feature_obj {
249 	uint32_t	cdw11;
250 	nvme_feature_cb	set;
251 	nvme_feature_cb	get;
252 	bool namespace_specific;
253 };
254 
255 #define NVME_FID_MAX		(NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
256 
257 typedef enum {
258 	PCI_NVME_AE_TYPE_ERROR = 0,
259 	PCI_NVME_AE_TYPE_SMART,
260 	PCI_NVME_AE_TYPE_NOTICE,
261 	PCI_NVME_AE_TYPE_IO_CMD = 6,
262 	PCI_NVME_AE_TYPE_VENDOR = 7,
263 	PCI_NVME_AE_TYPE_MAX		/* Must be last */
264 } pci_nvme_async_type;
265 
266 /* Asynchronous Event Requests */
267 struct pci_nvme_aer {
268 	STAILQ_ENTRY(pci_nvme_aer) link;
269 	uint16_t	cid;	/* Command ID of the submitted AER */
270 };
271 
272 /** Asynchronous Event Information - Notice */
273 typedef enum {
274 	PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED = 0,
275 	PCI_NVME_AEI_NOTICE_FW_ACTIVATION,
276 	PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE,
277 	PCI_NVME_AEI_NOTICE_ANA_CHANGE,
278 	PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE,
279 	PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT,
280 	PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE,
281 	PCI_NVME_AEI_NOTICE_MAX,
282 } pci_nvme_async_event_info_notice;
283 
284 #define PCI_NVME_AEI_NOTICE_SHIFT		8
285 #define PCI_NVME_AEI_NOTICE_MASK(event)	(1 << (event + PCI_NVME_AEI_NOTICE_SHIFT))
286 
287 /* Asynchronous Event Notifications */
288 struct pci_nvme_aen {
289 	pci_nvme_async_type atype;
290 	uint32_t	event_data;
291 	bool		posted;
292 };
293 
294 /*
295  * By default, enable all Asynchrnous Event Notifications:
296  *     SMART / Health Critical Warnings
297  *     Namespace Attribute Notices
298  */
299 #define PCI_NVME_AEN_DEFAULT_MASK	0x11f
300 
301 typedef enum {
302 	NVME_CNTRLTYPE_IO = 1,
303 	NVME_CNTRLTYPE_DISCOVERY = 2,
304 	NVME_CNTRLTYPE_ADMIN = 3,
305 } pci_nvme_cntrl_type;
306 
307 struct pci_nvme_softc {
308 	struct pci_devinst *nsc_pi;
309 
310 	pthread_mutex_t	mtx;
311 
312 	struct nvme_registers regs;
313 
314 	struct nvme_namespace_data  nsdata;
315 	struct nvme_controller_data ctrldata;
316 	struct nvme_error_information_entry err_log;
317 	struct nvme_health_information_page health_log;
318 	struct nvme_firmware_page fw_log;
319 	struct nvme_ns_list ns_log;
320 
321 	struct pci_nvme_blockstore nvstore;
322 
323 	uint16_t	max_qentries;	/* max entries per queue */
324 	uint32_t	max_queues;	/* max number of IO SQ's or CQ's */
325 	uint32_t	num_cqueues;
326 	uint32_t	num_squeues;
327 	bool		num_q_is_set; /* Has host set Number of Queues */
328 
329 	struct pci_nvme_ioreq *ioreqs;
330 	STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
331 	uint32_t	pending_ios;
332 	uint32_t	ioslots;
333 	sem_t		iosemlock;
334 
335 	/*
336 	 * Memory mapped Submission and Completion queues
337 	 * Each array includes both Admin and IO queues
338 	 */
339 	struct nvme_completion_queue *compl_queues;
340 	struct nvme_submission_queue *submit_queues;
341 
342 	struct nvme_feature_obj feat[NVME_FID_MAX];
343 
344 	enum nvme_dsm_type dataset_management;
345 
346 	/* Accounting for SMART data */
347 	__uint128_t	read_data_units;
348 	__uint128_t	write_data_units;
349 	__uint128_t	read_commands;
350 	__uint128_t	write_commands;
351 	uint32_t	read_dunits_remainder;
352 	uint32_t	write_dunits_remainder;
353 
354 	STAILQ_HEAD(, pci_nvme_aer) aer_list;
355 	pthread_mutex_t	aer_mtx;
356 	uint32_t	aer_count;
357 	struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX];
358 	pthread_t	aen_tid;
359 	pthread_mutex_t	aen_mtx;
360 	pthread_cond_t	aen_cond;
361 };
362 
363 
364 static void pci_nvme_cq_update(struct pci_nvme_softc *sc,
365     struct nvme_completion_queue *cq,
366     uint32_t cdw0,
367     uint16_t cid,
368     uint16_t sqid,
369     uint16_t status);
370 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
371 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
372 static void pci_nvme_io_done(struct blockif_req *, int);
373 
374 /* Controller Configuration utils */
375 #define	NVME_CC_GET_EN(cc) \
376 	((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
377 #define	NVME_CC_GET_CSS(cc) \
378 	((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
379 #define	NVME_CC_GET_SHN(cc) \
380 	((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
381 #define	NVME_CC_GET_IOSQES(cc) \
382 	((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
383 #define	NVME_CC_GET_IOCQES(cc) \
384 	((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
385 
386 #define	NVME_CC_WRITE_MASK \
387 	((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
388 	 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
389 	 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
390 
391 #define	NVME_CC_NEN_WRITE_MASK \
392 	((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
393 	 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
394 	 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
395 
396 /* Controller Status utils */
397 #define	NVME_CSTS_GET_RDY(sts) \
398 	((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
399 
400 #define	NVME_CSTS_RDY	(1 << NVME_CSTS_REG_RDY_SHIFT)
401 #define	NVME_CSTS_CFS	(1 << NVME_CSTS_REG_CFS_SHIFT)
402 
403 /* Completion Queue status word utils */
404 #define	NVME_STATUS_P	(1 << NVME_STATUS_P_SHIFT)
405 #define	NVME_STATUS_MASK \
406 	((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
407 	 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
408 
409 #define NVME_ONCS_DSM	(NVME_CTRLR_DATA_ONCS_DSM_MASK << \
410 	NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
411 
412 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
413     struct nvme_feature_obj *,
414     struct nvme_command *,
415     struct nvme_completion *);
416 static void nvme_feature_temperature(struct pci_nvme_softc *,
417     struct nvme_feature_obj *,
418     struct nvme_command *,
419     struct nvme_completion *);
420 static void nvme_feature_num_queues(struct pci_nvme_softc *,
421     struct nvme_feature_obj *,
422     struct nvme_command *,
423     struct nvme_completion *);
424 static void nvme_feature_iv_config(struct pci_nvme_softc *,
425     struct nvme_feature_obj *,
426     struct nvme_command *,
427     struct nvme_completion *);
428 static void nvme_feature_async_event(struct pci_nvme_softc *,
429     struct nvme_feature_obj *,
430     struct nvme_command *,
431     struct nvme_completion *);
432 
433 static void *aen_thr(void *arg);
434 
435 static __inline void
436 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
437 {
438 	size_t len;
439 
440 	len = strnlen(src, dst_size);
441 	memset(dst, pad, dst_size);
442 	memcpy(dst, src, len);
443 }
444 
445 static __inline void
446 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
447 {
448 
449 	*status &= ~NVME_STATUS_MASK;
450 	*status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
451 		(code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
452 }
453 
454 static __inline void
455 pci_nvme_status_genc(uint16_t *status, uint16_t code)
456 {
457 
458 	pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
459 }
460 
461 /*
462  * Initialize the requested number or IO Submission and Completion Queues.
463  * Admin queues are allocated implicitly.
464  */
465 static void
466 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
467 {
468 	uint32_t i;
469 
470 	/*
471 	 * Allocate and initialize the Submission Queues
472 	 */
473 	if (nsq > NVME_QUEUES) {
474 		WPRINTF("%s: clamping number of SQ from %u to %u",
475 					__func__, nsq, NVME_QUEUES);
476 		nsq = NVME_QUEUES;
477 	}
478 
479 	sc->num_squeues = nsq;
480 
481 	sc->submit_queues = calloc(sc->num_squeues + 1,
482 				sizeof(struct nvme_submission_queue));
483 	if (sc->submit_queues == NULL) {
484 		WPRINTF("%s: SQ allocation failed", __func__);
485 		sc->num_squeues = 0;
486 	} else {
487 		struct nvme_submission_queue *sq = sc->submit_queues;
488 
489 		for (i = 0; i < sc->num_squeues + 1; i++)
490 			pthread_mutex_init(&sq[i].mtx, NULL);
491 	}
492 
493 	/*
494 	 * Allocate and initialize the Completion Queues
495 	 */
496 	if (ncq > NVME_QUEUES) {
497 		WPRINTF("%s: clamping number of CQ from %u to %u",
498 					__func__, ncq, NVME_QUEUES);
499 		ncq = NVME_QUEUES;
500 	}
501 
502 	sc->num_cqueues = ncq;
503 
504 	sc->compl_queues = calloc(sc->num_cqueues + 1,
505 				sizeof(struct nvme_completion_queue));
506 	if (sc->compl_queues == NULL) {
507 		WPRINTF("%s: CQ allocation failed", __func__);
508 		sc->num_cqueues = 0;
509 	} else {
510 		struct nvme_completion_queue *cq = sc->compl_queues;
511 
512 		for (i = 0; i < sc->num_cqueues + 1; i++)
513 			pthread_mutex_init(&cq[i].mtx, NULL);
514 	}
515 }
516 
517 static void
518 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
519 {
520 	struct nvme_controller_data *cd = &sc->ctrldata;
521 
522 	cd->vid = 0xFB5D;
523 	cd->ssvid = 0x0000;
524 
525 	cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
526 	cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
527 
528 	/* Num of submission commands that we can handle at a time (2^rab) */
529 	cd->rab   = 4;
530 
531 	/* FreeBSD OUI */
532 	cd->ieee[0] = 0x58;
533 	cd->ieee[1] = 0x9c;
534 	cd->ieee[2] = 0xfc;
535 
536 	cd->mic = 0;
537 
538 	cd->mdts = NVME_MDTS;	/* max data transfer size (2^mdts * CAP.MPSMIN) */
539 
540 	cd->ver = NVME_REV(1,4);
541 
542 	cd->cntrltype = NVME_CNTRLTYPE_IO;
543 	cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
544 	cd->oaes = NVMEB(NVME_CTRLR_DATA_OAES_NS_ATTR);
545 	cd->acl = 2;
546 	cd->aerl = 4;
547 
548 	/* Advertise 1, Read-only firmware slot */
549 	cd->frmw = NVMEB(NVME_CTRLR_DATA_FRMW_SLOT1_RO) |
550 	    (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
551 	cd->lpa = 0;	/* TODO: support some simple things like SMART */
552 	cd->elpe = 0;	/* max error log page entries */
553 	/*
554 	 * Report a single power state (zero-based value)
555 	 * power_state[] values are left as zero to indicate "Not reported"
556 	 */
557 	cd->npss = 0;
558 
559 	/* Warning Composite Temperature Threshold */
560 	cd->wctemp = 0x0157;
561 	cd->cctemp = 0x0157;
562 
563 	/* SANICAP must not be 0 for Revision 1.4 and later NVMe Controllers */
564 	cd->sanicap = (NVME_CTRLR_DATA_SANICAP_NODMMAS_NO <<
565 			NVME_CTRLR_DATA_SANICAP_NODMMAS_SHIFT);
566 
567 	cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
568 	    (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
569 	cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
570 	    (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
571 	cd->nn = 1;	/* number of namespaces */
572 
573 	cd->oncs = 0;
574 	switch (sc->dataset_management) {
575 	case NVME_DATASET_MANAGEMENT_AUTO:
576 		if (sc->nvstore.deallocate)
577 			cd->oncs |= NVME_ONCS_DSM;
578 		break;
579 	case NVME_DATASET_MANAGEMENT_ENABLE:
580 		cd->oncs |= NVME_ONCS_DSM;
581 		break;
582 	default:
583 		break;
584 	}
585 
586 	cd->fna = NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK <<
587 	    NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT;
588 
589 	cd->vwc = NVME_CTRLR_DATA_VWC_ALL_NO << NVME_CTRLR_DATA_VWC_ALL_SHIFT;
590 }
591 
592 /*
593  * Calculate the CRC-16 of the given buffer
594  * See copyright attribution at top of file
595  */
596 static uint16_t
597 crc16(uint16_t crc, const void *buffer, unsigned int len)
598 {
599 	const unsigned char *cp = buffer;
600 	/* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
601 	static uint16_t const crc16_table[256] = {
602 		0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
603 		0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
604 		0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
605 		0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
606 		0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
607 		0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
608 		0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
609 		0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
610 		0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
611 		0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
612 		0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
613 		0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
614 		0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
615 		0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
616 		0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
617 		0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
618 		0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
619 		0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
620 		0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
621 		0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
622 		0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
623 		0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
624 		0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
625 		0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
626 		0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
627 		0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
628 		0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
629 		0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
630 		0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
631 		0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
632 		0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
633 		0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
634 	};
635 
636 	while (len--)
637 		crc = (((crc >> 8) & 0xffU) ^
638 		    crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
639 	return crc;
640 }
641 
642 static void
643 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore,
644     struct nvme_namespace_data *nd)
645 {
646 
647 	/* Get capacity and block size information from backing store */
648 	nd->nsze = nvstore->size / nvstore->sectsz;
649 	nd->ncap = nd->nsze;
650 	nd->nuse = nd->nsze;
651 }
652 
653 static void
654 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
655     struct nvme_namespace_data *nd, uint32_t nsid,
656     struct pci_nvme_blockstore *nvstore)
657 {
658 
659 	pci_nvme_init_nsdata_size(nvstore, nd);
660 
661 	if (nvstore->type == NVME_STOR_BLOCKIF)
662 		nvstore->deallocate = blockif_candelete(nvstore->ctx);
663 
664 	nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
665 	nd->flbas = 0;
666 
667 	/* Create an EUI-64 if user did not provide one */
668 	if (nvstore->eui64 == 0) {
669 		char *data = NULL;
670 		uint64_t eui64 = nvstore->eui64;
671 
672 		asprintf(&data, "%s%u%u%u", get_config_value("name"),
673 		    sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot,
674 		    sc->nsc_pi->pi_func);
675 
676 		if (data != NULL) {
677 			eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
678 			free(data);
679 		}
680 		nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
681 	}
682 	be64enc(nd->eui64, nvstore->eui64);
683 
684 	/* LBA data-sz = 2^lbads */
685 	nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
686 }
687 
688 static void
689 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
690 {
691 
692 	memset(&sc->err_log, 0, sizeof(sc->err_log));
693 	memset(&sc->health_log, 0, sizeof(sc->health_log));
694 	memset(&sc->fw_log, 0, sizeof(sc->fw_log));
695 	memset(&sc->ns_log, 0, sizeof(sc->ns_log));
696 
697 	/* Set read/write remainder to round up according to spec */
698 	sc->read_dunits_remainder = 999;
699 	sc->write_dunits_remainder = 999;
700 
701 	/* Set nominal Health values checked by implementations */
702 	sc->health_log.temperature = NVME_TEMPERATURE;
703 	sc->health_log.available_spare = 100;
704 	sc->health_log.available_spare_threshold = 10;
705 
706 	/* Set Active Firmware Info to slot 1 */
707 	sc->fw_log.afi = (1 << NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT);
708 	memcpy(&sc->fw_log.revision[0], sc->ctrldata.fr,
709 	    sizeof(sc->fw_log.revision[0]));
710 }
711 
712 static void
713 pci_nvme_init_features(struct pci_nvme_softc *sc)
714 {
715 	enum nvme_feature	fid;
716 
717 	for (fid = 0; fid < NVME_FID_MAX; fid++) {
718 		switch (fid) {
719 		case NVME_FEAT_ARBITRATION:
720 		case NVME_FEAT_POWER_MANAGEMENT:
721 		case NVME_FEAT_INTERRUPT_COALESCING: //XXX
722 		case NVME_FEAT_WRITE_ATOMICITY:
723 			/* Mandatory but no special handling required */
724 		//XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
725 		//XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
726 		//		  this returns a data buffer
727 			break;
728 		case NVME_FEAT_TEMPERATURE_THRESHOLD:
729 			sc->feat[fid].set = nvme_feature_temperature;
730 			break;
731 		case NVME_FEAT_ERROR_RECOVERY:
732 			sc->feat[fid].namespace_specific = true;
733 			break;
734 		case NVME_FEAT_NUMBER_OF_QUEUES:
735 			sc->feat[fid].set = nvme_feature_num_queues;
736 			break;
737 		case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
738 			sc->feat[fid].set = nvme_feature_iv_config;
739 			break;
740 		case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
741 			sc->feat[fid].set = nvme_feature_async_event;
742 			/* Enable all AENs by default */
743 			sc->feat[fid].cdw11 = PCI_NVME_AEN_DEFAULT_MASK;
744 			break;
745 		default:
746 			sc->feat[fid].set = nvme_feature_invalid_cb;
747 			sc->feat[fid].get = nvme_feature_invalid_cb;
748 		}
749 	}
750 }
751 
752 static void
753 pci_nvme_aer_reset(struct pci_nvme_softc *sc)
754 {
755 
756 	STAILQ_INIT(&sc->aer_list);
757 	sc->aer_count = 0;
758 }
759 
760 static void
761 pci_nvme_aer_init(struct pci_nvme_softc *sc)
762 {
763 
764 	pthread_mutex_init(&sc->aer_mtx, NULL);
765 	pci_nvme_aer_reset(sc);
766 }
767 
768 static void
769 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
770 {
771 	struct pci_nvme_aer *aer = NULL;
772 
773 	pthread_mutex_lock(&sc->aer_mtx);
774 	while (!STAILQ_EMPTY(&sc->aer_list)) {
775 		aer = STAILQ_FIRST(&sc->aer_list);
776 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
777 		free(aer);
778 	}
779 	pthread_mutex_unlock(&sc->aer_mtx);
780 
781 	pci_nvme_aer_reset(sc);
782 }
783 
784 static bool
785 pci_nvme_aer_available(struct pci_nvme_softc *sc)
786 {
787 
788 	return (sc->aer_count != 0);
789 }
790 
791 static bool
792 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
793 {
794 	struct nvme_controller_data *cd = &sc->ctrldata;
795 
796 	/* AERL is a zero based value while aer_count is one's based */
797 	return (sc->aer_count == (cd->aerl + 1));
798 }
799 
800 /*
801  * Add an Async Event Request
802  *
803  * Stores an AER to be returned later if the Controller needs to notify the
804  * host of an event.
805  * Note that while the NVMe spec doesn't require Controllers to return AER's
806  * in order, this implementation does preserve the order.
807  */
808 static int
809 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
810 {
811 	struct pci_nvme_aer *aer = NULL;
812 
813 	aer = calloc(1, sizeof(struct pci_nvme_aer));
814 	if (aer == NULL)
815 		return (-1);
816 
817 	/* Save the Command ID for use in the completion message */
818 	aer->cid = cid;
819 
820 	pthread_mutex_lock(&sc->aer_mtx);
821 	sc->aer_count++;
822 	STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
823 	pthread_mutex_unlock(&sc->aer_mtx);
824 
825 	return (0);
826 }
827 
828 /*
829  * Get an Async Event Request structure
830  *
831  * Returns a pointer to an AER previously submitted by the host or NULL if
832  * no AER's exist. Caller is responsible for freeing the returned struct.
833  */
834 static struct pci_nvme_aer *
835 pci_nvme_aer_get(struct pci_nvme_softc *sc)
836 {
837 	struct pci_nvme_aer *aer = NULL;
838 
839 	pthread_mutex_lock(&sc->aer_mtx);
840 	aer = STAILQ_FIRST(&sc->aer_list);
841 	if (aer != NULL) {
842 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
843 		sc->aer_count--;
844 	}
845 	pthread_mutex_unlock(&sc->aer_mtx);
846 
847 	return (aer);
848 }
849 
850 static void
851 pci_nvme_aen_reset(struct pci_nvme_softc *sc)
852 {
853 	uint32_t	atype;
854 
855 	memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen));
856 
857 	for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
858 		sc->aen[atype].atype = atype;
859 	}
860 }
861 
862 static void
863 pci_nvme_aen_init(struct pci_nvme_softc *sc)
864 {
865 	char nstr[80];
866 
867 	pci_nvme_aen_reset(sc);
868 
869 	pthread_mutex_init(&sc->aen_mtx, NULL);
870 	pthread_create(&sc->aen_tid, NULL, aen_thr, sc);
871 	snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot,
872 	    sc->nsc_pi->pi_func);
873 	pthread_set_name_np(sc->aen_tid, nstr);
874 }
875 
876 static void
877 pci_nvme_aen_destroy(struct pci_nvme_softc *sc)
878 {
879 
880 	pci_nvme_aen_reset(sc);
881 }
882 
883 /* Notify the AEN thread of pending work */
884 static void
885 pci_nvme_aen_notify(struct pci_nvme_softc *sc)
886 {
887 
888 	pthread_cond_signal(&sc->aen_cond);
889 }
890 
891 /*
892  * Post an Asynchronous Event Notification
893  */
894 static int32_t
895 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype,
896 		uint32_t event_data)
897 {
898 	struct pci_nvme_aen *aen;
899 
900 	if (atype >= PCI_NVME_AE_TYPE_MAX) {
901 		return(EINVAL);
902 	}
903 
904 	pthread_mutex_lock(&sc->aen_mtx);
905 	aen = &sc->aen[atype];
906 
907 	/* Has the controller already posted an event of this type? */
908 	if (aen->posted) {
909 		pthread_mutex_unlock(&sc->aen_mtx);
910 		return(EALREADY);
911 	}
912 
913 	aen->event_data = event_data;
914 	aen->posted = true;
915 	pthread_mutex_unlock(&sc->aen_mtx);
916 
917 	pci_nvme_aen_notify(sc);
918 
919 	return(0);
920 }
921 
922 static void
923 pci_nvme_aen_process(struct pci_nvme_softc *sc)
924 {
925 	struct pci_nvme_aer *aer;
926 	struct pci_nvme_aen *aen;
927 	pci_nvme_async_type atype;
928 	uint32_t mask;
929 	uint16_t status;
930 	uint8_t lid;
931 
932 	assert(pthread_mutex_isowned_np(&sc->aen_mtx));
933 	for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
934 		aen = &sc->aen[atype];
935 		/* Previous iterations may have depleted the available AER's */
936 		if (!pci_nvme_aer_available(sc)) {
937 			DPRINTF("%s: no AER", __func__);
938 			break;
939 		}
940 
941 		if (!aen->posted) {
942 			DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype);
943 			continue;
944 		}
945 
946 		status = NVME_SC_SUCCESS;
947 
948 		/* Is the event masked? */
949 		mask =
950 		    sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11;
951 
952 		DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data);
953 		switch (atype) {
954 		case PCI_NVME_AE_TYPE_ERROR:
955 			lid = NVME_LOG_ERROR;
956 			break;
957 		case PCI_NVME_AE_TYPE_SMART:
958 			mask &= 0xff;
959 			if ((mask & aen->event_data) == 0)
960 				continue;
961 			lid = NVME_LOG_HEALTH_INFORMATION;
962 			break;
963 		case PCI_NVME_AE_TYPE_NOTICE:
964 			if (aen->event_data >= PCI_NVME_AEI_NOTICE_MAX) {
965 				EPRINTLN("%s unknown AEN notice type %u",
966 				    __func__, aen->event_data);
967 				status = NVME_SC_INTERNAL_DEVICE_ERROR;
968 				break;
969 			}
970 			if ((PCI_NVME_AEI_NOTICE_MASK(aen->event_data) & mask) == 0)
971 				continue;
972 			switch (aen->event_data) {
973 			case PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED:
974 				lid = NVME_LOG_CHANGED_NAMESPACE;
975 				break;
976 			case PCI_NVME_AEI_NOTICE_FW_ACTIVATION:
977 				lid = NVME_LOG_FIRMWARE_SLOT;
978 				break;
979 			case PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE:
980 				lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED;
981 				break;
982 			case PCI_NVME_AEI_NOTICE_ANA_CHANGE:
983 				lid = NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS;
984 				break;
985 			case PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE:
986 				lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE;
987 				break;
988 			case PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT:
989 				lid = NVME_LOG_LBA_STATUS_INFORMATION;
990 				break;
991 			case PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE:
992 				lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE;
993 				break;
994 			default:
995 				lid = 0;
996 			}
997 			break;
998 		default:
999 			/* bad type?!? */
1000 			EPRINTLN("%s unknown AEN type %u", __func__, atype);
1001 			status = NVME_SC_INTERNAL_DEVICE_ERROR;
1002 			break;
1003 		}
1004 
1005 		aer = pci_nvme_aer_get(sc);
1006 		assert(aer != NULL);
1007 
1008 		DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype);
1009 		pci_nvme_cq_update(sc, &sc->compl_queues[0],
1010 		    (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */
1011 		    aer->cid,
1012 		    0,		/* SQID */
1013 		    status);
1014 
1015 		aen->event_data = 0;
1016 		aen->posted = false;
1017 
1018 		pci_generate_msix(sc->nsc_pi, 0);
1019 	}
1020 }
1021 
1022 static void *
1023 aen_thr(void *arg)
1024 {
1025 	struct pci_nvme_softc *sc;
1026 
1027 	sc = arg;
1028 
1029 	pthread_mutex_lock(&sc->aen_mtx);
1030 	for (;;) {
1031 		pci_nvme_aen_process(sc);
1032 		pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx);
1033 	}
1034 	pthread_mutex_unlock(&sc->aen_mtx);
1035 
1036 	pthread_exit(NULL);
1037 	return (NULL);
1038 }
1039 
1040 static void
1041 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
1042 {
1043 	uint32_t i;
1044 
1045 	DPRINTF("%s", __func__);
1046 
1047 	sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
1048 	    (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
1049 	    (60 << NVME_CAP_LO_REG_TO_SHIFT);
1050 
1051 	sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
1052 
1053 	sc->regs.vs = NVME_REV(1,4);	/* NVMe v1.4 */
1054 
1055 	sc->regs.cc = 0;
1056 
1057 	assert(sc->submit_queues != NULL);
1058 
1059 	for (i = 0; i < sc->num_squeues + 1; i++) {
1060 		sc->submit_queues[i].qbase = NULL;
1061 		sc->submit_queues[i].size = 0;
1062 		sc->submit_queues[i].cqid = 0;
1063 		sc->submit_queues[i].tail = 0;
1064 		sc->submit_queues[i].head = 0;
1065 	}
1066 
1067 	assert(sc->compl_queues != NULL);
1068 
1069 	for (i = 0; i < sc->num_cqueues + 1; i++) {
1070 		sc->compl_queues[i].qbase = NULL;
1071 		sc->compl_queues[i].size = 0;
1072 		sc->compl_queues[i].tail = 0;
1073 		sc->compl_queues[i].head = 0;
1074 	}
1075 
1076 	sc->num_q_is_set = false;
1077 
1078 	pci_nvme_aer_destroy(sc);
1079 	pci_nvme_aen_destroy(sc);
1080 
1081 	/*
1082 	 * Clear CSTS.RDY last to prevent the host from enabling Controller
1083 	 * before cleanup completes
1084 	 */
1085 	sc->regs.csts = 0;
1086 }
1087 
1088 static void
1089 pci_nvme_reset(struct pci_nvme_softc *sc)
1090 {
1091 	pthread_mutex_lock(&sc->mtx);
1092 	pci_nvme_reset_locked(sc);
1093 	pthread_mutex_unlock(&sc->mtx);
1094 }
1095 
1096 static int
1097 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
1098 {
1099 	uint16_t acqs, asqs;
1100 
1101 	DPRINTF("%s", __func__);
1102 
1103 	/*
1104 	 * NVMe 2.0 states that "enabling a controller while this field is
1105 	 * cleared to 0h produces undefined results" for both ACQS and
1106 	 * ASQS. If zero, set CFS and do not become ready.
1107 	 */
1108 	asqs = ONE_BASED(sc->regs.aqa & NVME_AQA_REG_ASQS_MASK);
1109 	if (asqs < 2) {
1110 		EPRINTLN("%s: illegal ASQS value %#x (aqa=%#x)", __func__,
1111 		    asqs - 1, sc->regs.aqa);
1112 		sc->regs.csts |= NVME_CSTS_CFS;
1113 		return (-1);
1114 	}
1115 	sc->submit_queues[0].size = asqs;
1116 	sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
1117 	            sizeof(struct nvme_command) * asqs);
1118 	if (sc->submit_queues[0].qbase == NULL) {
1119 		EPRINTLN("%s: ASQ vm_map_gpa(%lx) failed", __func__,
1120 		    sc->regs.asq);
1121 		sc->regs.csts |= NVME_CSTS_CFS;
1122 		return (-1);
1123 	}
1124 
1125 	DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
1126 	        __func__, sc->regs.asq, sc->submit_queues[0].qbase);
1127 
1128 	acqs = ONE_BASED((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
1129 	    NVME_AQA_REG_ACQS_MASK);
1130 	if (acqs < 2) {
1131 		EPRINTLN("%s: illegal ACQS value %#x (aqa=%#x)", __func__,
1132 		    acqs - 1, sc->regs.aqa);
1133 		sc->regs.csts |= NVME_CSTS_CFS;
1134 		return (-1);
1135 	}
1136 	sc->compl_queues[0].size = acqs;
1137 	sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
1138 	         sizeof(struct nvme_completion) * acqs);
1139 	if (sc->compl_queues[0].qbase == NULL) {
1140 		EPRINTLN("%s: ACQ vm_map_gpa(%lx) failed", __func__,
1141 		    sc->regs.acq);
1142 		sc->regs.csts |= NVME_CSTS_CFS;
1143 		return (-1);
1144 	}
1145 	sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
1146 
1147 	DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
1148 	        __func__, sc->regs.acq, sc->compl_queues[0].qbase);
1149 
1150 	return (0);
1151 }
1152 
1153 static int
1154 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
1155 	size_t len, enum nvme_copy_dir dir)
1156 {
1157 	uint8_t *p;
1158 	size_t bytes;
1159 
1160 	if (len > (8 * 1024)) {
1161 		return (-1);
1162 	}
1163 
1164 	/* Copy from the start of prp1 to the end of the physical page */
1165 	bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
1166 	bytes = MIN(bytes, len);
1167 
1168 	p = vm_map_gpa(ctx, prp1, bytes);
1169 	if (p == NULL) {
1170 		return (-1);
1171 	}
1172 
1173 	if (dir == NVME_COPY_TO_PRP)
1174 		memcpy(p, b, bytes);
1175 	else
1176 		memcpy(b, p, bytes);
1177 
1178 	b += bytes;
1179 
1180 	len -= bytes;
1181 	if (len == 0) {
1182 		return (0);
1183 	}
1184 
1185 	len = MIN(len, PAGE_SIZE);
1186 
1187 	p = vm_map_gpa(ctx, prp2, len);
1188 	if (p == NULL) {
1189 		return (-1);
1190 	}
1191 
1192 	if (dir == NVME_COPY_TO_PRP)
1193 		memcpy(p, b, len);
1194 	else
1195 		memcpy(b, p, len);
1196 
1197 	return (0);
1198 }
1199 
1200 /*
1201  * Write a Completion Queue Entry update
1202  *
1203  * Write the completion and update the doorbell value
1204  */
1205 static void
1206 pci_nvme_cq_update(struct pci_nvme_softc *sc,
1207 		struct nvme_completion_queue *cq,
1208 		uint32_t cdw0,
1209 		uint16_t cid,
1210 		uint16_t sqid,
1211 		uint16_t status)
1212 {
1213 	struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
1214 	struct nvme_completion *cqe;
1215 
1216 	assert(cq->qbase != NULL);
1217 
1218 	pthread_mutex_lock(&cq->mtx);
1219 
1220 	cqe = &cq->qbase[cq->tail];
1221 
1222 	/* Flip the phase bit */
1223 	status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
1224 
1225 	cqe->cdw0 = cdw0;
1226 	cqe->sqhd = sq->head;
1227 	cqe->sqid = sqid;
1228 	cqe->cid = cid;
1229 	cqe->status = status;
1230 
1231 	cq->tail++;
1232 	if (cq->tail >= cq->size) {
1233 		cq->tail = 0;
1234 	}
1235 
1236 	pthread_mutex_unlock(&cq->mtx);
1237 }
1238 
1239 static int
1240 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1241 	struct nvme_completion* compl)
1242 {
1243 	uint16_t qid = command->cdw10 & 0xffff;
1244 
1245 	DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
1246 	if (qid == 0 || qid > sc->num_squeues ||
1247 	    (sc->submit_queues[qid].qbase == NULL)) {
1248 		WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
1249 		        __func__, qid, sc->num_squeues);
1250 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1251 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1252 		return (1);
1253 	}
1254 
1255 	sc->submit_queues[qid].qbase = NULL;
1256 	sc->submit_queues[qid].cqid = 0;
1257 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1258 	return (1);
1259 }
1260 
1261 static int
1262 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1263 	struct nvme_completion* compl)
1264 {
1265 	if (command->cdw11 & NVME_CMD_CDW11_PC) {
1266 		uint16_t qid = command->cdw10 & 0xffff;
1267 		struct nvme_submission_queue *nsq;
1268 
1269 		if ((qid == 0) || (qid > sc->num_squeues) ||
1270 		    (sc->submit_queues[qid].qbase != NULL)) {
1271 			WPRINTF("%s queue index %u > num_squeues %u",
1272 			        __func__, qid, sc->num_squeues);
1273 			pci_nvme_status_tc(&compl->status,
1274 			    NVME_SCT_COMMAND_SPECIFIC,
1275 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1276 			return (1);
1277 		}
1278 
1279 		nsq = &sc->submit_queues[qid];
1280 		nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1281 		DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
1282 		if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
1283 			/*
1284 			 * Queues must specify at least two entries
1285 			 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1286 			 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1287 			 */
1288 			pci_nvme_status_tc(&compl->status,
1289 			    NVME_SCT_COMMAND_SPECIFIC,
1290 			    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1291 			return (1);
1292 		}
1293 		nsq->head = nsq->tail = 0;
1294 
1295 		nsq->cqid = (command->cdw11 >> 16) & 0xffff;
1296 		if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
1297 			pci_nvme_status_tc(&compl->status,
1298 			    NVME_SCT_COMMAND_SPECIFIC,
1299 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1300 			return (1);
1301 		}
1302 
1303 		if (sc->compl_queues[nsq->cqid].qbase == NULL) {
1304 			pci_nvme_status_tc(&compl->status,
1305 			    NVME_SCT_COMMAND_SPECIFIC,
1306 			    NVME_SC_COMPLETION_QUEUE_INVALID);
1307 			return (1);
1308 		}
1309 
1310 		nsq->qpriority = (command->cdw11 >> 1) & 0x03;
1311 
1312 		nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1313 		              sizeof(struct nvme_command) * (size_t)nsq->size);
1314 
1315 		DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
1316 		        qid, nsq->size, nsq->qbase, nsq->cqid);
1317 
1318 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1319 
1320 		DPRINTF("%s completed creating IOSQ qid %u",
1321 		         __func__, qid);
1322 	} else {
1323 		/*
1324 		 * Guest sent non-cont submission queue request.
1325 		 * This setting is unsupported by this emulation.
1326 		 */
1327 		WPRINTF("%s unsupported non-contig (list-based) "
1328 		         "create i/o submission queue", __func__);
1329 
1330 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1331 	}
1332 	return (1);
1333 }
1334 
1335 static int
1336 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1337 	struct nvme_completion* compl)
1338 {
1339 	uint16_t qid = command->cdw10 & 0xffff;
1340 	uint16_t sqid;
1341 
1342 	DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
1343 	if (qid == 0 || qid > sc->num_cqueues ||
1344 	    (sc->compl_queues[qid].qbase == NULL)) {
1345 		WPRINTF("%s queue index %u / num_cqueues %u",
1346 		        __func__, qid, sc->num_cqueues);
1347 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1348 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1349 		return (1);
1350 	}
1351 
1352 	/* Deleting an Active CQ is an error */
1353 	for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
1354 		if (sc->submit_queues[sqid].cqid == qid) {
1355 			pci_nvme_status_tc(&compl->status,
1356 			    NVME_SCT_COMMAND_SPECIFIC,
1357 			    NVME_SC_INVALID_QUEUE_DELETION);
1358 			return (1);
1359 		}
1360 
1361 	sc->compl_queues[qid].qbase = NULL;
1362 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1363 	return (1);
1364 }
1365 
1366 static int
1367 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1368 	struct nvme_completion* compl)
1369 {
1370 	struct nvme_completion_queue *ncq;
1371 	uint16_t qid = command->cdw10 & 0xffff;
1372 
1373 	/* Only support Physically Contiguous queues */
1374 	if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1375 		WPRINTF("%s unsupported non-contig (list-based) "
1376 		         "create i/o completion queue",
1377 		         __func__);
1378 
1379 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1380 		return (1);
1381 	}
1382 
1383 	if ((qid == 0) || (qid > sc->num_cqueues) ||
1384 	    (sc->compl_queues[qid].qbase != NULL)) {
1385 		WPRINTF("%s queue index %u > num_cqueues %u",
1386 			__func__, qid, sc->num_cqueues);
1387 		pci_nvme_status_tc(&compl->status,
1388 		    NVME_SCT_COMMAND_SPECIFIC,
1389 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1390 		return (1);
1391  	}
1392 
1393 	ncq = &sc->compl_queues[qid];
1394 	ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1395 	ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1396 	if (ncq->intr_vec > (sc->max_queues + 1)) {
1397 		pci_nvme_status_tc(&compl->status,
1398 		    NVME_SCT_COMMAND_SPECIFIC,
1399 		    NVME_SC_INVALID_INTERRUPT_VECTOR);
1400 		return (1);
1401 	}
1402 
1403 	ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1404 	if ((ncq->size < 2) || (ncq->size > sc->max_qentries))  {
1405 		/*
1406 		 * Queues must specify at least two entries
1407 		 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1408 		 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1409 		 */
1410 		pci_nvme_status_tc(&compl->status,
1411 		    NVME_SCT_COMMAND_SPECIFIC,
1412 		    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1413 		return (1);
1414 	}
1415 	ncq->head = ncq->tail = 0;
1416 	ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1417 		     command->prp1,
1418 		     sizeof(struct nvme_command) * (size_t)ncq->size);
1419 
1420 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1421 
1422 
1423 	return (1);
1424 }
1425 
1426 static int
1427 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1428 	struct nvme_completion* compl)
1429 {
1430 	uint64_t logoff;
1431 	uint32_t logsize;
1432 	uint8_t logpage;
1433 
1434 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1435 
1436 	/*
1437 	 * Command specifies the number of dwords to return in fields NUMDU
1438 	 * and NUMDL. This is a zero-based value.
1439 	 */
1440 	logpage = command->cdw10 & 0xFF;
1441 	logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1442 	logsize *= sizeof(uint32_t);
1443 	logoff  = ((uint64_t)(command->cdw13) << 32) | command->cdw12;
1444 
1445 	DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1446 
1447 	switch (logpage) {
1448 	case NVME_LOG_ERROR:
1449 		if (logoff >= sizeof(sc->err_log)) {
1450 			pci_nvme_status_genc(&compl->status,
1451 			    NVME_SC_INVALID_FIELD);
1452 			break;
1453 		}
1454 
1455 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1456 		    command->prp2, (uint8_t *)&sc->err_log + logoff,
1457 		    MIN(logsize - logoff, sizeof(sc->err_log)),
1458 		    NVME_COPY_TO_PRP);
1459 		break;
1460 	case NVME_LOG_HEALTH_INFORMATION:
1461 		if (logoff >= sizeof(sc->health_log)) {
1462 			pci_nvme_status_genc(&compl->status,
1463 			    NVME_SC_INVALID_FIELD);
1464 			break;
1465 		}
1466 
1467 		pthread_mutex_lock(&sc->mtx);
1468 		memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1469 		    sizeof(sc->health_log.data_units_read));
1470 		memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1471 		    sizeof(sc->health_log.data_units_written));
1472 		memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1473 		    sizeof(sc->health_log.host_read_commands));
1474 		memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1475 		    sizeof(sc->health_log.host_write_commands));
1476 		pthread_mutex_unlock(&sc->mtx);
1477 
1478 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1479 		    command->prp2, (uint8_t *)&sc->health_log + logoff,
1480 		    MIN(logsize - logoff, sizeof(sc->health_log)),
1481 		    NVME_COPY_TO_PRP);
1482 		break;
1483 	case NVME_LOG_FIRMWARE_SLOT:
1484 		if (logoff >= sizeof(sc->fw_log)) {
1485 			pci_nvme_status_genc(&compl->status,
1486 			    NVME_SC_INVALID_FIELD);
1487 			break;
1488 		}
1489 
1490 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1491 		    command->prp2, (uint8_t *)&sc->fw_log + logoff,
1492 		    MIN(logsize - logoff, sizeof(sc->fw_log)),
1493 		    NVME_COPY_TO_PRP);
1494 		break;
1495 	case NVME_LOG_CHANGED_NAMESPACE:
1496 		if (logoff >= sizeof(sc->ns_log)) {
1497 			pci_nvme_status_genc(&compl->status,
1498 			    NVME_SC_INVALID_FIELD);
1499 			break;
1500 		}
1501 
1502 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1503 		    command->prp2, (uint8_t *)&sc->ns_log + logoff,
1504 		    MIN(logsize - logoff, sizeof(sc->ns_log)),
1505 		    NVME_COPY_TO_PRP);
1506 		memset(&sc->ns_log, 0, sizeof(sc->ns_log));
1507 		break;
1508 	default:
1509 		DPRINTF("%s get log page %x command not supported",
1510 		        __func__, logpage);
1511 
1512 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1513 		    NVME_SC_INVALID_LOG_PAGE);
1514 	}
1515 
1516 	return (1);
1517 }
1518 
1519 static int
1520 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1521 	struct nvme_completion* compl)
1522 {
1523 	void *dest;
1524 	uint16_t status;
1525 
1526 	DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1527 	        command->cdw10 & 0xFF, command->nsid);
1528 
1529 	pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1530 
1531 	switch (command->cdw10 & 0xFF) {
1532 	case 0x00: /* return Identify Namespace data structure */
1533 		/* Global NS only valid with NS Management */
1534 		if (command->nsid == NVME_GLOBAL_NAMESPACE_TAG) {
1535 			pci_nvme_status_genc(&status,
1536 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1537 			break;
1538 		}
1539 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1540 		    command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1541 		    NVME_COPY_TO_PRP);
1542 		break;
1543 	case 0x01: /* return Identify Controller data structure */
1544 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1545 		    command->prp2, (uint8_t *)&sc->ctrldata,
1546 		    sizeof(sc->ctrldata),
1547 		    NVME_COPY_TO_PRP);
1548 		break;
1549 	case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1550 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1551 		                  sizeof(uint32_t) * 1024);
1552 		/* All unused entries shall be zero */
1553 		bzero(dest, sizeof(uint32_t) * 1024);
1554 		((uint32_t *)dest)[0] = 1;
1555 		break;
1556 	case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1557 		if (command->nsid != 1) {
1558 			pci_nvme_status_genc(&status,
1559 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1560 			break;
1561 		}
1562 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1563 		                  sizeof(uint32_t) * 1024);
1564 		/* All bytes after the descriptor shall be zero */
1565 		bzero(dest, sizeof(uint32_t) * 1024);
1566 
1567 		/* Return NIDT=1 (i.e. EUI64) descriptor */
1568 		((uint8_t *)dest)[0] = 1;
1569 		((uint8_t *)dest)[1] = sizeof(uint64_t);
1570 		bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1571 		break;
1572 	case 0x13:
1573 		/*
1574 		 * Controller list is optional but used by UNH tests. Return
1575 		 * a valid but empty list.
1576 		 */
1577 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1578 		                  sizeof(uint16_t) * 2048);
1579 		memset(dest, 0, sizeof(uint16_t) * 2048);
1580 		break;
1581 	default:
1582 		DPRINTF("%s unsupported identify command requested 0x%x",
1583 		         __func__, command->cdw10 & 0xFF);
1584 		pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1585 		break;
1586 	}
1587 
1588 	compl->status = status;
1589 	return (1);
1590 }
1591 
1592 static const char *
1593 nvme_fid_to_name(uint8_t fid)
1594 {
1595 	const char *name;
1596 
1597 	switch (fid) {
1598 	case NVME_FEAT_ARBITRATION:
1599 		name = "Arbitration";
1600 		break;
1601 	case NVME_FEAT_POWER_MANAGEMENT:
1602 		name = "Power Management";
1603 		break;
1604 	case NVME_FEAT_LBA_RANGE_TYPE:
1605 		name = "LBA Range Type";
1606 		break;
1607 	case NVME_FEAT_TEMPERATURE_THRESHOLD:
1608 		name = "Temperature Threshold";
1609 		break;
1610 	case NVME_FEAT_ERROR_RECOVERY:
1611 		name = "Error Recovery";
1612 		break;
1613 	case NVME_FEAT_VOLATILE_WRITE_CACHE:
1614 		name = "Volatile Write Cache";
1615 		break;
1616 	case NVME_FEAT_NUMBER_OF_QUEUES:
1617 		name = "Number of Queues";
1618 		break;
1619 	case NVME_FEAT_INTERRUPT_COALESCING:
1620 		name = "Interrupt Coalescing";
1621 		break;
1622 	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1623 		name = "Interrupt Vector Configuration";
1624 		break;
1625 	case NVME_FEAT_WRITE_ATOMICITY:
1626 		name = "Write Atomicity Normal";
1627 		break;
1628 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1629 		name = "Asynchronous Event Configuration";
1630 		break;
1631 	case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1632 		name = "Autonomous Power State Transition";
1633 		break;
1634 	case NVME_FEAT_HOST_MEMORY_BUFFER:
1635 		name = "Host Memory Buffer";
1636 		break;
1637 	case NVME_FEAT_TIMESTAMP:
1638 		name = "Timestamp";
1639 		break;
1640 	case NVME_FEAT_KEEP_ALIVE_TIMER:
1641 		name = "Keep Alive Timer";
1642 		break;
1643 	case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1644 		name = "Host Controlled Thermal Management";
1645 		break;
1646 	case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1647 		name = "Non-Operation Power State Config";
1648 		break;
1649 	case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1650 		name = "Read Recovery Level Config";
1651 		break;
1652 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1653 		name = "Predictable Latency Mode Config";
1654 		break;
1655 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1656 		name = "Predictable Latency Mode Window";
1657 		break;
1658 	case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1659 		name = "LBA Status Information Report Interval";
1660 		break;
1661 	case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1662 		name = "Host Behavior Support";
1663 		break;
1664 	case NVME_FEAT_SANITIZE_CONFIG:
1665 		name = "Sanitize Config";
1666 		break;
1667 	case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1668 		name = "Endurance Group Event Configuration";
1669 		break;
1670 	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1671 		name = "Software Progress Marker";
1672 		break;
1673 	case NVME_FEAT_HOST_IDENTIFIER:
1674 		name = "Host Identifier";
1675 		break;
1676 	case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1677 		name = "Reservation Notification Mask";
1678 		break;
1679 	case NVME_FEAT_RESERVATION_PERSISTENCE:
1680 		name = "Reservation Persistence";
1681 		break;
1682 	case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1683 		name = "Namespace Write Protection Config";
1684 		break;
1685 	default:
1686 		name = "Unknown";
1687 		break;
1688 	}
1689 
1690 	return (name);
1691 }
1692 
1693 static void
1694 nvme_feature_invalid_cb(struct pci_nvme_softc *sc __unused,
1695     struct nvme_feature_obj *feat __unused,
1696     struct nvme_command *command __unused,
1697     struct nvme_completion *compl)
1698 {
1699 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1700 }
1701 
1702 static void
1703 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1704     struct nvme_feature_obj *feat __unused,
1705     struct nvme_command *command,
1706     struct nvme_completion *compl)
1707 {
1708 	uint32_t i;
1709 	uint32_t cdw11 = command->cdw11;
1710 	uint16_t iv;
1711 	bool cd;
1712 
1713 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1714 
1715 	iv = cdw11 & 0xffff;
1716 	cd = cdw11 & (1 << 16);
1717 
1718 	if (iv > (sc->max_queues + 1)) {
1719 		return;
1720 	}
1721 
1722 	/* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1723 	if ((iv == 0) && !cd)
1724 		return;
1725 
1726 	/* Requested Interrupt Vector must be used by a CQ */
1727 	for (i = 0; i < sc->num_cqueues + 1; i++) {
1728 		if (sc->compl_queues[i].intr_vec == iv) {
1729 			pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1730 		}
1731 	}
1732 }
1733 
1734 #define NVME_ASYNC_EVENT_ENDURANCE_GROUP		(0x4000)
1735 static void
1736 nvme_feature_async_event(struct pci_nvme_softc *sc __unused,
1737     struct nvme_feature_obj *feat __unused,
1738     struct nvme_command *command,
1739     struct nvme_completion *compl)
1740 {
1741 	if (command->cdw11 & NVME_ASYNC_EVENT_ENDURANCE_GROUP)
1742 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1743 }
1744 
1745 #define NVME_TEMP_THRESH_OVER	0
1746 #define NVME_TEMP_THRESH_UNDER	1
1747 static void
1748 nvme_feature_temperature(struct pci_nvme_softc *sc,
1749     struct nvme_feature_obj *feat __unused,
1750     struct nvme_command *command,
1751     struct nvme_completion *compl)
1752 {
1753 	uint16_t	tmpth;	/* Temperature Threshold */
1754 	uint8_t		tmpsel; /* Threshold Temperature Select */
1755 	uint8_t		thsel;  /* Threshold Type Select */
1756 	bool		set_crit = false;
1757 
1758 	tmpth  = command->cdw11 & 0xffff;
1759 	tmpsel = (command->cdw11 >> 16) & 0xf;
1760 	thsel  = (command->cdw11 >> 20) & 0x3;
1761 
1762 	DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel);
1763 
1764 	/* Check for unsupported values */
1765 	if (((tmpsel != 0) && (tmpsel != 0xf)) ||
1766 	    (thsel > NVME_TEMP_THRESH_UNDER)) {
1767 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1768 		return;
1769 	}
1770 
1771 	if (((thsel == NVME_TEMP_THRESH_OVER)  && (NVME_TEMPERATURE >= tmpth)) ||
1772 	    ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth)))
1773 		set_crit = true;
1774 
1775 	pthread_mutex_lock(&sc->mtx);
1776 	if (set_crit)
1777 		sc->health_log.critical_warning |=
1778 		    NVME_CRIT_WARN_ST_TEMPERATURE;
1779 	else
1780 		sc->health_log.critical_warning &=
1781 		    ~NVME_CRIT_WARN_ST_TEMPERATURE;
1782 	pthread_mutex_unlock(&sc->mtx);
1783 
1784 	if (set_crit)
1785 		pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART,
1786 		    sc->health_log.critical_warning);
1787 
1788 
1789 	DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status);
1790 }
1791 
1792 static void
1793 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1794     struct nvme_feature_obj *feat __unused,
1795     struct nvme_command *command,
1796     struct nvme_completion *compl)
1797 {
1798 	uint16_t nqr;	/* Number of Queues Requested */
1799 
1800 	if (sc->num_q_is_set) {
1801 		WPRINTF("%s: Number of Queues already set", __func__);
1802 		pci_nvme_status_genc(&compl->status,
1803 		    NVME_SC_COMMAND_SEQUENCE_ERROR);
1804 		return;
1805 	}
1806 
1807 	nqr = command->cdw11 & 0xFFFF;
1808 	if (nqr == 0xffff) {
1809 		WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1810 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1811 		return;
1812 	}
1813 
1814 	sc->num_squeues = ONE_BASED(nqr);
1815 	if (sc->num_squeues > sc->max_queues) {
1816 		DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1817 					sc->max_queues);
1818 		sc->num_squeues = sc->max_queues;
1819 	}
1820 
1821 	nqr = (command->cdw11 >> 16) & 0xFFFF;
1822 	if (nqr == 0xffff) {
1823 		WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1824 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1825 		return;
1826 	}
1827 
1828 	sc->num_cqueues = ONE_BASED(nqr);
1829 	if (sc->num_cqueues > sc->max_queues) {
1830 		DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1831 					sc->max_queues);
1832 		sc->num_cqueues = sc->max_queues;
1833 	}
1834 
1835 	/* Patch the command value which will be saved on callback's return */
1836 	command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1837 	compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1838 
1839 	sc->num_q_is_set = true;
1840 }
1841 
1842 static int
1843 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1844 	struct nvme_completion *compl)
1845 {
1846 	struct nvme_feature_obj *feat;
1847 	uint32_t nsid = command->nsid;
1848 	uint8_t fid = command->cdw10 & 0xFF;
1849 
1850 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1851 
1852 	if (fid >= NVME_FID_MAX) {
1853 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1854 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1855 		return (1);
1856 	}
1857 	feat = &sc->feat[fid];
1858 
1859 	if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) {
1860 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1861 		return (1);
1862 	}
1863 
1864 	if (!feat->namespace_specific &&
1865 	    !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1866 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1867 		    NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1868 		return (1);
1869 	}
1870 
1871 	compl->cdw0 = 0;
1872 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1873 
1874 	if (feat->set)
1875 		feat->set(sc, feat, command, compl);
1876 
1877 	DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11);
1878 	if (compl->status == NVME_SC_SUCCESS) {
1879 		feat->cdw11 = command->cdw11;
1880 		if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) &&
1881 		    (command->cdw11 != 0))
1882 			pci_nvme_aen_notify(sc);
1883 	}
1884 
1885 	return (0);
1886 }
1887 
1888 #define NVME_FEATURES_SEL_SUPPORTED	0x3
1889 #define NVME_FEATURES_NS_SPECIFIC	(1 << 1)
1890 
1891 static int
1892 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1893 	struct nvme_completion* compl)
1894 {
1895 	struct nvme_feature_obj *feat;
1896 	uint8_t fid = command->cdw10 & 0xFF;
1897 	uint8_t sel = (command->cdw10 >> 8) & 0x7;
1898 
1899 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1900 
1901 	if (fid >= NVME_FID_MAX) {
1902 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1903 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1904 		return (1);
1905 	}
1906 
1907 	compl->cdw0 = 0;
1908 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1909 
1910 	feat = &sc->feat[fid];
1911 	if (feat->get) {
1912 		feat->get(sc, feat, command, compl);
1913 	}
1914 
1915 	if (compl->status == NVME_SC_SUCCESS) {
1916 		if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific)
1917 			compl->cdw0 = NVME_FEATURES_NS_SPECIFIC;
1918 		else
1919 			compl->cdw0 = feat->cdw11;
1920 	}
1921 
1922 	return (0);
1923 }
1924 
1925 static int
1926 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1927 	struct nvme_completion* compl)
1928 {
1929 	uint8_t	ses, lbaf, pi;
1930 
1931 	/* Only supports Secure Erase Setting - User Data Erase */
1932 	ses = (command->cdw10 >> 9) & 0x7;
1933 	if (ses > 0x1) {
1934 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1935 		return (1);
1936 	}
1937 
1938 	/* Only supports a single LBA Format */
1939 	lbaf = command->cdw10 & 0xf;
1940 	if (lbaf != 0) {
1941 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1942 		    NVME_SC_INVALID_FORMAT);
1943 		return (1);
1944 	}
1945 
1946 	/* Doesn't support Protection Infomation */
1947 	pi = (command->cdw10 >> 5) & 0x7;
1948 	if (pi != 0) {
1949 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1950 		return (1);
1951 	}
1952 
1953 	if (sc->nvstore.type == NVME_STOR_RAM) {
1954 		if (sc->nvstore.ctx)
1955 			free(sc->nvstore.ctx);
1956 		sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1957 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1958 	} else {
1959 		struct pci_nvme_ioreq *req;
1960 		int err;
1961 
1962 		req = pci_nvme_get_ioreq(sc);
1963 		if (req == NULL) {
1964 			pci_nvme_status_genc(&compl->status,
1965 			    NVME_SC_INTERNAL_DEVICE_ERROR);
1966 			WPRINTF("%s: unable to allocate IO req", __func__);
1967 			return (1);
1968 		}
1969 		req->nvme_sq = &sc->submit_queues[0];
1970 		req->sqid = 0;
1971 		req->opc = command->opc;
1972 		req->cid = command->cid;
1973 		req->nsid = command->nsid;
1974 
1975 		req->io_req.br_offset = 0;
1976 		req->io_req.br_resid = sc->nvstore.size;
1977 		req->io_req.br_callback = pci_nvme_io_done;
1978 
1979 		err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1980 		if (err) {
1981 			pci_nvme_status_genc(&compl->status,
1982 			    NVME_SC_INTERNAL_DEVICE_ERROR);
1983 			pci_nvme_release_ioreq(sc, req);
1984 		} else
1985 			compl->status = NVME_NO_STATUS;
1986 	}
1987 
1988 	return (1);
1989 }
1990 
1991 static int
1992 nvme_opc_abort(struct pci_nvme_softc *sc __unused, struct nvme_command *command,
1993     struct nvme_completion *compl)
1994 {
1995 	DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1996 	        command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1997 
1998 	/* TODO: search for the command ID and abort it */
1999 
2000 	compl->cdw0 = 1;
2001 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
2002 	return (1);
2003 }
2004 
2005 static int
2006 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
2007 	struct nvme_command* command, struct nvme_completion* compl)
2008 {
2009 	DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__,
2010 	    sc->aer_count, sc->ctrldata.aerl, command->cid);
2011 
2012 	/* Don't exceed the Async Event Request Limit (AERL). */
2013 	if (pci_nvme_aer_limit_reached(sc)) {
2014 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
2015 				NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
2016 		return (1);
2017 	}
2018 
2019 	if (pci_nvme_aer_add(sc, command->cid)) {
2020 		pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
2021 				NVME_SC_INTERNAL_DEVICE_ERROR);
2022 		return (1);
2023 	}
2024 
2025 	/*
2026 	 * Raise events when they happen based on the Set Features cmd.
2027 	 * These events happen async, so only set completion successful if
2028 	 * there is an event reflective of the request to get event.
2029 	 */
2030 	compl->status = NVME_NO_STATUS;
2031 	pci_nvme_aen_notify(sc);
2032 
2033 	return (0);
2034 }
2035 
2036 static void
2037 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
2038 {
2039 	struct nvme_completion compl;
2040 	struct nvme_command *cmd;
2041 	struct nvme_submission_queue *sq;
2042 	struct nvme_completion_queue *cq;
2043 	uint16_t sqhead;
2044 
2045 	DPRINTF("%s index %u", __func__, (uint32_t)value);
2046 
2047 	sq = &sc->submit_queues[0];
2048 	cq = &sc->compl_queues[0];
2049 
2050 	pthread_mutex_lock(&sq->mtx);
2051 
2052 	sqhead = sq->head;
2053 	DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
2054 
2055 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
2056 		cmd = &(sq->qbase)[sqhead];
2057 		compl.cdw0 = 0;
2058 		compl.status = 0;
2059 
2060 		switch (cmd->opc) {
2061 		case NVME_OPC_DELETE_IO_SQ:
2062 			DPRINTF("%s command DELETE_IO_SQ", __func__);
2063 			nvme_opc_delete_io_sq(sc, cmd, &compl);
2064 			break;
2065 		case NVME_OPC_CREATE_IO_SQ:
2066 			DPRINTF("%s command CREATE_IO_SQ", __func__);
2067 			nvme_opc_create_io_sq(sc, cmd, &compl);
2068 			break;
2069 		case NVME_OPC_DELETE_IO_CQ:
2070 			DPRINTF("%s command DELETE_IO_CQ", __func__);
2071 			nvme_opc_delete_io_cq(sc, cmd, &compl);
2072 			break;
2073 		case NVME_OPC_CREATE_IO_CQ:
2074 			DPRINTF("%s command CREATE_IO_CQ", __func__);
2075 			nvme_opc_create_io_cq(sc, cmd, &compl);
2076 			break;
2077 		case NVME_OPC_GET_LOG_PAGE:
2078 			DPRINTF("%s command GET_LOG_PAGE", __func__);
2079 			nvme_opc_get_log_page(sc, cmd, &compl);
2080 			break;
2081 		case NVME_OPC_IDENTIFY:
2082 			DPRINTF("%s command IDENTIFY", __func__);
2083 			nvme_opc_identify(sc, cmd, &compl);
2084 			break;
2085 		case NVME_OPC_ABORT:
2086 			DPRINTF("%s command ABORT", __func__);
2087 			nvme_opc_abort(sc, cmd, &compl);
2088 			break;
2089 		case NVME_OPC_SET_FEATURES:
2090 			DPRINTF("%s command SET_FEATURES", __func__);
2091 			nvme_opc_set_features(sc, cmd, &compl);
2092 			break;
2093 		case NVME_OPC_GET_FEATURES:
2094 			DPRINTF("%s command GET_FEATURES", __func__);
2095 			nvme_opc_get_features(sc, cmd, &compl);
2096 			break;
2097 		case NVME_OPC_FIRMWARE_ACTIVATE:
2098 			DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
2099 			pci_nvme_status_tc(&compl.status,
2100 			    NVME_SCT_COMMAND_SPECIFIC,
2101 			    NVME_SC_INVALID_FIRMWARE_SLOT);
2102 			break;
2103 		case NVME_OPC_ASYNC_EVENT_REQUEST:
2104 			DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
2105 			nvme_opc_async_event_req(sc, cmd, &compl);
2106 			break;
2107 		case NVME_OPC_FORMAT_NVM:
2108 			DPRINTF("%s command FORMAT_NVM", __func__);
2109 			if ((sc->ctrldata.oacs &
2110 			    (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
2111 				pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
2112 				break;
2113 			}
2114 			nvme_opc_format_nvm(sc, cmd, &compl);
2115 			break;
2116 		case NVME_OPC_SECURITY_SEND:
2117 		case NVME_OPC_SECURITY_RECEIVE:
2118 		case NVME_OPC_SANITIZE:
2119 		case NVME_OPC_GET_LBA_STATUS:
2120 			DPRINTF("%s command OPC=%#x (unsupported)", __func__,
2121 			    cmd->opc);
2122 			/* Valid but unsupported opcodes */
2123 			pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD);
2124 			break;
2125 		default:
2126 			DPRINTF("%s command OPC=%#X (not implemented)",
2127 			    __func__,
2128 			    cmd->opc);
2129 			pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
2130 		}
2131 		sqhead = (sqhead + 1) % sq->size;
2132 
2133 		if (NVME_COMPLETION_VALID(compl)) {
2134 			pci_nvme_cq_update(sc, &sc->compl_queues[0],
2135 			    compl.cdw0,
2136 			    cmd->cid,
2137 			    0,		/* SQID */
2138 			    compl.status);
2139 		}
2140 	}
2141 
2142 	DPRINTF("setting sqhead %u", sqhead);
2143 	sq->head = sqhead;
2144 
2145 	if (cq->head != cq->tail)
2146 		pci_generate_msix(sc->nsc_pi, 0);
2147 
2148 	pthread_mutex_unlock(&sq->mtx);
2149 }
2150 
2151 /*
2152  * Update the Write and Read statistics reported in SMART data
2153  *
2154  * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
2155  * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
2156  * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
2157  */
2158 static void
2159 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
2160     size_t bytes, uint16_t status)
2161 {
2162 
2163 	pthread_mutex_lock(&sc->mtx);
2164 	switch (opc) {
2165 	case NVME_OPC_WRITE:
2166 		sc->write_commands++;
2167 		if (status != NVME_SC_SUCCESS)
2168 			break;
2169 		sc->write_dunits_remainder += (bytes / 512);
2170 		while (sc->write_dunits_remainder >= 1000) {
2171 			sc->write_data_units++;
2172 			sc->write_dunits_remainder -= 1000;
2173 		}
2174 		break;
2175 	case NVME_OPC_READ:
2176 		sc->read_commands++;
2177 		if (status != NVME_SC_SUCCESS)
2178 			break;
2179 		sc->read_dunits_remainder += (bytes / 512);
2180 		while (sc->read_dunits_remainder >= 1000) {
2181 			sc->read_data_units++;
2182 			sc->read_dunits_remainder -= 1000;
2183 		}
2184 		break;
2185 	default:
2186 		DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
2187 		break;
2188 	}
2189 	pthread_mutex_unlock(&sc->mtx);
2190 }
2191 
2192 /*
2193  * Check if the combination of Starting LBA (slba) and number of blocks
2194  * exceeds the range of the underlying storage.
2195  *
2196  * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
2197  * the capacity in bytes as a uint64_t, care must be taken to avoid integer
2198  * overflow.
2199  */
2200 static bool
2201 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
2202     uint32_t nblocks)
2203 {
2204 	size_t	offset, bytes;
2205 
2206 	/* Overflow check of multiplying Starting LBA by the sector size */
2207 	if (slba >> (64 - nvstore->sectsz_bits))
2208 		return (true);
2209 
2210 	offset = slba << nvstore->sectsz_bits;
2211 	bytes = nblocks << nvstore->sectsz_bits;
2212 
2213 	/* Overflow check of Number of Logical Blocks */
2214 	if ((nvstore->size <= offset) || ((nvstore->size - offset) < bytes))
2215 		return (true);
2216 
2217 	return (false);
2218 }
2219 
2220 static int
2221 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
2222 	uint64_t gpaddr, size_t size, int do_write, uint64_t offset)
2223 {
2224 	int iovidx;
2225 	bool range_is_contiguous;
2226 
2227 	if (req == NULL)
2228 		return (-1);
2229 
2230 	if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
2231 		return (-1);
2232 	}
2233 
2234 	/*
2235 	 * Minimize the number of IOVs by concatenating contiguous address
2236 	 * ranges. If the IOV count is zero, there is no previous range to
2237 	 * concatenate.
2238 	 */
2239 	if (req->io_req.br_iovcnt == 0)
2240 		range_is_contiguous = false;
2241 	else
2242 		range_is_contiguous = (req->prev_gpaddr + req->prev_size) == gpaddr;
2243 
2244 	if (range_is_contiguous) {
2245 		iovidx = req->io_req.br_iovcnt - 1;
2246 
2247 		req->io_req.br_iov[iovidx].iov_base =
2248 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2249 				     req->prev_gpaddr, size);
2250 		if (req->io_req.br_iov[iovidx].iov_base == NULL)
2251 			return (-1);
2252 
2253 		req->prev_size += size;
2254 		req->io_req.br_resid += size;
2255 
2256 		req->io_req.br_iov[iovidx].iov_len = req->prev_size;
2257 	} else {
2258 		iovidx = req->io_req.br_iovcnt;
2259 		if (iovidx == 0) {
2260 			req->io_req.br_offset = offset;
2261 			req->io_req.br_resid = 0;
2262 			req->io_req.br_param = req;
2263 		}
2264 
2265 		req->io_req.br_iov[iovidx].iov_base =
2266 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2267 				     gpaddr, size);
2268 		if (req->io_req.br_iov[iovidx].iov_base == NULL)
2269 			return (-1);
2270 
2271 		req->io_req.br_iov[iovidx].iov_len = size;
2272 
2273 		req->prev_gpaddr = gpaddr;
2274 		req->prev_size = size;
2275 		req->io_req.br_resid += size;
2276 
2277 		req->io_req.br_iovcnt++;
2278 	}
2279 
2280 	return (0);
2281 }
2282 
2283 static void
2284 pci_nvme_set_completion(struct pci_nvme_softc *sc,
2285     struct nvme_submission_queue *sq, int sqid, uint16_t cid, uint16_t status)
2286 {
2287 	struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
2288 
2289 	DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
2290 		 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
2291 		 NVME_STATUS_GET_SC(status));
2292 
2293 	pci_nvme_cq_update(sc, cq, 0, cid, sqid, status);
2294 
2295 	if (cq->head != cq->tail) {
2296 		if (cq->intr_en & NVME_CQ_INTEN) {
2297 			pci_generate_msix(sc->nsc_pi, cq->intr_vec);
2298 		} else {
2299 			DPRINTF("%s: CQ%u interrupt disabled",
2300 						__func__, sq->cqid);
2301 		}
2302 	}
2303 }
2304 
2305 static void
2306 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
2307 {
2308 	req->sc = NULL;
2309 	req->nvme_sq = NULL;
2310 	req->sqid = 0;
2311 
2312 	pthread_mutex_lock(&sc->mtx);
2313 
2314 	STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
2315 	sc->pending_ios--;
2316 
2317 	/* when no more IO pending, can set to ready if device reset/enabled */
2318 	if (sc->pending_ios == 0 &&
2319 	    NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
2320 		sc->regs.csts |= NVME_CSTS_RDY;
2321 
2322 	pthread_mutex_unlock(&sc->mtx);
2323 
2324 	sem_post(&sc->iosemlock);
2325 }
2326 
2327 static struct pci_nvme_ioreq *
2328 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
2329 {
2330 	struct pci_nvme_ioreq *req = NULL;
2331 
2332 	sem_wait(&sc->iosemlock);
2333 	pthread_mutex_lock(&sc->mtx);
2334 
2335 	req = STAILQ_FIRST(&sc->ioreqs_free);
2336 	assert(req != NULL);
2337 	STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
2338 
2339 	req->sc = sc;
2340 
2341 	sc->pending_ios++;
2342 
2343 	pthread_mutex_unlock(&sc->mtx);
2344 
2345 	req->io_req.br_iovcnt = 0;
2346 	req->io_req.br_offset = 0;
2347 	req->io_req.br_resid = 0;
2348 	req->io_req.br_param = req;
2349 	req->prev_gpaddr = 0;
2350 	req->prev_size = 0;
2351 
2352 	return req;
2353 }
2354 
2355 static void
2356 pci_nvme_io_done(struct blockif_req *br, int err)
2357 {
2358 	struct pci_nvme_ioreq *req = br->br_param;
2359 	struct nvme_submission_queue *sq = req->nvme_sq;
2360 	uint16_t code, status;
2361 
2362 	DPRINTF("%s error %d %s", __func__, err, strerror(err));
2363 
2364 	/* TODO return correct error */
2365 	code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
2366 	pci_nvme_status_genc(&status, code);
2367 
2368 	pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, status);
2369 	pci_nvme_stats_write_read_update(req->sc, req->opc,
2370 	    req->bytes, status);
2371 	pci_nvme_release_ioreq(req->sc, req);
2372 }
2373 
2374 /*
2375  * Implements the Flush command. The specification states:
2376  *    If a volatile write cache is not present, Flush commands complete
2377  *    successfully and have no effect
2378  * in the description of the Volatile Write Cache (VWC) field of the Identify
2379  * Controller data. Therefore, set status to Success if the command is
2380  * not supported (i.e. RAM or as indicated by the blockif).
2381  */
2382 static bool
2383 nvme_opc_flush(struct pci_nvme_softc *sc __unused,
2384     struct nvme_command *cmd __unused,
2385     struct pci_nvme_blockstore *nvstore,
2386     struct pci_nvme_ioreq *req,
2387     uint16_t *status)
2388 {
2389 	bool pending = false;
2390 
2391 	if (nvstore->type == NVME_STOR_RAM) {
2392 		pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2393 	} else {
2394 		int err;
2395 
2396 		req->io_req.br_callback = pci_nvme_io_done;
2397 
2398 		err = blockif_flush(nvstore->ctx, &req->io_req);
2399 		switch (err) {
2400 		case 0:
2401 			pending = true;
2402 			break;
2403 		case EOPNOTSUPP:
2404 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2405 			break;
2406 		default:
2407 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2408 		}
2409 	}
2410 
2411 	return (pending);
2412 }
2413 
2414 static uint16_t
2415 nvme_write_read_ram(struct pci_nvme_softc *sc,
2416     struct pci_nvme_blockstore *nvstore,
2417     uint64_t prp1, uint64_t prp2,
2418     size_t offset, uint64_t bytes,
2419     bool is_write)
2420 {
2421 	uint8_t *buf = nvstore->ctx;
2422 	enum nvme_copy_dir dir;
2423 	uint16_t status;
2424 
2425 	if (is_write)
2426 		dir = NVME_COPY_TO_PRP;
2427 	else
2428 		dir = NVME_COPY_FROM_PRP;
2429 
2430 	if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
2431 	    buf + offset, bytes, dir))
2432 		pci_nvme_status_genc(&status,
2433 		    NVME_SC_DATA_TRANSFER_ERROR);
2434 	else
2435 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2436 
2437 	return (status);
2438 }
2439 
2440 static uint16_t
2441 nvme_write_read_blockif(struct pci_nvme_softc *sc,
2442     struct pci_nvme_blockstore *nvstore,
2443     struct pci_nvme_ioreq *req,
2444     uint64_t prp1, uint64_t prp2,
2445     size_t offset, uint64_t bytes,
2446     bool is_write)
2447 {
2448 	uint64_t size;
2449 	int err;
2450 	uint16_t status = NVME_NO_STATUS;
2451 
2452 	size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
2453 	if (pci_nvme_append_iov_req(sc, req, prp1,
2454 	    size, is_write, offset)) {
2455 		err = -1;
2456 		goto out;
2457 	}
2458 
2459 	offset += size;
2460 	bytes  -= size;
2461 
2462 	if (bytes == 0) {
2463 		;
2464 	} else if (bytes <= PAGE_SIZE) {
2465 		size = bytes;
2466 		if (pci_nvme_append_iov_req(sc, req, prp2,
2467 		    size, is_write, offset)) {
2468 			err = -1;
2469 			goto out;
2470 		}
2471 	} else {
2472 		void *vmctx = sc->nsc_pi->pi_vmctx;
2473 		uint64_t *prp_list = &prp2;
2474 		uint64_t *last = prp_list;
2475 
2476 		/* PRP2 is pointer to a physical region page list */
2477 		while (bytes) {
2478 			/* Last entry in list points to the next list */
2479 			if ((prp_list == last) && (bytes > PAGE_SIZE)) {
2480 				uint64_t prp = *prp_list;
2481 
2482 				prp_list = paddr_guest2host(vmctx, prp,
2483 				    PAGE_SIZE - (prp % PAGE_SIZE));
2484 				if (prp_list == NULL) {
2485 					err = -1;
2486 					goto out;
2487 				}
2488 				last = prp_list + (NVME_PRP2_ITEMS - 1);
2489 			}
2490 
2491 			size = MIN(bytes, PAGE_SIZE);
2492 
2493 			if (pci_nvme_append_iov_req(sc, req, *prp_list,
2494 			    size, is_write, offset)) {
2495 				err = -1;
2496 				goto out;
2497 			}
2498 
2499 			offset += size;
2500 			bytes  -= size;
2501 
2502 			prp_list++;
2503 		}
2504 	}
2505 	req->io_req.br_callback = pci_nvme_io_done;
2506 	if (is_write)
2507 		err = blockif_write(nvstore->ctx, &req->io_req);
2508 	else
2509 		err = blockif_read(nvstore->ctx, &req->io_req);
2510 out:
2511 	if (err)
2512 		pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2513 
2514 	return (status);
2515 }
2516 
2517 static bool
2518 nvme_opc_write_read(struct pci_nvme_softc *sc,
2519     struct nvme_command *cmd,
2520     struct pci_nvme_blockstore *nvstore,
2521     struct pci_nvme_ioreq *req,
2522     uint16_t *status)
2523 {
2524 	uint64_t lba, nblocks, bytes;
2525 	size_t offset;
2526 	bool is_write = cmd->opc == NVME_OPC_WRITE;
2527 	bool pending = false;
2528 
2529 	lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2530 	nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2531 	bytes = nblocks << nvstore->sectsz_bits;
2532 	if (bytes > NVME_MAX_DATA_SIZE) {
2533 		WPRINTF("%s command would exceed MDTS", __func__);
2534 		pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2535 		goto out;
2536 	}
2537 
2538 	if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2539 		WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)",
2540 		    __func__, lba, nblocks);
2541 		pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2542 		goto out;
2543 	}
2544 
2545 	offset = lba << nvstore->sectsz_bits;
2546 
2547 	req->bytes = bytes;
2548 	req->io_req.br_offset = lba;
2549 
2550 	/* PRP bits 1:0 must be zero */
2551 	cmd->prp1 &= ~0x3UL;
2552 	cmd->prp2 &= ~0x3UL;
2553 
2554 	if (nvstore->type == NVME_STOR_RAM) {
2555 		*status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2556 		    cmd->prp2, offset, bytes, is_write);
2557 	} else {
2558 		*status = nvme_write_read_blockif(sc, nvstore, req,
2559 		    cmd->prp1, cmd->prp2, offset, bytes, is_write);
2560 
2561 		if (*status == NVME_NO_STATUS)
2562 			pending = true;
2563 	}
2564 out:
2565 	if (!pending)
2566 		pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2567 
2568 	return (pending);
2569 }
2570 
2571 static void
2572 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2573 {
2574 	struct pci_nvme_ioreq *req = br->br_param;
2575 	struct pci_nvme_softc *sc = req->sc;
2576 	bool done = true;
2577 	uint16_t status;
2578 
2579 	if (err) {
2580 		pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2581 	} else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2582 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2583 	} else {
2584 		struct iovec *iov = req->io_req.br_iov;
2585 
2586 		req->prev_gpaddr++;
2587 		iov += req->prev_gpaddr;
2588 
2589 		/* The iov_* values already include the sector size */
2590 		req->io_req.br_offset = (off_t)iov->iov_base;
2591 		req->io_req.br_resid = iov->iov_len;
2592 		if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2593 			pci_nvme_status_genc(&status,
2594 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2595 		} else
2596 			done = false;
2597 	}
2598 
2599 	if (done) {
2600 		pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, req->cid,
2601 		    status);
2602 		pci_nvme_release_ioreq(sc, req);
2603 	}
2604 }
2605 
2606 static bool
2607 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2608     struct nvme_command *cmd,
2609     struct pci_nvme_blockstore *nvstore,
2610     struct pci_nvme_ioreq *req,
2611     uint16_t *status)
2612 {
2613 	struct nvme_dsm_range *range = NULL;
2614 	uint32_t nr, r, non_zero, dr;
2615 	int err;
2616 	bool pending = false;
2617 
2618 	if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2619 		pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2620 		goto out;
2621 	}
2622 
2623 	nr = cmd->cdw10 & 0xff;
2624 
2625 	/* copy locally because a range entry could straddle PRPs */
2626 	range = calloc(1, NVME_MAX_DSM_TRIM);
2627 	if (range == NULL) {
2628 		pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2629 		goto out;
2630 	}
2631 	nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2632 	    (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2633 
2634 	/* Check for invalid ranges and the number of non-zero lengths */
2635 	non_zero = 0;
2636 	for (r = 0; r <= nr; r++) {
2637 		if (pci_nvme_out_of_range(nvstore,
2638 		    range[r].starting_lba, range[r].length)) {
2639 			pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2640 			goto out;
2641 		}
2642 		if (range[r].length != 0)
2643 			non_zero++;
2644 	}
2645 
2646 	if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2647 		size_t offset, bytes;
2648 		int sectsz_bits = sc->nvstore.sectsz_bits;
2649 
2650 		/*
2651 		 * DSM calls are advisory only, and compliant controllers
2652 		 * may choose to take no actions (i.e. return Success).
2653 		 */
2654 		if (!nvstore->deallocate) {
2655 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2656 			goto out;
2657 		}
2658 
2659 		/* If all ranges have a zero length, return Success */
2660 		if (non_zero == 0) {
2661 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2662 			goto out;
2663 		}
2664 
2665 		if (req == NULL) {
2666 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2667 			goto out;
2668 		}
2669 
2670 		offset = range[0].starting_lba << sectsz_bits;
2671 		bytes = range[0].length << sectsz_bits;
2672 
2673 		/*
2674 		 * If the request is for more than a single range, store
2675 		 * the ranges in the br_iov. Optimize for the common case
2676 		 * of a single range.
2677 		 *
2678 		 * Note that NVMe Number of Ranges is a zero based value
2679 		 */
2680 		req->io_req.br_iovcnt = 0;
2681 		req->io_req.br_offset = offset;
2682 		req->io_req.br_resid = bytes;
2683 
2684 		if (nr == 0) {
2685 			req->io_req.br_callback = pci_nvme_io_done;
2686 		} else {
2687 			struct iovec *iov = req->io_req.br_iov;
2688 
2689 			for (r = 0, dr = 0; r <= nr; r++) {
2690 				offset = range[r].starting_lba << sectsz_bits;
2691 				bytes = range[r].length << sectsz_bits;
2692 				if (bytes == 0)
2693 					continue;
2694 
2695 				if ((nvstore->size - offset) < bytes) {
2696 					pci_nvme_status_genc(status,
2697 					    NVME_SC_LBA_OUT_OF_RANGE);
2698 					goto out;
2699 				}
2700 				iov[dr].iov_base = (void *)offset;
2701 				iov[dr].iov_len = bytes;
2702 				dr++;
2703 			}
2704 			req->io_req.br_callback = pci_nvme_dealloc_sm;
2705 
2706 			/*
2707 			 * Use prev_gpaddr to track the current entry and
2708 			 * prev_size to track the number of entries
2709 			 */
2710 			req->prev_gpaddr = 0;
2711 			req->prev_size = dr;
2712 		}
2713 
2714 		err = blockif_delete(nvstore->ctx, &req->io_req);
2715 		if (err)
2716 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2717 		else
2718 			pending = true;
2719 	}
2720 out:
2721 	free(range);
2722 	return (pending);
2723 }
2724 
2725 static void
2726 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2727 {
2728 	struct nvme_submission_queue *sq;
2729 	uint16_t status;
2730 	uint16_t sqhead;
2731 
2732 	/* handle all submissions up to sq->tail index */
2733 	sq = &sc->submit_queues[idx];
2734 
2735 	pthread_mutex_lock(&sq->mtx);
2736 
2737 	sqhead = sq->head;
2738 	DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2739 	         idx, sqhead, sq->tail, sq->qbase);
2740 
2741 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
2742 		struct nvme_command *cmd;
2743 		struct pci_nvme_ioreq *req;
2744 		uint32_t nsid;
2745 		bool pending;
2746 
2747 		pending = false;
2748 		req = NULL;
2749 		status = 0;
2750 
2751 		cmd = &sq->qbase[sqhead];
2752 		sqhead = (sqhead + 1) % sq->size;
2753 
2754 		nsid = le32toh(cmd->nsid);
2755 		if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2756 			pci_nvme_status_genc(&status,
2757 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2758 			status |=
2759 			    NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2760 			goto complete;
2761  		}
2762 
2763 		req = pci_nvme_get_ioreq(sc);
2764 		if (req == NULL) {
2765 			pci_nvme_status_genc(&status,
2766 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2767 			WPRINTF("%s: unable to allocate IO req", __func__);
2768 			goto complete;
2769 		}
2770 		req->nvme_sq = sq;
2771 		req->sqid = idx;
2772 		req->opc = cmd->opc;
2773 		req->cid = cmd->cid;
2774 		req->nsid = cmd->nsid;
2775 
2776 		switch (cmd->opc) {
2777 		case NVME_OPC_FLUSH:
2778 			pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2779 			    req, &status);
2780  			break;
2781 		case NVME_OPC_WRITE:
2782 		case NVME_OPC_READ:
2783 			pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2784 			    req, &status);
2785 			break;
2786 		case NVME_OPC_WRITE_ZEROES:
2787 			/* TODO: write zeroes
2788 			WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2789 			        __func__, lba, cmd->cdw12 & 0xFFFF); */
2790 			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2791 			break;
2792 		case NVME_OPC_DATASET_MANAGEMENT:
2793  			pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2794 			    req, &status);
2795 			break;
2796  		default:
2797  			WPRINTF("%s unhandled io command 0x%x",
2798 			    __func__, cmd->opc);
2799 			pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2800 		}
2801 complete:
2802 		if (!pending) {
2803 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, status);
2804 			if (req != NULL)
2805 				pci_nvme_release_ioreq(sc, req);
2806 		}
2807 	}
2808 
2809 	sq->head = sqhead;
2810 
2811 	pthread_mutex_unlock(&sq->mtx);
2812 }
2813 
2814 static void
2815 pci_nvme_handle_doorbell(struct vmctx *ctx __unused, struct pci_nvme_softc* sc,
2816 	uint64_t idx, int is_sq, uint64_t value)
2817 {
2818 	DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2819 	        idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2820 
2821 	if (is_sq) {
2822 		if (idx > sc->num_squeues) {
2823 			WPRINTF("%s queue index %lu overflow from "
2824 			         "guest (max %u)",
2825 			         __func__, idx, sc->num_squeues);
2826 			return;
2827 		}
2828 
2829 		atomic_store_short(&sc->submit_queues[idx].tail,
2830 		                   (uint16_t)value);
2831 
2832 		if (idx == 0) {
2833 			pci_nvme_handle_admin_cmd(sc, value);
2834 		} else {
2835 			/* submission queue; handle new entries in SQ */
2836 			if (idx > sc->num_squeues) {
2837 				WPRINTF("%s SQ index %lu overflow from "
2838 				         "guest (max %u)",
2839 				         __func__, idx, sc->num_squeues);
2840 				return;
2841 			}
2842 			pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2843 		}
2844 	} else {
2845 		if (idx > sc->num_cqueues) {
2846 			WPRINTF("%s queue index %lu overflow from "
2847 			         "guest (max %u)",
2848 			         __func__, idx, sc->num_cqueues);
2849 			return;
2850 		}
2851 
2852 		atomic_store_short(&sc->compl_queues[idx].head,
2853 				(uint16_t)value);
2854 	}
2855 }
2856 
2857 static void
2858 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2859 {
2860 	const char *s = iswrite ? "WRITE" : "READ";
2861 
2862 	switch (offset) {
2863 	case NVME_CR_CAP_LOW:
2864 		DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2865 		break;
2866 	case NVME_CR_CAP_HI:
2867 		DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2868 		break;
2869 	case NVME_CR_VS:
2870 		DPRINTF("%s %s NVME_CR_VS", func, s);
2871 		break;
2872 	case NVME_CR_INTMS:
2873 		DPRINTF("%s %s NVME_CR_INTMS", func, s);
2874 		break;
2875 	case NVME_CR_INTMC:
2876 		DPRINTF("%s %s NVME_CR_INTMC", func, s);
2877 		break;
2878 	case NVME_CR_CC:
2879 		DPRINTF("%s %s NVME_CR_CC", func, s);
2880 		break;
2881 	case NVME_CR_CSTS:
2882 		DPRINTF("%s %s NVME_CR_CSTS", func, s);
2883 		break;
2884 	case NVME_CR_NSSR:
2885 		DPRINTF("%s %s NVME_CR_NSSR", func, s);
2886 		break;
2887 	case NVME_CR_AQA:
2888 		DPRINTF("%s %s NVME_CR_AQA", func, s);
2889 		break;
2890 	case NVME_CR_ASQ_LOW:
2891 		DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2892 		break;
2893 	case NVME_CR_ASQ_HI:
2894 		DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2895 		break;
2896 	case NVME_CR_ACQ_LOW:
2897 		DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2898 		break;
2899 	case NVME_CR_ACQ_HI:
2900 		DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2901 		break;
2902 	default:
2903 		DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2904 	}
2905 
2906 }
2907 
2908 static void
2909 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2910 	uint64_t offset, int size, uint64_t value)
2911 {
2912 	uint32_t ccreg;
2913 
2914 	if (offset >= NVME_DOORBELL_OFFSET) {
2915 		uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2916 		uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2917 		int is_sq = (belloffset % 8) < 4;
2918 
2919 		if ((sc->regs.csts & NVME_CSTS_RDY) == 0) {
2920 			WPRINTF("doorbell write prior to RDY (offset=%#lx)\n",
2921 			    offset);
2922 			return;
2923 		}
2924 
2925 		if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2926 			WPRINTF("guest attempted an overflow write offset "
2927 			         "0x%lx, val 0x%lx in %s",
2928 			         offset, value, __func__);
2929 			return;
2930 		}
2931 
2932 		if (is_sq) {
2933 			if (sc->submit_queues[idx].qbase == NULL)
2934 				return;
2935 		} else if (sc->compl_queues[idx].qbase == NULL)
2936 			return;
2937 
2938 		pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2939 		return;
2940 	}
2941 
2942 	DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2943 	        offset, size, value);
2944 
2945 	if (size != 4) {
2946 		WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2947 		         "val 0x%lx) to bar0 in %s",
2948 		         size, offset, value, __func__);
2949 		/* TODO: shutdown device */
2950 		return;
2951 	}
2952 
2953 	pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2954 
2955 	pthread_mutex_lock(&sc->mtx);
2956 
2957 	switch (offset) {
2958 	case NVME_CR_CAP_LOW:
2959 	case NVME_CR_CAP_HI:
2960 		/* readonly */
2961 		break;
2962 	case NVME_CR_VS:
2963 		/* readonly */
2964 		break;
2965 	case NVME_CR_INTMS:
2966 		/* MSI-X, so ignore */
2967 		break;
2968 	case NVME_CR_INTMC:
2969 		/* MSI-X, so ignore */
2970 		break;
2971 	case NVME_CR_CC:
2972 		ccreg = (uint32_t)value;
2973 
2974 		DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2975 		         "iocqes %u",
2976 		        __func__,
2977 			 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2978 			 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2979 			 NVME_CC_GET_IOCQES(ccreg));
2980 
2981 		if (NVME_CC_GET_SHN(ccreg)) {
2982 			/* perform shutdown - flush out data to backend */
2983 			sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2984 			    NVME_CSTS_REG_SHST_SHIFT);
2985 			sc->regs.csts |= NVME_SHST_COMPLETE <<
2986 			    NVME_CSTS_REG_SHST_SHIFT;
2987 		}
2988 		if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2989 			if (NVME_CC_GET_EN(ccreg) == 0)
2990 				/* transition 1-> causes controller reset */
2991 				pci_nvme_reset_locked(sc);
2992 			else
2993 				pci_nvme_init_controller(ctx, sc);
2994 		}
2995 
2996 		/* Insert the iocqes, iosqes and en bits from the write */
2997 		sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2998 		sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2999 		if (NVME_CC_GET_EN(ccreg) == 0) {
3000 			/* Insert the ams, mps and css bit fields */
3001 			sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
3002 			sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
3003 			sc->regs.csts &= ~NVME_CSTS_RDY;
3004 		} else if ((sc->pending_ios == 0) &&
3005 		    !(sc->regs.csts & NVME_CSTS_CFS)) {
3006 			sc->regs.csts |= NVME_CSTS_RDY;
3007 		}
3008 		break;
3009 	case NVME_CR_CSTS:
3010 		break;
3011 	case NVME_CR_NSSR:
3012 		/* ignore writes; don't support subsystem reset */
3013 		break;
3014 	case NVME_CR_AQA:
3015 		sc->regs.aqa = (uint32_t)value;
3016 		break;
3017 	case NVME_CR_ASQ_LOW:
3018 		sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
3019 		               (0xFFFFF000 & value);
3020 		break;
3021 	case NVME_CR_ASQ_HI:
3022 		sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
3023 		               (value << 32);
3024 		break;
3025 	case NVME_CR_ACQ_LOW:
3026 		sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
3027 		               (0xFFFFF000 & value);
3028 		break;
3029 	case NVME_CR_ACQ_HI:
3030 		sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
3031 		               (value << 32);
3032 		break;
3033 	default:
3034 		DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
3035 		         __func__, offset, value, size);
3036 	}
3037 	pthread_mutex_unlock(&sc->mtx);
3038 }
3039 
3040 static void
3041 pci_nvme_write(struct vmctx *ctx, int vcpu __unused, struct pci_devinst *pi,
3042     int baridx, uint64_t offset, int size, uint64_t value)
3043 {
3044 	struct pci_nvme_softc* sc = pi->pi_arg;
3045 
3046 	if (baridx == pci_msix_table_bar(pi) ||
3047 	    baridx == pci_msix_pba_bar(pi)) {
3048 		DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
3049 		         " value 0x%lx", baridx, offset, size, value);
3050 
3051 		pci_emul_msix_twrite(pi, offset, size, value);
3052 		return;
3053 	}
3054 
3055 	switch (baridx) {
3056 	case 0:
3057 		pci_nvme_write_bar_0(ctx, sc, offset, size, value);
3058 		break;
3059 
3060 	default:
3061 		DPRINTF("%s unknown baridx %d, val 0x%lx",
3062 		         __func__, baridx, value);
3063 	}
3064 }
3065 
3066 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
3067 	uint64_t offset, int size)
3068 {
3069 	uint64_t value;
3070 
3071 	pci_nvme_bar0_reg_dumps(__func__, offset, 0);
3072 
3073 	if (offset < NVME_DOORBELL_OFFSET) {
3074 		void *p = &(sc->regs);
3075 		pthread_mutex_lock(&sc->mtx);
3076 		memcpy(&value, (void *)((uintptr_t)p + offset), size);
3077 		pthread_mutex_unlock(&sc->mtx);
3078 	} else {
3079 		value = 0;
3080                 WPRINTF("pci_nvme: read invalid offset %ld", offset);
3081 	}
3082 
3083 	switch (size) {
3084 	case 1:
3085 		value &= 0xFF;
3086 		break;
3087 	case 2:
3088 		value &= 0xFFFF;
3089 		break;
3090 	case 4:
3091 		value &= 0xFFFFFFFF;
3092 		break;
3093 	}
3094 
3095 	DPRINTF("   nvme-read offset 0x%lx, size %d -> value 0x%x",
3096 	         offset, size, (uint32_t)value);
3097 
3098 	return (value);
3099 }
3100 
3101 
3102 
3103 static uint64_t
3104 pci_nvme_read(struct vmctx *ctx __unused, int vcpu __unused,
3105     struct pci_devinst *pi, int baridx, uint64_t offset, int size)
3106 {
3107 	struct pci_nvme_softc* sc = pi->pi_arg;
3108 
3109 	if (baridx == pci_msix_table_bar(pi) ||
3110 	    baridx == pci_msix_pba_bar(pi)) {
3111 		DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
3112 		        baridx, offset, size);
3113 
3114 		return pci_emul_msix_tread(pi, offset, size);
3115 	}
3116 
3117 	switch (baridx) {
3118 	case 0:
3119        		return pci_nvme_read_bar_0(sc, offset, size);
3120 
3121 	default:
3122 		DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
3123 	}
3124 
3125 	return (0);
3126 }
3127 
3128 static int
3129 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl)
3130 {
3131 	char bident[sizeof("XX:X:X")];
3132 	const char *value;
3133 	uint32_t sectsz;
3134 
3135 	sc->max_queues = NVME_QUEUES;
3136 	sc->max_qentries = NVME_MAX_QENTRIES;
3137 	sc->ioslots = NVME_IOSLOTS;
3138 	sc->num_squeues = sc->max_queues;
3139 	sc->num_cqueues = sc->max_queues;
3140 	sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
3141 	sectsz = 0;
3142 	snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
3143 	         "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
3144 
3145 	value = get_config_value_node(nvl, "maxq");
3146 	if (value != NULL)
3147 		sc->max_queues = atoi(value);
3148 	value = get_config_value_node(nvl, "qsz");
3149 	if (value != NULL) {
3150 		sc->max_qentries = atoi(value);
3151 		if (sc->max_qentries <= 0) {
3152 			EPRINTLN("nvme: Invalid qsz option %d",
3153 			    sc->max_qentries);
3154 			return (-1);
3155 		}
3156 	}
3157 	value = get_config_value_node(nvl, "ioslots");
3158 	if (value != NULL) {
3159 		sc->ioslots = atoi(value);
3160 		if (sc->ioslots <= 0) {
3161 			EPRINTLN("Invalid ioslots option %d", sc->ioslots);
3162 			return (-1);
3163 		}
3164 	}
3165 	value = get_config_value_node(nvl, "sectsz");
3166 	if (value != NULL)
3167 		sectsz = atoi(value);
3168 	value = get_config_value_node(nvl, "ser");
3169 	if (value != NULL) {
3170 		/*
3171 		 * This field indicates the Product Serial Number in
3172 		 * 7-bit ASCII, unused bytes should be space characters.
3173 		 * Ref: NVMe v1.3c.
3174 		 */
3175 		cpywithpad((char *)sc->ctrldata.sn,
3176 		    sizeof(sc->ctrldata.sn), value, ' ');
3177 	}
3178 	value = get_config_value_node(nvl, "eui64");
3179 	if (value != NULL)
3180 		sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0));
3181 	value = get_config_value_node(nvl, "dsm");
3182 	if (value != NULL) {
3183 		if (strcmp(value, "auto") == 0)
3184 			sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
3185 		else if (strcmp(value, "enable") == 0)
3186 			sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
3187 		else if (strcmp(value, "disable") == 0)
3188 			sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
3189 	}
3190 
3191 	value = get_config_value_node(nvl, "ram");
3192 	if (value != NULL) {
3193 		uint64_t sz = strtoull(value, NULL, 10);
3194 
3195 		sc->nvstore.type = NVME_STOR_RAM;
3196 		sc->nvstore.size = sz * 1024 * 1024;
3197 		sc->nvstore.ctx = calloc(1, sc->nvstore.size);
3198 		sc->nvstore.sectsz = 4096;
3199 		sc->nvstore.sectsz_bits = 12;
3200 		if (sc->nvstore.ctx == NULL) {
3201 			EPRINTLN("nvme: Unable to allocate RAM");
3202 			return (-1);
3203 		}
3204 	} else {
3205 		snprintf(bident, sizeof(bident), "%d:%d",
3206 		    sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
3207 		sc->nvstore.ctx = blockif_open(nvl, bident);
3208 		if (sc->nvstore.ctx == NULL) {
3209 			EPRINTLN("nvme: Could not open backing file: %s",
3210 			    strerror(errno));
3211 			return (-1);
3212 		}
3213 		sc->nvstore.type = NVME_STOR_BLOCKIF;
3214 		sc->nvstore.size = blockif_size(sc->nvstore.ctx);
3215 	}
3216 
3217 	if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
3218 		sc->nvstore.sectsz = sectsz;
3219 	else if (sc->nvstore.type != NVME_STOR_RAM)
3220 		sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
3221 	for (sc->nvstore.sectsz_bits = 9;
3222 	     (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
3223 	     sc->nvstore.sectsz_bits++);
3224 
3225 	if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
3226 		sc->max_queues = NVME_QUEUES;
3227 
3228 	return (0);
3229 }
3230 
3231 static void
3232 pci_nvme_resized(struct blockif_ctxt *bctxt __unused, void *arg,
3233     size_t new_size)
3234 {
3235 	struct pci_nvme_softc *sc;
3236 	struct pci_nvme_blockstore *nvstore;
3237 	struct nvme_namespace_data *nd;
3238 
3239 	sc = arg;
3240 	nvstore = &sc->nvstore;
3241 	nd = &sc->nsdata;
3242 
3243 	nvstore->size = new_size;
3244 	pci_nvme_init_nsdata_size(nvstore, nd);
3245 
3246 	/* Add changed NSID to list */
3247 	sc->ns_log.ns[0] = 1;
3248 	sc->ns_log.ns[1] = 0;
3249 
3250 	pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE,
3251 	    PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED);
3252 }
3253 
3254 static int
3255 pci_nvme_init(struct vmctx *ctx __unused, struct pci_devinst *pi, nvlist_t *nvl)
3256 {
3257 	struct pci_nvme_softc *sc;
3258 	uint32_t pci_membar_sz;
3259 	int	error;
3260 
3261 	error = 0;
3262 
3263 	sc = calloc(1, sizeof(struct pci_nvme_softc));
3264 	pi->pi_arg = sc;
3265 	sc->nsc_pi = pi;
3266 
3267 	error = pci_nvme_parse_config(sc, nvl);
3268 	if (error < 0)
3269 		goto done;
3270 	else
3271 		error = 0;
3272 
3273 	STAILQ_INIT(&sc->ioreqs_free);
3274 	sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
3275 	for (int i = 0; i < sc->ioslots; i++) {
3276 		STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
3277 	}
3278 
3279 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
3280 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
3281 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
3282 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
3283 	pci_set_cfgdata8(pi, PCIR_PROGIF,
3284 	                 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
3285 
3286 	/*
3287 	 * Allocate size of NVMe registers + doorbell space for all queues.
3288 	 *
3289 	 * The specification requires a minimum memory I/O window size of 16K.
3290 	 * The Windows driver will refuse to start a device with a smaller
3291 	 * window.
3292 	 */
3293 	pci_membar_sz = sizeof(struct nvme_registers) +
3294 	    2 * sizeof(uint32_t) * (sc->max_queues + 1);
3295 	pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
3296 
3297 	DPRINTF("nvme membar size: %u", pci_membar_sz);
3298 
3299 	error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
3300 	if (error) {
3301 		WPRINTF("%s pci alloc mem bar failed", __func__);
3302 		goto done;
3303 	}
3304 
3305 	error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
3306 	if (error) {
3307 		WPRINTF("%s pci add msixcap failed", __func__);
3308 		goto done;
3309 	}
3310 
3311 	error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
3312 	if (error) {
3313 		WPRINTF("%s pci add Express capability failed", __func__);
3314 		goto done;
3315 	}
3316 
3317 	pthread_mutex_init(&sc->mtx, NULL);
3318 	sem_init(&sc->iosemlock, 0, sc->ioslots);
3319 	blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc);
3320 
3321 	pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
3322 	/*
3323 	 * Controller data depends on Namespace data so initialize Namespace
3324 	 * data first.
3325 	 */
3326 	pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
3327 	pci_nvme_init_ctrldata(sc);
3328 	pci_nvme_init_logpages(sc);
3329 	pci_nvme_init_features(sc);
3330 
3331 	pci_nvme_aer_init(sc);
3332 	pci_nvme_aen_init(sc);
3333 
3334 	pci_nvme_reset(sc);
3335 
3336 	pci_lintr_request(pi);
3337 
3338 done:
3339 	return (error);
3340 }
3341 
3342 static int
3343 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts)
3344 {
3345 	char *cp, *ram;
3346 
3347 	if (opts == NULL)
3348 		return (0);
3349 
3350 	if (strncmp(opts, "ram=", 4) == 0) {
3351 		cp = strchr(opts, ',');
3352 		if (cp == NULL) {
3353 			set_config_value_node(nvl, "ram", opts + 4);
3354 			return (0);
3355 		}
3356 		ram = strndup(opts + 4, cp - opts - 4);
3357 		set_config_value_node(nvl, "ram", ram);
3358 		free(ram);
3359 		return (pci_parse_legacy_config(nvl, cp + 1));
3360 	} else
3361 		return (blockif_legacy_config(nvl, opts));
3362 }
3363 
3364 struct pci_devemu pci_de_nvme = {
3365 	.pe_emu =	"nvme",
3366 	.pe_init =	pci_nvme_init,
3367 	.pe_legacy_config = pci_nvme_legacy_config,
3368 	.pe_barwrite =	pci_nvme_write,
3369 	.pe_barread =	pci_nvme_read
3370 };
3371 PCI_EMUL_SET(pci_de_nvme);
3372