xref: /freebsd/usr.sbin/bhyve/pci_nvme.c (revision 1d9e8a9e60953b148a036b39d1fe7037fdbb40a3)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  * Copyright (c) 2020 Chuck Tuffli
7  *
8  * Function crc16 Copyright (c) 2017, Fedor Uporov
9  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 /*
34  * bhyve PCIe-NVMe device emulation.
35  *
36  * options:
37  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
38  *
39  *  accepted devpath:
40  *    /dev/blockdev
41  *    /path/to/image
42  *    ram=size_in_MiB
43  *
44  *  maxq    = max number of queues
45  *  qsz     = max elements in each queue
46  *  ioslots = max number of concurrent io requests
47  *  sectsz  = sector size (defaults to blockif sector size)
48  *  ser     = serial number (20-chars max)
49  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
50  *  dsm     = DataSet Management support. Option is one of auto, enable,disable
51  *
52  */
53 
54 /* TODO:
55     - create async event for smart and log
56     - intr coalesce
57  */
58 
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61 
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
65 
66 #include <assert.h>
67 #include <pthread.h>
68 #include <pthread_np.h>
69 #include <semaphore.h>
70 #include <stdbool.h>
71 #include <stddef.h>
72 #include <stdint.h>
73 #include <stdio.h>
74 #include <stdlib.h>
75 #include <string.h>
76 
77 #include <machine/atomic.h>
78 #include <machine/vmm.h>
79 #include <vmmapi.h>
80 
81 #include <dev/nvme/nvme.h>
82 
83 #include "bhyverun.h"
84 #include "block_if.h"
85 #include "config.h"
86 #include "debug.h"
87 #include "pci_emul.h"
88 
89 
90 static int nvme_debug = 0;
91 #define	DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
92 #define	WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
93 
94 /* defaults; can be overridden */
95 #define	NVME_MSIX_BAR		4
96 
97 #define	NVME_IOSLOTS		8
98 
99 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
100 #define NVME_MMIO_SPACE_MIN	(1 << 14)
101 
102 #define	NVME_QUEUES		16
103 #define	NVME_MAX_QENTRIES	2048
104 /* Memory Page size Minimum reported in CAP register */
105 #define	NVME_MPSMIN		0
106 /* MPSMIN converted to bytes */
107 #define	NVME_MPSMIN_BYTES	(1 << (12 + NVME_MPSMIN))
108 
109 #define	NVME_PRP2_ITEMS		(PAGE_SIZE/sizeof(uint64_t))
110 #define	NVME_MDTS		9
111 /* Note the + 1 allows for the initial descriptor to not be page aligned */
112 #define	NVME_MAX_IOVEC		((1 << NVME_MDTS) + 1)
113 #define	NVME_MAX_DATA_SIZE	((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
114 
115 /* This is a synthetic status code to indicate there is no status */
116 #define NVME_NO_STATUS		0xffff
117 #define NVME_COMPLETION_VALID(c)	((c).status != NVME_NO_STATUS)
118 
119 /* Reported temperature in Kelvin (i.e. room temperature) */
120 #define NVME_TEMPERATURE 296
121 
122 /* helpers */
123 
124 /* Convert a zero-based value into a one-based value */
125 #define ONE_BASED(zero)		((zero) + 1)
126 /* Convert a one-based value into a zero-based value */
127 #define ZERO_BASED(one)		((one)  - 1)
128 
129 /* Encode number of SQ's and CQ's for Set/Get Features */
130 #define NVME_FEATURE_NUM_QUEUES(sc) \
131 	(ZERO_BASED((sc)->num_squeues) & 0xffff) | \
132 	(ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
133 
134 #define	NVME_DOORBELL_OFFSET	offsetof(struct nvme_registers, doorbell)
135 
136 enum nvme_controller_register_offsets {
137 	NVME_CR_CAP_LOW = 0x00,
138 	NVME_CR_CAP_HI  = 0x04,
139 	NVME_CR_VS      = 0x08,
140 	NVME_CR_INTMS   = 0x0c,
141 	NVME_CR_INTMC   = 0x10,
142 	NVME_CR_CC      = 0x14,
143 	NVME_CR_CSTS    = 0x1c,
144 	NVME_CR_NSSR    = 0x20,
145 	NVME_CR_AQA     = 0x24,
146 	NVME_CR_ASQ_LOW = 0x28,
147 	NVME_CR_ASQ_HI  = 0x2c,
148 	NVME_CR_ACQ_LOW = 0x30,
149 	NVME_CR_ACQ_HI  = 0x34,
150 };
151 
152 enum nvme_cmd_cdw11 {
153 	NVME_CMD_CDW11_PC  = 0x0001,
154 	NVME_CMD_CDW11_IEN = 0x0002,
155 	NVME_CMD_CDW11_IV  = 0xFFFF0000,
156 };
157 
158 enum nvme_copy_dir {
159 	NVME_COPY_TO_PRP,
160 	NVME_COPY_FROM_PRP,
161 };
162 
163 #define	NVME_CQ_INTEN	0x01
164 #define	NVME_CQ_INTCOAL	0x02
165 
166 struct nvme_completion_queue {
167 	struct nvme_completion *qbase;
168 	pthread_mutex_t	mtx;
169 	uint32_t	size;
170 	uint16_t	tail; /* nvme progress */
171 	uint16_t	head; /* guest progress */
172 	uint16_t	intr_vec;
173 	uint32_t	intr_en;
174 };
175 
176 struct nvme_submission_queue {
177 	struct nvme_command *qbase;
178 	pthread_mutex_t	mtx;
179 	uint32_t	size;
180 	uint16_t	head; /* nvme progress */
181 	uint16_t	tail; /* guest progress */
182 	uint16_t	cqid; /* completion queue id */
183 	int		qpriority;
184 };
185 
186 enum nvme_storage_type {
187 	NVME_STOR_BLOCKIF = 0,
188 	NVME_STOR_RAM = 1,
189 };
190 
191 struct pci_nvme_blockstore {
192 	enum nvme_storage_type type;
193 	void		*ctx;
194 	uint64_t	size;
195 	uint32_t	sectsz;
196 	uint32_t	sectsz_bits;
197 	uint64_t	eui64;
198 	uint32_t	deallocate:1;
199 };
200 
201 /*
202  * Calculate the number of additional page descriptors for guest IO requests
203  * based on the advertised Max Data Transfer (MDTS) and given the number of
204  * default iovec's in a struct blockif_req.
205  */
206 #define MDTS_PAD_SIZE \
207 	( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
208 	  NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
209 	  0 )
210 
211 struct pci_nvme_ioreq {
212 	struct pci_nvme_softc *sc;
213 	STAILQ_ENTRY(pci_nvme_ioreq) link;
214 	struct nvme_submission_queue *nvme_sq;
215 	uint16_t	sqid;
216 
217 	/* command information */
218 	uint16_t	opc;
219 	uint16_t	cid;
220 	uint32_t	nsid;
221 
222 	uint64_t	prev_gpaddr;
223 	size_t		prev_size;
224 	size_t		bytes;
225 
226 	struct blockif_req io_req;
227 
228 	struct iovec	iovpadding[MDTS_PAD_SIZE];
229 };
230 
231 enum nvme_dsm_type {
232 	/* Dataset Management bit in ONCS reflects backing storage capability */
233 	NVME_DATASET_MANAGEMENT_AUTO,
234 	/* Unconditionally set Dataset Management bit in ONCS */
235 	NVME_DATASET_MANAGEMENT_ENABLE,
236 	/* Unconditionally clear Dataset Management bit in ONCS */
237 	NVME_DATASET_MANAGEMENT_DISABLE,
238 };
239 
240 struct pci_nvme_softc;
241 struct nvme_feature_obj;
242 
243 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
244     struct nvme_feature_obj *,
245     struct nvme_command *,
246     struct nvme_completion *);
247 
248 struct nvme_feature_obj {
249 	uint32_t	cdw11;
250 	nvme_feature_cb	set;
251 	nvme_feature_cb	get;
252 	bool namespace_specific;
253 };
254 
255 #define NVME_FID_MAX		(NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
256 
257 typedef enum {
258 	PCI_NVME_AE_TYPE_ERROR = 0,
259 	PCI_NVME_AE_TYPE_SMART,
260 	PCI_NVME_AE_TYPE_NOTICE,
261 	PCI_NVME_AE_TYPE_IO_CMD = 6,
262 	PCI_NVME_AE_TYPE_VENDOR = 7,
263 	PCI_NVME_AE_TYPE_MAX		/* Must be last */
264 } pci_nvme_async_type;
265 
266 /* Asynchronous Event Requests */
267 struct pci_nvme_aer {
268 	STAILQ_ENTRY(pci_nvme_aer) link;
269 	uint16_t	cid;	/* Command ID of the submitted AER */
270 };
271 
272 /** Asynchronous Event Information - Notice */
273 typedef enum {
274 	PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED = 0,
275 	PCI_NVME_AEI_NOTICE_FW_ACTIVATION,
276 	PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE,
277 	PCI_NVME_AEI_NOTICE_ANA_CHANGE,
278 	PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE,
279 	PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT,
280 	PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE,
281 	PCI_NVME_AEI_NOTICE_MAX,
282 } pci_nvme_async_event_info_notice;
283 
284 #define PCI_NVME_AEI_NOTICE_SHIFT		8
285 #define PCI_NVME_AEI_NOTICE_MASK(event)	(1 << (event + PCI_NVME_AEI_NOTICE_SHIFT))
286 
287 /* Asynchronous Event Notifications */
288 struct pci_nvme_aen {
289 	pci_nvme_async_type atype;
290 	uint32_t	event_data;
291 	bool		posted;
292 };
293 
294 /*
295  * By default, enable all Asynchrnous Event Notifications:
296  *     SMART / Health Critical Warnings
297  *     Namespace Attribute Notices
298  */
299 #define PCI_NVME_AEN_DEFAULT_MASK	0x11f
300 
301 typedef enum {
302 	NVME_CNTRLTYPE_IO = 1,
303 	NVME_CNTRLTYPE_DISCOVERY = 2,
304 	NVME_CNTRLTYPE_ADMIN = 3,
305 } pci_nvme_cntrl_type;
306 
307 struct pci_nvme_softc {
308 	struct pci_devinst *nsc_pi;
309 
310 	pthread_mutex_t	mtx;
311 
312 	struct nvme_registers regs;
313 
314 	struct nvme_namespace_data  nsdata;
315 	struct nvme_controller_data ctrldata;
316 	struct nvme_error_information_entry err_log;
317 	struct nvme_health_information_page health_log;
318 	struct nvme_firmware_page fw_log;
319 	struct nvme_ns_list ns_log;
320 
321 	struct pci_nvme_blockstore nvstore;
322 
323 	uint16_t	max_qentries;	/* max entries per queue */
324 	uint32_t	max_queues;	/* max number of IO SQ's or CQ's */
325 	uint32_t	num_cqueues;
326 	uint32_t	num_squeues;
327 	bool		num_q_is_set; /* Has host set Number of Queues */
328 
329 	struct pci_nvme_ioreq *ioreqs;
330 	STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
331 	uint32_t	pending_ios;
332 	uint32_t	ioslots;
333 	sem_t		iosemlock;
334 
335 	/*
336 	 * Memory mapped Submission and Completion queues
337 	 * Each array includes both Admin and IO queues
338 	 */
339 	struct nvme_completion_queue *compl_queues;
340 	struct nvme_submission_queue *submit_queues;
341 
342 	struct nvme_feature_obj feat[NVME_FID_MAX];
343 
344 	enum nvme_dsm_type dataset_management;
345 
346 	/* Accounting for SMART data */
347 	__uint128_t	read_data_units;
348 	__uint128_t	write_data_units;
349 	__uint128_t	read_commands;
350 	__uint128_t	write_commands;
351 	uint32_t	read_dunits_remainder;
352 	uint32_t	write_dunits_remainder;
353 
354 	STAILQ_HEAD(, pci_nvme_aer) aer_list;
355 	pthread_mutex_t	aer_mtx;
356 	uint32_t	aer_count;
357 	struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX];
358 	pthread_t	aen_tid;
359 	pthread_mutex_t	aen_mtx;
360 	pthread_cond_t	aen_cond;
361 };
362 
363 
364 static void pci_nvme_cq_update(struct pci_nvme_softc *sc,
365     struct nvme_completion_queue *cq,
366     uint32_t cdw0,
367     uint16_t cid,
368     uint16_t sqid,
369     uint16_t status);
370 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
371 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
372 static void pci_nvme_io_done(struct blockif_req *, int);
373 
374 /* Controller Configuration utils */
375 #define	NVME_CC_GET_EN(cc) \
376 	((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
377 #define	NVME_CC_GET_CSS(cc) \
378 	((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
379 #define	NVME_CC_GET_SHN(cc) \
380 	((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
381 #define	NVME_CC_GET_IOSQES(cc) \
382 	((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
383 #define	NVME_CC_GET_IOCQES(cc) \
384 	((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
385 
386 #define	NVME_CC_WRITE_MASK \
387 	((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
388 	 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
389 	 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
390 
391 #define	NVME_CC_NEN_WRITE_MASK \
392 	((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
393 	 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
394 	 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
395 
396 /* Controller Status utils */
397 #define	NVME_CSTS_GET_RDY(sts) \
398 	((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
399 
400 #define	NVME_CSTS_RDY	(1 << NVME_CSTS_REG_RDY_SHIFT)
401 #define	NVME_CSTS_CFS	(1 << NVME_CSTS_REG_CFS_SHIFT)
402 
403 /* Completion Queue status word utils */
404 #define	NVME_STATUS_P	(1 << NVME_STATUS_P_SHIFT)
405 #define	NVME_STATUS_MASK \
406 	((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
407 	 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
408 
409 #define NVME_ONCS_DSM	(NVME_CTRLR_DATA_ONCS_DSM_MASK << \
410 	NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
411 
412 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
413     struct nvme_feature_obj *,
414     struct nvme_command *,
415     struct nvme_completion *);
416 static void nvme_feature_temperature(struct pci_nvme_softc *,
417     struct nvme_feature_obj *,
418     struct nvme_command *,
419     struct nvme_completion *);
420 static void nvme_feature_num_queues(struct pci_nvme_softc *,
421     struct nvme_feature_obj *,
422     struct nvme_command *,
423     struct nvme_completion *);
424 static void nvme_feature_iv_config(struct pci_nvme_softc *,
425     struct nvme_feature_obj *,
426     struct nvme_command *,
427     struct nvme_completion *);
428 static void nvme_feature_async_event(struct pci_nvme_softc *,
429     struct nvme_feature_obj *,
430     struct nvme_command *,
431     struct nvme_completion *);
432 
433 static void *aen_thr(void *arg);
434 
435 static __inline void
436 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
437 {
438 	size_t len;
439 
440 	len = strnlen(src, dst_size);
441 	memset(dst, pad, dst_size);
442 	memcpy(dst, src, len);
443 }
444 
445 static __inline void
446 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
447 {
448 
449 	*status &= ~NVME_STATUS_MASK;
450 	*status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
451 		(code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
452 }
453 
454 static __inline void
455 pci_nvme_status_genc(uint16_t *status, uint16_t code)
456 {
457 
458 	pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
459 }
460 
461 /*
462  * Initialize the requested number or IO Submission and Completion Queues.
463  * Admin queues are allocated implicitly.
464  */
465 static void
466 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
467 {
468 	uint32_t i;
469 
470 	/*
471 	 * Allocate and initialize the Submission Queues
472 	 */
473 	if (nsq > NVME_QUEUES) {
474 		WPRINTF("%s: clamping number of SQ from %u to %u",
475 					__func__, nsq, NVME_QUEUES);
476 		nsq = NVME_QUEUES;
477 	}
478 
479 	sc->num_squeues = nsq;
480 
481 	sc->submit_queues = calloc(sc->num_squeues + 1,
482 				sizeof(struct nvme_submission_queue));
483 	if (sc->submit_queues == NULL) {
484 		WPRINTF("%s: SQ allocation failed", __func__);
485 		sc->num_squeues = 0;
486 	} else {
487 		struct nvme_submission_queue *sq = sc->submit_queues;
488 
489 		for (i = 0; i < sc->num_squeues + 1; i++)
490 			pthread_mutex_init(&sq[i].mtx, NULL);
491 	}
492 
493 	/*
494 	 * Allocate and initialize the Completion Queues
495 	 */
496 	if (ncq > NVME_QUEUES) {
497 		WPRINTF("%s: clamping number of CQ from %u to %u",
498 					__func__, ncq, NVME_QUEUES);
499 		ncq = NVME_QUEUES;
500 	}
501 
502 	sc->num_cqueues = ncq;
503 
504 	sc->compl_queues = calloc(sc->num_cqueues + 1,
505 				sizeof(struct nvme_completion_queue));
506 	if (sc->compl_queues == NULL) {
507 		WPRINTF("%s: CQ allocation failed", __func__);
508 		sc->num_cqueues = 0;
509 	} else {
510 		struct nvme_completion_queue *cq = sc->compl_queues;
511 
512 		for (i = 0; i < sc->num_cqueues + 1; i++)
513 			pthread_mutex_init(&cq[i].mtx, NULL);
514 	}
515 }
516 
517 static void
518 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
519 {
520 	struct nvme_controller_data *cd = &sc->ctrldata;
521 
522 	cd->vid = 0xFB5D;
523 	cd->ssvid = 0x0000;
524 
525 	cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
526 	cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
527 
528 	/* Num of submission commands that we can handle at a time (2^rab) */
529 	cd->rab   = 4;
530 
531 	/* FreeBSD OUI */
532 	cd->ieee[0] = 0x58;
533 	cd->ieee[1] = 0x9c;
534 	cd->ieee[2] = 0xfc;
535 
536 	cd->mic = 0;
537 
538 	cd->mdts = NVME_MDTS;	/* max data transfer size (2^mdts * CAP.MPSMIN) */
539 
540 	cd->ver = NVME_REV(1,4);
541 
542 	cd->cntrltype = NVME_CNTRLTYPE_IO;
543 	cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
544 	cd->oaes = NVMEB(NVME_CTRLR_DATA_OAES_NS_ATTR);
545 	cd->acl = 2;
546 	cd->aerl = 4;
547 
548 	/* Advertise 1, Read-only firmware slot */
549 	cd->frmw = NVMEB(NVME_CTRLR_DATA_FRMW_SLOT1_RO) |
550 	    (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
551 	cd->lpa = 0;	/* TODO: support some simple things like SMART */
552 	cd->elpe = 0;	/* max error log page entries */
553 	/*
554 	 * Report a single power state (zero-based value)
555 	 * power_state[] values are left as zero to indicate "Not reported"
556 	 */
557 	cd->npss = 0;
558 
559 	/* Warning Composite Temperature Threshold */
560 	cd->wctemp = 0x0157;
561 	cd->cctemp = 0x0157;
562 
563 	/* SANICAP must not be 0 for Revision 1.4 and later NVMe Controllers */
564 	cd->sanicap = (NVME_CTRLR_DATA_SANICAP_NODMMAS_NO <<
565 			NVME_CTRLR_DATA_SANICAP_NODMMAS_SHIFT);
566 
567 	cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
568 	    (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
569 	cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
570 	    (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
571 	cd->nn = 1;	/* number of namespaces */
572 
573 	cd->oncs = 0;
574 	switch (sc->dataset_management) {
575 	case NVME_DATASET_MANAGEMENT_AUTO:
576 		if (sc->nvstore.deallocate)
577 			cd->oncs |= NVME_ONCS_DSM;
578 		break;
579 	case NVME_DATASET_MANAGEMENT_ENABLE:
580 		cd->oncs |= NVME_ONCS_DSM;
581 		break;
582 	default:
583 		break;
584 	}
585 
586 	cd->fna = NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK <<
587 	    NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT;
588 
589 	cd->vwc = NVME_CTRLR_DATA_VWC_ALL_NO << NVME_CTRLR_DATA_VWC_ALL_SHIFT;
590 }
591 
592 /*
593  * Calculate the CRC-16 of the given buffer
594  * See copyright attribution at top of file
595  */
596 static uint16_t
597 crc16(uint16_t crc, const void *buffer, unsigned int len)
598 {
599 	const unsigned char *cp = buffer;
600 	/* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
601 	static uint16_t const crc16_table[256] = {
602 		0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
603 		0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
604 		0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
605 		0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
606 		0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
607 		0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
608 		0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
609 		0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
610 		0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
611 		0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
612 		0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
613 		0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
614 		0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
615 		0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
616 		0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
617 		0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
618 		0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
619 		0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
620 		0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
621 		0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
622 		0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
623 		0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
624 		0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
625 		0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
626 		0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
627 		0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
628 		0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
629 		0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
630 		0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
631 		0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
632 		0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
633 		0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
634 	};
635 
636 	while (len--)
637 		crc = (((crc >> 8) & 0xffU) ^
638 		    crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
639 	return crc;
640 }
641 
642 static void
643 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore,
644     struct nvme_namespace_data *nd)
645 {
646 
647 	/* Get capacity and block size information from backing store */
648 	nd->nsze = nvstore->size / nvstore->sectsz;
649 	nd->ncap = nd->nsze;
650 	nd->nuse = nd->nsze;
651 }
652 
653 static void
654 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
655     struct nvme_namespace_data *nd, uint32_t nsid,
656     struct pci_nvme_blockstore *nvstore)
657 {
658 
659 	pci_nvme_init_nsdata_size(nvstore, nd);
660 
661 	if (nvstore->type == NVME_STOR_BLOCKIF)
662 		nvstore->deallocate = blockif_candelete(nvstore->ctx);
663 
664 	nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
665 	nd->flbas = 0;
666 
667 	/* Create an EUI-64 if user did not provide one */
668 	if (nvstore->eui64 == 0) {
669 		char *data = NULL;
670 		uint64_t eui64 = nvstore->eui64;
671 
672 		asprintf(&data, "%s%u%u%u", get_config_value("name"),
673 		    sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot,
674 		    sc->nsc_pi->pi_func);
675 
676 		if (data != NULL) {
677 			eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
678 			free(data);
679 		}
680 		nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
681 	}
682 	be64enc(nd->eui64, nvstore->eui64);
683 
684 	/* LBA data-sz = 2^lbads */
685 	nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
686 }
687 
688 static void
689 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
690 {
691 	__uint128_t power_cycles = 1;
692 
693 	memset(&sc->err_log, 0, sizeof(sc->err_log));
694 	memset(&sc->health_log, 0, sizeof(sc->health_log));
695 	memset(&sc->fw_log, 0, sizeof(sc->fw_log));
696 	memset(&sc->ns_log, 0, sizeof(sc->ns_log));
697 
698 	/* Set read/write remainder to round up according to spec */
699 	sc->read_dunits_remainder = 999;
700 	sc->write_dunits_remainder = 999;
701 
702 	/* Set nominal Health values checked by implementations */
703 	sc->health_log.temperature = NVME_TEMPERATURE;
704 	sc->health_log.available_spare = 100;
705 	sc->health_log.available_spare_threshold = 10;
706 
707 	/* Set Active Firmware Info to slot 1 */
708 	sc->fw_log.afi = (1 << NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT);
709 	memcpy(&sc->fw_log.revision[0], sc->ctrldata.fr,
710 	    sizeof(sc->fw_log.revision[0]));
711 
712 	memcpy(&sc->health_log.power_cycles, &power_cycles,
713 	    sizeof(sc->health_log.power_cycles));
714 }
715 
716 static void
717 pci_nvme_init_features(struct pci_nvme_softc *sc)
718 {
719 	enum nvme_feature	fid;
720 
721 	for (fid = 0; fid < NVME_FID_MAX; fid++) {
722 		switch (fid) {
723 		case NVME_FEAT_ARBITRATION:
724 		case NVME_FEAT_POWER_MANAGEMENT:
725 		case NVME_FEAT_INTERRUPT_COALESCING: //XXX
726 		case NVME_FEAT_WRITE_ATOMICITY:
727 			/* Mandatory but no special handling required */
728 		//XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
729 		//XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
730 		//		  this returns a data buffer
731 			break;
732 		case NVME_FEAT_TEMPERATURE_THRESHOLD:
733 			sc->feat[fid].set = nvme_feature_temperature;
734 			break;
735 		case NVME_FEAT_ERROR_RECOVERY:
736 			sc->feat[fid].namespace_specific = true;
737 			break;
738 		case NVME_FEAT_NUMBER_OF_QUEUES:
739 			sc->feat[fid].set = nvme_feature_num_queues;
740 			break;
741 		case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
742 			sc->feat[fid].set = nvme_feature_iv_config;
743 			break;
744 		case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
745 			sc->feat[fid].set = nvme_feature_async_event;
746 			/* Enable all AENs by default */
747 			sc->feat[fid].cdw11 = PCI_NVME_AEN_DEFAULT_MASK;
748 			break;
749 		default:
750 			sc->feat[fid].set = nvme_feature_invalid_cb;
751 			sc->feat[fid].get = nvme_feature_invalid_cb;
752 		}
753 	}
754 }
755 
756 static void
757 pci_nvme_aer_reset(struct pci_nvme_softc *sc)
758 {
759 
760 	STAILQ_INIT(&sc->aer_list);
761 	sc->aer_count = 0;
762 }
763 
764 static void
765 pci_nvme_aer_init(struct pci_nvme_softc *sc)
766 {
767 
768 	pthread_mutex_init(&sc->aer_mtx, NULL);
769 	pci_nvme_aer_reset(sc);
770 }
771 
772 static void
773 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
774 {
775 	struct pci_nvme_aer *aer = NULL;
776 
777 	pthread_mutex_lock(&sc->aer_mtx);
778 	while (!STAILQ_EMPTY(&sc->aer_list)) {
779 		aer = STAILQ_FIRST(&sc->aer_list);
780 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
781 		free(aer);
782 	}
783 	pthread_mutex_unlock(&sc->aer_mtx);
784 
785 	pci_nvme_aer_reset(sc);
786 }
787 
788 static bool
789 pci_nvme_aer_available(struct pci_nvme_softc *sc)
790 {
791 
792 	return (sc->aer_count != 0);
793 }
794 
795 static bool
796 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
797 {
798 	struct nvme_controller_data *cd = &sc->ctrldata;
799 
800 	/* AERL is a zero based value while aer_count is one's based */
801 	return (sc->aer_count == (cd->aerl + 1));
802 }
803 
804 /*
805  * Add an Async Event Request
806  *
807  * Stores an AER to be returned later if the Controller needs to notify the
808  * host of an event.
809  * Note that while the NVMe spec doesn't require Controllers to return AER's
810  * in order, this implementation does preserve the order.
811  */
812 static int
813 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
814 {
815 	struct pci_nvme_aer *aer = NULL;
816 
817 	aer = calloc(1, sizeof(struct pci_nvme_aer));
818 	if (aer == NULL)
819 		return (-1);
820 
821 	/* Save the Command ID for use in the completion message */
822 	aer->cid = cid;
823 
824 	pthread_mutex_lock(&sc->aer_mtx);
825 	sc->aer_count++;
826 	STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
827 	pthread_mutex_unlock(&sc->aer_mtx);
828 
829 	return (0);
830 }
831 
832 /*
833  * Get an Async Event Request structure
834  *
835  * Returns a pointer to an AER previously submitted by the host or NULL if
836  * no AER's exist. Caller is responsible for freeing the returned struct.
837  */
838 static struct pci_nvme_aer *
839 pci_nvme_aer_get(struct pci_nvme_softc *sc)
840 {
841 	struct pci_nvme_aer *aer = NULL;
842 
843 	pthread_mutex_lock(&sc->aer_mtx);
844 	aer = STAILQ_FIRST(&sc->aer_list);
845 	if (aer != NULL) {
846 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
847 		sc->aer_count--;
848 	}
849 	pthread_mutex_unlock(&sc->aer_mtx);
850 
851 	return (aer);
852 }
853 
854 static void
855 pci_nvme_aen_reset(struct pci_nvme_softc *sc)
856 {
857 	uint32_t	atype;
858 
859 	memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen));
860 
861 	for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
862 		sc->aen[atype].atype = atype;
863 	}
864 }
865 
866 static void
867 pci_nvme_aen_init(struct pci_nvme_softc *sc)
868 {
869 	char nstr[80];
870 
871 	pci_nvme_aen_reset(sc);
872 
873 	pthread_mutex_init(&sc->aen_mtx, NULL);
874 	pthread_create(&sc->aen_tid, NULL, aen_thr, sc);
875 	snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot,
876 	    sc->nsc_pi->pi_func);
877 	pthread_set_name_np(sc->aen_tid, nstr);
878 }
879 
880 static void
881 pci_nvme_aen_destroy(struct pci_nvme_softc *sc)
882 {
883 
884 	pci_nvme_aen_reset(sc);
885 }
886 
887 /* Notify the AEN thread of pending work */
888 static void
889 pci_nvme_aen_notify(struct pci_nvme_softc *sc)
890 {
891 
892 	pthread_cond_signal(&sc->aen_cond);
893 }
894 
895 /*
896  * Post an Asynchronous Event Notification
897  */
898 static int32_t
899 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype,
900 		uint32_t event_data)
901 {
902 	struct pci_nvme_aen *aen;
903 
904 	if (atype >= PCI_NVME_AE_TYPE_MAX) {
905 		return(EINVAL);
906 	}
907 
908 	pthread_mutex_lock(&sc->aen_mtx);
909 	aen = &sc->aen[atype];
910 
911 	/* Has the controller already posted an event of this type? */
912 	if (aen->posted) {
913 		pthread_mutex_unlock(&sc->aen_mtx);
914 		return(EALREADY);
915 	}
916 
917 	aen->event_data = event_data;
918 	aen->posted = true;
919 	pthread_mutex_unlock(&sc->aen_mtx);
920 
921 	pci_nvme_aen_notify(sc);
922 
923 	return(0);
924 }
925 
926 static void
927 pci_nvme_aen_process(struct pci_nvme_softc *sc)
928 {
929 	struct pci_nvme_aer *aer;
930 	struct pci_nvme_aen *aen;
931 	pci_nvme_async_type atype;
932 	uint32_t mask;
933 	uint16_t status;
934 	uint8_t lid;
935 
936 	assert(pthread_mutex_isowned_np(&sc->aen_mtx));
937 	for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
938 		aen = &sc->aen[atype];
939 		/* Previous iterations may have depleted the available AER's */
940 		if (!pci_nvme_aer_available(sc)) {
941 			DPRINTF("%s: no AER", __func__);
942 			break;
943 		}
944 
945 		if (!aen->posted) {
946 			DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype);
947 			continue;
948 		}
949 
950 		status = NVME_SC_SUCCESS;
951 
952 		/* Is the event masked? */
953 		mask =
954 		    sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11;
955 
956 		DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data);
957 		switch (atype) {
958 		case PCI_NVME_AE_TYPE_ERROR:
959 			lid = NVME_LOG_ERROR;
960 			break;
961 		case PCI_NVME_AE_TYPE_SMART:
962 			mask &= 0xff;
963 			if ((mask & aen->event_data) == 0)
964 				continue;
965 			lid = NVME_LOG_HEALTH_INFORMATION;
966 			break;
967 		case PCI_NVME_AE_TYPE_NOTICE:
968 			if (aen->event_data >= PCI_NVME_AEI_NOTICE_MAX) {
969 				EPRINTLN("%s unknown AEN notice type %u",
970 				    __func__, aen->event_data);
971 				status = NVME_SC_INTERNAL_DEVICE_ERROR;
972 				break;
973 			}
974 			if ((PCI_NVME_AEI_NOTICE_MASK(aen->event_data) & mask) == 0)
975 				continue;
976 			switch (aen->event_data) {
977 			case PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED:
978 				lid = NVME_LOG_CHANGED_NAMESPACE;
979 				break;
980 			case PCI_NVME_AEI_NOTICE_FW_ACTIVATION:
981 				lid = NVME_LOG_FIRMWARE_SLOT;
982 				break;
983 			case PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE:
984 				lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED;
985 				break;
986 			case PCI_NVME_AEI_NOTICE_ANA_CHANGE:
987 				lid = NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS;
988 				break;
989 			case PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE:
990 				lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE;
991 				break;
992 			case PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT:
993 				lid = NVME_LOG_LBA_STATUS_INFORMATION;
994 				break;
995 			case PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE:
996 				lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE;
997 				break;
998 			default:
999 				lid = 0;
1000 			}
1001 			break;
1002 		default:
1003 			/* bad type?!? */
1004 			EPRINTLN("%s unknown AEN type %u", __func__, atype);
1005 			status = NVME_SC_INTERNAL_DEVICE_ERROR;
1006 			break;
1007 		}
1008 
1009 		aer = pci_nvme_aer_get(sc);
1010 		assert(aer != NULL);
1011 
1012 		DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype);
1013 		pci_nvme_cq_update(sc, &sc->compl_queues[0],
1014 		    (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */
1015 		    aer->cid,
1016 		    0,		/* SQID */
1017 		    status);
1018 
1019 		aen->event_data = 0;
1020 		aen->posted = false;
1021 
1022 		pci_generate_msix(sc->nsc_pi, 0);
1023 	}
1024 }
1025 
1026 static void *
1027 aen_thr(void *arg)
1028 {
1029 	struct pci_nvme_softc *sc;
1030 
1031 	sc = arg;
1032 
1033 	pthread_mutex_lock(&sc->aen_mtx);
1034 	for (;;) {
1035 		pci_nvme_aen_process(sc);
1036 		pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx);
1037 	}
1038 	pthread_mutex_unlock(&sc->aen_mtx);
1039 
1040 	pthread_exit(NULL);
1041 	return (NULL);
1042 }
1043 
1044 static void
1045 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
1046 {
1047 	uint32_t i;
1048 
1049 	DPRINTF("%s", __func__);
1050 
1051 	sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
1052 	    (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
1053 	    (60 << NVME_CAP_LO_REG_TO_SHIFT);
1054 
1055 	sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
1056 
1057 	sc->regs.vs = NVME_REV(1,4);	/* NVMe v1.4 */
1058 
1059 	sc->regs.cc = 0;
1060 
1061 	assert(sc->submit_queues != NULL);
1062 
1063 	for (i = 0; i < sc->num_squeues + 1; i++) {
1064 		sc->submit_queues[i].qbase = NULL;
1065 		sc->submit_queues[i].size = 0;
1066 		sc->submit_queues[i].cqid = 0;
1067 		sc->submit_queues[i].tail = 0;
1068 		sc->submit_queues[i].head = 0;
1069 	}
1070 
1071 	assert(sc->compl_queues != NULL);
1072 
1073 	for (i = 0; i < sc->num_cqueues + 1; i++) {
1074 		sc->compl_queues[i].qbase = NULL;
1075 		sc->compl_queues[i].size = 0;
1076 		sc->compl_queues[i].tail = 0;
1077 		sc->compl_queues[i].head = 0;
1078 	}
1079 
1080 	sc->num_q_is_set = false;
1081 
1082 	pci_nvme_aer_destroy(sc);
1083 	pci_nvme_aen_destroy(sc);
1084 
1085 	/*
1086 	 * Clear CSTS.RDY last to prevent the host from enabling Controller
1087 	 * before cleanup completes
1088 	 */
1089 	sc->regs.csts = 0;
1090 }
1091 
1092 static void
1093 pci_nvme_reset(struct pci_nvme_softc *sc)
1094 {
1095 	pthread_mutex_lock(&sc->mtx);
1096 	pci_nvme_reset_locked(sc);
1097 	pthread_mutex_unlock(&sc->mtx);
1098 }
1099 
1100 static int
1101 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
1102 {
1103 	uint16_t acqs, asqs;
1104 
1105 	DPRINTF("%s", __func__);
1106 
1107 	/*
1108 	 * NVMe 2.0 states that "enabling a controller while this field is
1109 	 * cleared to 0h produces undefined results" for both ACQS and
1110 	 * ASQS. If zero, set CFS and do not become ready.
1111 	 */
1112 	asqs = ONE_BASED(sc->regs.aqa & NVME_AQA_REG_ASQS_MASK);
1113 	if (asqs < 2) {
1114 		EPRINTLN("%s: illegal ASQS value %#x (aqa=%#x)", __func__,
1115 		    asqs - 1, sc->regs.aqa);
1116 		sc->regs.csts |= NVME_CSTS_CFS;
1117 		return (-1);
1118 	}
1119 	sc->submit_queues[0].size = asqs;
1120 	sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
1121 	            sizeof(struct nvme_command) * asqs);
1122 	if (sc->submit_queues[0].qbase == NULL) {
1123 		EPRINTLN("%s: ASQ vm_map_gpa(%lx) failed", __func__,
1124 		    sc->regs.asq);
1125 		sc->regs.csts |= NVME_CSTS_CFS;
1126 		return (-1);
1127 	}
1128 
1129 	DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
1130 	        __func__, sc->regs.asq, sc->submit_queues[0].qbase);
1131 
1132 	acqs = ONE_BASED((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
1133 	    NVME_AQA_REG_ACQS_MASK);
1134 	if (acqs < 2) {
1135 		EPRINTLN("%s: illegal ACQS value %#x (aqa=%#x)", __func__,
1136 		    acqs - 1, sc->regs.aqa);
1137 		sc->regs.csts |= NVME_CSTS_CFS;
1138 		return (-1);
1139 	}
1140 	sc->compl_queues[0].size = acqs;
1141 	sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
1142 	         sizeof(struct nvme_completion) * acqs);
1143 	if (sc->compl_queues[0].qbase == NULL) {
1144 		EPRINTLN("%s: ACQ vm_map_gpa(%lx) failed", __func__,
1145 		    sc->regs.acq);
1146 		sc->regs.csts |= NVME_CSTS_CFS;
1147 		return (-1);
1148 	}
1149 	sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
1150 
1151 	DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
1152 	        __func__, sc->regs.acq, sc->compl_queues[0].qbase);
1153 
1154 	return (0);
1155 }
1156 
1157 static int
1158 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
1159 	size_t len, enum nvme_copy_dir dir)
1160 {
1161 	uint8_t *p;
1162 	size_t bytes;
1163 
1164 	if (len > (8 * 1024)) {
1165 		return (-1);
1166 	}
1167 
1168 	/* Copy from the start of prp1 to the end of the physical page */
1169 	bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
1170 	bytes = MIN(bytes, len);
1171 
1172 	p = vm_map_gpa(ctx, prp1, bytes);
1173 	if (p == NULL) {
1174 		return (-1);
1175 	}
1176 
1177 	if (dir == NVME_COPY_TO_PRP)
1178 		memcpy(p, b, bytes);
1179 	else
1180 		memcpy(b, p, bytes);
1181 
1182 	b += bytes;
1183 
1184 	len -= bytes;
1185 	if (len == 0) {
1186 		return (0);
1187 	}
1188 
1189 	len = MIN(len, PAGE_SIZE);
1190 
1191 	p = vm_map_gpa(ctx, prp2, len);
1192 	if (p == NULL) {
1193 		return (-1);
1194 	}
1195 
1196 	if (dir == NVME_COPY_TO_PRP)
1197 		memcpy(p, b, len);
1198 	else
1199 		memcpy(b, p, len);
1200 
1201 	return (0);
1202 }
1203 
1204 /*
1205  * Write a Completion Queue Entry update
1206  *
1207  * Write the completion and update the doorbell value
1208  */
1209 static void
1210 pci_nvme_cq_update(struct pci_nvme_softc *sc,
1211 		struct nvme_completion_queue *cq,
1212 		uint32_t cdw0,
1213 		uint16_t cid,
1214 		uint16_t sqid,
1215 		uint16_t status)
1216 {
1217 	struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
1218 	struct nvme_completion *cqe;
1219 
1220 	assert(cq->qbase != NULL);
1221 
1222 	pthread_mutex_lock(&cq->mtx);
1223 
1224 	cqe = &cq->qbase[cq->tail];
1225 
1226 	/* Flip the phase bit */
1227 	status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
1228 
1229 	cqe->cdw0 = cdw0;
1230 	cqe->sqhd = sq->head;
1231 	cqe->sqid = sqid;
1232 	cqe->cid = cid;
1233 	cqe->status = status;
1234 
1235 	cq->tail++;
1236 	if (cq->tail >= cq->size) {
1237 		cq->tail = 0;
1238 	}
1239 
1240 	pthread_mutex_unlock(&cq->mtx);
1241 }
1242 
1243 static int
1244 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1245 	struct nvme_completion* compl)
1246 {
1247 	uint16_t qid = command->cdw10 & 0xffff;
1248 
1249 	DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
1250 	if (qid == 0 || qid > sc->num_squeues ||
1251 	    (sc->submit_queues[qid].qbase == NULL)) {
1252 		WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
1253 		        __func__, qid, sc->num_squeues);
1254 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1255 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1256 		return (1);
1257 	}
1258 
1259 	sc->submit_queues[qid].qbase = NULL;
1260 	sc->submit_queues[qid].cqid = 0;
1261 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1262 	return (1);
1263 }
1264 
1265 static int
1266 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1267 	struct nvme_completion* compl)
1268 {
1269 	if (command->cdw11 & NVME_CMD_CDW11_PC) {
1270 		uint16_t qid = command->cdw10 & 0xffff;
1271 		struct nvme_submission_queue *nsq;
1272 
1273 		if ((qid == 0) || (qid > sc->num_squeues) ||
1274 		    (sc->submit_queues[qid].qbase != NULL)) {
1275 			WPRINTF("%s queue index %u > num_squeues %u",
1276 			        __func__, qid, sc->num_squeues);
1277 			pci_nvme_status_tc(&compl->status,
1278 			    NVME_SCT_COMMAND_SPECIFIC,
1279 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1280 			return (1);
1281 		}
1282 
1283 		nsq = &sc->submit_queues[qid];
1284 		nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1285 		DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
1286 		if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
1287 			/*
1288 			 * Queues must specify at least two entries
1289 			 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1290 			 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1291 			 */
1292 			pci_nvme_status_tc(&compl->status,
1293 			    NVME_SCT_COMMAND_SPECIFIC,
1294 			    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1295 			return (1);
1296 		}
1297 		nsq->head = nsq->tail = 0;
1298 
1299 		nsq->cqid = (command->cdw11 >> 16) & 0xffff;
1300 		if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
1301 			pci_nvme_status_tc(&compl->status,
1302 			    NVME_SCT_COMMAND_SPECIFIC,
1303 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1304 			return (1);
1305 		}
1306 
1307 		if (sc->compl_queues[nsq->cqid].qbase == NULL) {
1308 			pci_nvme_status_tc(&compl->status,
1309 			    NVME_SCT_COMMAND_SPECIFIC,
1310 			    NVME_SC_COMPLETION_QUEUE_INVALID);
1311 			return (1);
1312 		}
1313 
1314 		nsq->qpriority = (command->cdw11 >> 1) & 0x03;
1315 
1316 		nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1317 		              sizeof(struct nvme_command) * (size_t)nsq->size);
1318 
1319 		DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
1320 		        qid, nsq->size, nsq->qbase, nsq->cqid);
1321 
1322 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1323 
1324 		DPRINTF("%s completed creating IOSQ qid %u",
1325 		         __func__, qid);
1326 	} else {
1327 		/*
1328 		 * Guest sent non-cont submission queue request.
1329 		 * This setting is unsupported by this emulation.
1330 		 */
1331 		WPRINTF("%s unsupported non-contig (list-based) "
1332 		         "create i/o submission queue", __func__);
1333 
1334 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1335 	}
1336 	return (1);
1337 }
1338 
1339 static int
1340 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1341 	struct nvme_completion* compl)
1342 {
1343 	uint16_t qid = command->cdw10 & 0xffff;
1344 	uint16_t sqid;
1345 
1346 	DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
1347 	if (qid == 0 || qid > sc->num_cqueues ||
1348 	    (sc->compl_queues[qid].qbase == NULL)) {
1349 		WPRINTF("%s queue index %u / num_cqueues %u",
1350 		        __func__, qid, sc->num_cqueues);
1351 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1352 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1353 		return (1);
1354 	}
1355 
1356 	/* Deleting an Active CQ is an error */
1357 	for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
1358 		if (sc->submit_queues[sqid].cqid == qid) {
1359 			pci_nvme_status_tc(&compl->status,
1360 			    NVME_SCT_COMMAND_SPECIFIC,
1361 			    NVME_SC_INVALID_QUEUE_DELETION);
1362 			return (1);
1363 		}
1364 
1365 	sc->compl_queues[qid].qbase = NULL;
1366 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1367 	return (1);
1368 }
1369 
1370 static int
1371 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1372 	struct nvme_completion* compl)
1373 {
1374 	struct nvme_completion_queue *ncq;
1375 	uint16_t qid = command->cdw10 & 0xffff;
1376 
1377 	/* Only support Physically Contiguous queues */
1378 	if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1379 		WPRINTF("%s unsupported non-contig (list-based) "
1380 		         "create i/o completion queue",
1381 		         __func__);
1382 
1383 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1384 		return (1);
1385 	}
1386 
1387 	if ((qid == 0) || (qid > sc->num_cqueues) ||
1388 	    (sc->compl_queues[qid].qbase != NULL)) {
1389 		WPRINTF("%s queue index %u > num_cqueues %u",
1390 			__func__, qid, sc->num_cqueues);
1391 		pci_nvme_status_tc(&compl->status,
1392 		    NVME_SCT_COMMAND_SPECIFIC,
1393 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1394 		return (1);
1395  	}
1396 
1397 	ncq = &sc->compl_queues[qid];
1398 	ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1399 	ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1400 	if (ncq->intr_vec > (sc->max_queues + 1)) {
1401 		pci_nvme_status_tc(&compl->status,
1402 		    NVME_SCT_COMMAND_SPECIFIC,
1403 		    NVME_SC_INVALID_INTERRUPT_VECTOR);
1404 		return (1);
1405 	}
1406 
1407 	ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1408 	if ((ncq->size < 2) || (ncq->size > sc->max_qentries))  {
1409 		/*
1410 		 * Queues must specify at least two entries
1411 		 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1412 		 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1413 		 */
1414 		pci_nvme_status_tc(&compl->status,
1415 		    NVME_SCT_COMMAND_SPECIFIC,
1416 		    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1417 		return (1);
1418 	}
1419 	ncq->head = ncq->tail = 0;
1420 	ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1421 		     command->prp1,
1422 		     sizeof(struct nvme_command) * (size_t)ncq->size);
1423 
1424 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1425 
1426 
1427 	return (1);
1428 }
1429 
1430 static int
1431 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1432 	struct nvme_completion* compl)
1433 {
1434 	uint64_t logoff;
1435 	uint32_t logsize;
1436 	uint8_t logpage;
1437 
1438 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1439 
1440 	/*
1441 	 * Command specifies the number of dwords to return in fields NUMDU
1442 	 * and NUMDL. This is a zero-based value.
1443 	 */
1444 	logpage = command->cdw10 & 0xFF;
1445 	logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1446 	logsize *= sizeof(uint32_t);
1447 	logoff  = ((uint64_t)(command->cdw13) << 32) | command->cdw12;
1448 
1449 	DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1450 
1451 	switch (logpage) {
1452 	case NVME_LOG_ERROR:
1453 		if (logoff >= sizeof(sc->err_log)) {
1454 			pci_nvme_status_genc(&compl->status,
1455 			    NVME_SC_INVALID_FIELD);
1456 			break;
1457 		}
1458 
1459 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1460 		    command->prp2, (uint8_t *)&sc->err_log + logoff,
1461 		    MIN(logsize - logoff, sizeof(sc->err_log)),
1462 		    NVME_COPY_TO_PRP);
1463 		break;
1464 	case NVME_LOG_HEALTH_INFORMATION:
1465 		if (logoff >= sizeof(sc->health_log)) {
1466 			pci_nvme_status_genc(&compl->status,
1467 			    NVME_SC_INVALID_FIELD);
1468 			break;
1469 		}
1470 
1471 		pthread_mutex_lock(&sc->mtx);
1472 		memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1473 		    sizeof(sc->health_log.data_units_read));
1474 		memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1475 		    sizeof(sc->health_log.data_units_written));
1476 		memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1477 		    sizeof(sc->health_log.host_read_commands));
1478 		memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1479 		    sizeof(sc->health_log.host_write_commands));
1480 		pthread_mutex_unlock(&sc->mtx);
1481 
1482 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1483 		    command->prp2, (uint8_t *)&sc->health_log + logoff,
1484 		    MIN(logsize - logoff, sizeof(sc->health_log)),
1485 		    NVME_COPY_TO_PRP);
1486 		break;
1487 	case NVME_LOG_FIRMWARE_SLOT:
1488 		if (logoff >= sizeof(sc->fw_log)) {
1489 			pci_nvme_status_genc(&compl->status,
1490 			    NVME_SC_INVALID_FIELD);
1491 			break;
1492 		}
1493 
1494 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1495 		    command->prp2, (uint8_t *)&sc->fw_log + logoff,
1496 		    MIN(logsize - logoff, sizeof(sc->fw_log)),
1497 		    NVME_COPY_TO_PRP);
1498 		break;
1499 	case NVME_LOG_CHANGED_NAMESPACE:
1500 		if (logoff >= sizeof(sc->ns_log)) {
1501 			pci_nvme_status_genc(&compl->status,
1502 			    NVME_SC_INVALID_FIELD);
1503 			break;
1504 		}
1505 
1506 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1507 		    command->prp2, (uint8_t *)&sc->ns_log + logoff,
1508 		    MIN(logsize - logoff, sizeof(sc->ns_log)),
1509 		    NVME_COPY_TO_PRP);
1510 		memset(&sc->ns_log, 0, sizeof(sc->ns_log));
1511 		break;
1512 	default:
1513 		DPRINTF("%s get log page %x command not supported",
1514 		        __func__, logpage);
1515 
1516 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1517 		    NVME_SC_INVALID_LOG_PAGE);
1518 	}
1519 
1520 	return (1);
1521 }
1522 
1523 static int
1524 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1525 	struct nvme_completion* compl)
1526 {
1527 	void *dest;
1528 	uint16_t status;
1529 
1530 	DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1531 	        command->cdw10 & 0xFF, command->nsid);
1532 
1533 	status = 0;
1534 	pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1535 
1536 	switch (command->cdw10 & 0xFF) {
1537 	case 0x00: /* return Identify Namespace data structure */
1538 		/* Global NS only valid with NS Management */
1539 		if (command->nsid == NVME_GLOBAL_NAMESPACE_TAG) {
1540 			pci_nvme_status_genc(&status,
1541 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1542 			break;
1543 		}
1544 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1545 		    command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1546 		    NVME_COPY_TO_PRP);
1547 		break;
1548 	case 0x01: /* return Identify Controller data structure */
1549 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1550 		    command->prp2, (uint8_t *)&sc->ctrldata,
1551 		    sizeof(sc->ctrldata),
1552 		    NVME_COPY_TO_PRP);
1553 		break;
1554 	case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1555 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1556 		                  sizeof(uint32_t) * 1024);
1557 		/* All unused entries shall be zero */
1558 		memset(dest, 0, sizeof(uint32_t) * 1024);
1559 		((uint32_t *)dest)[0] = 1;
1560 		break;
1561 	case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1562 		if (command->nsid != 1) {
1563 			pci_nvme_status_genc(&status,
1564 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1565 			break;
1566 		}
1567 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1568 		                  sizeof(uint32_t) * 1024);
1569 		/* All bytes after the descriptor shall be zero */
1570 		memset(dest, 0, sizeof(uint32_t) * 1024);
1571 
1572 		/* Return NIDT=1 (i.e. EUI64) descriptor */
1573 		((uint8_t *)dest)[0] = 1;
1574 		((uint8_t *)dest)[1] = sizeof(uint64_t);
1575 		memcpy(((uint8_t *)dest) + 4, sc->nsdata.eui64, sizeof(uint64_t));
1576 		break;
1577 	case 0x13:
1578 		/*
1579 		 * Controller list is optional but used by UNH tests. Return
1580 		 * a valid but empty list.
1581 		 */
1582 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1583 		                  sizeof(uint16_t) * 2048);
1584 		memset(dest, 0, sizeof(uint16_t) * 2048);
1585 		break;
1586 	default:
1587 		DPRINTF("%s unsupported identify command requested 0x%x",
1588 		         __func__, command->cdw10 & 0xFF);
1589 		pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1590 		break;
1591 	}
1592 
1593 	compl->status = status;
1594 	return (1);
1595 }
1596 
1597 static const char *
1598 nvme_fid_to_name(uint8_t fid)
1599 {
1600 	const char *name;
1601 
1602 	switch (fid) {
1603 	case NVME_FEAT_ARBITRATION:
1604 		name = "Arbitration";
1605 		break;
1606 	case NVME_FEAT_POWER_MANAGEMENT:
1607 		name = "Power Management";
1608 		break;
1609 	case NVME_FEAT_LBA_RANGE_TYPE:
1610 		name = "LBA Range Type";
1611 		break;
1612 	case NVME_FEAT_TEMPERATURE_THRESHOLD:
1613 		name = "Temperature Threshold";
1614 		break;
1615 	case NVME_FEAT_ERROR_RECOVERY:
1616 		name = "Error Recovery";
1617 		break;
1618 	case NVME_FEAT_VOLATILE_WRITE_CACHE:
1619 		name = "Volatile Write Cache";
1620 		break;
1621 	case NVME_FEAT_NUMBER_OF_QUEUES:
1622 		name = "Number of Queues";
1623 		break;
1624 	case NVME_FEAT_INTERRUPT_COALESCING:
1625 		name = "Interrupt Coalescing";
1626 		break;
1627 	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1628 		name = "Interrupt Vector Configuration";
1629 		break;
1630 	case NVME_FEAT_WRITE_ATOMICITY:
1631 		name = "Write Atomicity Normal";
1632 		break;
1633 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1634 		name = "Asynchronous Event Configuration";
1635 		break;
1636 	case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1637 		name = "Autonomous Power State Transition";
1638 		break;
1639 	case NVME_FEAT_HOST_MEMORY_BUFFER:
1640 		name = "Host Memory Buffer";
1641 		break;
1642 	case NVME_FEAT_TIMESTAMP:
1643 		name = "Timestamp";
1644 		break;
1645 	case NVME_FEAT_KEEP_ALIVE_TIMER:
1646 		name = "Keep Alive Timer";
1647 		break;
1648 	case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1649 		name = "Host Controlled Thermal Management";
1650 		break;
1651 	case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1652 		name = "Non-Operation Power State Config";
1653 		break;
1654 	case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1655 		name = "Read Recovery Level Config";
1656 		break;
1657 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1658 		name = "Predictable Latency Mode Config";
1659 		break;
1660 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1661 		name = "Predictable Latency Mode Window";
1662 		break;
1663 	case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1664 		name = "LBA Status Information Report Interval";
1665 		break;
1666 	case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1667 		name = "Host Behavior Support";
1668 		break;
1669 	case NVME_FEAT_SANITIZE_CONFIG:
1670 		name = "Sanitize Config";
1671 		break;
1672 	case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1673 		name = "Endurance Group Event Configuration";
1674 		break;
1675 	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1676 		name = "Software Progress Marker";
1677 		break;
1678 	case NVME_FEAT_HOST_IDENTIFIER:
1679 		name = "Host Identifier";
1680 		break;
1681 	case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1682 		name = "Reservation Notification Mask";
1683 		break;
1684 	case NVME_FEAT_RESERVATION_PERSISTENCE:
1685 		name = "Reservation Persistence";
1686 		break;
1687 	case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1688 		name = "Namespace Write Protection Config";
1689 		break;
1690 	default:
1691 		name = "Unknown";
1692 		break;
1693 	}
1694 
1695 	return (name);
1696 }
1697 
1698 static void
1699 nvme_feature_invalid_cb(struct pci_nvme_softc *sc __unused,
1700     struct nvme_feature_obj *feat __unused,
1701     struct nvme_command *command __unused,
1702     struct nvme_completion *compl)
1703 {
1704 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1705 }
1706 
1707 static void
1708 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1709     struct nvme_feature_obj *feat __unused,
1710     struct nvme_command *command,
1711     struct nvme_completion *compl)
1712 {
1713 	uint32_t i;
1714 	uint32_t cdw11 = command->cdw11;
1715 	uint16_t iv;
1716 	bool cd;
1717 
1718 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1719 
1720 	iv = cdw11 & 0xffff;
1721 	cd = cdw11 & (1 << 16);
1722 
1723 	if (iv > (sc->max_queues + 1)) {
1724 		return;
1725 	}
1726 
1727 	/* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1728 	if ((iv == 0) && !cd)
1729 		return;
1730 
1731 	/* Requested Interrupt Vector must be used by a CQ */
1732 	for (i = 0; i < sc->num_cqueues + 1; i++) {
1733 		if (sc->compl_queues[i].intr_vec == iv) {
1734 			pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1735 		}
1736 	}
1737 }
1738 
1739 #define NVME_ASYNC_EVENT_ENDURANCE_GROUP		(0x4000)
1740 static void
1741 nvme_feature_async_event(struct pci_nvme_softc *sc __unused,
1742     struct nvme_feature_obj *feat __unused,
1743     struct nvme_command *command,
1744     struct nvme_completion *compl)
1745 {
1746 	if (command->cdw11 & NVME_ASYNC_EVENT_ENDURANCE_GROUP)
1747 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1748 }
1749 
1750 #define NVME_TEMP_THRESH_OVER	0
1751 #define NVME_TEMP_THRESH_UNDER	1
1752 static void
1753 nvme_feature_temperature(struct pci_nvme_softc *sc,
1754     struct nvme_feature_obj *feat __unused,
1755     struct nvme_command *command,
1756     struct nvme_completion *compl)
1757 {
1758 	uint16_t	tmpth;	/* Temperature Threshold */
1759 	uint8_t		tmpsel; /* Threshold Temperature Select */
1760 	uint8_t		thsel;  /* Threshold Type Select */
1761 	bool		set_crit = false;
1762 	bool		report_crit;
1763 
1764 	tmpth  = command->cdw11 & 0xffff;
1765 	tmpsel = (command->cdw11 >> 16) & 0xf;
1766 	thsel  = (command->cdw11 >> 20) & 0x3;
1767 
1768 	DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel);
1769 
1770 	/* Check for unsupported values */
1771 	if (((tmpsel != 0) && (tmpsel != 0xf)) ||
1772 	    (thsel > NVME_TEMP_THRESH_UNDER)) {
1773 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1774 		return;
1775 	}
1776 
1777 	if (((thsel == NVME_TEMP_THRESH_OVER)  && (NVME_TEMPERATURE >= tmpth)) ||
1778 	    ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth)))
1779 		set_crit = true;
1780 
1781 	pthread_mutex_lock(&sc->mtx);
1782 	if (set_crit)
1783 		sc->health_log.critical_warning |=
1784 		    NVME_CRIT_WARN_ST_TEMPERATURE;
1785 	else
1786 		sc->health_log.critical_warning &=
1787 		    ~NVME_CRIT_WARN_ST_TEMPERATURE;
1788 	pthread_mutex_unlock(&sc->mtx);
1789 
1790 	report_crit = sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11 &
1791 	    NVME_CRIT_WARN_ST_TEMPERATURE;
1792 
1793 	if (set_crit && report_crit)
1794 		pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART,
1795 		    sc->health_log.critical_warning);
1796 
1797 	DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status);
1798 }
1799 
1800 static void
1801 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1802     struct nvme_feature_obj *feat __unused,
1803     struct nvme_command *command,
1804     struct nvme_completion *compl)
1805 {
1806 	uint16_t nqr;	/* Number of Queues Requested */
1807 
1808 	if (sc->num_q_is_set) {
1809 		WPRINTF("%s: Number of Queues already set", __func__);
1810 		pci_nvme_status_genc(&compl->status,
1811 		    NVME_SC_COMMAND_SEQUENCE_ERROR);
1812 		return;
1813 	}
1814 
1815 	nqr = command->cdw11 & 0xFFFF;
1816 	if (nqr == 0xffff) {
1817 		WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1818 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1819 		return;
1820 	}
1821 
1822 	sc->num_squeues = ONE_BASED(nqr);
1823 	if (sc->num_squeues > sc->max_queues) {
1824 		DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1825 					sc->max_queues);
1826 		sc->num_squeues = sc->max_queues;
1827 	}
1828 
1829 	nqr = (command->cdw11 >> 16) & 0xFFFF;
1830 	if (nqr == 0xffff) {
1831 		WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1832 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1833 		return;
1834 	}
1835 
1836 	sc->num_cqueues = ONE_BASED(nqr);
1837 	if (sc->num_cqueues > sc->max_queues) {
1838 		DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1839 					sc->max_queues);
1840 		sc->num_cqueues = sc->max_queues;
1841 	}
1842 
1843 	/* Patch the command value which will be saved on callback's return */
1844 	command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1845 	compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1846 
1847 	sc->num_q_is_set = true;
1848 }
1849 
1850 static int
1851 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1852 	struct nvme_completion *compl)
1853 {
1854 	struct nvme_feature_obj *feat;
1855 	uint32_t nsid = command->nsid;
1856 	uint8_t fid = NVMEV(NVME_FEAT_SET_FID, command->cdw10);
1857 	bool sv = NVMEV(NVME_FEAT_SET_SV, command->cdw10);
1858 
1859 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1860 
1861 	if (fid >= NVME_FID_MAX) {
1862 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1863 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1864 		return (1);
1865 	}
1866 
1867 	if (sv) {
1868 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1869 		    NVME_SC_FEATURE_NOT_SAVEABLE);
1870 		return (1);
1871 	}
1872 
1873 	feat = &sc->feat[fid];
1874 
1875 	if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) {
1876 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1877 		return (1);
1878 	}
1879 
1880 	if (!feat->namespace_specific &&
1881 	    !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1882 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1883 		    NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1884 		return (1);
1885 	}
1886 
1887 	compl->cdw0 = 0;
1888 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1889 
1890 	if (feat->set)
1891 		feat->set(sc, feat, command, compl);
1892 	else {
1893 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1894 		    NVME_SC_FEATURE_NOT_CHANGEABLE);
1895 		return (1);
1896 	}
1897 
1898 	DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11);
1899 	if (compl->status == NVME_SC_SUCCESS) {
1900 		feat->cdw11 = command->cdw11;
1901 		if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) &&
1902 		    (command->cdw11 != 0))
1903 			pci_nvme_aen_notify(sc);
1904 	}
1905 
1906 	return (0);
1907 }
1908 
1909 #define NVME_FEATURES_SEL_SUPPORTED	0x3
1910 #define NVME_FEATURES_NS_SPECIFIC	(1 << 1)
1911 
1912 static int
1913 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1914 	struct nvme_completion* compl)
1915 {
1916 	struct nvme_feature_obj *feat;
1917 	uint8_t fid = command->cdw10 & 0xFF;
1918 	uint8_t sel = (command->cdw10 >> 8) & 0x7;
1919 
1920 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1921 
1922 	if (fid >= NVME_FID_MAX) {
1923 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1924 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1925 		return (1);
1926 	}
1927 
1928 	compl->cdw0 = 0;
1929 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1930 
1931 	feat = &sc->feat[fid];
1932 	if (feat->get) {
1933 		feat->get(sc, feat, command, compl);
1934 	}
1935 
1936 	if (compl->status == NVME_SC_SUCCESS) {
1937 		if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific)
1938 			compl->cdw0 = NVME_FEATURES_NS_SPECIFIC;
1939 		else
1940 			compl->cdw0 = feat->cdw11;
1941 	}
1942 
1943 	return (0);
1944 }
1945 
1946 static int
1947 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1948 	struct nvme_completion* compl)
1949 {
1950 	uint8_t	ses, lbaf, pi;
1951 
1952 	/* Only supports Secure Erase Setting - User Data Erase */
1953 	ses = (command->cdw10 >> 9) & 0x7;
1954 	if (ses > 0x1) {
1955 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1956 		return (1);
1957 	}
1958 
1959 	/* Only supports a single LBA Format */
1960 	lbaf = command->cdw10 & 0xf;
1961 	if (lbaf != 0) {
1962 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1963 		    NVME_SC_INVALID_FORMAT);
1964 		return (1);
1965 	}
1966 
1967 	/* Doesn't support Protection Infomation */
1968 	pi = (command->cdw10 >> 5) & 0x7;
1969 	if (pi != 0) {
1970 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1971 		return (1);
1972 	}
1973 
1974 	if (sc->nvstore.type == NVME_STOR_RAM) {
1975 		if (sc->nvstore.ctx)
1976 			free(sc->nvstore.ctx);
1977 		sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1978 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1979 	} else {
1980 		struct pci_nvme_ioreq *req;
1981 		int err;
1982 
1983 		req = pci_nvme_get_ioreq(sc);
1984 		if (req == NULL) {
1985 			pci_nvme_status_genc(&compl->status,
1986 			    NVME_SC_INTERNAL_DEVICE_ERROR);
1987 			WPRINTF("%s: unable to allocate IO req", __func__);
1988 			return (1);
1989 		}
1990 		req->nvme_sq = &sc->submit_queues[0];
1991 		req->sqid = 0;
1992 		req->opc = command->opc;
1993 		req->cid = command->cid;
1994 		req->nsid = command->nsid;
1995 
1996 		req->io_req.br_offset = 0;
1997 		req->io_req.br_resid = sc->nvstore.size;
1998 		req->io_req.br_callback = pci_nvme_io_done;
1999 
2000 		err = blockif_delete(sc->nvstore.ctx, &req->io_req);
2001 		if (err) {
2002 			pci_nvme_status_genc(&compl->status,
2003 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2004 			pci_nvme_release_ioreq(sc, req);
2005 		} else
2006 			compl->status = NVME_NO_STATUS;
2007 	}
2008 
2009 	return (1);
2010 }
2011 
2012 static int
2013 nvme_opc_abort(struct pci_nvme_softc *sc __unused, struct nvme_command *command,
2014     struct nvme_completion *compl)
2015 {
2016 	DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
2017 	        command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
2018 
2019 	/* TODO: search for the command ID and abort it */
2020 
2021 	compl->cdw0 = 1;
2022 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
2023 	return (1);
2024 }
2025 
2026 static int
2027 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
2028 	struct nvme_command* command, struct nvme_completion* compl)
2029 {
2030 	DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__,
2031 	    sc->aer_count, sc->ctrldata.aerl, command->cid);
2032 
2033 	/* Don't exceed the Async Event Request Limit (AERL). */
2034 	if (pci_nvme_aer_limit_reached(sc)) {
2035 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
2036 				NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
2037 		return (1);
2038 	}
2039 
2040 	if (pci_nvme_aer_add(sc, command->cid)) {
2041 		pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
2042 				NVME_SC_INTERNAL_DEVICE_ERROR);
2043 		return (1);
2044 	}
2045 
2046 	/*
2047 	 * Raise events when they happen based on the Set Features cmd.
2048 	 * These events happen async, so only set completion successful if
2049 	 * there is an event reflective of the request to get event.
2050 	 */
2051 	compl->status = NVME_NO_STATUS;
2052 	pci_nvme_aen_notify(sc);
2053 
2054 	return (0);
2055 }
2056 
2057 static void
2058 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
2059 {
2060 	struct nvme_completion compl;
2061 	struct nvme_command *cmd;
2062 	struct nvme_submission_queue *sq;
2063 	struct nvme_completion_queue *cq;
2064 	uint16_t sqhead;
2065 
2066 	DPRINTF("%s index %u", __func__, (uint32_t)value);
2067 
2068 	sq = &sc->submit_queues[0];
2069 	cq = &sc->compl_queues[0];
2070 
2071 	pthread_mutex_lock(&sq->mtx);
2072 
2073 	sqhead = sq->head;
2074 	DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
2075 
2076 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
2077 		cmd = &(sq->qbase)[sqhead];
2078 		compl.cdw0 = 0;
2079 		compl.status = 0;
2080 
2081 		switch (cmd->opc) {
2082 		case NVME_OPC_DELETE_IO_SQ:
2083 			DPRINTF("%s command DELETE_IO_SQ", __func__);
2084 			nvme_opc_delete_io_sq(sc, cmd, &compl);
2085 			break;
2086 		case NVME_OPC_CREATE_IO_SQ:
2087 			DPRINTF("%s command CREATE_IO_SQ", __func__);
2088 			nvme_opc_create_io_sq(sc, cmd, &compl);
2089 			break;
2090 		case NVME_OPC_DELETE_IO_CQ:
2091 			DPRINTF("%s command DELETE_IO_CQ", __func__);
2092 			nvme_opc_delete_io_cq(sc, cmd, &compl);
2093 			break;
2094 		case NVME_OPC_CREATE_IO_CQ:
2095 			DPRINTF("%s command CREATE_IO_CQ", __func__);
2096 			nvme_opc_create_io_cq(sc, cmd, &compl);
2097 			break;
2098 		case NVME_OPC_GET_LOG_PAGE:
2099 			DPRINTF("%s command GET_LOG_PAGE", __func__);
2100 			nvme_opc_get_log_page(sc, cmd, &compl);
2101 			break;
2102 		case NVME_OPC_IDENTIFY:
2103 			DPRINTF("%s command IDENTIFY", __func__);
2104 			nvme_opc_identify(sc, cmd, &compl);
2105 			break;
2106 		case NVME_OPC_ABORT:
2107 			DPRINTF("%s command ABORT", __func__);
2108 			nvme_opc_abort(sc, cmd, &compl);
2109 			break;
2110 		case NVME_OPC_SET_FEATURES:
2111 			DPRINTF("%s command SET_FEATURES", __func__);
2112 			nvme_opc_set_features(sc, cmd, &compl);
2113 			break;
2114 		case NVME_OPC_GET_FEATURES:
2115 			DPRINTF("%s command GET_FEATURES", __func__);
2116 			nvme_opc_get_features(sc, cmd, &compl);
2117 			break;
2118 		case NVME_OPC_FIRMWARE_ACTIVATE:
2119 			DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
2120 			pci_nvme_status_tc(&compl.status,
2121 			    NVME_SCT_COMMAND_SPECIFIC,
2122 			    NVME_SC_INVALID_FIRMWARE_SLOT);
2123 			break;
2124 		case NVME_OPC_ASYNC_EVENT_REQUEST:
2125 			DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
2126 			nvme_opc_async_event_req(sc, cmd, &compl);
2127 			break;
2128 		case NVME_OPC_FORMAT_NVM:
2129 			DPRINTF("%s command FORMAT_NVM", __func__);
2130 			if ((sc->ctrldata.oacs &
2131 			    (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
2132 				pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
2133 				break;
2134 			}
2135 			nvme_opc_format_nvm(sc, cmd, &compl);
2136 			break;
2137 		case NVME_OPC_SECURITY_SEND:
2138 		case NVME_OPC_SECURITY_RECEIVE:
2139 		case NVME_OPC_SANITIZE:
2140 		case NVME_OPC_GET_LBA_STATUS:
2141 			DPRINTF("%s command OPC=%#x (unsupported)", __func__,
2142 			    cmd->opc);
2143 			/* Valid but unsupported opcodes */
2144 			pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD);
2145 			break;
2146 		default:
2147 			DPRINTF("%s command OPC=%#X (not implemented)",
2148 			    __func__,
2149 			    cmd->opc);
2150 			pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
2151 		}
2152 		sqhead = (sqhead + 1) % sq->size;
2153 
2154 		if (NVME_COMPLETION_VALID(compl)) {
2155 			pci_nvme_cq_update(sc, &sc->compl_queues[0],
2156 			    compl.cdw0,
2157 			    cmd->cid,
2158 			    0,		/* SQID */
2159 			    compl.status);
2160 		}
2161 	}
2162 
2163 	DPRINTF("setting sqhead %u", sqhead);
2164 	sq->head = sqhead;
2165 
2166 	if (cq->head != cq->tail)
2167 		pci_generate_msix(sc->nsc_pi, 0);
2168 
2169 	pthread_mutex_unlock(&sq->mtx);
2170 }
2171 
2172 /*
2173  * Update the Write and Read statistics reported in SMART data
2174  *
2175  * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
2176  * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
2177  * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
2178  */
2179 static void
2180 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
2181     size_t bytes, uint16_t status)
2182 {
2183 
2184 	pthread_mutex_lock(&sc->mtx);
2185 	switch (opc) {
2186 	case NVME_OPC_WRITE:
2187 		sc->write_commands++;
2188 		if (status != NVME_SC_SUCCESS)
2189 			break;
2190 		sc->write_dunits_remainder += (bytes / 512);
2191 		while (sc->write_dunits_remainder >= 1000) {
2192 			sc->write_data_units++;
2193 			sc->write_dunits_remainder -= 1000;
2194 		}
2195 		break;
2196 	case NVME_OPC_READ:
2197 		sc->read_commands++;
2198 		if (status != NVME_SC_SUCCESS)
2199 			break;
2200 		sc->read_dunits_remainder += (bytes / 512);
2201 		while (sc->read_dunits_remainder >= 1000) {
2202 			sc->read_data_units++;
2203 			sc->read_dunits_remainder -= 1000;
2204 		}
2205 		break;
2206 	default:
2207 		DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
2208 		break;
2209 	}
2210 	pthread_mutex_unlock(&sc->mtx);
2211 }
2212 
2213 /*
2214  * Check if the combination of Starting LBA (slba) and number of blocks
2215  * exceeds the range of the underlying storage.
2216  *
2217  * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
2218  * the capacity in bytes as a uint64_t, care must be taken to avoid integer
2219  * overflow.
2220  */
2221 static bool
2222 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
2223     uint32_t nblocks)
2224 {
2225 	size_t	offset, bytes;
2226 
2227 	/* Overflow check of multiplying Starting LBA by the sector size */
2228 	if (slba >> (64 - nvstore->sectsz_bits))
2229 		return (true);
2230 
2231 	offset = slba << nvstore->sectsz_bits;
2232 	bytes = nblocks << nvstore->sectsz_bits;
2233 
2234 	/* Overflow check of Number of Logical Blocks */
2235 	if ((nvstore->size <= offset) || ((nvstore->size - offset) < bytes))
2236 		return (true);
2237 
2238 	return (false);
2239 }
2240 
2241 static int
2242 pci_nvme_append_iov_req(struct pci_nvme_softc *sc __unused,
2243     struct pci_nvme_ioreq *req, uint64_t gpaddr, size_t size, uint64_t offset)
2244 {
2245 	int iovidx;
2246 	bool range_is_contiguous;
2247 
2248 	if (req == NULL)
2249 		return (-1);
2250 
2251 	if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
2252 		return (-1);
2253 	}
2254 
2255 	/*
2256 	 * Minimize the number of IOVs by concatenating contiguous address
2257 	 * ranges. If the IOV count is zero, there is no previous range to
2258 	 * concatenate.
2259 	 */
2260 	if (req->io_req.br_iovcnt == 0)
2261 		range_is_contiguous = false;
2262 	else
2263 		range_is_contiguous = (req->prev_gpaddr + req->prev_size) == gpaddr;
2264 
2265 	if (range_is_contiguous) {
2266 		iovidx = req->io_req.br_iovcnt - 1;
2267 
2268 		req->io_req.br_iov[iovidx].iov_base =
2269 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2270 				     req->prev_gpaddr, size);
2271 		if (req->io_req.br_iov[iovidx].iov_base == NULL)
2272 			return (-1);
2273 
2274 		req->prev_size += size;
2275 		req->io_req.br_resid += size;
2276 
2277 		req->io_req.br_iov[iovidx].iov_len = req->prev_size;
2278 	} else {
2279 		iovidx = req->io_req.br_iovcnt;
2280 		if (iovidx == 0) {
2281 			req->io_req.br_offset = offset;
2282 			req->io_req.br_resid = 0;
2283 			req->io_req.br_param = req;
2284 		}
2285 
2286 		req->io_req.br_iov[iovidx].iov_base =
2287 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2288 				     gpaddr, size);
2289 		if (req->io_req.br_iov[iovidx].iov_base == NULL)
2290 			return (-1);
2291 
2292 		req->io_req.br_iov[iovidx].iov_len = size;
2293 
2294 		req->prev_gpaddr = gpaddr;
2295 		req->prev_size = size;
2296 		req->io_req.br_resid += size;
2297 
2298 		req->io_req.br_iovcnt++;
2299 	}
2300 
2301 	return (0);
2302 }
2303 
2304 static void
2305 pci_nvme_set_completion(struct pci_nvme_softc *sc,
2306     struct nvme_submission_queue *sq, int sqid, uint16_t cid, uint16_t status)
2307 {
2308 	struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
2309 
2310 	DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
2311 		 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
2312 		 NVME_STATUS_GET_SC(status));
2313 
2314 	pci_nvme_cq_update(sc, cq, 0, cid, sqid, status);
2315 
2316 	if (cq->head != cq->tail) {
2317 		if (cq->intr_en & NVME_CQ_INTEN) {
2318 			pci_generate_msix(sc->nsc_pi, cq->intr_vec);
2319 		} else {
2320 			DPRINTF("%s: CQ%u interrupt disabled",
2321 						__func__, sq->cqid);
2322 		}
2323 	}
2324 }
2325 
2326 static void
2327 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
2328 {
2329 	req->sc = NULL;
2330 	req->nvme_sq = NULL;
2331 	req->sqid = 0;
2332 
2333 	pthread_mutex_lock(&sc->mtx);
2334 
2335 	STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
2336 	sc->pending_ios--;
2337 
2338 	/* when no more IO pending, can set to ready if device reset/enabled */
2339 	if (sc->pending_ios == 0 &&
2340 	    NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
2341 		sc->regs.csts |= NVME_CSTS_RDY;
2342 
2343 	pthread_mutex_unlock(&sc->mtx);
2344 
2345 	sem_post(&sc->iosemlock);
2346 }
2347 
2348 static struct pci_nvme_ioreq *
2349 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
2350 {
2351 	struct pci_nvme_ioreq *req = NULL;
2352 
2353 	sem_wait(&sc->iosemlock);
2354 	pthread_mutex_lock(&sc->mtx);
2355 
2356 	req = STAILQ_FIRST(&sc->ioreqs_free);
2357 	assert(req != NULL);
2358 	STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
2359 
2360 	req->sc = sc;
2361 
2362 	sc->pending_ios++;
2363 
2364 	pthread_mutex_unlock(&sc->mtx);
2365 
2366 	req->io_req.br_iovcnt = 0;
2367 	req->io_req.br_offset = 0;
2368 	req->io_req.br_resid = 0;
2369 	req->io_req.br_param = req;
2370 	req->prev_gpaddr = 0;
2371 	req->prev_size = 0;
2372 
2373 	return req;
2374 }
2375 
2376 static void
2377 pci_nvme_io_done(struct blockif_req *br, int err)
2378 {
2379 	struct pci_nvme_ioreq *req = br->br_param;
2380 	struct nvme_submission_queue *sq = req->nvme_sq;
2381 	uint16_t code, status;
2382 
2383 	DPRINTF("%s error %d %s", __func__, err, strerror(err));
2384 
2385 	/* TODO return correct error */
2386 	code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
2387 	status = 0;
2388 	pci_nvme_status_genc(&status, code);
2389 
2390 	pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, status);
2391 	pci_nvme_stats_write_read_update(req->sc, req->opc,
2392 	    req->bytes, status);
2393 	pci_nvme_release_ioreq(req->sc, req);
2394 }
2395 
2396 /*
2397  * Implements the Flush command. The specification states:
2398  *    If a volatile write cache is not present, Flush commands complete
2399  *    successfully and have no effect
2400  * in the description of the Volatile Write Cache (VWC) field of the Identify
2401  * Controller data. Therefore, set status to Success if the command is
2402  * not supported (i.e. RAM or as indicated by the blockif).
2403  */
2404 static bool
2405 nvme_opc_flush(struct pci_nvme_softc *sc __unused,
2406     struct nvme_command *cmd __unused,
2407     struct pci_nvme_blockstore *nvstore,
2408     struct pci_nvme_ioreq *req,
2409     uint16_t *status)
2410 {
2411 	bool pending = false;
2412 
2413 	if (nvstore->type == NVME_STOR_RAM) {
2414 		pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2415 	} else {
2416 		int err;
2417 
2418 		req->io_req.br_callback = pci_nvme_io_done;
2419 
2420 		err = blockif_flush(nvstore->ctx, &req->io_req);
2421 		switch (err) {
2422 		case 0:
2423 			pending = true;
2424 			break;
2425 		case EOPNOTSUPP:
2426 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2427 			break;
2428 		default:
2429 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2430 		}
2431 	}
2432 
2433 	return (pending);
2434 }
2435 
2436 static uint16_t
2437 nvme_write_read_ram(struct pci_nvme_softc *sc,
2438     struct pci_nvme_blockstore *nvstore,
2439     uint64_t prp1, uint64_t prp2,
2440     size_t offset, uint64_t bytes,
2441     bool is_write)
2442 {
2443 	uint8_t *buf = nvstore->ctx;
2444 	enum nvme_copy_dir dir;
2445 	uint16_t status;
2446 
2447 	if (is_write)
2448 		dir = NVME_COPY_TO_PRP;
2449 	else
2450 		dir = NVME_COPY_FROM_PRP;
2451 
2452 	status = 0;
2453 	if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
2454 	    buf + offset, bytes, dir))
2455 		pci_nvme_status_genc(&status,
2456 		    NVME_SC_DATA_TRANSFER_ERROR);
2457 	else
2458 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2459 
2460 	return (status);
2461 }
2462 
2463 static uint16_t
2464 nvme_write_read_blockif(struct pci_nvme_softc *sc,
2465     struct pci_nvme_blockstore *nvstore,
2466     struct pci_nvme_ioreq *req,
2467     uint64_t prp1, uint64_t prp2,
2468     size_t offset, uint64_t bytes,
2469     bool is_write)
2470 {
2471 	uint64_t size;
2472 	int err;
2473 	uint16_t status = NVME_NO_STATUS;
2474 
2475 	size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
2476 	if (pci_nvme_append_iov_req(sc, req, prp1, size, offset)) {
2477 		err = -1;
2478 		goto out;
2479 	}
2480 
2481 	offset += size;
2482 	bytes  -= size;
2483 
2484 	if (bytes == 0) {
2485 		;
2486 	} else if (bytes <= PAGE_SIZE) {
2487 		size = bytes;
2488 		if (pci_nvme_append_iov_req(sc, req, prp2, size, offset)) {
2489 			err = -1;
2490 			goto out;
2491 		}
2492 	} else {
2493 		void *vmctx = sc->nsc_pi->pi_vmctx;
2494 		uint64_t *prp_list = &prp2;
2495 		uint64_t *last = prp_list;
2496 
2497 		/* PRP2 is pointer to a physical region page list */
2498 		while (bytes) {
2499 			/* Last entry in list points to the next list */
2500 			if ((prp_list == last) && (bytes > PAGE_SIZE)) {
2501 				uint64_t prp = *prp_list;
2502 
2503 				prp_list = paddr_guest2host(vmctx, prp,
2504 				    PAGE_SIZE - (prp % PAGE_SIZE));
2505 				if (prp_list == NULL) {
2506 					err = -1;
2507 					goto out;
2508 				}
2509 				last = prp_list + (NVME_PRP2_ITEMS - 1);
2510 			}
2511 
2512 			size = MIN(bytes, PAGE_SIZE);
2513 
2514 			if (pci_nvme_append_iov_req(sc, req, *prp_list, size,
2515 			    offset)) {
2516 				err = -1;
2517 				goto out;
2518 			}
2519 
2520 			offset += size;
2521 			bytes  -= size;
2522 
2523 			prp_list++;
2524 		}
2525 	}
2526 	req->io_req.br_callback = pci_nvme_io_done;
2527 	if (is_write)
2528 		err = blockif_write(nvstore->ctx, &req->io_req);
2529 	else
2530 		err = blockif_read(nvstore->ctx, &req->io_req);
2531 out:
2532 	if (err)
2533 		pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2534 
2535 	return (status);
2536 }
2537 
2538 static bool
2539 nvme_opc_write_read(struct pci_nvme_softc *sc,
2540     struct nvme_command *cmd,
2541     struct pci_nvme_blockstore *nvstore,
2542     struct pci_nvme_ioreq *req,
2543     uint16_t *status)
2544 {
2545 	uint64_t lba, nblocks, bytes;
2546 	size_t offset;
2547 	bool is_write = cmd->opc == NVME_OPC_WRITE;
2548 	bool pending = false;
2549 
2550 	lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2551 	nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2552 	bytes = nblocks << nvstore->sectsz_bits;
2553 	if (bytes > NVME_MAX_DATA_SIZE) {
2554 		WPRINTF("%s command would exceed MDTS", __func__);
2555 		pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2556 		goto out;
2557 	}
2558 
2559 	if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2560 		WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)",
2561 		    __func__, lba, nblocks);
2562 		pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2563 		goto out;
2564 	}
2565 
2566 	offset = lba << nvstore->sectsz_bits;
2567 
2568 	req->bytes = bytes;
2569 	req->io_req.br_offset = lba;
2570 
2571 	/* PRP bits 1:0 must be zero */
2572 	cmd->prp1 &= ~0x3UL;
2573 	cmd->prp2 &= ~0x3UL;
2574 
2575 	if (nvstore->type == NVME_STOR_RAM) {
2576 		*status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2577 		    cmd->prp2, offset, bytes, is_write);
2578 	} else {
2579 		*status = nvme_write_read_blockif(sc, nvstore, req,
2580 		    cmd->prp1, cmd->prp2, offset, bytes, is_write);
2581 
2582 		if (*status == NVME_NO_STATUS)
2583 			pending = true;
2584 	}
2585 out:
2586 	if (!pending)
2587 		pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2588 
2589 	return (pending);
2590 }
2591 
2592 static void
2593 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2594 {
2595 	struct pci_nvme_ioreq *req = br->br_param;
2596 	struct pci_nvme_softc *sc = req->sc;
2597 	bool done = true;
2598 	uint16_t status;
2599 
2600 	status = 0;
2601 	if (err) {
2602 		pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2603 	} else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2604 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2605 	} else {
2606 		struct iovec *iov = req->io_req.br_iov;
2607 
2608 		req->prev_gpaddr++;
2609 		iov += req->prev_gpaddr;
2610 
2611 		/* The iov_* values already include the sector size */
2612 		req->io_req.br_offset = (off_t)iov->iov_base;
2613 		req->io_req.br_resid = iov->iov_len;
2614 		if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2615 			pci_nvme_status_genc(&status,
2616 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2617 		} else
2618 			done = false;
2619 	}
2620 
2621 	if (done) {
2622 		pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, req->cid,
2623 		    status);
2624 		pci_nvme_release_ioreq(sc, req);
2625 	}
2626 }
2627 
2628 static bool
2629 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2630     struct nvme_command *cmd,
2631     struct pci_nvme_blockstore *nvstore,
2632     struct pci_nvme_ioreq *req,
2633     uint16_t *status)
2634 {
2635 	struct nvme_dsm_range *range = NULL;
2636 	uint32_t nr, r, non_zero, dr;
2637 	int err;
2638 	bool pending = false;
2639 
2640 	if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2641 		pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2642 		goto out;
2643 	}
2644 
2645 	nr = cmd->cdw10 & 0xff;
2646 
2647 	/* copy locally because a range entry could straddle PRPs */
2648 	range = calloc(1, NVME_MAX_DSM_TRIM);
2649 	if (range == NULL) {
2650 		pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2651 		goto out;
2652 	}
2653 	nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2654 	    (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2655 
2656 	/* Check for invalid ranges and the number of non-zero lengths */
2657 	non_zero = 0;
2658 	for (r = 0; r <= nr; r++) {
2659 		if (pci_nvme_out_of_range(nvstore,
2660 		    range[r].starting_lba, range[r].length)) {
2661 			pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2662 			goto out;
2663 		}
2664 		if (range[r].length != 0)
2665 			non_zero++;
2666 	}
2667 
2668 	if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2669 		size_t offset, bytes;
2670 		int sectsz_bits = sc->nvstore.sectsz_bits;
2671 
2672 		/*
2673 		 * DSM calls are advisory only, and compliant controllers
2674 		 * may choose to take no actions (i.e. return Success).
2675 		 */
2676 		if (!nvstore->deallocate) {
2677 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2678 			goto out;
2679 		}
2680 
2681 		/* If all ranges have a zero length, return Success */
2682 		if (non_zero == 0) {
2683 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2684 			goto out;
2685 		}
2686 
2687 		if (req == NULL) {
2688 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2689 			goto out;
2690 		}
2691 
2692 		offset = range[0].starting_lba << sectsz_bits;
2693 		bytes = range[0].length << sectsz_bits;
2694 
2695 		/*
2696 		 * If the request is for more than a single range, store
2697 		 * the ranges in the br_iov. Optimize for the common case
2698 		 * of a single range.
2699 		 *
2700 		 * Note that NVMe Number of Ranges is a zero based value
2701 		 */
2702 		req->io_req.br_iovcnt = 0;
2703 		req->io_req.br_offset = offset;
2704 		req->io_req.br_resid = bytes;
2705 
2706 		if (nr == 0) {
2707 			req->io_req.br_callback = pci_nvme_io_done;
2708 		} else {
2709 			struct iovec *iov = req->io_req.br_iov;
2710 
2711 			for (r = 0, dr = 0; r <= nr; r++) {
2712 				offset = range[r].starting_lba << sectsz_bits;
2713 				bytes = range[r].length << sectsz_bits;
2714 				if (bytes == 0)
2715 					continue;
2716 
2717 				if ((nvstore->size - offset) < bytes) {
2718 					pci_nvme_status_genc(status,
2719 					    NVME_SC_LBA_OUT_OF_RANGE);
2720 					goto out;
2721 				}
2722 				iov[dr].iov_base = (void *)offset;
2723 				iov[dr].iov_len = bytes;
2724 				dr++;
2725 			}
2726 			req->io_req.br_callback = pci_nvme_dealloc_sm;
2727 
2728 			/*
2729 			 * Use prev_gpaddr to track the current entry and
2730 			 * prev_size to track the number of entries
2731 			 */
2732 			req->prev_gpaddr = 0;
2733 			req->prev_size = dr;
2734 		}
2735 
2736 		err = blockif_delete(nvstore->ctx, &req->io_req);
2737 		if (err)
2738 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2739 		else
2740 			pending = true;
2741 	}
2742 out:
2743 	free(range);
2744 	return (pending);
2745 }
2746 
2747 static void
2748 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2749 {
2750 	struct nvme_submission_queue *sq;
2751 	uint16_t status;
2752 	uint16_t sqhead;
2753 
2754 	/* handle all submissions up to sq->tail index */
2755 	sq = &sc->submit_queues[idx];
2756 
2757 	pthread_mutex_lock(&sq->mtx);
2758 
2759 	sqhead = sq->head;
2760 	DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2761 	         idx, sqhead, sq->tail, sq->qbase);
2762 
2763 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
2764 		struct nvme_command *cmd;
2765 		struct pci_nvme_ioreq *req;
2766 		uint32_t nsid;
2767 		bool pending;
2768 
2769 		pending = false;
2770 		req = NULL;
2771 		status = 0;
2772 
2773 		cmd = &sq->qbase[sqhead];
2774 		sqhead = (sqhead + 1) % sq->size;
2775 
2776 		nsid = le32toh(cmd->nsid);
2777 		if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2778 			pci_nvme_status_genc(&status,
2779 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2780 			status |=
2781 			    NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2782 			goto complete;
2783  		}
2784 
2785 		req = pci_nvme_get_ioreq(sc);
2786 		if (req == NULL) {
2787 			pci_nvme_status_genc(&status,
2788 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2789 			WPRINTF("%s: unable to allocate IO req", __func__);
2790 			goto complete;
2791 		}
2792 		req->nvme_sq = sq;
2793 		req->sqid = idx;
2794 		req->opc = cmd->opc;
2795 		req->cid = cmd->cid;
2796 		req->nsid = cmd->nsid;
2797 
2798 		switch (cmd->opc) {
2799 		case NVME_OPC_FLUSH:
2800 			pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2801 			    req, &status);
2802  			break;
2803 		case NVME_OPC_WRITE:
2804 		case NVME_OPC_READ:
2805 			pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2806 			    req, &status);
2807 			break;
2808 		case NVME_OPC_WRITE_ZEROES:
2809 			/* TODO: write zeroes
2810 			WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2811 			        __func__, lba, cmd->cdw12 & 0xFFFF); */
2812 			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2813 			break;
2814 		case NVME_OPC_DATASET_MANAGEMENT:
2815  			pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2816 			    req, &status);
2817 			break;
2818  		default:
2819  			WPRINTF("%s unhandled io command 0x%x",
2820 			    __func__, cmd->opc);
2821 			pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2822 		}
2823 complete:
2824 		if (!pending) {
2825 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, status);
2826 			if (req != NULL)
2827 				pci_nvme_release_ioreq(sc, req);
2828 		}
2829 	}
2830 
2831 	sq->head = sqhead;
2832 
2833 	pthread_mutex_unlock(&sq->mtx);
2834 }
2835 
2836 static void
2837 pci_nvme_handle_doorbell(struct vmctx *ctx __unused, struct pci_nvme_softc* sc,
2838 	uint64_t idx, int is_sq, uint64_t value)
2839 {
2840 	DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2841 	        idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2842 
2843 	if (is_sq) {
2844 		if (idx > sc->num_squeues) {
2845 			WPRINTF("%s queue index %lu overflow from "
2846 			         "guest (max %u)",
2847 			         __func__, idx, sc->num_squeues);
2848 			return;
2849 		}
2850 
2851 		atomic_store_short(&sc->submit_queues[idx].tail,
2852 		                   (uint16_t)value);
2853 
2854 		if (idx == 0) {
2855 			pci_nvme_handle_admin_cmd(sc, value);
2856 		} else {
2857 			/* submission queue; handle new entries in SQ */
2858 			if (idx > sc->num_squeues) {
2859 				WPRINTF("%s SQ index %lu overflow from "
2860 				         "guest (max %u)",
2861 				         __func__, idx, sc->num_squeues);
2862 				return;
2863 			}
2864 			pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2865 		}
2866 	} else {
2867 		if (idx > sc->num_cqueues) {
2868 			WPRINTF("%s queue index %lu overflow from "
2869 			         "guest (max %u)",
2870 			         __func__, idx, sc->num_cqueues);
2871 			return;
2872 		}
2873 
2874 		atomic_store_short(&sc->compl_queues[idx].head,
2875 				(uint16_t)value);
2876 	}
2877 }
2878 
2879 static void
2880 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2881 {
2882 	const char *s = iswrite ? "WRITE" : "READ";
2883 
2884 	switch (offset) {
2885 	case NVME_CR_CAP_LOW:
2886 		DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2887 		break;
2888 	case NVME_CR_CAP_HI:
2889 		DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2890 		break;
2891 	case NVME_CR_VS:
2892 		DPRINTF("%s %s NVME_CR_VS", func, s);
2893 		break;
2894 	case NVME_CR_INTMS:
2895 		DPRINTF("%s %s NVME_CR_INTMS", func, s);
2896 		break;
2897 	case NVME_CR_INTMC:
2898 		DPRINTF("%s %s NVME_CR_INTMC", func, s);
2899 		break;
2900 	case NVME_CR_CC:
2901 		DPRINTF("%s %s NVME_CR_CC", func, s);
2902 		break;
2903 	case NVME_CR_CSTS:
2904 		DPRINTF("%s %s NVME_CR_CSTS", func, s);
2905 		break;
2906 	case NVME_CR_NSSR:
2907 		DPRINTF("%s %s NVME_CR_NSSR", func, s);
2908 		break;
2909 	case NVME_CR_AQA:
2910 		DPRINTF("%s %s NVME_CR_AQA", func, s);
2911 		break;
2912 	case NVME_CR_ASQ_LOW:
2913 		DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2914 		break;
2915 	case NVME_CR_ASQ_HI:
2916 		DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2917 		break;
2918 	case NVME_CR_ACQ_LOW:
2919 		DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2920 		break;
2921 	case NVME_CR_ACQ_HI:
2922 		DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2923 		break;
2924 	default:
2925 		DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2926 	}
2927 
2928 }
2929 
2930 static void
2931 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2932 	uint64_t offset, int size, uint64_t value)
2933 {
2934 	uint32_t ccreg;
2935 
2936 	if (offset >= NVME_DOORBELL_OFFSET) {
2937 		uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2938 		uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2939 		int is_sq = (belloffset % 8) < 4;
2940 
2941 		if ((sc->regs.csts & NVME_CSTS_RDY) == 0) {
2942 			WPRINTF("doorbell write prior to RDY (offset=%#lx)\n",
2943 			    offset);
2944 			return;
2945 		}
2946 
2947 		if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2948 			WPRINTF("guest attempted an overflow write offset "
2949 			         "0x%lx, val 0x%lx in %s",
2950 			         offset, value, __func__);
2951 			return;
2952 		}
2953 
2954 		if (is_sq) {
2955 			if (sc->submit_queues[idx].qbase == NULL)
2956 				return;
2957 		} else if (sc->compl_queues[idx].qbase == NULL)
2958 			return;
2959 
2960 		pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2961 		return;
2962 	}
2963 
2964 	DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2965 	        offset, size, value);
2966 
2967 	if (size != 4) {
2968 		WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2969 		         "val 0x%lx) to bar0 in %s",
2970 		         size, offset, value, __func__);
2971 		/* TODO: shutdown device */
2972 		return;
2973 	}
2974 
2975 	pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2976 
2977 	pthread_mutex_lock(&sc->mtx);
2978 
2979 	switch (offset) {
2980 	case NVME_CR_CAP_LOW:
2981 	case NVME_CR_CAP_HI:
2982 		/* readonly */
2983 		break;
2984 	case NVME_CR_VS:
2985 		/* readonly */
2986 		break;
2987 	case NVME_CR_INTMS:
2988 		/* MSI-X, so ignore */
2989 		break;
2990 	case NVME_CR_INTMC:
2991 		/* MSI-X, so ignore */
2992 		break;
2993 	case NVME_CR_CC:
2994 		ccreg = (uint32_t)value;
2995 
2996 		DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2997 		         "iocqes %u",
2998 		        __func__,
2999 			 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
3000 			 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
3001 			 NVME_CC_GET_IOCQES(ccreg));
3002 
3003 		if (NVME_CC_GET_SHN(ccreg)) {
3004 			/* perform shutdown - flush out data to backend */
3005 			sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
3006 			    NVME_CSTS_REG_SHST_SHIFT);
3007 			sc->regs.csts |= NVME_SHST_COMPLETE <<
3008 			    NVME_CSTS_REG_SHST_SHIFT;
3009 		}
3010 		if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
3011 			if (NVME_CC_GET_EN(ccreg) == 0)
3012 				/* transition 1-> causes controller reset */
3013 				pci_nvme_reset_locked(sc);
3014 			else
3015 				pci_nvme_init_controller(ctx, sc);
3016 		}
3017 
3018 		/* Insert the iocqes, iosqes and en bits from the write */
3019 		sc->regs.cc &= ~NVME_CC_WRITE_MASK;
3020 		sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
3021 		if (NVME_CC_GET_EN(ccreg) == 0) {
3022 			/* Insert the ams, mps and css bit fields */
3023 			sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
3024 			sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
3025 			sc->regs.csts &= ~NVME_CSTS_RDY;
3026 		} else if ((sc->pending_ios == 0) &&
3027 		    !(sc->regs.csts & NVME_CSTS_CFS)) {
3028 			sc->regs.csts |= NVME_CSTS_RDY;
3029 		}
3030 		break;
3031 	case NVME_CR_CSTS:
3032 		break;
3033 	case NVME_CR_NSSR:
3034 		/* ignore writes; don't support subsystem reset */
3035 		break;
3036 	case NVME_CR_AQA:
3037 		sc->regs.aqa = (uint32_t)value;
3038 		break;
3039 	case NVME_CR_ASQ_LOW:
3040 		sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
3041 		               (0xFFFFF000 & value);
3042 		break;
3043 	case NVME_CR_ASQ_HI:
3044 		sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
3045 		               (value << 32);
3046 		break;
3047 	case NVME_CR_ACQ_LOW:
3048 		sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
3049 		               (0xFFFFF000 & value);
3050 		break;
3051 	case NVME_CR_ACQ_HI:
3052 		sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
3053 		               (value << 32);
3054 		break;
3055 	default:
3056 		DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
3057 		         __func__, offset, value, size);
3058 	}
3059 	pthread_mutex_unlock(&sc->mtx);
3060 }
3061 
3062 static void
3063 pci_nvme_write(struct vmctx *ctx, int vcpu __unused, struct pci_devinst *pi,
3064     int baridx, uint64_t offset, int size, uint64_t value)
3065 {
3066 	struct pci_nvme_softc* sc = pi->pi_arg;
3067 
3068 	if (baridx == pci_msix_table_bar(pi) ||
3069 	    baridx == pci_msix_pba_bar(pi)) {
3070 		DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
3071 		         " value 0x%lx", baridx, offset, size, value);
3072 
3073 		pci_emul_msix_twrite(pi, offset, size, value);
3074 		return;
3075 	}
3076 
3077 	switch (baridx) {
3078 	case 0:
3079 		pci_nvme_write_bar_0(ctx, sc, offset, size, value);
3080 		break;
3081 
3082 	default:
3083 		DPRINTF("%s unknown baridx %d, val 0x%lx",
3084 		         __func__, baridx, value);
3085 	}
3086 }
3087 
3088 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
3089 	uint64_t offset, int size)
3090 {
3091 	uint64_t value;
3092 
3093 	pci_nvme_bar0_reg_dumps(__func__, offset, 0);
3094 
3095 	if (offset < NVME_DOORBELL_OFFSET) {
3096 		void *p = &(sc->regs);
3097 		pthread_mutex_lock(&sc->mtx);
3098 		memcpy(&value, (void *)((uintptr_t)p + offset), size);
3099 		pthread_mutex_unlock(&sc->mtx);
3100 	} else {
3101 		value = 0;
3102                 WPRINTF("pci_nvme: read invalid offset %ld", offset);
3103 	}
3104 
3105 	switch (size) {
3106 	case 1:
3107 		value &= 0xFF;
3108 		break;
3109 	case 2:
3110 		value &= 0xFFFF;
3111 		break;
3112 	case 4:
3113 		value &= 0xFFFFFFFF;
3114 		break;
3115 	}
3116 
3117 	DPRINTF("   nvme-read offset 0x%lx, size %d -> value 0x%x",
3118 	         offset, size, (uint32_t)value);
3119 
3120 	return (value);
3121 }
3122 
3123 
3124 
3125 static uint64_t
3126 pci_nvme_read(struct vmctx *ctx __unused, int vcpu __unused,
3127     struct pci_devinst *pi, int baridx, uint64_t offset, int size)
3128 {
3129 	struct pci_nvme_softc* sc = pi->pi_arg;
3130 
3131 	if (baridx == pci_msix_table_bar(pi) ||
3132 	    baridx == pci_msix_pba_bar(pi)) {
3133 		DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
3134 		        baridx, offset, size);
3135 
3136 		return pci_emul_msix_tread(pi, offset, size);
3137 	}
3138 
3139 	switch (baridx) {
3140 	case 0:
3141        		return pci_nvme_read_bar_0(sc, offset, size);
3142 
3143 	default:
3144 		DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
3145 	}
3146 
3147 	return (0);
3148 }
3149 
3150 static int
3151 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl)
3152 {
3153 	char bident[sizeof("XX:X:X")];
3154 	const char *value;
3155 	uint32_t sectsz;
3156 
3157 	sc->max_queues = NVME_QUEUES;
3158 	sc->max_qentries = NVME_MAX_QENTRIES;
3159 	sc->ioslots = NVME_IOSLOTS;
3160 	sc->num_squeues = sc->max_queues;
3161 	sc->num_cqueues = sc->max_queues;
3162 	sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
3163 	sectsz = 0;
3164 	snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
3165 	         "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
3166 
3167 	value = get_config_value_node(nvl, "maxq");
3168 	if (value != NULL)
3169 		sc->max_queues = atoi(value);
3170 	value = get_config_value_node(nvl, "qsz");
3171 	if (value != NULL) {
3172 		sc->max_qentries = atoi(value);
3173 		if (sc->max_qentries <= 0) {
3174 			EPRINTLN("nvme: Invalid qsz option %d",
3175 			    sc->max_qentries);
3176 			return (-1);
3177 		}
3178 	}
3179 	value = get_config_value_node(nvl, "ioslots");
3180 	if (value != NULL) {
3181 		sc->ioslots = atoi(value);
3182 		if (sc->ioslots <= 0) {
3183 			EPRINTLN("Invalid ioslots option %d", sc->ioslots);
3184 			return (-1);
3185 		}
3186 	}
3187 	value = get_config_value_node(nvl, "sectsz");
3188 	if (value != NULL)
3189 		sectsz = atoi(value);
3190 	value = get_config_value_node(nvl, "ser");
3191 	if (value != NULL) {
3192 		/*
3193 		 * This field indicates the Product Serial Number in
3194 		 * 7-bit ASCII, unused bytes should be space characters.
3195 		 * Ref: NVMe v1.3c.
3196 		 */
3197 		cpywithpad((char *)sc->ctrldata.sn,
3198 		    sizeof(sc->ctrldata.sn), value, ' ');
3199 	}
3200 	value = get_config_value_node(nvl, "eui64");
3201 	if (value != NULL)
3202 		sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0));
3203 	value = get_config_value_node(nvl, "dsm");
3204 	if (value != NULL) {
3205 		if (strcmp(value, "auto") == 0)
3206 			sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
3207 		else if (strcmp(value, "enable") == 0)
3208 			sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
3209 		else if (strcmp(value, "disable") == 0)
3210 			sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
3211 	}
3212 
3213 	value = get_config_value_node(nvl, "ram");
3214 	if (value != NULL) {
3215 		uint64_t sz = strtoull(value, NULL, 10);
3216 
3217 		sc->nvstore.type = NVME_STOR_RAM;
3218 		sc->nvstore.size = sz * 1024 * 1024;
3219 		sc->nvstore.ctx = calloc(1, sc->nvstore.size);
3220 		sc->nvstore.sectsz = 4096;
3221 		sc->nvstore.sectsz_bits = 12;
3222 		if (sc->nvstore.ctx == NULL) {
3223 			EPRINTLN("nvme: Unable to allocate RAM");
3224 			return (-1);
3225 		}
3226 	} else {
3227 		snprintf(bident, sizeof(bident), "%d:%d",
3228 		    sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
3229 		sc->nvstore.ctx = blockif_open(nvl, bident);
3230 		if (sc->nvstore.ctx == NULL) {
3231 			EPRINTLN("nvme: Could not open backing file: %s",
3232 			    strerror(errno));
3233 			return (-1);
3234 		}
3235 		sc->nvstore.type = NVME_STOR_BLOCKIF;
3236 		sc->nvstore.size = blockif_size(sc->nvstore.ctx);
3237 	}
3238 
3239 	if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
3240 		sc->nvstore.sectsz = sectsz;
3241 	else if (sc->nvstore.type != NVME_STOR_RAM)
3242 		sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
3243 	for (sc->nvstore.sectsz_bits = 9;
3244 	     (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
3245 	     sc->nvstore.sectsz_bits++);
3246 
3247 	if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
3248 		sc->max_queues = NVME_QUEUES;
3249 
3250 	return (0);
3251 }
3252 
3253 static void
3254 pci_nvme_resized(struct blockif_ctxt *bctxt __unused, void *arg,
3255     size_t new_size)
3256 {
3257 	struct pci_nvme_softc *sc;
3258 	struct pci_nvme_blockstore *nvstore;
3259 	struct nvme_namespace_data *nd;
3260 
3261 	sc = arg;
3262 	nvstore = &sc->nvstore;
3263 	nd = &sc->nsdata;
3264 
3265 	nvstore->size = new_size;
3266 	pci_nvme_init_nsdata_size(nvstore, nd);
3267 
3268 	/* Add changed NSID to list */
3269 	sc->ns_log.ns[0] = 1;
3270 	sc->ns_log.ns[1] = 0;
3271 
3272 	pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE,
3273 	    PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED);
3274 }
3275 
3276 static int
3277 pci_nvme_init(struct vmctx *ctx __unused, struct pci_devinst *pi, nvlist_t *nvl)
3278 {
3279 	struct pci_nvme_softc *sc;
3280 	uint32_t pci_membar_sz;
3281 	int	error;
3282 
3283 	error = 0;
3284 
3285 	sc = calloc(1, sizeof(struct pci_nvme_softc));
3286 	pi->pi_arg = sc;
3287 	sc->nsc_pi = pi;
3288 
3289 	error = pci_nvme_parse_config(sc, nvl);
3290 	if (error < 0)
3291 		goto done;
3292 	else
3293 		error = 0;
3294 
3295 	STAILQ_INIT(&sc->ioreqs_free);
3296 	sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
3297 	for (uint32_t i = 0; i < sc->ioslots; i++) {
3298 		STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
3299 	}
3300 
3301 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
3302 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
3303 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
3304 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
3305 	pci_set_cfgdata8(pi, PCIR_PROGIF,
3306 	                 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
3307 
3308 	/*
3309 	 * Allocate size of NVMe registers + doorbell space for all queues.
3310 	 *
3311 	 * The specification requires a minimum memory I/O window size of 16K.
3312 	 * The Windows driver will refuse to start a device with a smaller
3313 	 * window.
3314 	 */
3315 	pci_membar_sz = sizeof(struct nvme_registers) +
3316 	    2 * sizeof(uint32_t) * (sc->max_queues + 1);
3317 	pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
3318 
3319 	DPRINTF("nvme membar size: %u", pci_membar_sz);
3320 
3321 	error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
3322 	if (error) {
3323 		WPRINTF("%s pci alloc mem bar failed", __func__);
3324 		goto done;
3325 	}
3326 
3327 	error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
3328 	if (error) {
3329 		WPRINTF("%s pci add msixcap failed", __func__);
3330 		goto done;
3331 	}
3332 
3333 	error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
3334 	if (error) {
3335 		WPRINTF("%s pci add Express capability failed", __func__);
3336 		goto done;
3337 	}
3338 
3339 	pthread_mutex_init(&sc->mtx, NULL);
3340 	sem_init(&sc->iosemlock, 0, sc->ioslots);
3341 	blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc);
3342 
3343 	pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
3344 	/*
3345 	 * Controller data depends on Namespace data so initialize Namespace
3346 	 * data first.
3347 	 */
3348 	pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
3349 	pci_nvme_init_ctrldata(sc);
3350 	pci_nvme_init_logpages(sc);
3351 	pci_nvme_init_features(sc);
3352 
3353 	pci_nvme_aer_init(sc);
3354 	pci_nvme_aen_init(sc);
3355 
3356 	pci_nvme_reset(sc);
3357 
3358 	pci_lintr_request(pi);
3359 
3360 done:
3361 	return (error);
3362 }
3363 
3364 static int
3365 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts)
3366 {
3367 	char *cp, *ram;
3368 
3369 	if (opts == NULL)
3370 		return (0);
3371 
3372 	if (strncmp(opts, "ram=", 4) == 0) {
3373 		cp = strchr(opts, ',');
3374 		if (cp == NULL) {
3375 			set_config_value_node(nvl, "ram", opts + 4);
3376 			return (0);
3377 		}
3378 		ram = strndup(opts + 4, cp - opts - 4);
3379 		set_config_value_node(nvl, "ram", ram);
3380 		free(ram);
3381 		return (pci_parse_legacy_config(nvl, cp + 1));
3382 	} else
3383 		return (blockif_legacy_config(nvl, opts));
3384 }
3385 
3386 static const struct pci_devemu pci_de_nvme = {
3387 	.pe_emu =	"nvme",
3388 	.pe_init =	pci_nvme_init,
3389 	.pe_legacy_config = pci_nvme_legacy_config,
3390 	.pe_barwrite =	pci_nvme_write,
3391 	.pe_barread =	pci_nvme_read
3392 };
3393 PCI_EMUL_SET(pci_de_nvme);
3394