xref: /freebsd/usr.sbin/bhyve/pci_nvme.c (revision 35c87c070a2d04f06c56578b0a4b2e9c13f62be5)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  * Copyright (c) 2020 Chuck Tuffli
7  *
8  * Function crc16 Copyright (c) 2017, Fedor Uporov
9  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 /*
34  * bhyve PCIe-NVMe device emulation.
35  *
36  * options:
37  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
38  *
39  *  accepted devpath:
40  *    /dev/blockdev
41  *    /path/to/image
42  *    ram=size_in_MiB
43  *
44  *  maxq    = max number of queues
45  *  qsz     = max elements in each queue
46  *  ioslots = max number of concurrent io requests
47  *  sectsz  = sector size (defaults to blockif sector size)
48  *  ser     = serial number (20-chars max)
49  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
50  *  dsm     = DataSet Management support. Option is one of auto, enable,disable
51  *
52  */
53 
54 /* TODO:
55     - create async event for smart and log
56     - intr coalesce
57  */
58 
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61 
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
65 
66 #include <assert.h>
67 #include <pthread.h>
68 #include <pthread_np.h>
69 #include <semaphore.h>
70 #include <stdbool.h>
71 #include <stddef.h>
72 #include <stdint.h>
73 #include <stdio.h>
74 #include <stdlib.h>
75 #include <string.h>
76 
77 #include <machine/atomic.h>
78 #include <machine/vmm.h>
79 #include <vmmapi.h>
80 
81 #include <dev/nvme/nvme.h>
82 
83 #include "bhyverun.h"
84 #include "block_if.h"
85 #include "config.h"
86 #include "debug.h"
87 #include "pci_emul.h"
88 
89 
90 static int nvme_debug = 0;
91 #define	DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
92 #define	WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
93 
94 /* defaults; can be overridden */
95 #define	NVME_MSIX_BAR		4
96 
97 #define	NVME_IOSLOTS		8
98 
99 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
100 #define NVME_MMIO_SPACE_MIN	(1 << 14)
101 
102 #define	NVME_QUEUES		16
103 #define	NVME_MAX_QENTRIES	2048
104 /* Memory Page size Minimum reported in CAP register */
105 #define	NVME_MPSMIN		0
106 /* MPSMIN converted to bytes */
107 #define	NVME_MPSMIN_BYTES	(1 << (12 + NVME_MPSMIN))
108 
109 #define	NVME_PRP2_ITEMS		(PAGE_SIZE/sizeof(uint64_t))
110 #define	NVME_MDTS		9
111 /* Note the + 1 allows for the initial descriptor to not be page aligned */
112 #define	NVME_MAX_IOVEC		((1 << NVME_MDTS) + 1)
113 #define	NVME_MAX_DATA_SIZE	((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
114 
115 /* This is a synthetic status code to indicate there is no status */
116 #define NVME_NO_STATUS		0xffff
117 #define NVME_COMPLETION_VALID(c)	((c).status != NVME_NO_STATUS)
118 
119 /* Reported temperature in Kelvin (i.e. room temperature) */
120 #define NVME_TEMPERATURE 296
121 
122 /* helpers */
123 
124 /* Convert a zero-based value into a one-based value */
125 #define ONE_BASED(zero)		((zero) + 1)
126 /* Convert a one-based value into a zero-based value */
127 #define ZERO_BASED(one)		((one)  - 1)
128 
129 /* Encode number of SQ's and CQ's for Set/Get Features */
130 #define NVME_FEATURE_NUM_QUEUES(sc) \
131 	(ZERO_BASED((sc)->num_squeues) & 0xffff) | \
132 	(ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
133 
134 #define	NVME_DOORBELL_OFFSET	offsetof(struct nvme_registers, doorbell)
135 
136 enum nvme_controller_register_offsets {
137 	NVME_CR_CAP_LOW = 0x00,
138 	NVME_CR_CAP_HI  = 0x04,
139 	NVME_CR_VS      = 0x08,
140 	NVME_CR_INTMS   = 0x0c,
141 	NVME_CR_INTMC   = 0x10,
142 	NVME_CR_CC      = 0x14,
143 	NVME_CR_CSTS    = 0x1c,
144 	NVME_CR_NSSR    = 0x20,
145 	NVME_CR_AQA     = 0x24,
146 	NVME_CR_ASQ_LOW = 0x28,
147 	NVME_CR_ASQ_HI  = 0x2c,
148 	NVME_CR_ACQ_LOW = 0x30,
149 	NVME_CR_ACQ_HI  = 0x34,
150 };
151 
152 enum nvme_cmd_cdw11 {
153 	NVME_CMD_CDW11_PC  = 0x0001,
154 	NVME_CMD_CDW11_IEN = 0x0002,
155 	NVME_CMD_CDW11_IV  = 0xFFFF0000,
156 };
157 
158 enum nvme_copy_dir {
159 	NVME_COPY_TO_PRP,
160 	NVME_COPY_FROM_PRP,
161 };
162 
163 #define	NVME_CQ_INTEN	0x01
164 #define	NVME_CQ_INTCOAL	0x02
165 
166 struct nvme_completion_queue {
167 	struct nvme_completion *qbase;
168 	pthread_mutex_t	mtx;
169 	uint32_t	size;
170 	uint16_t	tail; /* nvme progress */
171 	uint16_t	head; /* guest progress */
172 	uint16_t	intr_vec;
173 	uint32_t	intr_en;
174 };
175 
176 struct nvme_submission_queue {
177 	struct nvme_command *qbase;
178 	pthread_mutex_t	mtx;
179 	uint32_t	size;
180 	uint16_t	head; /* nvme progress */
181 	uint16_t	tail; /* guest progress */
182 	uint16_t	cqid; /* completion queue id */
183 	int		qpriority;
184 };
185 
186 enum nvme_storage_type {
187 	NVME_STOR_BLOCKIF = 0,
188 	NVME_STOR_RAM = 1,
189 };
190 
191 struct pci_nvme_blockstore {
192 	enum nvme_storage_type type;
193 	void		*ctx;
194 	uint64_t	size;
195 	uint32_t	sectsz;
196 	uint32_t	sectsz_bits;
197 	uint64_t	eui64;
198 	uint32_t	deallocate:1;
199 };
200 
201 /*
202  * Calculate the number of additional page descriptors for guest IO requests
203  * based on the advertised Max Data Transfer (MDTS) and given the number of
204  * default iovec's in a struct blockif_req.
205  */
206 #define MDTS_PAD_SIZE \
207 	( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
208 	  NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
209 	  0 )
210 
211 struct pci_nvme_ioreq {
212 	struct pci_nvme_softc *sc;
213 	STAILQ_ENTRY(pci_nvme_ioreq) link;
214 	struct nvme_submission_queue *nvme_sq;
215 	uint16_t	sqid;
216 
217 	/* command information */
218 	uint16_t	opc;
219 	uint16_t	cid;
220 	uint32_t	nsid;
221 
222 	uint64_t	prev_gpaddr;
223 	size_t		prev_size;
224 	size_t		bytes;
225 
226 	struct blockif_req io_req;
227 
228 	struct iovec	iovpadding[MDTS_PAD_SIZE];
229 };
230 
231 enum nvme_dsm_type {
232 	/* Dataset Management bit in ONCS reflects backing storage capability */
233 	NVME_DATASET_MANAGEMENT_AUTO,
234 	/* Unconditionally set Dataset Management bit in ONCS */
235 	NVME_DATASET_MANAGEMENT_ENABLE,
236 	/* Unconditionally clear Dataset Management bit in ONCS */
237 	NVME_DATASET_MANAGEMENT_DISABLE,
238 };
239 
240 struct pci_nvme_softc;
241 struct nvme_feature_obj;
242 
243 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
244     struct nvme_feature_obj *,
245     struct nvme_command *,
246     struct nvme_completion *);
247 
248 struct nvme_feature_obj {
249 	uint32_t	cdw11;
250 	nvme_feature_cb	set;
251 	nvme_feature_cb	get;
252 	bool namespace_specific;
253 };
254 
255 #define NVME_FID_MAX		(NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
256 
257 typedef enum {
258 	PCI_NVME_AE_TYPE_ERROR = 0,
259 	PCI_NVME_AE_TYPE_SMART,
260 	PCI_NVME_AE_TYPE_NOTICE,
261 	PCI_NVME_AE_TYPE_IO_CMD = 6,
262 	PCI_NVME_AE_TYPE_VENDOR = 7,
263 	PCI_NVME_AE_TYPE_MAX		/* Must be last */
264 } pci_nvme_async_type;
265 
266 /* Asynchronous Event Requests */
267 struct pci_nvme_aer {
268 	STAILQ_ENTRY(pci_nvme_aer) link;
269 	uint16_t	cid;	/* Command ID of the submitted AER */
270 };
271 
272 /** Asynchronous Event Information - Notice */
273 typedef enum {
274 	PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED = 0,
275 	PCI_NVME_AEI_NOTICE_FW_ACTIVATION,
276 	PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE,
277 	PCI_NVME_AEI_NOTICE_ANA_CHANGE,
278 	PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE,
279 	PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT,
280 	PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE,
281 	PCI_NVME_AEI_NOTICE_MAX,
282 } pci_nvme_async_event_info_notice;
283 
284 #define PCI_NVME_AEI_NOTICE_SHIFT		8
285 #define PCI_NVME_AEI_NOTICE_MASK(event)	(1 << (event + PCI_NVME_AEI_NOTICE_SHIFT))
286 
287 /* Asynchronous Event Notifications */
288 struct pci_nvme_aen {
289 	pci_nvme_async_type atype;
290 	uint32_t	event_data;
291 	bool		posted;
292 };
293 
294 /*
295  * By default, enable all Asynchrnous Event Notifications:
296  *     SMART / Health Critical Warnings
297  *     Namespace Attribute Notices
298  */
299 #define PCI_NVME_AEN_DEFAULT_MASK	0x11f
300 
301 typedef enum {
302 	NVME_CNTRLTYPE_IO = 1,
303 	NVME_CNTRLTYPE_DISCOVERY = 2,
304 	NVME_CNTRLTYPE_ADMIN = 3,
305 } pci_nvme_cntrl_type;
306 
307 struct pci_nvme_softc {
308 	struct pci_devinst *nsc_pi;
309 
310 	pthread_mutex_t	mtx;
311 
312 	struct nvme_registers regs;
313 
314 	struct nvme_namespace_data  nsdata;
315 	struct nvme_controller_data ctrldata;
316 	struct nvme_error_information_entry err_log;
317 	struct nvme_health_information_page health_log;
318 	struct nvme_firmware_page fw_log;
319 	struct nvme_ns_list ns_log;
320 
321 	struct pci_nvme_blockstore nvstore;
322 
323 	uint16_t	max_qentries;	/* max entries per queue */
324 	uint32_t	max_queues;	/* max number of IO SQ's or CQ's */
325 	uint32_t	num_cqueues;
326 	uint32_t	num_squeues;
327 	bool		num_q_is_set; /* Has host set Number of Queues */
328 
329 	struct pci_nvme_ioreq *ioreqs;
330 	STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
331 	uint32_t	pending_ios;
332 	uint32_t	ioslots;
333 	sem_t		iosemlock;
334 
335 	/*
336 	 * Memory mapped Submission and Completion queues
337 	 * Each array includes both Admin and IO queues
338 	 */
339 	struct nvme_completion_queue *compl_queues;
340 	struct nvme_submission_queue *submit_queues;
341 
342 	struct nvme_feature_obj feat[NVME_FID_MAX];
343 
344 	enum nvme_dsm_type dataset_management;
345 
346 	/* Accounting for SMART data */
347 	__uint128_t	read_data_units;
348 	__uint128_t	write_data_units;
349 	__uint128_t	read_commands;
350 	__uint128_t	write_commands;
351 	uint32_t	read_dunits_remainder;
352 	uint32_t	write_dunits_remainder;
353 
354 	STAILQ_HEAD(, pci_nvme_aer) aer_list;
355 	pthread_mutex_t	aer_mtx;
356 	uint32_t	aer_count;
357 	struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX];
358 	pthread_t	aen_tid;
359 	pthread_mutex_t	aen_mtx;
360 	pthread_cond_t	aen_cond;
361 };
362 
363 
364 static void pci_nvme_cq_update(struct pci_nvme_softc *sc,
365     struct nvme_completion_queue *cq,
366     uint32_t cdw0,
367     uint16_t cid,
368     uint16_t sqid,
369     uint16_t status);
370 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
371 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
372 static void pci_nvme_io_done(struct blockif_req *, int);
373 
374 /* Controller Configuration utils */
375 #define	NVME_CC_GET_EN(cc) \
376 	((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
377 #define	NVME_CC_GET_CSS(cc) \
378 	((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
379 #define	NVME_CC_GET_SHN(cc) \
380 	((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
381 #define	NVME_CC_GET_IOSQES(cc) \
382 	((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
383 #define	NVME_CC_GET_IOCQES(cc) \
384 	((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
385 
386 #define	NVME_CC_WRITE_MASK \
387 	((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
388 	 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
389 	 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
390 
391 #define	NVME_CC_NEN_WRITE_MASK \
392 	((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
393 	 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
394 	 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
395 
396 /* Controller Status utils */
397 #define	NVME_CSTS_GET_RDY(sts) \
398 	((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
399 
400 #define	NVME_CSTS_RDY	(1 << NVME_CSTS_REG_RDY_SHIFT)
401 #define	NVME_CSTS_CFS	(1 << NVME_CSTS_REG_CFS_SHIFT)
402 
403 /* Completion Queue status word utils */
404 #define	NVME_STATUS_P	(1 << NVME_STATUS_P_SHIFT)
405 #define	NVME_STATUS_MASK \
406 	((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
407 	 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
408 
409 #define NVME_ONCS_DSM	(NVME_CTRLR_DATA_ONCS_DSM_MASK << \
410 	NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
411 
412 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
413     struct nvme_feature_obj *,
414     struct nvme_command *,
415     struct nvme_completion *);
416 static void nvme_feature_temperature(struct pci_nvme_softc *,
417     struct nvme_feature_obj *,
418     struct nvme_command *,
419     struct nvme_completion *);
420 static void nvme_feature_num_queues(struct pci_nvme_softc *,
421     struct nvme_feature_obj *,
422     struct nvme_command *,
423     struct nvme_completion *);
424 static void nvme_feature_iv_config(struct pci_nvme_softc *,
425     struct nvme_feature_obj *,
426     struct nvme_command *,
427     struct nvme_completion *);
428 static void nvme_feature_async_event(struct pci_nvme_softc *,
429     struct nvme_feature_obj *,
430     struct nvme_command *,
431     struct nvme_completion *);
432 
433 static void *aen_thr(void *arg);
434 
435 static __inline void
436 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
437 {
438 	size_t len;
439 
440 	len = strnlen(src, dst_size);
441 	memset(dst, pad, dst_size);
442 	memcpy(dst, src, len);
443 }
444 
445 static __inline void
446 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
447 {
448 
449 	*status &= ~NVME_STATUS_MASK;
450 	*status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
451 		(code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
452 }
453 
454 static __inline void
455 pci_nvme_status_genc(uint16_t *status, uint16_t code)
456 {
457 
458 	pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
459 }
460 
461 /*
462  * Initialize the requested number or IO Submission and Completion Queues.
463  * Admin queues are allocated implicitly.
464  */
465 static void
466 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
467 {
468 	uint32_t i;
469 
470 	/*
471 	 * Allocate and initialize the Submission Queues
472 	 */
473 	if (nsq > NVME_QUEUES) {
474 		WPRINTF("%s: clamping number of SQ from %u to %u",
475 					__func__, nsq, NVME_QUEUES);
476 		nsq = NVME_QUEUES;
477 	}
478 
479 	sc->num_squeues = nsq;
480 
481 	sc->submit_queues = calloc(sc->num_squeues + 1,
482 				sizeof(struct nvme_submission_queue));
483 	if (sc->submit_queues == NULL) {
484 		WPRINTF("%s: SQ allocation failed", __func__);
485 		sc->num_squeues = 0;
486 	} else {
487 		struct nvme_submission_queue *sq = sc->submit_queues;
488 
489 		for (i = 0; i < sc->num_squeues + 1; i++)
490 			pthread_mutex_init(&sq[i].mtx, NULL);
491 	}
492 
493 	/*
494 	 * Allocate and initialize the Completion Queues
495 	 */
496 	if (ncq > NVME_QUEUES) {
497 		WPRINTF("%s: clamping number of CQ from %u to %u",
498 					__func__, ncq, NVME_QUEUES);
499 		ncq = NVME_QUEUES;
500 	}
501 
502 	sc->num_cqueues = ncq;
503 
504 	sc->compl_queues = calloc(sc->num_cqueues + 1,
505 				sizeof(struct nvme_completion_queue));
506 	if (sc->compl_queues == NULL) {
507 		WPRINTF("%s: CQ allocation failed", __func__);
508 		sc->num_cqueues = 0;
509 	} else {
510 		struct nvme_completion_queue *cq = sc->compl_queues;
511 
512 		for (i = 0; i < sc->num_cqueues + 1; i++)
513 			pthread_mutex_init(&cq[i].mtx, NULL);
514 	}
515 }
516 
517 static void
518 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
519 {
520 	struct nvme_controller_data *cd = &sc->ctrldata;
521 
522 	cd->vid = 0xFB5D;
523 	cd->ssvid = 0x0000;
524 
525 	cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
526 	cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
527 
528 	/* Num of submission commands that we can handle at a time (2^rab) */
529 	cd->rab   = 4;
530 
531 	/* FreeBSD OUI */
532 	cd->ieee[0] = 0x58;
533 	cd->ieee[1] = 0x9c;
534 	cd->ieee[2] = 0xfc;
535 
536 	cd->mic = 0;
537 
538 	cd->mdts = NVME_MDTS;	/* max data transfer size (2^mdts * CAP.MPSMIN) */
539 
540 	cd->ver = NVME_REV(1,4);
541 
542 	cd->cntrltype = NVME_CNTRLTYPE_IO;
543 	cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
544 	cd->oaes = NVMEB(NVME_CTRLR_DATA_OAES_NS_ATTR);
545 	cd->acl = 2;
546 	cd->aerl = 4;
547 
548 	/* Advertise 1, Read-only firmware slot */
549 	cd->frmw = NVMEB(NVME_CTRLR_DATA_FRMW_SLOT1_RO) |
550 	    (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
551 	cd->lpa = 0;	/* TODO: support some simple things like SMART */
552 	cd->elpe = 0;	/* max error log page entries */
553 	/*
554 	 * Report a single power state (zero-based value)
555 	 * power_state[] values are left as zero to indicate "Not reported"
556 	 */
557 	cd->npss = 0;
558 
559 	/* Warning Composite Temperature Threshold */
560 	cd->wctemp = 0x0157;
561 	cd->cctemp = 0x0157;
562 
563 	/* SANICAP must not be 0 for Revision 1.4 and later NVMe Controllers */
564 	cd->sanicap = (NVME_CTRLR_DATA_SANICAP_NODMMAS_NO <<
565 			NVME_CTRLR_DATA_SANICAP_NODMMAS_SHIFT);
566 
567 	cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
568 	    (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
569 	cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
570 	    (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
571 	cd->nn = 1;	/* number of namespaces */
572 
573 	cd->oncs = 0;
574 	switch (sc->dataset_management) {
575 	case NVME_DATASET_MANAGEMENT_AUTO:
576 		if (sc->nvstore.deallocate)
577 			cd->oncs |= NVME_ONCS_DSM;
578 		break;
579 	case NVME_DATASET_MANAGEMENT_ENABLE:
580 		cd->oncs |= NVME_ONCS_DSM;
581 		break;
582 	default:
583 		break;
584 	}
585 
586 	cd->fna = NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK <<
587 	    NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT;
588 
589 	cd->vwc = NVME_CTRLR_DATA_VWC_ALL_NO << NVME_CTRLR_DATA_VWC_ALL_SHIFT;
590 }
591 
592 /*
593  * Calculate the CRC-16 of the given buffer
594  * See copyright attribution at top of file
595  */
596 static uint16_t
597 crc16(uint16_t crc, const void *buffer, unsigned int len)
598 {
599 	const unsigned char *cp = buffer;
600 	/* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
601 	static uint16_t const crc16_table[256] = {
602 		0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
603 		0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
604 		0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
605 		0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
606 		0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
607 		0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
608 		0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
609 		0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
610 		0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
611 		0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
612 		0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
613 		0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
614 		0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
615 		0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
616 		0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
617 		0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
618 		0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
619 		0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
620 		0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
621 		0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
622 		0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
623 		0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
624 		0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
625 		0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
626 		0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
627 		0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
628 		0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
629 		0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
630 		0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
631 		0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
632 		0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
633 		0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
634 	};
635 
636 	while (len--)
637 		crc = (((crc >> 8) & 0xffU) ^
638 		    crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
639 	return crc;
640 }
641 
642 static void
643 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore,
644     struct nvme_namespace_data *nd)
645 {
646 
647 	/* Get capacity and block size information from backing store */
648 	nd->nsze = nvstore->size / nvstore->sectsz;
649 	nd->ncap = nd->nsze;
650 	nd->nuse = nd->nsze;
651 }
652 
653 static void
654 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
655     struct nvme_namespace_data *nd, uint32_t nsid,
656     struct pci_nvme_blockstore *nvstore)
657 {
658 
659 	pci_nvme_init_nsdata_size(nvstore, nd);
660 
661 	if (nvstore->type == NVME_STOR_BLOCKIF)
662 		nvstore->deallocate = blockif_candelete(nvstore->ctx);
663 
664 	nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
665 	nd->flbas = 0;
666 
667 	/* Create an EUI-64 if user did not provide one */
668 	if (nvstore->eui64 == 0) {
669 		char *data = NULL;
670 		uint64_t eui64 = nvstore->eui64;
671 
672 		asprintf(&data, "%s%u%u%u", get_config_value("name"),
673 		    sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot,
674 		    sc->nsc_pi->pi_func);
675 
676 		if (data != NULL) {
677 			eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
678 			free(data);
679 		}
680 		nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
681 	}
682 	be64enc(nd->eui64, nvstore->eui64);
683 
684 	/* LBA data-sz = 2^lbads */
685 	nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
686 }
687 
688 static void
689 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
690 {
691 
692 	memset(&sc->err_log, 0, sizeof(sc->err_log));
693 	memset(&sc->health_log, 0, sizeof(sc->health_log));
694 	memset(&sc->fw_log, 0, sizeof(sc->fw_log));
695 	memset(&sc->ns_log, 0, sizeof(sc->ns_log));
696 
697 	/* Set read/write remainder to round up according to spec */
698 	sc->read_dunits_remainder = 999;
699 	sc->write_dunits_remainder = 999;
700 
701 	/* Set nominal Health values checked by implementations */
702 	sc->health_log.temperature = NVME_TEMPERATURE;
703 	sc->health_log.available_spare = 100;
704 	sc->health_log.available_spare_threshold = 10;
705 
706 	/* Set Active Firmware Info to slot 1 */
707 	sc->fw_log.afi = (1 << NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT);
708 	memcpy(&sc->fw_log.revision[0], sc->ctrldata.fr,
709 	    sizeof(sc->fw_log.revision[0]));
710 }
711 
712 static void
713 pci_nvme_init_features(struct pci_nvme_softc *sc)
714 {
715 	enum nvme_feature	fid;
716 
717 	for (fid = 0; fid < NVME_FID_MAX; fid++) {
718 		switch (fid) {
719 		case NVME_FEAT_ARBITRATION:
720 		case NVME_FEAT_POWER_MANAGEMENT:
721 		case NVME_FEAT_INTERRUPT_COALESCING: //XXX
722 		case NVME_FEAT_WRITE_ATOMICITY:
723 			/* Mandatory but no special handling required */
724 		//XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
725 		//XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
726 		//		  this returns a data buffer
727 			break;
728 		case NVME_FEAT_TEMPERATURE_THRESHOLD:
729 			sc->feat[fid].set = nvme_feature_temperature;
730 			break;
731 		case NVME_FEAT_ERROR_RECOVERY:
732 			sc->feat[fid].namespace_specific = true;
733 			break;
734 		case NVME_FEAT_NUMBER_OF_QUEUES:
735 			sc->feat[fid].set = nvme_feature_num_queues;
736 			break;
737 		case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
738 			sc->feat[fid].set = nvme_feature_iv_config;
739 			break;
740 		case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
741 			sc->feat[fid].set = nvme_feature_async_event;
742 			/* Enable all AENs by default */
743 			sc->feat[fid].cdw11 = PCI_NVME_AEN_DEFAULT_MASK;
744 			break;
745 		default:
746 			sc->feat[fid].set = nvme_feature_invalid_cb;
747 			sc->feat[fid].get = nvme_feature_invalid_cb;
748 		}
749 	}
750 }
751 
752 static void
753 pci_nvme_aer_reset(struct pci_nvme_softc *sc)
754 {
755 
756 	STAILQ_INIT(&sc->aer_list);
757 	sc->aer_count = 0;
758 }
759 
760 static void
761 pci_nvme_aer_init(struct pci_nvme_softc *sc)
762 {
763 
764 	pthread_mutex_init(&sc->aer_mtx, NULL);
765 	pci_nvme_aer_reset(sc);
766 }
767 
768 static void
769 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
770 {
771 	struct pci_nvme_aer *aer = NULL;
772 
773 	pthread_mutex_lock(&sc->aer_mtx);
774 	while (!STAILQ_EMPTY(&sc->aer_list)) {
775 		aer = STAILQ_FIRST(&sc->aer_list);
776 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
777 		free(aer);
778 	}
779 	pthread_mutex_unlock(&sc->aer_mtx);
780 
781 	pci_nvme_aer_reset(sc);
782 }
783 
784 static bool
785 pci_nvme_aer_available(struct pci_nvme_softc *sc)
786 {
787 
788 	return (sc->aer_count != 0);
789 }
790 
791 static bool
792 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
793 {
794 	struct nvme_controller_data *cd = &sc->ctrldata;
795 
796 	/* AERL is a zero based value while aer_count is one's based */
797 	return (sc->aer_count == (cd->aerl + 1));
798 }
799 
800 /*
801  * Add an Async Event Request
802  *
803  * Stores an AER to be returned later if the Controller needs to notify the
804  * host of an event.
805  * Note that while the NVMe spec doesn't require Controllers to return AER's
806  * in order, this implementation does preserve the order.
807  */
808 static int
809 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
810 {
811 	struct pci_nvme_aer *aer = NULL;
812 
813 	aer = calloc(1, sizeof(struct pci_nvme_aer));
814 	if (aer == NULL)
815 		return (-1);
816 
817 	/* Save the Command ID for use in the completion message */
818 	aer->cid = cid;
819 
820 	pthread_mutex_lock(&sc->aer_mtx);
821 	sc->aer_count++;
822 	STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
823 	pthread_mutex_unlock(&sc->aer_mtx);
824 
825 	return (0);
826 }
827 
828 /*
829  * Get an Async Event Request structure
830  *
831  * Returns a pointer to an AER previously submitted by the host or NULL if
832  * no AER's exist. Caller is responsible for freeing the returned struct.
833  */
834 static struct pci_nvme_aer *
835 pci_nvme_aer_get(struct pci_nvme_softc *sc)
836 {
837 	struct pci_nvme_aer *aer = NULL;
838 
839 	pthread_mutex_lock(&sc->aer_mtx);
840 	aer = STAILQ_FIRST(&sc->aer_list);
841 	if (aer != NULL) {
842 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
843 		sc->aer_count--;
844 	}
845 	pthread_mutex_unlock(&sc->aer_mtx);
846 
847 	return (aer);
848 }
849 
850 static void
851 pci_nvme_aen_reset(struct pci_nvme_softc *sc)
852 {
853 	uint32_t	atype;
854 
855 	memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen));
856 
857 	for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
858 		sc->aen[atype].atype = atype;
859 	}
860 }
861 
862 static void
863 pci_nvme_aen_init(struct pci_nvme_softc *sc)
864 {
865 	char nstr[80];
866 
867 	pci_nvme_aen_reset(sc);
868 
869 	pthread_mutex_init(&sc->aen_mtx, NULL);
870 	pthread_create(&sc->aen_tid, NULL, aen_thr, sc);
871 	snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot,
872 	    sc->nsc_pi->pi_func);
873 	pthread_set_name_np(sc->aen_tid, nstr);
874 }
875 
876 static void
877 pci_nvme_aen_destroy(struct pci_nvme_softc *sc)
878 {
879 
880 	pci_nvme_aen_reset(sc);
881 }
882 
883 /* Notify the AEN thread of pending work */
884 static void
885 pci_nvme_aen_notify(struct pci_nvme_softc *sc)
886 {
887 
888 	pthread_cond_signal(&sc->aen_cond);
889 }
890 
891 /*
892  * Post an Asynchronous Event Notification
893  */
894 static int32_t
895 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype,
896 		uint32_t event_data)
897 {
898 	struct pci_nvme_aen *aen;
899 
900 	if (atype >= PCI_NVME_AE_TYPE_MAX) {
901 		return(EINVAL);
902 	}
903 
904 	pthread_mutex_lock(&sc->aen_mtx);
905 	aen = &sc->aen[atype];
906 
907 	/* Has the controller already posted an event of this type? */
908 	if (aen->posted) {
909 		pthread_mutex_unlock(&sc->aen_mtx);
910 		return(EALREADY);
911 	}
912 
913 	aen->event_data = event_data;
914 	aen->posted = true;
915 	pthread_mutex_unlock(&sc->aen_mtx);
916 
917 	pci_nvme_aen_notify(sc);
918 
919 	return(0);
920 }
921 
922 static void
923 pci_nvme_aen_process(struct pci_nvme_softc *sc)
924 {
925 	struct pci_nvme_aer *aer;
926 	struct pci_nvme_aen *aen;
927 	pci_nvme_async_type atype;
928 	uint32_t mask;
929 	uint16_t status;
930 	uint8_t lid;
931 
932 	assert(pthread_mutex_isowned_np(&sc->aen_mtx));
933 	for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
934 		aen = &sc->aen[atype];
935 		/* Previous iterations may have depleted the available AER's */
936 		if (!pci_nvme_aer_available(sc)) {
937 			DPRINTF("%s: no AER", __func__);
938 			break;
939 		}
940 
941 		if (!aen->posted) {
942 			DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype);
943 			continue;
944 		}
945 
946 		status = NVME_SC_SUCCESS;
947 
948 		/* Is the event masked? */
949 		mask =
950 		    sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11;
951 
952 		DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data);
953 		switch (atype) {
954 		case PCI_NVME_AE_TYPE_ERROR:
955 			lid = NVME_LOG_ERROR;
956 			break;
957 		case PCI_NVME_AE_TYPE_SMART:
958 			mask &= 0xff;
959 			if ((mask & aen->event_data) == 0)
960 				continue;
961 			lid = NVME_LOG_HEALTH_INFORMATION;
962 			break;
963 		case PCI_NVME_AE_TYPE_NOTICE:
964 			if (aen->event_data >= PCI_NVME_AEI_NOTICE_MAX) {
965 				EPRINTLN("%s unknown AEN notice type %u",
966 				    __func__, aen->event_data);
967 				status = NVME_SC_INTERNAL_DEVICE_ERROR;
968 				break;
969 			}
970 			if ((PCI_NVME_AEI_NOTICE_MASK(aen->event_data) & mask) == 0)
971 				continue;
972 			switch (aen->event_data) {
973 			case PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED:
974 				lid = NVME_LOG_CHANGED_NAMESPACE;
975 				break;
976 			case PCI_NVME_AEI_NOTICE_FW_ACTIVATION:
977 				lid = NVME_LOG_FIRMWARE_SLOT;
978 				break;
979 			case PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE:
980 				lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED;
981 				break;
982 			case PCI_NVME_AEI_NOTICE_ANA_CHANGE:
983 				lid = NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS;
984 				break;
985 			case PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE:
986 				lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE;
987 				break;
988 			case PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT:
989 				lid = NVME_LOG_LBA_STATUS_INFORMATION;
990 				break;
991 			case PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE:
992 				lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE;
993 				break;
994 			default:
995 				lid = 0;
996 			}
997 			break;
998 		default:
999 			/* bad type?!? */
1000 			EPRINTLN("%s unknown AEN type %u", __func__, atype);
1001 			status = NVME_SC_INTERNAL_DEVICE_ERROR;
1002 			break;
1003 		}
1004 
1005 		aer = pci_nvme_aer_get(sc);
1006 		assert(aer != NULL);
1007 
1008 		DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype);
1009 		pci_nvme_cq_update(sc, &sc->compl_queues[0],
1010 		    (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */
1011 		    aer->cid,
1012 		    0,		/* SQID */
1013 		    status);
1014 
1015 		aen->event_data = 0;
1016 		aen->posted = false;
1017 
1018 		pci_generate_msix(sc->nsc_pi, 0);
1019 	}
1020 }
1021 
1022 static void *
1023 aen_thr(void *arg)
1024 {
1025 	struct pci_nvme_softc *sc;
1026 
1027 	sc = arg;
1028 
1029 	pthread_mutex_lock(&sc->aen_mtx);
1030 	for (;;) {
1031 		pci_nvme_aen_process(sc);
1032 		pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx);
1033 	}
1034 	pthread_mutex_unlock(&sc->aen_mtx);
1035 
1036 	pthread_exit(NULL);
1037 	return (NULL);
1038 }
1039 
1040 static void
1041 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
1042 {
1043 	uint32_t i;
1044 
1045 	DPRINTF("%s", __func__);
1046 
1047 	sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
1048 	    (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
1049 	    (60 << NVME_CAP_LO_REG_TO_SHIFT);
1050 
1051 	sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
1052 
1053 	sc->regs.vs = NVME_REV(1,4);	/* NVMe v1.4 */
1054 
1055 	sc->regs.cc = 0;
1056 
1057 	assert(sc->submit_queues != NULL);
1058 
1059 	for (i = 0; i < sc->num_squeues + 1; i++) {
1060 		sc->submit_queues[i].qbase = NULL;
1061 		sc->submit_queues[i].size = 0;
1062 		sc->submit_queues[i].cqid = 0;
1063 		sc->submit_queues[i].tail = 0;
1064 		sc->submit_queues[i].head = 0;
1065 	}
1066 
1067 	assert(sc->compl_queues != NULL);
1068 
1069 	for (i = 0; i < sc->num_cqueues + 1; i++) {
1070 		sc->compl_queues[i].qbase = NULL;
1071 		sc->compl_queues[i].size = 0;
1072 		sc->compl_queues[i].tail = 0;
1073 		sc->compl_queues[i].head = 0;
1074 	}
1075 
1076 	sc->num_q_is_set = false;
1077 
1078 	pci_nvme_aer_destroy(sc);
1079 	pci_nvme_aen_destroy(sc);
1080 
1081 	/*
1082 	 * Clear CSTS.RDY last to prevent the host from enabling Controller
1083 	 * before cleanup completes
1084 	 */
1085 	sc->regs.csts = 0;
1086 }
1087 
1088 static void
1089 pci_nvme_reset(struct pci_nvme_softc *sc)
1090 {
1091 	pthread_mutex_lock(&sc->mtx);
1092 	pci_nvme_reset_locked(sc);
1093 	pthread_mutex_unlock(&sc->mtx);
1094 }
1095 
1096 static int
1097 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
1098 {
1099 	uint16_t acqs, asqs;
1100 
1101 	DPRINTF("%s", __func__);
1102 
1103 	/*
1104 	 * NVMe 2.0 states that "enabling a controller while this field is
1105 	 * cleared to 0h produces undefined results" for both ACQS and
1106 	 * ASQS. If zero, set CFS and do not become ready.
1107 	 */
1108 	asqs = ONE_BASED(sc->regs.aqa & NVME_AQA_REG_ASQS_MASK);
1109 	if (asqs < 2) {
1110 		EPRINTLN("%s: illegal ASQS value %#x (aqa=%#x)", __func__,
1111 		    asqs - 1, sc->regs.aqa);
1112 		sc->regs.csts |= NVME_CSTS_CFS;
1113 		return (-1);
1114 	}
1115 	sc->submit_queues[0].size = asqs;
1116 	sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
1117 	            sizeof(struct nvme_command) * asqs);
1118 	if (sc->submit_queues[0].qbase == NULL) {
1119 		EPRINTLN("%s: ASQ vm_map_gpa(%lx) failed", __func__,
1120 		    sc->regs.asq);
1121 		sc->regs.csts |= NVME_CSTS_CFS;
1122 		return (-1);
1123 	}
1124 
1125 	DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
1126 	        __func__, sc->regs.asq, sc->submit_queues[0].qbase);
1127 
1128 	acqs = ONE_BASED((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
1129 	    NVME_AQA_REG_ACQS_MASK);
1130 	if (acqs < 2) {
1131 		EPRINTLN("%s: illegal ACQS value %#x (aqa=%#x)", __func__,
1132 		    acqs - 1, sc->regs.aqa);
1133 		sc->regs.csts |= NVME_CSTS_CFS;
1134 		return (-1);
1135 	}
1136 	sc->compl_queues[0].size = acqs;
1137 	sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
1138 	         sizeof(struct nvme_completion) * acqs);
1139 	if (sc->compl_queues[0].qbase == NULL) {
1140 		EPRINTLN("%s: ACQ vm_map_gpa(%lx) failed", __func__,
1141 		    sc->regs.acq);
1142 		sc->regs.csts |= NVME_CSTS_CFS;
1143 		return (-1);
1144 	}
1145 	sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
1146 
1147 	DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
1148 	        __func__, sc->regs.acq, sc->compl_queues[0].qbase);
1149 
1150 	return (0);
1151 }
1152 
1153 static int
1154 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
1155 	size_t len, enum nvme_copy_dir dir)
1156 {
1157 	uint8_t *p;
1158 	size_t bytes;
1159 
1160 	if (len > (8 * 1024)) {
1161 		return (-1);
1162 	}
1163 
1164 	/* Copy from the start of prp1 to the end of the physical page */
1165 	bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
1166 	bytes = MIN(bytes, len);
1167 
1168 	p = vm_map_gpa(ctx, prp1, bytes);
1169 	if (p == NULL) {
1170 		return (-1);
1171 	}
1172 
1173 	if (dir == NVME_COPY_TO_PRP)
1174 		memcpy(p, b, bytes);
1175 	else
1176 		memcpy(b, p, bytes);
1177 
1178 	b += bytes;
1179 
1180 	len -= bytes;
1181 	if (len == 0) {
1182 		return (0);
1183 	}
1184 
1185 	len = MIN(len, PAGE_SIZE);
1186 
1187 	p = vm_map_gpa(ctx, prp2, len);
1188 	if (p == NULL) {
1189 		return (-1);
1190 	}
1191 
1192 	if (dir == NVME_COPY_TO_PRP)
1193 		memcpy(p, b, len);
1194 	else
1195 		memcpy(b, p, len);
1196 
1197 	return (0);
1198 }
1199 
1200 /*
1201  * Write a Completion Queue Entry update
1202  *
1203  * Write the completion and update the doorbell value
1204  */
1205 static void
1206 pci_nvme_cq_update(struct pci_nvme_softc *sc,
1207 		struct nvme_completion_queue *cq,
1208 		uint32_t cdw0,
1209 		uint16_t cid,
1210 		uint16_t sqid,
1211 		uint16_t status)
1212 {
1213 	struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
1214 	struct nvme_completion *cqe;
1215 
1216 	assert(cq->qbase != NULL);
1217 
1218 	pthread_mutex_lock(&cq->mtx);
1219 
1220 	cqe = &cq->qbase[cq->tail];
1221 
1222 	/* Flip the phase bit */
1223 	status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
1224 
1225 	cqe->cdw0 = cdw0;
1226 	cqe->sqhd = sq->head;
1227 	cqe->sqid = sqid;
1228 	cqe->cid = cid;
1229 	cqe->status = status;
1230 
1231 	cq->tail++;
1232 	if (cq->tail >= cq->size) {
1233 		cq->tail = 0;
1234 	}
1235 
1236 	pthread_mutex_unlock(&cq->mtx);
1237 }
1238 
1239 static int
1240 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1241 	struct nvme_completion* compl)
1242 {
1243 	uint16_t qid = command->cdw10 & 0xffff;
1244 
1245 	DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
1246 	if (qid == 0 || qid > sc->num_squeues ||
1247 	    (sc->submit_queues[qid].qbase == NULL)) {
1248 		WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
1249 		        __func__, qid, sc->num_squeues);
1250 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1251 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1252 		return (1);
1253 	}
1254 
1255 	sc->submit_queues[qid].qbase = NULL;
1256 	sc->submit_queues[qid].cqid = 0;
1257 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1258 	return (1);
1259 }
1260 
1261 static int
1262 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1263 	struct nvme_completion* compl)
1264 {
1265 	if (command->cdw11 & NVME_CMD_CDW11_PC) {
1266 		uint16_t qid = command->cdw10 & 0xffff;
1267 		struct nvme_submission_queue *nsq;
1268 
1269 		if ((qid == 0) || (qid > sc->num_squeues) ||
1270 		    (sc->submit_queues[qid].qbase != NULL)) {
1271 			WPRINTF("%s queue index %u > num_squeues %u",
1272 			        __func__, qid, sc->num_squeues);
1273 			pci_nvme_status_tc(&compl->status,
1274 			    NVME_SCT_COMMAND_SPECIFIC,
1275 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1276 			return (1);
1277 		}
1278 
1279 		nsq = &sc->submit_queues[qid];
1280 		nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1281 		DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
1282 		if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
1283 			/*
1284 			 * Queues must specify at least two entries
1285 			 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1286 			 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1287 			 */
1288 			pci_nvme_status_tc(&compl->status,
1289 			    NVME_SCT_COMMAND_SPECIFIC,
1290 			    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1291 			return (1);
1292 		}
1293 		nsq->head = nsq->tail = 0;
1294 
1295 		nsq->cqid = (command->cdw11 >> 16) & 0xffff;
1296 		if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
1297 			pci_nvme_status_tc(&compl->status,
1298 			    NVME_SCT_COMMAND_SPECIFIC,
1299 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1300 			return (1);
1301 		}
1302 
1303 		if (sc->compl_queues[nsq->cqid].qbase == NULL) {
1304 			pci_nvme_status_tc(&compl->status,
1305 			    NVME_SCT_COMMAND_SPECIFIC,
1306 			    NVME_SC_COMPLETION_QUEUE_INVALID);
1307 			return (1);
1308 		}
1309 
1310 		nsq->qpriority = (command->cdw11 >> 1) & 0x03;
1311 
1312 		nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1313 		              sizeof(struct nvme_command) * (size_t)nsq->size);
1314 
1315 		DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
1316 		        qid, nsq->size, nsq->qbase, nsq->cqid);
1317 
1318 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1319 
1320 		DPRINTF("%s completed creating IOSQ qid %u",
1321 		         __func__, qid);
1322 	} else {
1323 		/*
1324 		 * Guest sent non-cont submission queue request.
1325 		 * This setting is unsupported by this emulation.
1326 		 */
1327 		WPRINTF("%s unsupported non-contig (list-based) "
1328 		         "create i/o submission queue", __func__);
1329 
1330 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1331 	}
1332 	return (1);
1333 }
1334 
1335 static int
1336 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1337 	struct nvme_completion* compl)
1338 {
1339 	uint16_t qid = command->cdw10 & 0xffff;
1340 	uint16_t sqid;
1341 
1342 	DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
1343 	if (qid == 0 || qid > sc->num_cqueues ||
1344 	    (sc->compl_queues[qid].qbase == NULL)) {
1345 		WPRINTF("%s queue index %u / num_cqueues %u",
1346 		        __func__, qid, sc->num_cqueues);
1347 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1348 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1349 		return (1);
1350 	}
1351 
1352 	/* Deleting an Active CQ is an error */
1353 	for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
1354 		if (sc->submit_queues[sqid].cqid == qid) {
1355 			pci_nvme_status_tc(&compl->status,
1356 			    NVME_SCT_COMMAND_SPECIFIC,
1357 			    NVME_SC_INVALID_QUEUE_DELETION);
1358 			return (1);
1359 		}
1360 
1361 	sc->compl_queues[qid].qbase = NULL;
1362 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1363 	return (1);
1364 }
1365 
1366 static int
1367 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1368 	struct nvme_completion* compl)
1369 {
1370 	struct nvme_completion_queue *ncq;
1371 	uint16_t qid = command->cdw10 & 0xffff;
1372 
1373 	/* Only support Physically Contiguous queues */
1374 	if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1375 		WPRINTF("%s unsupported non-contig (list-based) "
1376 		         "create i/o completion queue",
1377 		         __func__);
1378 
1379 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1380 		return (1);
1381 	}
1382 
1383 	if ((qid == 0) || (qid > sc->num_cqueues) ||
1384 	    (sc->compl_queues[qid].qbase != NULL)) {
1385 		WPRINTF("%s queue index %u > num_cqueues %u",
1386 			__func__, qid, sc->num_cqueues);
1387 		pci_nvme_status_tc(&compl->status,
1388 		    NVME_SCT_COMMAND_SPECIFIC,
1389 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1390 		return (1);
1391  	}
1392 
1393 	ncq = &sc->compl_queues[qid];
1394 	ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1395 	ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1396 	if (ncq->intr_vec > (sc->max_queues + 1)) {
1397 		pci_nvme_status_tc(&compl->status,
1398 		    NVME_SCT_COMMAND_SPECIFIC,
1399 		    NVME_SC_INVALID_INTERRUPT_VECTOR);
1400 		return (1);
1401 	}
1402 
1403 	ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1404 	if ((ncq->size < 2) || (ncq->size > sc->max_qentries))  {
1405 		/*
1406 		 * Queues must specify at least two entries
1407 		 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1408 		 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1409 		 */
1410 		pci_nvme_status_tc(&compl->status,
1411 		    NVME_SCT_COMMAND_SPECIFIC,
1412 		    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1413 		return (1);
1414 	}
1415 	ncq->head = ncq->tail = 0;
1416 	ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1417 		     command->prp1,
1418 		     sizeof(struct nvme_command) * (size_t)ncq->size);
1419 
1420 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1421 
1422 
1423 	return (1);
1424 }
1425 
1426 static int
1427 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1428 	struct nvme_completion* compl)
1429 {
1430 	uint64_t logoff;
1431 	uint32_t logsize;
1432 	uint8_t logpage;
1433 
1434 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1435 
1436 	/*
1437 	 * Command specifies the number of dwords to return in fields NUMDU
1438 	 * and NUMDL. This is a zero-based value.
1439 	 */
1440 	logpage = command->cdw10 & 0xFF;
1441 	logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1442 	logsize *= sizeof(uint32_t);
1443 	logoff  = ((uint64_t)(command->cdw13) << 32) | command->cdw12;
1444 
1445 	DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1446 
1447 	switch (logpage) {
1448 	case NVME_LOG_ERROR:
1449 		if (logoff >= sizeof(sc->err_log)) {
1450 			pci_nvme_status_genc(&compl->status,
1451 			    NVME_SC_INVALID_FIELD);
1452 			break;
1453 		}
1454 
1455 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1456 		    command->prp2, (uint8_t *)&sc->err_log + logoff,
1457 		    MIN(logsize - logoff, sizeof(sc->err_log)),
1458 		    NVME_COPY_TO_PRP);
1459 		break;
1460 	case NVME_LOG_HEALTH_INFORMATION:
1461 		if (logoff >= sizeof(sc->health_log)) {
1462 			pci_nvme_status_genc(&compl->status,
1463 			    NVME_SC_INVALID_FIELD);
1464 			break;
1465 		}
1466 
1467 		pthread_mutex_lock(&sc->mtx);
1468 		memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1469 		    sizeof(sc->health_log.data_units_read));
1470 		memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1471 		    sizeof(sc->health_log.data_units_written));
1472 		memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1473 		    sizeof(sc->health_log.host_read_commands));
1474 		memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1475 		    sizeof(sc->health_log.host_write_commands));
1476 		pthread_mutex_unlock(&sc->mtx);
1477 
1478 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1479 		    command->prp2, (uint8_t *)&sc->health_log + logoff,
1480 		    MIN(logsize - logoff, sizeof(sc->health_log)),
1481 		    NVME_COPY_TO_PRP);
1482 		break;
1483 	case NVME_LOG_FIRMWARE_SLOT:
1484 		if (logoff >= sizeof(sc->fw_log)) {
1485 			pci_nvme_status_genc(&compl->status,
1486 			    NVME_SC_INVALID_FIELD);
1487 			break;
1488 		}
1489 
1490 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1491 		    command->prp2, (uint8_t *)&sc->fw_log + logoff,
1492 		    MIN(logsize - logoff, sizeof(sc->fw_log)),
1493 		    NVME_COPY_TO_PRP);
1494 		break;
1495 	case NVME_LOG_CHANGED_NAMESPACE:
1496 		if (logoff >= sizeof(sc->ns_log)) {
1497 			pci_nvme_status_genc(&compl->status,
1498 			    NVME_SC_INVALID_FIELD);
1499 			break;
1500 		}
1501 
1502 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1503 		    command->prp2, (uint8_t *)&sc->ns_log + logoff,
1504 		    MIN(logsize - logoff, sizeof(sc->ns_log)),
1505 		    NVME_COPY_TO_PRP);
1506 		memset(&sc->ns_log, 0, sizeof(sc->ns_log));
1507 		break;
1508 	default:
1509 		DPRINTF("%s get log page %x command not supported",
1510 		        __func__, logpage);
1511 
1512 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1513 		    NVME_SC_INVALID_LOG_PAGE);
1514 	}
1515 
1516 	return (1);
1517 }
1518 
1519 static int
1520 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1521 	struct nvme_completion* compl)
1522 {
1523 	void *dest;
1524 	uint16_t status;
1525 
1526 	DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1527 	        command->cdw10 & 0xFF, command->nsid);
1528 
1529 	pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1530 
1531 	switch (command->cdw10 & 0xFF) {
1532 	case 0x00: /* return Identify Namespace data structure */
1533 		/* Global NS only valid with NS Management */
1534 		if (command->nsid == NVME_GLOBAL_NAMESPACE_TAG) {
1535 			pci_nvme_status_genc(&status,
1536 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1537 			break;
1538 		}
1539 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1540 		    command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1541 		    NVME_COPY_TO_PRP);
1542 		break;
1543 	case 0x01: /* return Identify Controller data structure */
1544 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1545 		    command->prp2, (uint8_t *)&sc->ctrldata,
1546 		    sizeof(sc->ctrldata),
1547 		    NVME_COPY_TO_PRP);
1548 		break;
1549 	case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1550 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1551 		                  sizeof(uint32_t) * 1024);
1552 		/* All unused entries shall be zero */
1553 		memset(dest, 0, sizeof(uint32_t) * 1024);
1554 		((uint32_t *)dest)[0] = 1;
1555 		break;
1556 	case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1557 		if (command->nsid != 1) {
1558 			pci_nvme_status_genc(&status,
1559 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1560 			break;
1561 		}
1562 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1563 		                  sizeof(uint32_t) * 1024);
1564 		/* All bytes after the descriptor shall be zero */
1565 		memset(dest, 0, sizeof(uint32_t) * 1024);
1566 
1567 		/* Return NIDT=1 (i.e. EUI64) descriptor */
1568 		((uint8_t *)dest)[0] = 1;
1569 		((uint8_t *)dest)[1] = sizeof(uint64_t);
1570 		memcpy(((uint8_t *)dest) + 4, sc->nsdata.eui64, sizeof(uint64_t));
1571 		break;
1572 	case 0x13:
1573 		/*
1574 		 * Controller list is optional but used by UNH tests. Return
1575 		 * a valid but empty list.
1576 		 */
1577 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1578 		                  sizeof(uint16_t) * 2048);
1579 		memset(dest, 0, sizeof(uint16_t) * 2048);
1580 		break;
1581 	default:
1582 		DPRINTF("%s unsupported identify command requested 0x%x",
1583 		         __func__, command->cdw10 & 0xFF);
1584 		pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1585 		break;
1586 	}
1587 
1588 	compl->status = status;
1589 	return (1);
1590 }
1591 
1592 static const char *
1593 nvme_fid_to_name(uint8_t fid)
1594 {
1595 	const char *name;
1596 
1597 	switch (fid) {
1598 	case NVME_FEAT_ARBITRATION:
1599 		name = "Arbitration";
1600 		break;
1601 	case NVME_FEAT_POWER_MANAGEMENT:
1602 		name = "Power Management";
1603 		break;
1604 	case NVME_FEAT_LBA_RANGE_TYPE:
1605 		name = "LBA Range Type";
1606 		break;
1607 	case NVME_FEAT_TEMPERATURE_THRESHOLD:
1608 		name = "Temperature Threshold";
1609 		break;
1610 	case NVME_FEAT_ERROR_RECOVERY:
1611 		name = "Error Recovery";
1612 		break;
1613 	case NVME_FEAT_VOLATILE_WRITE_CACHE:
1614 		name = "Volatile Write Cache";
1615 		break;
1616 	case NVME_FEAT_NUMBER_OF_QUEUES:
1617 		name = "Number of Queues";
1618 		break;
1619 	case NVME_FEAT_INTERRUPT_COALESCING:
1620 		name = "Interrupt Coalescing";
1621 		break;
1622 	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1623 		name = "Interrupt Vector Configuration";
1624 		break;
1625 	case NVME_FEAT_WRITE_ATOMICITY:
1626 		name = "Write Atomicity Normal";
1627 		break;
1628 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1629 		name = "Asynchronous Event Configuration";
1630 		break;
1631 	case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1632 		name = "Autonomous Power State Transition";
1633 		break;
1634 	case NVME_FEAT_HOST_MEMORY_BUFFER:
1635 		name = "Host Memory Buffer";
1636 		break;
1637 	case NVME_FEAT_TIMESTAMP:
1638 		name = "Timestamp";
1639 		break;
1640 	case NVME_FEAT_KEEP_ALIVE_TIMER:
1641 		name = "Keep Alive Timer";
1642 		break;
1643 	case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1644 		name = "Host Controlled Thermal Management";
1645 		break;
1646 	case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1647 		name = "Non-Operation Power State Config";
1648 		break;
1649 	case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1650 		name = "Read Recovery Level Config";
1651 		break;
1652 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1653 		name = "Predictable Latency Mode Config";
1654 		break;
1655 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1656 		name = "Predictable Latency Mode Window";
1657 		break;
1658 	case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1659 		name = "LBA Status Information Report Interval";
1660 		break;
1661 	case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1662 		name = "Host Behavior Support";
1663 		break;
1664 	case NVME_FEAT_SANITIZE_CONFIG:
1665 		name = "Sanitize Config";
1666 		break;
1667 	case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1668 		name = "Endurance Group Event Configuration";
1669 		break;
1670 	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1671 		name = "Software Progress Marker";
1672 		break;
1673 	case NVME_FEAT_HOST_IDENTIFIER:
1674 		name = "Host Identifier";
1675 		break;
1676 	case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1677 		name = "Reservation Notification Mask";
1678 		break;
1679 	case NVME_FEAT_RESERVATION_PERSISTENCE:
1680 		name = "Reservation Persistence";
1681 		break;
1682 	case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1683 		name = "Namespace Write Protection Config";
1684 		break;
1685 	default:
1686 		name = "Unknown";
1687 		break;
1688 	}
1689 
1690 	return (name);
1691 }
1692 
1693 static void
1694 nvme_feature_invalid_cb(struct pci_nvme_softc *sc __unused,
1695     struct nvme_feature_obj *feat __unused,
1696     struct nvme_command *command __unused,
1697     struct nvme_completion *compl)
1698 {
1699 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1700 }
1701 
1702 static void
1703 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1704     struct nvme_feature_obj *feat __unused,
1705     struct nvme_command *command,
1706     struct nvme_completion *compl)
1707 {
1708 	uint32_t i;
1709 	uint32_t cdw11 = command->cdw11;
1710 	uint16_t iv;
1711 	bool cd;
1712 
1713 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1714 
1715 	iv = cdw11 & 0xffff;
1716 	cd = cdw11 & (1 << 16);
1717 
1718 	if (iv > (sc->max_queues + 1)) {
1719 		return;
1720 	}
1721 
1722 	/* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1723 	if ((iv == 0) && !cd)
1724 		return;
1725 
1726 	/* Requested Interrupt Vector must be used by a CQ */
1727 	for (i = 0; i < sc->num_cqueues + 1; i++) {
1728 		if (sc->compl_queues[i].intr_vec == iv) {
1729 			pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1730 		}
1731 	}
1732 }
1733 
1734 #define NVME_ASYNC_EVENT_ENDURANCE_GROUP		(0x4000)
1735 static void
1736 nvme_feature_async_event(struct pci_nvme_softc *sc __unused,
1737     struct nvme_feature_obj *feat __unused,
1738     struct nvme_command *command,
1739     struct nvme_completion *compl)
1740 {
1741 	if (command->cdw11 & NVME_ASYNC_EVENT_ENDURANCE_GROUP)
1742 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1743 }
1744 
1745 #define NVME_TEMP_THRESH_OVER	0
1746 #define NVME_TEMP_THRESH_UNDER	1
1747 static void
1748 nvme_feature_temperature(struct pci_nvme_softc *sc,
1749     struct nvme_feature_obj *feat __unused,
1750     struct nvme_command *command,
1751     struct nvme_completion *compl)
1752 {
1753 	uint16_t	tmpth;	/* Temperature Threshold */
1754 	uint8_t		tmpsel; /* Threshold Temperature Select */
1755 	uint8_t		thsel;  /* Threshold Type Select */
1756 	bool		set_crit = false;
1757 
1758 	tmpth  = command->cdw11 & 0xffff;
1759 	tmpsel = (command->cdw11 >> 16) & 0xf;
1760 	thsel  = (command->cdw11 >> 20) & 0x3;
1761 
1762 	DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel);
1763 
1764 	/* Check for unsupported values */
1765 	if (((tmpsel != 0) && (tmpsel != 0xf)) ||
1766 	    (thsel > NVME_TEMP_THRESH_UNDER)) {
1767 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1768 		return;
1769 	}
1770 
1771 	if (((thsel == NVME_TEMP_THRESH_OVER)  && (NVME_TEMPERATURE >= tmpth)) ||
1772 	    ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth)))
1773 		set_crit = true;
1774 
1775 	pthread_mutex_lock(&sc->mtx);
1776 	if (set_crit)
1777 		sc->health_log.critical_warning |=
1778 		    NVME_CRIT_WARN_ST_TEMPERATURE;
1779 	else
1780 		sc->health_log.critical_warning &=
1781 		    ~NVME_CRIT_WARN_ST_TEMPERATURE;
1782 	pthread_mutex_unlock(&sc->mtx);
1783 
1784 	if (set_crit)
1785 		pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART,
1786 		    sc->health_log.critical_warning);
1787 
1788 
1789 	DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status);
1790 }
1791 
1792 static void
1793 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1794     struct nvme_feature_obj *feat __unused,
1795     struct nvme_command *command,
1796     struct nvme_completion *compl)
1797 {
1798 	uint16_t nqr;	/* Number of Queues Requested */
1799 
1800 	if (sc->num_q_is_set) {
1801 		WPRINTF("%s: Number of Queues already set", __func__);
1802 		pci_nvme_status_genc(&compl->status,
1803 		    NVME_SC_COMMAND_SEQUENCE_ERROR);
1804 		return;
1805 	}
1806 
1807 	nqr = command->cdw11 & 0xFFFF;
1808 	if (nqr == 0xffff) {
1809 		WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1810 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1811 		return;
1812 	}
1813 
1814 	sc->num_squeues = ONE_BASED(nqr);
1815 	if (sc->num_squeues > sc->max_queues) {
1816 		DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1817 					sc->max_queues);
1818 		sc->num_squeues = sc->max_queues;
1819 	}
1820 
1821 	nqr = (command->cdw11 >> 16) & 0xFFFF;
1822 	if (nqr == 0xffff) {
1823 		WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1824 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1825 		return;
1826 	}
1827 
1828 	sc->num_cqueues = ONE_BASED(nqr);
1829 	if (sc->num_cqueues > sc->max_queues) {
1830 		DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1831 					sc->max_queues);
1832 		sc->num_cqueues = sc->max_queues;
1833 	}
1834 
1835 	/* Patch the command value which will be saved on callback's return */
1836 	command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1837 	compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1838 
1839 	sc->num_q_is_set = true;
1840 }
1841 
1842 static int
1843 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1844 	struct nvme_completion *compl)
1845 {
1846 	struct nvme_feature_obj *feat;
1847 	uint32_t nsid = command->nsid;
1848 	uint8_t fid = command->cdw10 & 0xFF;
1849 
1850 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1851 
1852 	if (fid >= NVME_FID_MAX) {
1853 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1854 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1855 		return (1);
1856 	}
1857 	feat = &sc->feat[fid];
1858 
1859 	if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) {
1860 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1861 		return (1);
1862 	}
1863 
1864 	if (!feat->namespace_specific &&
1865 	    !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1866 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1867 		    NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1868 		return (1);
1869 	}
1870 
1871 	compl->cdw0 = 0;
1872 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1873 
1874 	if (feat->set)
1875 		feat->set(sc, feat, command, compl);
1876 
1877 	DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11);
1878 	if (compl->status == NVME_SC_SUCCESS) {
1879 		feat->cdw11 = command->cdw11;
1880 		if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) &&
1881 		    (command->cdw11 != 0))
1882 			pci_nvme_aen_notify(sc);
1883 	}
1884 
1885 	return (0);
1886 }
1887 
1888 #define NVME_FEATURES_SEL_SUPPORTED	0x3
1889 #define NVME_FEATURES_NS_SPECIFIC	(1 << 1)
1890 
1891 static int
1892 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1893 	struct nvme_completion* compl)
1894 {
1895 	struct nvme_feature_obj *feat;
1896 	uint8_t fid = command->cdw10 & 0xFF;
1897 	uint8_t sel = (command->cdw10 >> 8) & 0x7;
1898 
1899 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1900 
1901 	if (fid >= NVME_FID_MAX) {
1902 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1903 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1904 		return (1);
1905 	}
1906 
1907 	compl->cdw0 = 0;
1908 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1909 
1910 	feat = &sc->feat[fid];
1911 	if (feat->get) {
1912 		feat->get(sc, feat, command, compl);
1913 	}
1914 
1915 	if (compl->status == NVME_SC_SUCCESS) {
1916 		if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific)
1917 			compl->cdw0 = NVME_FEATURES_NS_SPECIFIC;
1918 		else
1919 			compl->cdw0 = feat->cdw11;
1920 	}
1921 
1922 	return (0);
1923 }
1924 
1925 static int
1926 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1927 	struct nvme_completion* compl)
1928 {
1929 	uint8_t	ses, lbaf, pi;
1930 
1931 	/* Only supports Secure Erase Setting - User Data Erase */
1932 	ses = (command->cdw10 >> 9) & 0x7;
1933 	if (ses > 0x1) {
1934 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1935 		return (1);
1936 	}
1937 
1938 	/* Only supports a single LBA Format */
1939 	lbaf = command->cdw10 & 0xf;
1940 	if (lbaf != 0) {
1941 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1942 		    NVME_SC_INVALID_FORMAT);
1943 		return (1);
1944 	}
1945 
1946 	/* Doesn't support Protection Infomation */
1947 	pi = (command->cdw10 >> 5) & 0x7;
1948 	if (pi != 0) {
1949 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1950 		return (1);
1951 	}
1952 
1953 	if (sc->nvstore.type == NVME_STOR_RAM) {
1954 		if (sc->nvstore.ctx)
1955 			free(sc->nvstore.ctx);
1956 		sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1957 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1958 	} else {
1959 		struct pci_nvme_ioreq *req;
1960 		int err;
1961 
1962 		req = pci_nvme_get_ioreq(sc);
1963 		if (req == NULL) {
1964 			pci_nvme_status_genc(&compl->status,
1965 			    NVME_SC_INTERNAL_DEVICE_ERROR);
1966 			WPRINTF("%s: unable to allocate IO req", __func__);
1967 			return (1);
1968 		}
1969 		req->nvme_sq = &sc->submit_queues[0];
1970 		req->sqid = 0;
1971 		req->opc = command->opc;
1972 		req->cid = command->cid;
1973 		req->nsid = command->nsid;
1974 
1975 		req->io_req.br_offset = 0;
1976 		req->io_req.br_resid = sc->nvstore.size;
1977 		req->io_req.br_callback = pci_nvme_io_done;
1978 
1979 		err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1980 		if (err) {
1981 			pci_nvme_status_genc(&compl->status,
1982 			    NVME_SC_INTERNAL_DEVICE_ERROR);
1983 			pci_nvme_release_ioreq(sc, req);
1984 		} else
1985 			compl->status = NVME_NO_STATUS;
1986 	}
1987 
1988 	return (1);
1989 }
1990 
1991 static int
1992 nvme_opc_abort(struct pci_nvme_softc *sc __unused, struct nvme_command *command,
1993     struct nvme_completion *compl)
1994 {
1995 	DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1996 	        command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1997 
1998 	/* TODO: search for the command ID and abort it */
1999 
2000 	compl->cdw0 = 1;
2001 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
2002 	return (1);
2003 }
2004 
2005 static int
2006 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
2007 	struct nvme_command* command, struct nvme_completion* compl)
2008 {
2009 	DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__,
2010 	    sc->aer_count, sc->ctrldata.aerl, command->cid);
2011 
2012 	/* Don't exceed the Async Event Request Limit (AERL). */
2013 	if (pci_nvme_aer_limit_reached(sc)) {
2014 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
2015 				NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
2016 		return (1);
2017 	}
2018 
2019 	if (pci_nvme_aer_add(sc, command->cid)) {
2020 		pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
2021 				NVME_SC_INTERNAL_DEVICE_ERROR);
2022 		return (1);
2023 	}
2024 
2025 	/*
2026 	 * Raise events when they happen based on the Set Features cmd.
2027 	 * These events happen async, so only set completion successful if
2028 	 * there is an event reflective of the request to get event.
2029 	 */
2030 	compl->status = NVME_NO_STATUS;
2031 	pci_nvme_aen_notify(sc);
2032 
2033 	return (0);
2034 }
2035 
2036 static void
2037 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
2038 {
2039 	struct nvme_completion compl;
2040 	struct nvme_command *cmd;
2041 	struct nvme_submission_queue *sq;
2042 	struct nvme_completion_queue *cq;
2043 	uint16_t sqhead;
2044 
2045 	DPRINTF("%s index %u", __func__, (uint32_t)value);
2046 
2047 	sq = &sc->submit_queues[0];
2048 	cq = &sc->compl_queues[0];
2049 
2050 	pthread_mutex_lock(&sq->mtx);
2051 
2052 	sqhead = sq->head;
2053 	DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
2054 
2055 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
2056 		cmd = &(sq->qbase)[sqhead];
2057 		compl.cdw0 = 0;
2058 		compl.status = 0;
2059 
2060 		switch (cmd->opc) {
2061 		case NVME_OPC_DELETE_IO_SQ:
2062 			DPRINTF("%s command DELETE_IO_SQ", __func__);
2063 			nvme_opc_delete_io_sq(sc, cmd, &compl);
2064 			break;
2065 		case NVME_OPC_CREATE_IO_SQ:
2066 			DPRINTF("%s command CREATE_IO_SQ", __func__);
2067 			nvme_opc_create_io_sq(sc, cmd, &compl);
2068 			break;
2069 		case NVME_OPC_DELETE_IO_CQ:
2070 			DPRINTF("%s command DELETE_IO_CQ", __func__);
2071 			nvme_opc_delete_io_cq(sc, cmd, &compl);
2072 			break;
2073 		case NVME_OPC_CREATE_IO_CQ:
2074 			DPRINTF("%s command CREATE_IO_CQ", __func__);
2075 			nvme_opc_create_io_cq(sc, cmd, &compl);
2076 			break;
2077 		case NVME_OPC_GET_LOG_PAGE:
2078 			DPRINTF("%s command GET_LOG_PAGE", __func__);
2079 			nvme_opc_get_log_page(sc, cmd, &compl);
2080 			break;
2081 		case NVME_OPC_IDENTIFY:
2082 			DPRINTF("%s command IDENTIFY", __func__);
2083 			nvme_opc_identify(sc, cmd, &compl);
2084 			break;
2085 		case NVME_OPC_ABORT:
2086 			DPRINTF("%s command ABORT", __func__);
2087 			nvme_opc_abort(sc, cmd, &compl);
2088 			break;
2089 		case NVME_OPC_SET_FEATURES:
2090 			DPRINTF("%s command SET_FEATURES", __func__);
2091 			nvme_opc_set_features(sc, cmd, &compl);
2092 			break;
2093 		case NVME_OPC_GET_FEATURES:
2094 			DPRINTF("%s command GET_FEATURES", __func__);
2095 			nvme_opc_get_features(sc, cmd, &compl);
2096 			break;
2097 		case NVME_OPC_FIRMWARE_ACTIVATE:
2098 			DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
2099 			pci_nvme_status_tc(&compl.status,
2100 			    NVME_SCT_COMMAND_SPECIFIC,
2101 			    NVME_SC_INVALID_FIRMWARE_SLOT);
2102 			break;
2103 		case NVME_OPC_ASYNC_EVENT_REQUEST:
2104 			DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
2105 			nvme_opc_async_event_req(sc, cmd, &compl);
2106 			break;
2107 		case NVME_OPC_FORMAT_NVM:
2108 			DPRINTF("%s command FORMAT_NVM", __func__);
2109 			if ((sc->ctrldata.oacs &
2110 			    (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
2111 				pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
2112 				break;
2113 			}
2114 			nvme_opc_format_nvm(sc, cmd, &compl);
2115 			break;
2116 		case NVME_OPC_SECURITY_SEND:
2117 		case NVME_OPC_SECURITY_RECEIVE:
2118 		case NVME_OPC_SANITIZE:
2119 		case NVME_OPC_GET_LBA_STATUS:
2120 			DPRINTF("%s command OPC=%#x (unsupported)", __func__,
2121 			    cmd->opc);
2122 			/* Valid but unsupported opcodes */
2123 			pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD);
2124 			break;
2125 		default:
2126 			DPRINTF("%s command OPC=%#X (not implemented)",
2127 			    __func__,
2128 			    cmd->opc);
2129 			pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
2130 		}
2131 		sqhead = (sqhead + 1) % sq->size;
2132 
2133 		if (NVME_COMPLETION_VALID(compl)) {
2134 			pci_nvme_cq_update(sc, &sc->compl_queues[0],
2135 			    compl.cdw0,
2136 			    cmd->cid,
2137 			    0,		/* SQID */
2138 			    compl.status);
2139 		}
2140 	}
2141 
2142 	DPRINTF("setting sqhead %u", sqhead);
2143 	sq->head = sqhead;
2144 
2145 	if (cq->head != cq->tail)
2146 		pci_generate_msix(sc->nsc_pi, 0);
2147 
2148 	pthread_mutex_unlock(&sq->mtx);
2149 }
2150 
2151 /*
2152  * Update the Write and Read statistics reported in SMART data
2153  *
2154  * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
2155  * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
2156  * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
2157  */
2158 static void
2159 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
2160     size_t bytes, uint16_t status)
2161 {
2162 
2163 	pthread_mutex_lock(&sc->mtx);
2164 	switch (opc) {
2165 	case NVME_OPC_WRITE:
2166 		sc->write_commands++;
2167 		if (status != NVME_SC_SUCCESS)
2168 			break;
2169 		sc->write_dunits_remainder += (bytes / 512);
2170 		while (sc->write_dunits_remainder >= 1000) {
2171 			sc->write_data_units++;
2172 			sc->write_dunits_remainder -= 1000;
2173 		}
2174 		break;
2175 	case NVME_OPC_READ:
2176 		sc->read_commands++;
2177 		if (status != NVME_SC_SUCCESS)
2178 			break;
2179 		sc->read_dunits_remainder += (bytes / 512);
2180 		while (sc->read_dunits_remainder >= 1000) {
2181 			sc->read_data_units++;
2182 			sc->read_dunits_remainder -= 1000;
2183 		}
2184 		break;
2185 	default:
2186 		DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
2187 		break;
2188 	}
2189 	pthread_mutex_unlock(&sc->mtx);
2190 }
2191 
2192 /*
2193  * Check if the combination of Starting LBA (slba) and number of blocks
2194  * exceeds the range of the underlying storage.
2195  *
2196  * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
2197  * the capacity in bytes as a uint64_t, care must be taken to avoid integer
2198  * overflow.
2199  */
2200 static bool
2201 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
2202     uint32_t nblocks)
2203 {
2204 	size_t	offset, bytes;
2205 
2206 	/* Overflow check of multiplying Starting LBA by the sector size */
2207 	if (slba >> (64 - nvstore->sectsz_bits))
2208 		return (true);
2209 
2210 	offset = slba << nvstore->sectsz_bits;
2211 	bytes = nblocks << nvstore->sectsz_bits;
2212 
2213 	/* Overflow check of Number of Logical Blocks */
2214 	if ((nvstore->size <= offset) || ((nvstore->size - offset) < bytes))
2215 		return (true);
2216 
2217 	return (false);
2218 }
2219 
2220 static int
2221 pci_nvme_append_iov_req(struct pci_nvme_softc *sc __unused,
2222     struct pci_nvme_ioreq *req, uint64_t gpaddr, size_t size, uint64_t offset)
2223 {
2224 	int iovidx;
2225 	bool range_is_contiguous;
2226 
2227 	if (req == NULL)
2228 		return (-1);
2229 
2230 	if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
2231 		return (-1);
2232 	}
2233 
2234 	/*
2235 	 * Minimize the number of IOVs by concatenating contiguous address
2236 	 * ranges. If the IOV count is zero, there is no previous range to
2237 	 * concatenate.
2238 	 */
2239 	if (req->io_req.br_iovcnt == 0)
2240 		range_is_contiguous = false;
2241 	else
2242 		range_is_contiguous = (req->prev_gpaddr + req->prev_size) == gpaddr;
2243 
2244 	if (range_is_contiguous) {
2245 		iovidx = req->io_req.br_iovcnt - 1;
2246 
2247 		req->io_req.br_iov[iovidx].iov_base =
2248 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2249 				     req->prev_gpaddr, size);
2250 		if (req->io_req.br_iov[iovidx].iov_base == NULL)
2251 			return (-1);
2252 
2253 		req->prev_size += size;
2254 		req->io_req.br_resid += size;
2255 
2256 		req->io_req.br_iov[iovidx].iov_len = req->prev_size;
2257 	} else {
2258 		iovidx = req->io_req.br_iovcnt;
2259 		if (iovidx == 0) {
2260 			req->io_req.br_offset = offset;
2261 			req->io_req.br_resid = 0;
2262 			req->io_req.br_param = req;
2263 		}
2264 
2265 		req->io_req.br_iov[iovidx].iov_base =
2266 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2267 				     gpaddr, size);
2268 		if (req->io_req.br_iov[iovidx].iov_base == NULL)
2269 			return (-1);
2270 
2271 		req->io_req.br_iov[iovidx].iov_len = size;
2272 
2273 		req->prev_gpaddr = gpaddr;
2274 		req->prev_size = size;
2275 		req->io_req.br_resid += size;
2276 
2277 		req->io_req.br_iovcnt++;
2278 	}
2279 
2280 	return (0);
2281 }
2282 
2283 static void
2284 pci_nvme_set_completion(struct pci_nvme_softc *sc,
2285     struct nvme_submission_queue *sq, int sqid, uint16_t cid, uint16_t status)
2286 {
2287 	struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
2288 
2289 	DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
2290 		 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
2291 		 NVME_STATUS_GET_SC(status));
2292 
2293 	pci_nvme_cq_update(sc, cq, 0, cid, sqid, status);
2294 
2295 	if (cq->head != cq->tail) {
2296 		if (cq->intr_en & NVME_CQ_INTEN) {
2297 			pci_generate_msix(sc->nsc_pi, cq->intr_vec);
2298 		} else {
2299 			DPRINTF("%s: CQ%u interrupt disabled",
2300 						__func__, sq->cqid);
2301 		}
2302 	}
2303 }
2304 
2305 static void
2306 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
2307 {
2308 	req->sc = NULL;
2309 	req->nvme_sq = NULL;
2310 	req->sqid = 0;
2311 
2312 	pthread_mutex_lock(&sc->mtx);
2313 
2314 	STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
2315 	sc->pending_ios--;
2316 
2317 	/* when no more IO pending, can set to ready if device reset/enabled */
2318 	if (sc->pending_ios == 0 &&
2319 	    NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
2320 		sc->regs.csts |= NVME_CSTS_RDY;
2321 
2322 	pthread_mutex_unlock(&sc->mtx);
2323 
2324 	sem_post(&sc->iosemlock);
2325 }
2326 
2327 static struct pci_nvme_ioreq *
2328 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
2329 {
2330 	struct pci_nvme_ioreq *req = NULL;
2331 
2332 	sem_wait(&sc->iosemlock);
2333 	pthread_mutex_lock(&sc->mtx);
2334 
2335 	req = STAILQ_FIRST(&sc->ioreqs_free);
2336 	assert(req != NULL);
2337 	STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
2338 
2339 	req->sc = sc;
2340 
2341 	sc->pending_ios++;
2342 
2343 	pthread_mutex_unlock(&sc->mtx);
2344 
2345 	req->io_req.br_iovcnt = 0;
2346 	req->io_req.br_offset = 0;
2347 	req->io_req.br_resid = 0;
2348 	req->io_req.br_param = req;
2349 	req->prev_gpaddr = 0;
2350 	req->prev_size = 0;
2351 
2352 	return req;
2353 }
2354 
2355 static void
2356 pci_nvme_io_done(struct blockif_req *br, int err)
2357 {
2358 	struct pci_nvme_ioreq *req = br->br_param;
2359 	struct nvme_submission_queue *sq = req->nvme_sq;
2360 	uint16_t code, status;
2361 
2362 	DPRINTF("%s error %d %s", __func__, err, strerror(err));
2363 
2364 	/* TODO return correct error */
2365 	code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
2366 	pci_nvme_status_genc(&status, code);
2367 
2368 	pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, status);
2369 	pci_nvme_stats_write_read_update(req->sc, req->opc,
2370 	    req->bytes, status);
2371 	pci_nvme_release_ioreq(req->sc, req);
2372 }
2373 
2374 /*
2375  * Implements the Flush command. The specification states:
2376  *    If a volatile write cache is not present, Flush commands complete
2377  *    successfully and have no effect
2378  * in the description of the Volatile Write Cache (VWC) field of the Identify
2379  * Controller data. Therefore, set status to Success if the command is
2380  * not supported (i.e. RAM or as indicated by the blockif).
2381  */
2382 static bool
2383 nvme_opc_flush(struct pci_nvme_softc *sc __unused,
2384     struct nvme_command *cmd __unused,
2385     struct pci_nvme_blockstore *nvstore,
2386     struct pci_nvme_ioreq *req,
2387     uint16_t *status)
2388 {
2389 	bool pending = false;
2390 
2391 	if (nvstore->type == NVME_STOR_RAM) {
2392 		pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2393 	} else {
2394 		int err;
2395 
2396 		req->io_req.br_callback = pci_nvme_io_done;
2397 
2398 		err = blockif_flush(nvstore->ctx, &req->io_req);
2399 		switch (err) {
2400 		case 0:
2401 			pending = true;
2402 			break;
2403 		case EOPNOTSUPP:
2404 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2405 			break;
2406 		default:
2407 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2408 		}
2409 	}
2410 
2411 	return (pending);
2412 }
2413 
2414 static uint16_t
2415 nvme_write_read_ram(struct pci_nvme_softc *sc,
2416     struct pci_nvme_blockstore *nvstore,
2417     uint64_t prp1, uint64_t prp2,
2418     size_t offset, uint64_t bytes,
2419     bool is_write)
2420 {
2421 	uint8_t *buf = nvstore->ctx;
2422 	enum nvme_copy_dir dir;
2423 	uint16_t status;
2424 
2425 	if (is_write)
2426 		dir = NVME_COPY_TO_PRP;
2427 	else
2428 		dir = NVME_COPY_FROM_PRP;
2429 
2430 	if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
2431 	    buf + offset, bytes, dir))
2432 		pci_nvme_status_genc(&status,
2433 		    NVME_SC_DATA_TRANSFER_ERROR);
2434 	else
2435 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2436 
2437 	return (status);
2438 }
2439 
2440 static uint16_t
2441 nvme_write_read_blockif(struct pci_nvme_softc *sc,
2442     struct pci_nvme_blockstore *nvstore,
2443     struct pci_nvme_ioreq *req,
2444     uint64_t prp1, uint64_t prp2,
2445     size_t offset, uint64_t bytes,
2446     bool is_write)
2447 {
2448 	uint64_t size;
2449 	int err;
2450 	uint16_t status = NVME_NO_STATUS;
2451 
2452 	size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
2453 	if (pci_nvme_append_iov_req(sc, req, prp1, size, offset)) {
2454 		err = -1;
2455 		goto out;
2456 	}
2457 
2458 	offset += size;
2459 	bytes  -= size;
2460 
2461 	if (bytes == 0) {
2462 		;
2463 	} else if (bytes <= PAGE_SIZE) {
2464 		size = bytes;
2465 		if (pci_nvme_append_iov_req(sc, req, prp2, size, offset)) {
2466 			err = -1;
2467 			goto out;
2468 		}
2469 	} else {
2470 		void *vmctx = sc->nsc_pi->pi_vmctx;
2471 		uint64_t *prp_list = &prp2;
2472 		uint64_t *last = prp_list;
2473 
2474 		/* PRP2 is pointer to a physical region page list */
2475 		while (bytes) {
2476 			/* Last entry in list points to the next list */
2477 			if ((prp_list == last) && (bytes > PAGE_SIZE)) {
2478 				uint64_t prp = *prp_list;
2479 
2480 				prp_list = paddr_guest2host(vmctx, prp,
2481 				    PAGE_SIZE - (prp % PAGE_SIZE));
2482 				if (prp_list == NULL) {
2483 					err = -1;
2484 					goto out;
2485 				}
2486 				last = prp_list + (NVME_PRP2_ITEMS - 1);
2487 			}
2488 
2489 			size = MIN(bytes, PAGE_SIZE);
2490 
2491 			if (pci_nvme_append_iov_req(sc, req, *prp_list, size,
2492 			    offset)) {
2493 				err = -1;
2494 				goto out;
2495 			}
2496 
2497 			offset += size;
2498 			bytes  -= size;
2499 
2500 			prp_list++;
2501 		}
2502 	}
2503 	req->io_req.br_callback = pci_nvme_io_done;
2504 	if (is_write)
2505 		err = blockif_write(nvstore->ctx, &req->io_req);
2506 	else
2507 		err = blockif_read(nvstore->ctx, &req->io_req);
2508 out:
2509 	if (err)
2510 		pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2511 
2512 	return (status);
2513 }
2514 
2515 static bool
2516 nvme_opc_write_read(struct pci_nvme_softc *sc,
2517     struct nvme_command *cmd,
2518     struct pci_nvme_blockstore *nvstore,
2519     struct pci_nvme_ioreq *req,
2520     uint16_t *status)
2521 {
2522 	uint64_t lba, nblocks, bytes;
2523 	size_t offset;
2524 	bool is_write = cmd->opc == NVME_OPC_WRITE;
2525 	bool pending = false;
2526 
2527 	lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2528 	nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2529 	bytes = nblocks << nvstore->sectsz_bits;
2530 	if (bytes > NVME_MAX_DATA_SIZE) {
2531 		WPRINTF("%s command would exceed MDTS", __func__);
2532 		pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2533 		goto out;
2534 	}
2535 
2536 	if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2537 		WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)",
2538 		    __func__, lba, nblocks);
2539 		pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2540 		goto out;
2541 	}
2542 
2543 	offset = lba << nvstore->sectsz_bits;
2544 
2545 	req->bytes = bytes;
2546 	req->io_req.br_offset = lba;
2547 
2548 	/* PRP bits 1:0 must be zero */
2549 	cmd->prp1 &= ~0x3UL;
2550 	cmd->prp2 &= ~0x3UL;
2551 
2552 	if (nvstore->type == NVME_STOR_RAM) {
2553 		*status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2554 		    cmd->prp2, offset, bytes, is_write);
2555 	} else {
2556 		*status = nvme_write_read_blockif(sc, nvstore, req,
2557 		    cmd->prp1, cmd->prp2, offset, bytes, is_write);
2558 
2559 		if (*status == NVME_NO_STATUS)
2560 			pending = true;
2561 	}
2562 out:
2563 	if (!pending)
2564 		pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2565 
2566 	return (pending);
2567 }
2568 
2569 static void
2570 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2571 {
2572 	struct pci_nvme_ioreq *req = br->br_param;
2573 	struct pci_nvme_softc *sc = req->sc;
2574 	bool done = true;
2575 	uint16_t status;
2576 
2577 	if (err) {
2578 		pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2579 	} else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2580 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2581 	} else {
2582 		struct iovec *iov = req->io_req.br_iov;
2583 
2584 		req->prev_gpaddr++;
2585 		iov += req->prev_gpaddr;
2586 
2587 		/* The iov_* values already include the sector size */
2588 		req->io_req.br_offset = (off_t)iov->iov_base;
2589 		req->io_req.br_resid = iov->iov_len;
2590 		if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2591 			pci_nvme_status_genc(&status,
2592 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2593 		} else
2594 			done = false;
2595 	}
2596 
2597 	if (done) {
2598 		pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, req->cid,
2599 		    status);
2600 		pci_nvme_release_ioreq(sc, req);
2601 	}
2602 }
2603 
2604 static bool
2605 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2606     struct nvme_command *cmd,
2607     struct pci_nvme_blockstore *nvstore,
2608     struct pci_nvme_ioreq *req,
2609     uint16_t *status)
2610 {
2611 	struct nvme_dsm_range *range = NULL;
2612 	uint32_t nr, r, non_zero, dr;
2613 	int err;
2614 	bool pending = false;
2615 
2616 	if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2617 		pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2618 		goto out;
2619 	}
2620 
2621 	nr = cmd->cdw10 & 0xff;
2622 
2623 	/* copy locally because a range entry could straddle PRPs */
2624 	range = calloc(1, NVME_MAX_DSM_TRIM);
2625 	if (range == NULL) {
2626 		pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2627 		goto out;
2628 	}
2629 	nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2630 	    (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2631 
2632 	/* Check for invalid ranges and the number of non-zero lengths */
2633 	non_zero = 0;
2634 	for (r = 0; r <= nr; r++) {
2635 		if (pci_nvme_out_of_range(nvstore,
2636 		    range[r].starting_lba, range[r].length)) {
2637 			pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2638 			goto out;
2639 		}
2640 		if (range[r].length != 0)
2641 			non_zero++;
2642 	}
2643 
2644 	if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2645 		size_t offset, bytes;
2646 		int sectsz_bits = sc->nvstore.sectsz_bits;
2647 
2648 		/*
2649 		 * DSM calls are advisory only, and compliant controllers
2650 		 * may choose to take no actions (i.e. return Success).
2651 		 */
2652 		if (!nvstore->deallocate) {
2653 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2654 			goto out;
2655 		}
2656 
2657 		/* If all ranges have a zero length, return Success */
2658 		if (non_zero == 0) {
2659 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2660 			goto out;
2661 		}
2662 
2663 		if (req == NULL) {
2664 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2665 			goto out;
2666 		}
2667 
2668 		offset = range[0].starting_lba << sectsz_bits;
2669 		bytes = range[0].length << sectsz_bits;
2670 
2671 		/*
2672 		 * If the request is for more than a single range, store
2673 		 * the ranges in the br_iov. Optimize for the common case
2674 		 * of a single range.
2675 		 *
2676 		 * Note that NVMe Number of Ranges is a zero based value
2677 		 */
2678 		req->io_req.br_iovcnt = 0;
2679 		req->io_req.br_offset = offset;
2680 		req->io_req.br_resid = bytes;
2681 
2682 		if (nr == 0) {
2683 			req->io_req.br_callback = pci_nvme_io_done;
2684 		} else {
2685 			struct iovec *iov = req->io_req.br_iov;
2686 
2687 			for (r = 0, dr = 0; r <= nr; r++) {
2688 				offset = range[r].starting_lba << sectsz_bits;
2689 				bytes = range[r].length << sectsz_bits;
2690 				if (bytes == 0)
2691 					continue;
2692 
2693 				if ((nvstore->size - offset) < bytes) {
2694 					pci_nvme_status_genc(status,
2695 					    NVME_SC_LBA_OUT_OF_RANGE);
2696 					goto out;
2697 				}
2698 				iov[dr].iov_base = (void *)offset;
2699 				iov[dr].iov_len = bytes;
2700 				dr++;
2701 			}
2702 			req->io_req.br_callback = pci_nvme_dealloc_sm;
2703 
2704 			/*
2705 			 * Use prev_gpaddr to track the current entry and
2706 			 * prev_size to track the number of entries
2707 			 */
2708 			req->prev_gpaddr = 0;
2709 			req->prev_size = dr;
2710 		}
2711 
2712 		err = blockif_delete(nvstore->ctx, &req->io_req);
2713 		if (err)
2714 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2715 		else
2716 			pending = true;
2717 	}
2718 out:
2719 	free(range);
2720 	return (pending);
2721 }
2722 
2723 static void
2724 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2725 {
2726 	struct nvme_submission_queue *sq;
2727 	uint16_t status;
2728 	uint16_t sqhead;
2729 
2730 	/* handle all submissions up to sq->tail index */
2731 	sq = &sc->submit_queues[idx];
2732 
2733 	pthread_mutex_lock(&sq->mtx);
2734 
2735 	sqhead = sq->head;
2736 	DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2737 	         idx, sqhead, sq->tail, sq->qbase);
2738 
2739 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
2740 		struct nvme_command *cmd;
2741 		struct pci_nvme_ioreq *req;
2742 		uint32_t nsid;
2743 		bool pending;
2744 
2745 		pending = false;
2746 		req = NULL;
2747 		status = 0;
2748 
2749 		cmd = &sq->qbase[sqhead];
2750 		sqhead = (sqhead + 1) % sq->size;
2751 
2752 		nsid = le32toh(cmd->nsid);
2753 		if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2754 			pci_nvme_status_genc(&status,
2755 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2756 			status |=
2757 			    NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2758 			goto complete;
2759  		}
2760 
2761 		req = pci_nvme_get_ioreq(sc);
2762 		if (req == NULL) {
2763 			pci_nvme_status_genc(&status,
2764 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2765 			WPRINTF("%s: unable to allocate IO req", __func__);
2766 			goto complete;
2767 		}
2768 		req->nvme_sq = sq;
2769 		req->sqid = idx;
2770 		req->opc = cmd->opc;
2771 		req->cid = cmd->cid;
2772 		req->nsid = cmd->nsid;
2773 
2774 		switch (cmd->opc) {
2775 		case NVME_OPC_FLUSH:
2776 			pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2777 			    req, &status);
2778  			break;
2779 		case NVME_OPC_WRITE:
2780 		case NVME_OPC_READ:
2781 			pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2782 			    req, &status);
2783 			break;
2784 		case NVME_OPC_WRITE_ZEROES:
2785 			/* TODO: write zeroes
2786 			WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2787 			        __func__, lba, cmd->cdw12 & 0xFFFF); */
2788 			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2789 			break;
2790 		case NVME_OPC_DATASET_MANAGEMENT:
2791  			pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2792 			    req, &status);
2793 			break;
2794  		default:
2795  			WPRINTF("%s unhandled io command 0x%x",
2796 			    __func__, cmd->opc);
2797 			pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2798 		}
2799 complete:
2800 		if (!pending) {
2801 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, status);
2802 			if (req != NULL)
2803 				pci_nvme_release_ioreq(sc, req);
2804 		}
2805 	}
2806 
2807 	sq->head = sqhead;
2808 
2809 	pthread_mutex_unlock(&sq->mtx);
2810 }
2811 
2812 static void
2813 pci_nvme_handle_doorbell(struct vmctx *ctx __unused, struct pci_nvme_softc* sc,
2814 	uint64_t idx, int is_sq, uint64_t value)
2815 {
2816 	DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2817 	        idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2818 
2819 	if (is_sq) {
2820 		if (idx > sc->num_squeues) {
2821 			WPRINTF("%s queue index %lu overflow from "
2822 			         "guest (max %u)",
2823 			         __func__, idx, sc->num_squeues);
2824 			return;
2825 		}
2826 
2827 		atomic_store_short(&sc->submit_queues[idx].tail,
2828 		                   (uint16_t)value);
2829 
2830 		if (idx == 0) {
2831 			pci_nvme_handle_admin_cmd(sc, value);
2832 		} else {
2833 			/* submission queue; handle new entries in SQ */
2834 			if (idx > sc->num_squeues) {
2835 				WPRINTF("%s SQ index %lu overflow from "
2836 				         "guest (max %u)",
2837 				         __func__, idx, sc->num_squeues);
2838 				return;
2839 			}
2840 			pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2841 		}
2842 	} else {
2843 		if (idx > sc->num_cqueues) {
2844 			WPRINTF("%s queue index %lu overflow from "
2845 			         "guest (max %u)",
2846 			         __func__, idx, sc->num_cqueues);
2847 			return;
2848 		}
2849 
2850 		atomic_store_short(&sc->compl_queues[idx].head,
2851 				(uint16_t)value);
2852 	}
2853 }
2854 
2855 static void
2856 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2857 {
2858 	const char *s = iswrite ? "WRITE" : "READ";
2859 
2860 	switch (offset) {
2861 	case NVME_CR_CAP_LOW:
2862 		DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2863 		break;
2864 	case NVME_CR_CAP_HI:
2865 		DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2866 		break;
2867 	case NVME_CR_VS:
2868 		DPRINTF("%s %s NVME_CR_VS", func, s);
2869 		break;
2870 	case NVME_CR_INTMS:
2871 		DPRINTF("%s %s NVME_CR_INTMS", func, s);
2872 		break;
2873 	case NVME_CR_INTMC:
2874 		DPRINTF("%s %s NVME_CR_INTMC", func, s);
2875 		break;
2876 	case NVME_CR_CC:
2877 		DPRINTF("%s %s NVME_CR_CC", func, s);
2878 		break;
2879 	case NVME_CR_CSTS:
2880 		DPRINTF("%s %s NVME_CR_CSTS", func, s);
2881 		break;
2882 	case NVME_CR_NSSR:
2883 		DPRINTF("%s %s NVME_CR_NSSR", func, s);
2884 		break;
2885 	case NVME_CR_AQA:
2886 		DPRINTF("%s %s NVME_CR_AQA", func, s);
2887 		break;
2888 	case NVME_CR_ASQ_LOW:
2889 		DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2890 		break;
2891 	case NVME_CR_ASQ_HI:
2892 		DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2893 		break;
2894 	case NVME_CR_ACQ_LOW:
2895 		DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2896 		break;
2897 	case NVME_CR_ACQ_HI:
2898 		DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2899 		break;
2900 	default:
2901 		DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2902 	}
2903 
2904 }
2905 
2906 static void
2907 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2908 	uint64_t offset, int size, uint64_t value)
2909 {
2910 	uint32_t ccreg;
2911 
2912 	if (offset >= NVME_DOORBELL_OFFSET) {
2913 		uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2914 		uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2915 		int is_sq = (belloffset % 8) < 4;
2916 
2917 		if ((sc->regs.csts & NVME_CSTS_RDY) == 0) {
2918 			WPRINTF("doorbell write prior to RDY (offset=%#lx)\n",
2919 			    offset);
2920 			return;
2921 		}
2922 
2923 		if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2924 			WPRINTF("guest attempted an overflow write offset "
2925 			         "0x%lx, val 0x%lx in %s",
2926 			         offset, value, __func__);
2927 			return;
2928 		}
2929 
2930 		if (is_sq) {
2931 			if (sc->submit_queues[idx].qbase == NULL)
2932 				return;
2933 		} else if (sc->compl_queues[idx].qbase == NULL)
2934 			return;
2935 
2936 		pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2937 		return;
2938 	}
2939 
2940 	DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2941 	        offset, size, value);
2942 
2943 	if (size != 4) {
2944 		WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2945 		         "val 0x%lx) to bar0 in %s",
2946 		         size, offset, value, __func__);
2947 		/* TODO: shutdown device */
2948 		return;
2949 	}
2950 
2951 	pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2952 
2953 	pthread_mutex_lock(&sc->mtx);
2954 
2955 	switch (offset) {
2956 	case NVME_CR_CAP_LOW:
2957 	case NVME_CR_CAP_HI:
2958 		/* readonly */
2959 		break;
2960 	case NVME_CR_VS:
2961 		/* readonly */
2962 		break;
2963 	case NVME_CR_INTMS:
2964 		/* MSI-X, so ignore */
2965 		break;
2966 	case NVME_CR_INTMC:
2967 		/* MSI-X, so ignore */
2968 		break;
2969 	case NVME_CR_CC:
2970 		ccreg = (uint32_t)value;
2971 
2972 		DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2973 		         "iocqes %u",
2974 		        __func__,
2975 			 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2976 			 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2977 			 NVME_CC_GET_IOCQES(ccreg));
2978 
2979 		if (NVME_CC_GET_SHN(ccreg)) {
2980 			/* perform shutdown - flush out data to backend */
2981 			sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2982 			    NVME_CSTS_REG_SHST_SHIFT);
2983 			sc->regs.csts |= NVME_SHST_COMPLETE <<
2984 			    NVME_CSTS_REG_SHST_SHIFT;
2985 		}
2986 		if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2987 			if (NVME_CC_GET_EN(ccreg) == 0)
2988 				/* transition 1-> causes controller reset */
2989 				pci_nvme_reset_locked(sc);
2990 			else
2991 				pci_nvme_init_controller(ctx, sc);
2992 		}
2993 
2994 		/* Insert the iocqes, iosqes and en bits from the write */
2995 		sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2996 		sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2997 		if (NVME_CC_GET_EN(ccreg) == 0) {
2998 			/* Insert the ams, mps and css bit fields */
2999 			sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
3000 			sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
3001 			sc->regs.csts &= ~NVME_CSTS_RDY;
3002 		} else if ((sc->pending_ios == 0) &&
3003 		    !(sc->regs.csts & NVME_CSTS_CFS)) {
3004 			sc->regs.csts |= NVME_CSTS_RDY;
3005 		}
3006 		break;
3007 	case NVME_CR_CSTS:
3008 		break;
3009 	case NVME_CR_NSSR:
3010 		/* ignore writes; don't support subsystem reset */
3011 		break;
3012 	case NVME_CR_AQA:
3013 		sc->regs.aqa = (uint32_t)value;
3014 		break;
3015 	case NVME_CR_ASQ_LOW:
3016 		sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
3017 		               (0xFFFFF000 & value);
3018 		break;
3019 	case NVME_CR_ASQ_HI:
3020 		sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
3021 		               (value << 32);
3022 		break;
3023 	case NVME_CR_ACQ_LOW:
3024 		sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
3025 		               (0xFFFFF000 & value);
3026 		break;
3027 	case NVME_CR_ACQ_HI:
3028 		sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
3029 		               (value << 32);
3030 		break;
3031 	default:
3032 		DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
3033 		         __func__, offset, value, size);
3034 	}
3035 	pthread_mutex_unlock(&sc->mtx);
3036 }
3037 
3038 static void
3039 pci_nvme_write(struct vmctx *ctx, int vcpu __unused, struct pci_devinst *pi,
3040     int baridx, uint64_t offset, int size, uint64_t value)
3041 {
3042 	struct pci_nvme_softc* sc = pi->pi_arg;
3043 
3044 	if (baridx == pci_msix_table_bar(pi) ||
3045 	    baridx == pci_msix_pba_bar(pi)) {
3046 		DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
3047 		         " value 0x%lx", baridx, offset, size, value);
3048 
3049 		pci_emul_msix_twrite(pi, offset, size, value);
3050 		return;
3051 	}
3052 
3053 	switch (baridx) {
3054 	case 0:
3055 		pci_nvme_write_bar_0(ctx, sc, offset, size, value);
3056 		break;
3057 
3058 	default:
3059 		DPRINTF("%s unknown baridx %d, val 0x%lx",
3060 		         __func__, baridx, value);
3061 	}
3062 }
3063 
3064 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
3065 	uint64_t offset, int size)
3066 {
3067 	uint64_t value;
3068 
3069 	pci_nvme_bar0_reg_dumps(__func__, offset, 0);
3070 
3071 	if (offset < NVME_DOORBELL_OFFSET) {
3072 		void *p = &(sc->regs);
3073 		pthread_mutex_lock(&sc->mtx);
3074 		memcpy(&value, (void *)((uintptr_t)p + offset), size);
3075 		pthread_mutex_unlock(&sc->mtx);
3076 	} else {
3077 		value = 0;
3078                 WPRINTF("pci_nvme: read invalid offset %ld", offset);
3079 	}
3080 
3081 	switch (size) {
3082 	case 1:
3083 		value &= 0xFF;
3084 		break;
3085 	case 2:
3086 		value &= 0xFFFF;
3087 		break;
3088 	case 4:
3089 		value &= 0xFFFFFFFF;
3090 		break;
3091 	}
3092 
3093 	DPRINTF("   nvme-read offset 0x%lx, size %d -> value 0x%x",
3094 	         offset, size, (uint32_t)value);
3095 
3096 	return (value);
3097 }
3098 
3099 
3100 
3101 static uint64_t
3102 pci_nvme_read(struct vmctx *ctx __unused, int vcpu __unused,
3103     struct pci_devinst *pi, int baridx, uint64_t offset, int size)
3104 {
3105 	struct pci_nvme_softc* sc = pi->pi_arg;
3106 
3107 	if (baridx == pci_msix_table_bar(pi) ||
3108 	    baridx == pci_msix_pba_bar(pi)) {
3109 		DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
3110 		        baridx, offset, size);
3111 
3112 		return pci_emul_msix_tread(pi, offset, size);
3113 	}
3114 
3115 	switch (baridx) {
3116 	case 0:
3117        		return pci_nvme_read_bar_0(sc, offset, size);
3118 
3119 	default:
3120 		DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
3121 	}
3122 
3123 	return (0);
3124 }
3125 
3126 static int
3127 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl)
3128 {
3129 	char bident[sizeof("XX:X:X")];
3130 	const char *value;
3131 	uint32_t sectsz;
3132 
3133 	sc->max_queues = NVME_QUEUES;
3134 	sc->max_qentries = NVME_MAX_QENTRIES;
3135 	sc->ioslots = NVME_IOSLOTS;
3136 	sc->num_squeues = sc->max_queues;
3137 	sc->num_cqueues = sc->max_queues;
3138 	sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
3139 	sectsz = 0;
3140 	snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
3141 	         "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
3142 
3143 	value = get_config_value_node(nvl, "maxq");
3144 	if (value != NULL)
3145 		sc->max_queues = atoi(value);
3146 	value = get_config_value_node(nvl, "qsz");
3147 	if (value != NULL) {
3148 		sc->max_qentries = atoi(value);
3149 		if (sc->max_qentries <= 0) {
3150 			EPRINTLN("nvme: Invalid qsz option %d",
3151 			    sc->max_qentries);
3152 			return (-1);
3153 		}
3154 	}
3155 	value = get_config_value_node(nvl, "ioslots");
3156 	if (value != NULL) {
3157 		sc->ioslots = atoi(value);
3158 		if (sc->ioslots <= 0) {
3159 			EPRINTLN("Invalid ioslots option %d", sc->ioslots);
3160 			return (-1);
3161 		}
3162 	}
3163 	value = get_config_value_node(nvl, "sectsz");
3164 	if (value != NULL)
3165 		sectsz = atoi(value);
3166 	value = get_config_value_node(nvl, "ser");
3167 	if (value != NULL) {
3168 		/*
3169 		 * This field indicates the Product Serial Number in
3170 		 * 7-bit ASCII, unused bytes should be space characters.
3171 		 * Ref: NVMe v1.3c.
3172 		 */
3173 		cpywithpad((char *)sc->ctrldata.sn,
3174 		    sizeof(sc->ctrldata.sn), value, ' ');
3175 	}
3176 	value = get_config_value_node(nvl, "eui64");
3177 	if (value != NULL)
3178 		sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0));
3179 	value = get_config_value_node(nvl, "dsm");
3180 	if (value != NULL) {
3181 		if (strcmp(value, "auto") == 0)
3182 			sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
3183 		else if (strcmp(value, "enable") == 0)
3184 			sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
3185 		else if (strcmp(value, "disable") == 0)
3186 			sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
3187 	}
3188 
3189 	value = get_config_value_node(nvl, "ram");
3190 	if (value != NULL) {
3191 		uint64_t sz = strtoull(value, NULL, 10);
3192 
3193 		sc->nvstore.type = NVME_STOR_RAM;
3194 		sc->nvstore.size = sz * 1024 * 1024;
3195 		sc->nvstore.ctx = calloc(1, sc->nvstore.size);
3196 		sc->nvstore.sectsz = 4096;
3197 		sc->nvstore.sectsz_bits = 12;
3198 		if (sc->nvstore.ctx == NULL) {
3199 			EPRINTLN("nvme: Unable to allocate RAM");
3200 			return (-1);
3201 		}
3202 	} else {
3203 		snprintf(bident, sizeof(bident), "%d:%d",
3204 		    sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
3205 		sc->nvstore.ctx = blockif_open(nvl, bident);
3206 		if (sc->nvstore.ctx == NULL) {
3207 			EPRINTLN("nvme: Could not open backing file: %s",
3208 			    strerror(errno));
3209 			return (-1);
3210 		}
3211 		sc->nvstore.type = NVME_STOR_BLOCKIF;
3212 		sc->nvstore.size = blockif_size(sc->nvstore.ctx);
3213 	}
3214 
3215 	if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
3216 		sc->nvstore.sectsz = sectsz;
3217 	else if (sc->nvstore.type != NVME_STOR_RAM)
3218 		sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
3219 	for (sc->nvstore.sectsz_bits = 9;
3220 	     (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
3221 	     sc->nvstore.sectsz_bits++);
3222 
3223 	if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
3224 		sc->max_queues = NVME_QUEUES;
3225 
3226 	return (0);
3227 }
3228 
3229 static void
3230 pci_nvme_resized(struct blockif_ctxt *bctxt __unused, void *arg,
3231     size_t new_size)
3232 {
3233 	struct pci_nvme_softc *sc;
3234 	struct pci_nvme_blockstore *nvstore;
3235 	struct nvme_namespace_data *nd;
3236 
3237 	sc = arg;
3238 	nvstore = &sc->nvstore;
3239 	nd = &sc->nsdata;
3240 
3241 	nvstore->size = new_size;
3242 	pci_nvme_init_nsdata_size(nvstore, nd);
3243 
3244 	/* Add changed NSID to list */
3245 	sc->ns_log.ns[0] = 1;
3246 	sc->ns_log.ns[1] = 0;
3247 
3248 	pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE,
3249 	    PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED);
3250 }
3251 
3252 static int
3253 pci_nvme_init(struct vmctx *ctx __unused, struct pci_devinst *pi, nvlist_t *nvl)
3254 {
3255 	struct pci_nvme_softc *sc;
3256 	uint32_t pci_membar_sz;
3257 	int	error;
3258 
3259 	error = 0;
3260 
3261 	sc = calloc(1, sizeof(struct pci_nvme_softc));
3262 	pi->pi_arg = sc;
3263 	sc->nsc_pi = pi;
3264 
3265 	error = pci_nvme_parse_config(sc, nvl);
3266 	if (error < 0)
3267 		goto done;
3268 	else
3269 		error = 0;
3270 
3271 	STAILQ_INIT(&sc->ioreqs_free);
3272 	sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
3273 	for (uint32_t i = 0; i < sc->ioslots; i++) {
3274 		STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
3275 	}
3276 
3277 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
3278 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
3279 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
3280 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
3281 	pci_set_cfgdata8(pi, PCIR_PROGIF,
3282 	                 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
3283 
3284 	/*
3285 	 * Allocate size of NVMe registers + doorbell space for all queues.
3286 	 *
3287 	 * The specification requires a minimum memory I/O window size of 16K.
3288 	 * The Windows driver will refuse to start a device with a smaller
3289 	 * window.
3290 	 */
3291 	pci_membar_sz = sizeof(struct nvme_registers) +
3292 	    2 * sizeof(uint32_t) * (sc->max_queues + 1);
3293 	pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
3294 
3295 	DPRINTF("nvme membar size: %u", pci_membar_sz);
3296 
3297 	error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
3298 	if (error) {
3299 		WPRINTF("%s pci alloc mem bar failed", __func__);
3300 		goto done;
3301 	}
3302 
3303 	error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
3304 	if (error) {
3305 		WPRINTF("%s pci add msixcap failed", __func__);
3306 		goto done;
3307 	}
3308 
3309 	error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
3310 	if (error) {
3311 		WPRINTF("%s pci add Express capability failed", __func__);
3312 		goto done;
3313 	}
3314 
3315 	pthread_mutex_init(&sc->mtx, NULL);
3316 	sem_init(&sc->iosemlock, 0, sc->ioslots);
3317 	blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc);
3318 
3319 	pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
3320 	/*
3321 	 * Controller data depends on Namespace data so initialize Namespace
3322 	 * data first.
3323 	 */
3324 	pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
3325 	pci_nvme_init_ctrldata(sc);
3326 	pci_nvme_init_logpages(sc);
3327 	pci_nvme_init_features(sc);
3328 
3329 	pci_nvme_aer_init(sc);
3330 	pci_nvme_aen_init(sc);
3331 
3332 	pci_nvme_reset(sc);
3333 
3334 	pci_lintr_request(pi);
3335 
3336 done:
3337 	return (error);
3338 }
3339 
3340 static int
3341 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts)
3342 {
3343 	char *cp, *ram;
3344 
3345 	if (opts == NULL)
3346 		return (0);
3347 
3348 	if (strncmp(opts, "ram=", 4) == 0) {
3349 		cp = strchr(opts, ',');
3350 		if (cp == NULL) {
3351 			set_config_value_node(nvl, "ram", opts + 4);
3352 			return (0);
3353 		}
3354 		ram = strndup(opts + 4, cp - opts - 4);
3355 		set_config_value_node(nvl, "ram", ram);
3356 		free(ram);
3357 		return (pci_parse_legacy_config(nvl, cp + 1));
3358 	} else
3359 		return (blockif_legacy_config(nvl, opts));
3360 }
3361 
3362 static const struct pci_devemu pci_de_nvme = {
3363 	.pe_emu =	"nvme",
3364 	.pe_init =	pci_nvme_init,
3365 	.pe_legacy_config = pci_nvme_legacy_config,
3366 	.pe_barwrite =	pci_nvme_write,
3367 	.pe_barread =	pci_nvme_read
3368 };
3369 PCI_EMUL_SET(pci_de_nvme);
3370