xref: /freebsd/usr.sbin/bhyve/pci_nvme.c (revision 13ec1e3155c7e9bf037b12af186351b7fa9b9450)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  * Copyright (c) 2020 Chuck Tuffli
7  *
8  * Function crc16 Copyright (c) 2017, Fedor Uporov
9  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 /*
34  * bhyve PCIe-NVMe device emulation.
35  *
36  * options:
37  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
38  *
39  *  accepted devpath:
40  *    /dev/blockdev
41  *    /path/to/image
42  *    ram=size_in_MiB
43  *
44  *  maxq    = max number of queues
45  *  qsz     = max elements in each queue
46  *  ioslots = max number of concurrent io requests
47  *  sectsz  = sector size (defaults to blockif sector size)
48  *  ser     = serial number (20-chars max)
49  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
50  *  dsm     = DataSet Management support. Option is one of auto, enable,disable
51  *
52  */
53 
54 /* TODO:
55     - create async event for smart and log
56     - intr coalesce
57  */
58 
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61 
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
65 
66 #include <assert.h>
67 #include <pthread.h>
68 #include <pthread_np.h>
69 #include <semaphore.h>
70 #include <stdbool.h>
71 #include <stddef.h>
72 #include <stdint.h>
73 #include <stdio.h>
74 #include <stdlib.h>
75 #include <string.h>
76 
77 #include <machine/atomic.h>
78 #include <machine/vmm.h>
79 #include <vmmapi.h>
80 
81 #include <dev/nvme/nvme.h>
82 
83 #include "bhyverun.h"
84 #include "block_if.h"
85 #include "config.h"
86 #include "debug.h"
87 #include "pci_emul.h"
88 
89 
90 static int nvme_debug = 0;
91 #define	DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
92 #define	WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
93 
94 /* defaults; can be overridden */
95 #define	NVME_MSIX_BAR		4
96 
97 #define	NVME_IOSLOTS		8
98 
99 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
100 #define NVME_MMIO_SPACE_MIN	(1 << 14)
101 
102 #define	NVME_QUEUES		16
103 #define	NVME_MAX_QENTRIES	2048
104 /* Memory Page size Minimum reported in CAP register */
105 #define	NVME_MPSMIN		0
106 /* MPSMIN converted to bytes */
107 #define	NVME_MPSMIN_BYTES	(1 << (12 + NVME_MPSMIN))
108 
109 #define	NVME_PRP2_ITEMS		(PAGE_SIZE/sizeof(uint64_t))
110 #define	NVME_MDTS		9
111 /* Note the + 1 allows for the initial descriptor to not be page aligned */
112 #define	NVME_MAX_IOVEC		((1 << NVME_MDTS) + 1)
113 #define	NVME_MAX_DATA_SIZE	((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
114 
115 /* This is a synthetic status code to indicate there is no status */
116 #define NVME_NO_STATUS		0xffff
117 #define NVME_COMPLETION_VALID(c)	((c).status != NVME_NO_STATUS)
118 
119 /* Reported temperature in Kelvin (i.e. room temperature) */
120 #define NVME_TEMPERATURE 296
121 
122 /* helpers */
123 
124 /* Convert a zero-based value into a one-based value */
125 #define ONE_BASED(zero)		((zero) + 1)
126 /* Convert a one-based value into a zero-based value */
127 #define ZERO_BASED(one)		((one)  - 1)
128 
129 /* Encode number of SQ's and CQ's for Set/Get Features */
130 #define NVME_FEATURE_NUM_QUEUES(sc) \
131 	(ZERO_BASED((sc)->num_squeues) & 0xffff) | \
132 	(ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
133 
134 #define	NVME_DOORBELL_OFFSET	offsetof(struct nvme_registers, doorbell)
135 
136 enum nvme_controller_register_offsets {
137 	NVME_CR_CAP_LOW = 0x00,
138 	NVME_CR_CAP_HI  = 0x04,
139 	NVME_CR_VS      = 0x08,
140 	NVME_CR_INTMS   = 0x0c,
141 	NVME_CR_INTMC   = 0x10,
142 	NVME_CR_CC      = 0x14,
143 	NVME_CR_CSTS    = 0x1c,
144 	NVME_CR_NSSR    = 0x20,
145 	NVME_CR_AQA     = 0x24,
146 	NVME_CR_ASQ_LOW = 0x28,
147 	NVME_CR_ASQ_HI  = 0x2c,
148 	NVME_CR_ACQ_LOW = 0x30,
149 	NVME_CR_ACQ_HI  = 0x34,
150 };
151 
152 enum nvme_cmd_cdw11 {
153 	NVME_CMD_CDW11_PC  = 0x0001,
154 	NVME_CMD_CDW11_IEN = 0x0002,
155 	NVME_CMD_CDW11_IV  = 0xFFFF0000,
156 };
157 
158 enum nvme_copy_dir {
159 	NVME_COPY_TO_PRP,
160 	NVME_COPY_FROM_PRP,
161 };
162 
163 #define	NVME_CQ_INTEN	0x01
164 #define	NVME_CQ_INTCOAL	0x02
165 
166 struct nvme_completion_queue {
167 	struct nvme_completion *qbase;
168 	pthread_mutex_t	mtx;
169 	uint32_t	size;
170 	uint16_t	tail; /* nvme progress */
171 	uint16_t	head; /* guest progress */
172 	uint16_t	intr_vec;
173 	uint32_t	intr_en;
174 };
175 
176 struct nvme_submission_queue {
177 	struct nvme_command *qbase;
178 	pthread_mutex_t	mtx;
179 	uint32_t	size;
180 	uint16_t	head; /* nvme progress */
181 	uint16_t	tail; /* guest progress */
182 	uint16_t	cqid; /* completion queue id */
183 	int		qpriority;
184 };
185 
186 enum nvme_storage_type {
187 	NVME_STOR_BLOCKIF = 0,
188 	NVME_STOR_RAM = 1,
189 };
190 
191 struct pci_nvme_blockstore {
192 	enum nvme_storage_type type;
193 	void		*ctx;
194 	uint64_t	size;
195 	uint32_t	sectsz;
196 	uint32_t	sectsz_bits;
197 	uint64_t	eui64;
198 	uint32_t	deallocate:1;
199 };
200 
201 /*
202  * Calculate the number of additional page descriptors for guest IO requests
203  * based on the advertised Max Data Transfer (MDTS) and given the number of
204  * default iovec's in a struct blockif_req.
205  */
206 #define MDTS_PAD_SIZE \
207 	( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
208 	  NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
209 	  0 )
210 
211 struct pci_nvme_ioreq {
212 	struct pci_nvme_softc *sc;
213 	STAILQ_ENTRY(pci_nvme_ioreq) link;
214 	struct nvme_submission_queue *nvme_sq;
215 	uint16_t	sqid;
216 
217 	/* command information */
218 	uint16_t	opc;
219 	uint16_t	cid;
220 	uint32_t	nsid;
221 
222 	uint64_t	prev_gpaddr;
223 	size_t		prev_size;
224 	size_t		bytes;
225 
226 	struct blockif_req io_req;
227 
228 	struct iovec	iovpadding[MDTS_PAD_SIZE];
229 };
230 
231 enum nvme_dsm_type {
232 	/* Dataset Management bit in ONCS reflects backing storage capability */
233 	NVME_DATASET_MANAGEMENT_AUTO,
234 	/* Unconditionally set Dataset Management bit in ONCS */
235 	NVME_DATASET_MANAGEMENT_ENABLE,
236 	/* Unconditionally clear Dataset Management bit in ONCS */
237 	NVME_DATASET_MANAGEMENT_DISABLE,
238 };
239 
240 struct pci_nvme_softc;
241 struct nvme_feature_obj;
242 
243 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
244     struct nvme_feature_obj *,
245     struct nvme_command *,
246     struct nvme_completion *);
247 
248 struct nvme_feature_obj {
249 	uint32_t	cdw11;
250 	nvme_feature_cb	set;
251 	nvme_feature_cb	get;
252 	bool namespace_specific;
253 };
254 
255 #define NVME_FID_MAX		(NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
256 
257 typedef enum {
258 	PCI_NVME_AE_TYPE_ERROR = 0,
259 	PCI_NVME_AE_TYPE_SMART,
260 	PCI_NVME_AE_TYPE_NOTICE,
261 	PCI_NVME_AE_TYPE_IO_CMD = 6,
262 	PCI_NVME_AE_TYPE_VENDOR = 7,
263 	PCI_NVME_AE_TYPE_MAX		/* Must be last */
264 } pci_nvme_async_type;
265 
266 /* Asynchronous Event Requests */
267 struct pci_nvme_aer {
268 	STAILQ_ENTRY(pci_nvme_aer) link;
269 	uint16_t	cid;	/* Command ID of the submitted AER */
270 };
271 
272 typedef enum {
273 	PCI_NVME_AE_INFO_NS_ATTR_CHANGED = 0,
274 	PCI_NVME_AE_INFO_FW_ACTIVATION,
275 	PCI_NVME_AE_INFO_TELEMETRY_CHANGE,
276 	PCI_NVME_AE_INFO_ANA_CHANGE,
277 	PCI_NVME_AE_INFO_PREDICT_LATENCY_CHANGE,
278 	PCI_NVME_AE_INFO_LBA_STATUS_ALERT,
279 	PCI_NVME_AE_INFO_ENDURANCE_GROUP_CHANGE,
280 	PCI_NVME_AE_INFO_MAX,
281 } pci_nvme_async_info;
282 
283 /* Asynchronous Event Notifications */
284 struct pci_nvme_aen {
285 	pci_nvme_async_type atype;
286 	uint32_t	event_data;
287 	bool		posted;
288 };
289 
290 /*
291  * By default, enable all Asynchrnous Event Notifications:
292  *     SMART / Health Critical Warnings
293  *     Namespace Attribute Notices
294  */
295 #define PCI_NVME_AEN_DEFAULT_MASK	0x11f
296 
297 typedef enum {
298 	NVME_CNTRLTYPE_IO = 1,
299 	NVME_CNTRLTYPE_DISCOVERY = 2,
300 	NVME_CNTRLTYPE_ADMIN = 3,
301 } pci_nvme_cntrl_type;
302 
303 struct pci_nvme_softc {
304 	struct pci_devinst *nsc_pi;
305 
306 	pthread_mutex_t	mtx;
307 
308 	struct nvme_registers regs;
309 
310 	struct nvme_namespace_data  nsdata;
311 	struct nvme_controller_data ctrldata;
312 	struct nvme_error_information_entry err_log;
313 	struct nvme_health_information_page health_log;
314 	struct nvme_firmware_page fw_log;
315 	struct nvme_ns_list ns_log;
316 
317 	struct pci_nvme_blockstore nvstore;
318 
319 	uint16_t	max_qentries;	/* max entries per queue */
320 	uint32_t	max_queues;	/* max number of IO SQ's or CQ's */
321 	uint32_t	num_cqueues;
322 	uint32_t	num_squeues;
323 	bool		num_q_is_set; /* Has host set Number of Queues */
324 
325 	struct pci_nvme_ioreq *ioreqs;
326 	STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
327 	uint32_t	pending_ios;
328 	uint32_t	ioslots;
329 	sem_t		iosemlock;
330 
331 	/*
332 	 * Memory mapped Submission and Completion queues
333 	 * Each array includes both Admin and IO queues
334 	 */
335 	struct nvme_completion_queue *compl_queues;
336 	struct nvme_submission_queue *submit_queues;
337 
338 	struct nvme_feature_obj feat[NVME_FID_MAX];
339 
340 	enum nvme_dsm_type dataset_management;
341 
342 	/* Accounting for SMART data */
343 	__uint128_t	read_data_units;
344 	__uint128_t	write_data_units;
345 	__uint128_t	read_commands;
346 	__uint128_t	write_commands;
347 	uint32_t	read_dunits_remainder;
348 	uint32_t	write_dunits_remainder;
349 
350 	STAILQ_HEAD(, pci_nvme_aer) aer_list;
351 	pthread_mutex_t	aer_mtx;
352 	uint32_t	aer_count;
353 	struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX];
354 	pthread_t	aen_tid;
355 	pthread_mutex_t	aen_mtx;
356 	pthread_cond_t	aen_cond;
357 };
358 
359 
360 static void pci_nvme_cq_update(struct pci_nvme_softc *sc,
361     struct nvme_completion_queue *cq,
362     uint32_t cdw0,
363     uint16_t cid,
364     uint16_t sqid,
365     uint16_t status);
366 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
367 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
368 static void pci_nvme_io_done(struct blockif_req *, int);
369 
370 /* Controller Configuration utils */
371 #define	NVME_CC_GET_EN(cc) \
372 	((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
373 #define	NVME_CC_GET_CSS(cc) \
374 	((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
375 #define	NVME_CC_GET_SHN(cc) \
376 	((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
377 #define	NVME_CC_GET_IOSQES(cc) \
378 	((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
379 #define	NVME_CC_GET_IOCQES(cc) \
380 	((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
381 
382 #define	NVME_CC_WRITE_MASK \
383 	((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
384 	 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
385 	 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
386 
387 #define	NVME_CC_NEN_WRITE_MASK \
388 	((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
389 	 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
390 	 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
391 
392 /* Controller Status utils */
393 #define	NVME_CSTS_GET_RDY(sts) \
394 	((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
395 
396 #define	NVME_CSTS_RDY	(1 << NVME_CSTS_REG_RDY_SHIFT)
397 
398 /* Completion Queue status word utils */
399 #define	NVME_STATUS_P	(1 << NVME_STATUS_P_SHIFT)
400 #define	NVME_STATUS_MASK \
401 	((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
402 	 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
403 
404 #define NVME_ONCS_DSM	(NVME_CTRLR_DATA_ONCS_DSM_MASK << \
405 	NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
406 
407 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
408     struct nvme_feature_obj *,
409     struct nvme_command *,
410     struct nvme_completion *);
411 static void nvme_feature_temperature(struct pci_nvme_softc *,
412     struct nvme_feature_obj *,
413     struct nvme_command *,
414     struct nvme_completion *);
415 static void nvme_feature_num_queues(struct pci_nvme_softc *,
416     struct nvme_feature_obj *,
417     struct nvme_command *,
418     struct nvme_completion *);
419 static void nvme_feature_iv_config(struct pci_nvme_softc *,
420     struct nvme_feature_obj *,
421     struct nvme_command *,
422     struct nvme_completion *);
423 static void nvme_feature_async_event(struct pci_nvme_softc *,
424     struct nvme_feature_obj *,
425     struct nvme_command *,
426     struct nvme_completion *);
427 
428 static void *aen_thr(void *arg);
429 
430 static __inline void
431 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
432 {
433 	size_t len;
434 
435 	len = strnlen(src, dst_size);
436 	memset(dst, pad, dst_size);
437 	memcpy(dst, src, len);
438 }
439 
440 static __inline void
441 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
442 {
443 
444 	*status &= ~NVME_STATUS_MASK;
445 	*status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
446 		(code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
447 }
448 
449 static __inline void
450 pci_nvme_status_genc(uint16_t *status, uint16_t code)
451 {
452 
453 	pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
454 }
455 
456 /*
457  * Initialize the requested number or IO Submission and Completion Queues.
458  * Admin queues are allocated implicitly.
459  */
460 static void
461 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
462 {
463 	uint32_t i;
464 
465 	/*
466 	 * Allocate and initialize the Submission Queues
467 	 */
468 	if (nsq > NVME_QUEUES) {
469 		WPRINTF("%s: clamping number of SQ from %u to %u",
470 					__func__, nsq, NVME_QUEUES);
471 		nsq = NVME_QUEUES;
472 	}
473 
474 	sc->num_squeues = nsq;
475 
476 	sc->submit_queues = calloc(sc->num_squeues + 1,
477 				sizeof(struct nvme_submission_queue));
478 	if (sc->submit_queues == NULL) {
479 		WPRINTF("%s: SQ allocation failed", __func__);
480 		sc->num_squeues = 0;
481 	} else {
482 		struct nvme_submission_queue *sq = sc->submit_queues;
483 
484 		for (i = 0; i < sc->num_squeues; i++)
485 			pthread_mutex_init(&sq[i].mtx, NULL);
486 	}
487 
488 	/*
489 	 * Allocate and initialize the Completion Queues
490 	 */
491 	if (ncq > NVME_QUEUES) {
492 		WPRINTF("%s: clamping number of CQ from %u to %u",
493 					__func__, ncq, NVME_QUEUES);
494 		ncq = NVME_QUEUES;
495 	}
496 
497 	sc->num_cqueues = ncq;
498 
499 	sc->compl_queues = calloc(sc->num_cqueues + 1,
500 				sizeof(struct nvme_completion_queue));
501 	if (sc->compl_queues == NULL) {
502 		WPRINTF("%s: CQ allocation failed", __func__);
503 		sc->num_cqueues = 0;
504 	} else {
505 		struct nvme_completion_queue *cq = sc->compl_queues;
506 
507 		for (i = 0; i < sc->num_cqueues; i++)
508 			pthread_mutex_init(&cq[i].mtx, NULL);
509 	}
510 }
511 
512 static void
513 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
514 {
515 	struct nvme_controller_data *cd = &sc->ctrldata;
516 
517 	cd->vid = 0xFB5D;
518 	cd->ssvid = 0x0000;
519 
520 	cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
521 	cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
522 
523 	/* Num of submission commands that we can handle at a time (2^rab) */
524 	cd->rab   = 4;
525 
526 	/* FreeBSD OUI */
527 	cd->ieee[0] = 0x58;
528 	cd->ieee[1] = 0x9c;
529 	cd->ieee[2] = 0xfc;
530 
531 	cd->mic = 0;
532 
533 	cd->mdts = NVME_MDTS;	/* max data transfer size (2^mdts * CAP.MPSMIN) */
534 
535 	cd->ver = NVME_REV(1,4);
536 
537 	cd->cntrltype = NVME_CNTRLTYPE_IO;
538 	cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
539 	cd->acl = 2;
540 	cd->aerl = 4;
541 
542 	/* Advertise 1, Read-only firmware slot */
543 	cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK |
544 	    (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
545 	cd->lpa = 0;	/* TODO: support some simple things like SMART */
546 	cd->elpe = 0;	/* max error log page entries */
547 	cd->npss = 1;	/* number of power states support */
548 
549 	/* Warning Composite Temperature Threshold */
550 	cd->wctemp = 0x0157;
551 	cd->cctemp = 0x0157;
552 
553 	cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
554 	    (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
555 	cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
556 	    (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
557 	cd->nn = 1;	/* number of namespaces */
558 
559 	cd->oncs = 0;
560 	switch (sc->dataset_management) {
561 	case NVME_DATASET_MANAGEMENT_AUTO:
562 		if (sc->nvstore.deallocate)
563 			cd->oncs |= NVME_ONCS_DSM;
564 		break;
565 	case NVME_DATASET_MANAGEMENT_ENABLE:
566 		cd->oncs |= NVME_ONCS_DSM;
567 		break;
568 	default:
569 		break;
570 	}
571 
572 	cd->fna = NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK <<
573 	    NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT;
574 
575 	cd->vwc = NVME_CTRLR_DATA_VWC_ALL_NO << NVME_CTRLR_DATA_VWC_ALL_SHIFT;
576 
577 	cd->power_state[0].mp = 10;
578 }
579 
580 /*
581  * Calculate the CRC-16 of the given buffer
582  * See copyright attribution at top of file
583  */
584 static uint16_t
585 crc16(uint16_t crc, const void *buffer, unsigned int len)
586 {
587 	const unsigned char *cp = buffer;
588 	/* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
589 	static uint16_t const crc16_table[256] = {
590 		0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
591 		0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
592 		0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
593 		0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
594 		0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
595 		0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
596 		0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
597 		0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
598 		0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
599 		0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
600 		0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
601 		0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
602 		0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
603 		0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
604 		0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
605 		0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
606 		0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
607 		0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
608 		0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
609 		0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
610 		0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
611 		0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
612 		0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
613 		0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
614 		0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
615 		0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
616 		0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
617 		0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
618 		0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
619 		0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
620 		0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
621 		0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
622 	};
623 
624 	while (len--)
625 		crc = (((crc >> 8) & 0xffU) ^
626 		    crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
627 	return crc;
628 }
629 
630 static void
631 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore,
632     struct nvme_namespace_data *nd)
633 {
634 
635 	/* Get capacity and block size information from backing store */
636 	nd->nsze = nvstore->size / nvstore->sectsz;
637 	nd->ncap = nd->nsze;
638 	nd->nuse = nd->nsze;
639 }
640 
641 static void
642 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
643     struct nvme_namespace_data *nd, uint32_t nsid,
644     struct pci_nvme_blockstore *nvstore)
645 {
646 
647 	pci_nvme_init_nsdata_size(nvstore, nd);
648 
649 	if (nvstore->type == NVME_STOR_BLOCKIF)
650 		nvstore->deallocate = blockif_candelete(nvstore->ctx);
651 
652 	nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
653 	nd->flbas = 0;
654 
655 	/* Create an EUI-64 if user did not provide one */
656 	if (nvstore->eui64 == 0) {
657 		char *data = NULL;
658 		uint64_t eui64 = nvstore->eui64;
659 
660 		asprintf(&data, "%s%u%u%u", get_config_value("name"),
661 		    sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot,
662 		    sc->nsc_pi->pi_func);
663 
664 		if (data != NULL) {
665 			eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
666 			free(data);
667 		}
668 		nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
669 	}
670 	be64enc(nd->eui64, nvstore->eui64);
671 
672 	/* LBA data-sz = 2^lbads */
673 	nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
674 }
675 
676 static void
677 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
678 {
679 
680 	memset(&sc->err_log, 0, sizeof(sc->err_log));
681 	memset(&sc->health_log, 0, sizeof(sc->health_log));
682 	memset(&sc->fw_log, 0, sizeof(sc->fw_log));
683 	memset(&sc->ns_log, 0, sizeof(sc->ns_log));
684 
685 	/* Set read/write remainder to round up according to spec */
686 	sc->read_dunits_remainder = 999;
687 	sc->write_dunits_remainder = 999;
688 
689 	/* Set nominal Health values checked by implementations */
690 	sc->health_log.temperature = NVME_TEMPERATURE;
691 	sc->health_log.available_spare = 100;
692 	sc->health_log.available_spare_threshold = 10;
693 }
694 
695 static void
696 pci_nvme_init_features(struct pci_nvme_softc *sc)
697 {
698 	enum nvme_feature	fid;
699 
700 	for (fid = 0; fid < NVME_FID_MAX; fid++) {
701 		switch (fid) {
702 		case NVME_FEAT_ARBITRATION:
703 		case NVME_FEAT_POWER_MANAGEMENT:
704 		case NVME_FEAT_INTERRUPT_COALESCING: //XXX
705 		case NVME_FEAT_WRITE_ATOMICITY:
706 			/* Mandatory but no special handling required */
707 		//XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
708 		//XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
709 		//		  this returns a data buffer
710 			break;
711 		case NVME_FEAT_TEMPERATURE_THRESHOLD:
712 			sc->feat[fid].set = nvme_feature_temperature;
713 			break;
714 		case NVME_FEAT_ERROR_RECOVERY:
715 			sc->feat[fid].namespace_specific = true;
716 			break;
717 		case NVME_FEAT_NUMBER_OF_QUEUES:
718 			sc->feat[fid].set = nvme_feature_num_queues;
719 			break;
720 		case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
721 			sc->feat[fid].set = nvme_feature_iv_config;
722 			break;
723 		case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
724 			sc->feat[fid].set = nvme_feature_async_event;
725 			/* Enable all AENs by default */
726 			sc->feat[fid].cdw11 = PCI_NVME_AEN_DEFAULT_MASK;
727 			break;
728 		default:
729 			sc->feat[fid].set = nvme_feature_invalid_cb;
730 			sc->feat[fid].get = nvme_feature_invalid_cb;
731 		}
732 	}
733 }
734 
735 static void
736 pci_nvme_aer_reset(struct pci_nvme_softc *sc)
737 {
738 
739 	STAILQ_INIT(&sc->aer_list);
740 	sc->aer_count = 0;
741 }
742 
743 static void
744 pci_nvme_aer_init(struct pci_nvme_softc *sc)
745 {
746 
747 	pthread_mutex_init(&sc->aer_mtx, NULL);
748 	pci_nvme_aer_reset(sc);
749 }
750 
751 static void
752 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
753 {
754 	struct pci_nvme_aer *aer = NULL;
755 
756 	pthread_mutex_lock(&sc->aer_mtx);
757 	while (!STAILQ_EMPTY(&sc->aer_list)) {
758 		aer = STAILQ_FIRST(&sc->aer_list);
759 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
760 		free(aer);
761 	}
762 	pthread_mutex_unlock(&sc->aer_mtx);
763 
764 	pci_nvme_aer_reset(sc);
765 }
766 
767 static bool
768 pci_nvme_aer_available(struct pci_nvme_softc *sc)
769 {
770 
771 	return (sc->aer_count != 0);
772 }
773 
774 static bool
775 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
776 {
777 	struct nvme_controller_data *cd = &sc->ctrldata;
778 
779 	/* AERL is a zero based value while aer_count is one's based */
780 	return (sc->aer_count == (cd->aerl + 1));
781 }
782 
783 /*
784  * Add an Async Event Request
785  *
786  * Stores an AER to be returned later if the Controller needs to notify the
787  * host of an event.
788  * Note that while the NVMe spec doesn't require Controllers to return AER's
789  * in order, this implementation does preserve the order.
790  */
791 static int
792 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
793 {
794 	struct pci_nvme_aer *aer = NULL;
795 
796 	aer = calloc(1, sizeof(struct pci_nvme_aer));
797 	if (aer == NULL)
798 		return (-1);
799 
800 	/* Save the Command ID for use in the completion message */
801 	aer->cid = cid;
802 
803 	pthread_mutex_lock(&sc->aer_mtx);
804 	sc->aer_count++;
805 	STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
806 	pthread_mutex_unlock(&sc->aer_mtx);
807 
808 	return (0);
809 }
810 
811 /*
812  * Get an Async Event Request structure
813  *
814  * Returns a pointer to an AER previously submitted by the host or NULL if
815  * no AER's exist. Caller is responsible for freeing the returned struct.
816  */
817 static struct pci_nvme_aer *
818 pci_nvme_aer_get(struct pci_nvme_softc *sc)
819 {
820 	struct pci_nvme_aer *aer = NULL;
821 
822 	pthread_mutex_lock(&sc->aer_mtx);
823 	aer = STAILQ_FIRST(&sc->aer_list);
824 	if (aer != NULL) {
825 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
826 		sc->aer_count--;
827 	}
828 	pthread_mutex_unlock(&sc->aer_mtx);
829 
830 	return (aer);
831 }
832 
833 static void
834 pci_nvme_aen_reset(struct pci_nvme_softc *sc)
835 {
836 	uint32_t	atype;
837 
838 	memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen));
839 
840 	for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
841 		sc->aen[atype].atype = atype;
842 	}
843 }
844 
845 static void
846 pci_nvme_aen_init(struct pci_nvme_softc *sc)
847 {
848 	char nstr[80];
849 
850 	pci_nvme_aen_reset(sc);
851 
852 	pthread_mutex_init(&sc->aen_mtx, NULL);
853 	pthread_create(&sc->aen_tid, NULL, aen_thr, sc);
854 	snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot,
855 	    sc->nsc_pi->pi_func);
856 	pthread_set_name_np(sc->aen_tid, nstr);
857 }
858 
859 static void
860 pci_nvme_aen_destroy(struct pci_nvme_softc *sc)
861 {
862 
863 	pci_nvme_aen_reset(sc);
864 }
865 
866 /* Notify the AEN thread of pending work */
867 static void
868 pci_nvme_aen_notify(struct pci_nvme_softc *sc)
869 {
870 
871 	pthread_cond_signal(&sc->aen_cond);
872 }
873 
874 /*
875  * Post an Asynchronous Event Notification
876  */
877 static int32_t
878 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype,
879 		uint32_t event_data)
880 {
881 	struct pci_nvme_aen *aen;
882 
883 	if (atype >= PCI_NVME_AE_TYPE_MAX) {
884 		return(EINVAL);
885 	}
886 
887 	pthread_mutex_lock(&sc->aen_mtx);
888 	aen = &sc->aen[atype];
889 
890 	/* Has the controller already posted an event of this type? */
891 	if (aen->posted) {
892 		pthread_mutex_unlock(&sc->aen_mtx);
893 		return(EALREADY);
894 	}
895 
896 	aen->event_data = event_data;
897 	aen->posted = true;
898 	pthread_mutex_unlock(&sc->aen_mtx);
899 
900 	pci_nvme_aen_notify(sc);
901 
902 	return(0);
903 }
904 
905 static void
906 pci_nvme_aen_process(struct pci_nvme_softc *sc)
907 {
908 	struct pci_nvme_aer *aer;
909 	struct pci_nvme_aen *aen;
910 	pci_nvme_async_type atype;
911 	uint32_t mask;
912 	uint16_t status;
913 	uint8_t lid;
914 
915 	assert(pthread_mutex_isowned_np(&sc->aen_mtx));
916 	for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
917 		aen = &sc->aen[atype];
918 		/* Previous iterations may have depleted the available AER's */
919 		if (!pci_nvme_aer_available(sc)) {
920 			DPRINTF("%s: no AER", __func__);
921 			break;
922 		}
923 
924 		if (!aen->posted) {
925 			DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype);
926 			continue;
927 		}
928 
929 		status = NVME_SC_SUCCESS;
930 
931 		/* Is the event masked? */
932 		mask =
933 		    sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11;
934 
935 		DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data);
936 		switch (atype) {
937 		case PCI_NVME_AE_TYPE_ERROR:
938 			lid = NVME_LOG_ERROR;
939 			break;
940 		case PCI_NVME_AE_TYPE_SMART:
941 			mask &= 0xff;
942 			if ((mask & aen->event_data) == 0)
943 				continue;
944 			lid = NVME_LOG_HEALTH_INFORMATION;
945 			break;
946 		case PCI_NVME_AE_TYPE_NOTICE:
947 			if (aen->event_data >= PCI_NVME_AE_INFO_MAX) {
948 				EPRINTLN("%s unknown AEN notice type %u",
949 				    __func__, aen->event_data);
950 				status = NVME_SC_INTERNAL_DEVICE_ERROR;
951 				break;
952 			}
953 			mask >>= 8;
954 			if (((1 << aen->event_data) & mask) == 0)
955 				continue;
956 			switch (aen->event_data) {
957 			case PCI_NVME_AE_INFO_NS_ATTR_CHANGED:
958 				lid = NVME_LOG_CHANGED_NAMESPACE;
959 				break;
960 			case PCI_NVME_AE_INFO_FW_ACTIVATION:
961 				lid = NVME_LOG_FIRMWARE_SLOT;
962 				break;
963 			case PCI_NVME_AE_INFO_TELEMETRY_CHANGE:
964 				lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED;
965 				break;
966 			case PCI_NVME_AE_INFO_ANA_CHANGE:
967 				lid = NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS;
968 				break;
969 			case PCI_NVME_AE_INFO_PREDICT_LATENCY_CHANGE:
970 				lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE;
971 				break;
972 			case PCI_NVME_AE_INFO_LBA_STATUS_ALERT:
973 				lid = NVME_LOG_LBA_STATUS_INFORMATION;
974 				break;
975 			case PCI_NVME_AE_INFO_ENDURANCE_GROUP_CHANGE:
976 				lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE;
977 				break;
978 			default:
979 				lid = 0;
980 			}
981 			break;
982 		default:
983 			/* bad type?!? */
984 			EPRINTLN("%s unknown AEN type %u", __func__, atype);
985 			status = NVME_SC_INTERNAL_DEVICE_ERROR;
986 			break;
987 		}
988 
989 		aer = pci_nvme_aer_get(sc);
990 		assert(aer != NULL);
991 
992 		DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype);
993 		pci_nvme_cq_update(sc, &sc->compl_queues[0],
994 		    (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */
995 		    aer->cid,
996 		    0,		/* SQID */
997 		    status);
998 
999 		aen->event_data = 0;
1000 		aen->posted = false;
1001 
1002 		pci_generate_msix(sc->nsc_pi, 0);
1003 	}
1004 }
1005 
1006 static void *
1007 aen_thr(void *arg)
1008 {
1009 	struct pci_nvme_softc *sc;
1010 
1011 	sc = arg;
1012 
1013 	pthread_mutex_lock(&sc->aen_mtx);
1014 	for (;;) {
1015 		pci_nvme_aen_process(sc);
1016 		pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx);
1017 	}
1018 	pthread_mutex_unlock(&sc->aen_mtx);
1019 
1020 	pthread_exit(NULL);
1021 	return (NULL);
1022 }
1023 
1024 static void
1025 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
1026 {
1027 	uint32_t i;
1028 
1029 	DPRINTF("%s", __func__);
1030 
1031 	sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
1032 	    (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
1033 	    (60 << NVME_CAP_LO_REG_TO_SHIFT);
1034 
1035 	sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
1036 
1037 	sc->regs.vs = NVME_REV(1,4);	/* NVMe v1.4 */
1038 
1039 	sc->regs.cc = 0;
1040 
1041 	assert(sc->submit_queues != NULL);
1042 
1043 	for (i = 0; i < sc->num_squeues + 1; i++) {
1044 		sc->submit_queues[i].qbase = NULL;
1045 		sc->submit_queues[i].size = 0;
1046 		sc->submit_queues[i].cqid = 0;
1047 		sc->submit_queues[i].tail = 0;
1048 		sc->submit_queues[i].head = 0;
1049 	}
1050 
1051 	assert(sc->compl_queues != NULL);
1052 
1053 	for (i = 0; i < sc->num_cqueues + 1; i++) {
1054 		sc->compl_queues[i].qbase = NULL;
1055 		sc->compl_queues[i].size = 0;
1056 		sc->compl_queues[i].tail = 0;
1057 		sc->compl_queues[i].head = 0;
1058 	}
1059 
1060 	sc->num_q_is_set = false;
1061 
1062 	pci_nvme_aer_destroy(sc);
1063 	pci_nvme_aen_destroy(sc);
1064 
1065 	/*
1066 	 * Clear CSTS.RDY last to prevent the host from enabling Controller
1067 	 * before cleanup completes
1068 	 */
1069 	sc->regs.csts = 0;
1070 }
1071 
1072 static void
1073 pci_nvme_reset(struct pci_nvme_softc *sc)
1074 {
1075 	pthread_mutex_lock(&sc->mtx);
1076 	pci_nvme_reset_locked(sc);
1077 	pthread_mutex_unlock(&sc->mtx);
1078 }
1079 
1080 static void
1081 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
1082 {
1083 	uint16_t acqs, asqs;
1084 
1085 	DPRINTF("%s", __func__);
1086 
1087 	asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
1088 	sc->submit_queues[0].size = asqs;
1089 	sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
1090 	            sizeof(struct nvme_command) * asqs);
1091 
1092 	DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
1093 	        __func__, sc->regs.asq, sc->submit_queues[0].qbase);
1094 
1095 	acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
1096 	    NVME_AQA_REG_ACQS_MASK) + 1;
1097 	sc->compl_queues[0].size = acqs;
1098 	sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
1099 	         sizeof(struct nvme_completion) * acqs);
1100 	sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
1101 
1102 	DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
1103 	        __func__, sc->regs.acq, sc->compl_queues[0].qbase);
1104 }
1105 
1106 static int
1107 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
1108 	size_t len, enum nvme_copy_dir dir)
1109 {
1110 	uint8_t *p;
1111 	size_t bytes;
1112 
1113 	if (len > (8 * 1024)) {
1114 		return (-1);
1115 	}
1116 
1117 	/* Copy from the start of prp1 to the end of the physical page */
1118 	bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
1119 	bytes = MIN(bytes, len);
1120 
1121 	p = vm_map_gpa(ctx, prp1, bytes);
1122 	if (p == NULL) {
1123 		return (-1);
1124 	}
1125 
1126 	if (dir == NVME_COPY_TO_PRP)
1127 		memcpy(p, b, bytes);
1128 	else
1129 		memcpy(b, p, bytes);
1130 
1131 	b += bytes;
1132 
1133 	len -= bytes;
1134 	if (len == 0) {
1135 		return (0);
1136 	}
1137 
1138 	len = MIN(len, PAGE_SIZE);
1139 
1140 	p = vm_map_gpa(ctx, prp2, len);
1141 	if (p == NULL) {
1142 		return (-1);
1143 	}
1144 
1145 	if (dir == NVME_COPY_TO_PRP)
1146 		memcpy(p, b, len);
1147 	else
1148 		memcpy(b, p, len);
1149 
1150 	return (0);
1151 }
1152 
1153 /*
1154  * Write a Completion Queue Entry update
1155  *
1156  * Write the completion and update the doorbell value
1157  */
1158 static void
1159 pci_nvme_cq_update(struct pci_nvme_softc *sc,
1160 		struct nvme_completion_queue *cq,
1161 		uint32_t cdw0,
1162 		uint16_t cid,
1163 		uint16_t sqid,
1164 		uint16_t status)
1165 {
1166 	struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
1167 	struct nvme_completion *cqe;
1168 
1169 	assert(cq->qbase != NULL);
1170 
1171 	pthread_mutex_lock(&cq->mtx);
1172 
1173 	cqe = &cq->qbase[cq->tail];
1174 
1175 	/* Flip the phase bit */
1176 	status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
1177 
1178 	cqe->cdw0 = cdw0;
1179 	cqe->sqhd = sq->head;
1180 	cqe->sqid = sqid;
1181 	cqe->cid = cid;
1182 	cqe->status = status;
1183 
1184 	cq->tail++;
1185 	if (cq->tail >= cq->size) {
1186 		cq->tail = 0;
1187 	}
1188 
1189 	pthread_mutex_unlock(&cq->mtx);
1190 }
1191 
1192 static int
1193 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1194 	struct nvme_completion* compl)
1195 {
1196 	uint16_t qid = command->cdw10 & 0xffff;
1197 
1198 	DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
1199 	if (qid == 0 || qid > sc->num_squeues ||
1200 	    (sc->submit_queues[qid].qbase == NULL)) {
1201 		WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
1202 		        __func__, qid, sc->num_squeues);
1203 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1204 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1205 		return (1);
1206 	}
1207 
1208 	sc->submit_queues[qid].qbase = NULL;
1209 	sc->submit_queues[qid].cqid = 0;
1210 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1211 	return (1);
1212 }
1213 
1214 static int
1215 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1216 	struct nvme_completion* compl)
1217 {
1218 	if (command->cdw11 & NVME_CMD_CDW11_PC) {
1219 		uint16_t qid = command->cdw10 & 0xffff;
1220 		struct nvme_submission_queue *nsq;
1221 
1222 		if ((qid == 0) || (qid > sc->num_squeues) ||
1223 		    (sc->submit_queues[qid].qbase != NULL)) {
1224 			WPRINTF("%s queue index %u > num_squeues %u",
1225 			        __func__, qid, sc->num_squeues);
1226 			pci_nvme_status_tc(&compl->status,
1227 			    NVME_SCT_COMMAND_SPECIFIC,
1228 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1229 			return (1);
1230 		}
1231 
1232 		nsq = &sc->submit_queues[qid];
1233 		nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1234 		DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
1235 		if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
1236 			/*
1237 			 * Queues must specify at least two entries
1238 			 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1239 			 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1240 			 */
1241 			pci_nvme_status_tc(&compl->status,
1242 			    NVME_SCT_COMMAND_SPECIFIC,
1243 			    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1244 			return (1);
1245 		}
1246 		nsq->head = nsq->tail = 0;
1247 
1248 		nsq->cqid = (command->cdw11 >> 16) & 0xffff;
1249 		if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
1250 			pci_nvme_status_tc(&compl->status,
1251 			    NVME_SCT_COMMAND_SPECIFIC,
1252 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1253 			return (1);
1254 		}
1255 
1256 		if (sc->compl_queues[nsq->cqid].qbase == NULL) {
1257 			pci_nvme_status_tc(&compl->status,
1258 			    NVME_SCT_COMMAND_SPECIFIC,
1259 			    NVME_SC_COMPLETION_QUEUE_INVALID);
1260 			return (1);
1261 		}
1262 
1263 		nsq->qpriority = (command->cdw11 >> 1) & 0x03;
1264 
1265 		nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1266 		              sizeof(struct nvme_command) * (size_t)nsq->size);
1267 
1268 		DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
1269 		        qid, nsq->size, nsq->qbase, nsq->cqid);
1270 
1271 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1272 
1273 		DPRINTF("%s completed creating IOSQ qid %u",
1274 		         __func__, qid);
1275 	} else {
1276 		/*
1277 		 * Guest sent non-cont submission queue request.
1278 		 * This setting is unsupported by this emulation.
1279 		 */
1280 		WPRINTF("%s unsupported non-contig (list-based) "
1281 		         "create i/o submission queue", __func__);
1282 
1283 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1284 	}
1285 	return (1);
1286 }
1287 
1288 static int
1289 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1290 	struct nvme_completion* compl)
1291 {
1292 	uint16_t qid = command->cdw10 & 0xffff;
1293 	uint16_t sqid;
1294 
1295 	DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
1296 	if (qid == 0 || qid > sc->num_cqueues ||
1297 	    (sc->compl_queues[qid].qbase == NULL)) {
1298 		WPRINTF("%s queue index %u / num_cqueues %u",
1299 		        __func__, qid, sc->num_cqueues);
1300 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1301 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1302 		return (1);
1303 	}
1304 
1305 	/* Deleting an Active CQ is an error */
1306 	for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
1307 		if (sc->submit_queues[sqid].cqid == qid) {
1308 			pci_nvme_status_tc(&compl->status,
1309 			    NVME_SCT_COMMAND_SPECIFIC,
1310 			    NVME_SC_INVALID_QUEUE_DELETION);
1311 			return (1);
1312 		}
1313 
1314 	sc->compl_queues[qid].qbase = NULL;
1315 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1316 	return (1);
1317 }
1318 
1319 static int
1320 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1321 	struct nvme_completion* compl)
1322 {
1323 	struct nvme_completion_queue *ncq;
1324 	uint16_t qid = command->cdw10 & 0xffff;
1325 
1326 	/* Only support Physically Contiguous queues */
1327 	if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1328 		WPRINTF("%s unsupported non-contig (list-based) "
1329 		         "create i/o completion queue",
1330 		         __func__);
1331 
1332 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1333 		return (1);
1334 	}
1335 
1336 	if ((qid == 0) || (qid > sc->num_cqueues) ||
1337 	    (sc->compl_queues[qid].qbase != NULL)) {
1338 		WPRINTF("%s queue index %u > num_cqueues %u",
1339 			__func__, qid, sc->num_cqueues);
1340 		pci_nvme_status_tc(&compl->status,
1341 		    NVME_SCT_COMMAND_SPECIFIC,
1342 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1343 		return (1);
1344  	}
1345 
1346 	ncq = &sc->compl_queues[qid];
1347 	ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1348 	ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1349 	if (ncq->intr_vec > (sc->max_queues + 1)) {
1350 		pci_nvme_status_tc(&compl->status,
1351 		    NVME_SCT_COMMAND_SPECIFIC,
1352 		    NVME_SC_INVALID_INTERRUPT_VECTOR);
1353 		return (1);
1354 	}
1355 
1356 	ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1357 	if ((ncq->size < 2) || (ncq->size > sc->max_qentries))  {
1358 		/*
1359 		 * Queues must specify at least two entries
1360 		 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1361 		 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1362 		 */
1363 		pci_nvme_status_tc(&compl->status,
1364 		    NVME_SCT_COMMAND_SPECIFIC,
1365 		    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1366 		return (1);
1367 	}
1368 	ncq->head = ncq->tail = 0;
1369 	ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1370 		     command->prp1,
1371 		     sizeof(struct nvme_command) * (size_t)ncq->size);
1372 
1373 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1374 
1375 
1376 	return (1);
1377 }
1378 
1379 static int
1380 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1381 	struct nvme_completion* compl)
1382 {
1383 	uint64_t logoff;
1384 	uint32_t logsize;
1385 	uint8_t logpage = command->cdw10 & 0xFF;
1386 
1387 	DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1388 
1389 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1390 
1391 	/*
1392 	 * Command specifies the number of dwords to return in fields NUMDU
1393 	 * and NUMDL. This is a zero-based value.
1394 	 */
1395 	logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1396 	logsize *= sizeof(uint32_t);
1397 	logoff  = ((uint64_t)(command->cdw13) << 32) | command->cdw12;
1398 
1399 	switch (logpage) {
1400 	case NVME_LOG_ERROR:
1401 		if (logoff >= sizeof(sc->err_log)) {
1402 			pci_nvme_status_genc(&compl->status,
1403 			    NVME_SC_INVALID_FIELD);
1404 			break;
1405 		}
1406 
1407 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1408 		    command->prp2, (uint8_t *)&sc->err_log + logoff,
1409 		    MIN(logsize - logoff, sizeof(sc->err_log)),
1410 		    NVME_COPY_TO_PRP);
1411 		break;
1412 	case NVME_LOG_HEALTH_INFORMATION:
1413 		if (logoff >= sizeof(sc->health_log)) {
1414 			pci_nvme_status_genc(&compl->status,
1415 			    NVME_SC_INVALID_FIELD);
1416 			break;
1417 		}
1418 
1419 		pthread_mutex_lock(&sc->mtx);
1420 		memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1421 		    sizeof(sc->health_log.data_units_read));
1422 		memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1423 		    sizeof(sc->health_log.data_units_written));
1424 		memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1425 		    sizeof(sc->health_log.host_read_commands));
1426 		memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1427 		    sizeof(sc->health_log.host_write_commands));
1428 		pthread_mutex_unlock(&sc->mtx);
1429 
1430 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1431 		    command->prp2, (uint8_t *)&sc->health_log + logoff,
1432 		    MIN(logsize - logoff, sizeof(sc->health_log)),
1433 		    NVME_COPY_TO_PRP);
1434 		break;
1435 	case NVME_LOG_FIRMWARE_SLOT:
1436 		if (logoff >= sizeof(sc->fw_log)) {
1437 			pci_nvme_status_genc(&compl->status,
1438 			    NVME_SC_INVALID_FIELD);
1439 			break;
1440 		}
1441 
1442 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1443 		    command->prp2, (uint8_t *)&sc->fw_log + logoff,
1444 		    MIN(logsize - logoff, sizeof(sc->fw_log)),
1445 		    NVME_COPY_TO_PRP);
1446 		break;
1447 	case NVME_LOG_CHANGED_NAMESPACE:
1448 		if (logoff >= sizeof(sc->ns_log)) {
1449 			pci_nvme_status_genc(&compl->status,
1450 			    NVME_SC_INVALID_FIELD);
1451 			break;
1452 		}
1453 
1454 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1455 		    command->prp2, (uint8_t *)&sc->ns_log + logoff,
1456 		    MIN(logsize - logoff, sizeof(sc->ns_log)),
1457 		    NVME_COPY_TO_PRP);
1458 		memset(&sc->ns_log, 0, sizeof(sc->ns_log));
1459 		break;
1460 	default:
1461 		DPRINTF("%s get log page %x command not supported",
1462 		        __func__, logpage);
1463 
1464 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1465 		    NVME_SC_INVALID_LOG_PAGE);
1466 	}
1467 
1468 	return (1);
1469 }
1470 
1471 static int
1472 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1473 	struct nvme_completion* compl)
1474 {
1475 	void *dest;
1476 	uint16_t status;
1477 
1478 	DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1479 	        command->cdw10 & 0xFF, command->nsid);
1480 
1481 	pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1482 
1483 	switch (command->cdw10 & 0xFF) {
1484 	case 0x00: /* return Identify Namespace data structure */
1485 		/* Global NS only valid with NS Management */
1486 		if (command->nsid == NVME_GLOBAL_NAMESPACE_TAG) {
1487 			pci_nvme_status_genc(&status,
1488 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1489 			break;
1490 		}
1491 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1492 		    command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1493 		    NVME_COPY_TO_PRP);
1494 		break;
1495 	case 0x01: /* return Identify Controller data structure */
1496 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1497 		    command->prp2, (uint8_t *)&sc->ctrldata,
1498 		    sizeof(sc->ctrldata),
1499 		    NVME_COPY_TO_PRP);
1500 		break;
1501 	case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1502 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1503 		                  sizeof(uint32_t) * 1024);
1504 		/* All unused entries shall be zero */
1505 		bzero(dest, sizeof(uint32_t) * 1024);
1506 		((uint32_t *)dest)[0] = 1;
1507 		break;
1508 	case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1509 		if (command->nsid != 1) {
1510 			pci_nvme_status_genc(&status,
1511 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1512 			break;
1513 		}
1514 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1515 		                  sizeof(uint32_t) * 1024);
1516 		/* All bytes after the descriptor shall be zero */
1517 		bzero(dest, sizeof(uint32_t) * 1024);
1518 
1519 		/* Return NIDT=1 (i.e. EUI64) descriptor */
1520 		((uint8_t *)dest)[0] = 1;
1521 		((uint8_t *)dest)[1] = sizeof(uint64_t);
1522 		bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1523 		break;
1524 	default:
1525 		DPRINTF("%s unsupported identify command requested 0x%x",
1526 		         __func__, command->cdw10 & 0xFF);
1527 		pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1528 		break;
1529 	}
1530 
1531 	compl->status = status;
1532 	return (1);
1533 }
1534 
1535 static const char *
1536 nvme_fid_to_name(uint8_t fid)
1537 {
1538 	const char *name;
1539 
1540 	switch (fid) {
1541 	case NVME_FEAT_ARBITRATION:
1542 		name = "Arbitration";
1543 		break;
1544 	case NVME_FEAT_POWER_MANAGEMENT:
1545 		name = "Power Management";
1546 		break;
1547 	case NVME_FEAT_LBA_RANGE_TYPE:
1548 		name = "LBA Range Type";
1549 		break;
1550 	case NVME_FEAT_TEMPERATURE_THRESHOLD:
1551 		name = "Temperature Threshold";
1552 		break;
1553 	case NVME_FEAT_ERROR_RECOVERY:
1554 		name = "Error Recovery";
1555 		break;
1556 	case NVME_FEAT_VOLATILE_WRITE_CACHE:
1557 		name = "Volatile Write Cache";
1558 		break;
1559 	case NVME_FEAT_NUMBER_OF_QUEUES:
1560 		name = "Number of Queues";
1561 		break;
1562 	case NVME_FEAT_INTERRUPT_COALESCING:
1563 		name = "Interrupt Coalescing";
1564 		break;
1565 	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1566 		name = "Interrupt Vector Configuration";
1567 		break;
1568 	case NVME_FEAT_WRITE_ATOMICITY:
1569 		name = "Write Atomicity Normal";
1570 		break;
1571 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1572 		name = "Asynchronous Event Configuration";
1573 		break;
1574 	case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1575 		name = "Autonomous Power State Transition";
1576 		break;
1577 	case NVME_FEAT_HOST_MEMORY_BUFFER:
1578 		name = "Host Memory Buffer";
1579 		break;
1580 	case NVME_FEAT_TIMESTAMP:
1581 		name = "Timestamp";
1582 		break;
1583 	case NVME_FEAT_KEEP_ALIVE_TIMER:
1584 		name = "Keep Alive Timer";
1585 		break;
1586 	case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1587 		name = "Host Controlled Thermal Management";
1588 		break;
1589 	case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1590 		name = "Non-Operation Power State Config";
1591 		break;
1592 	case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1593 		name = "Read Recovery Level Config";
1594 		break;
1595 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1596 		name = "Predictable Latency Mode Config";
1597 		break;
1598 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1599 		name = "Predictable Latency Mode Window";
1600 		break;
1601 	case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1602 		name = "LBA Status Information Report Interval";
1603 		break;
1604 	case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1605 		name = "Host Behavior Support";
1606 		break;
1607 	case NVME_FEAT_SANITIZE_CONFIG:
1608 		name = "Sanitize Config";
1609 		break;
1610 	case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1611 		name = "Endurance Group Event Configuration";
1612 		break;
1613 	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1614 		name = "Software Progress Marker";
1615 		break;
1616 	case NVME_FEAT_HOST_IDENTIFIER:
1617 		name = "Host Identifier";
1618 		break;
1619 	case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1620 		name = "Reservation Notification Mask";
1621 		break;
1622 	case NVME_FEAT_RESERVATION_PERSISTENCE:
1623 		name = "Reservation Persistence";
1624 		break;
1625 	case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1626 		name = "Namespace Write Protection Config";
1627 		break;
1628 	default:
1629 		name = "Unknown";
1630 		break;
1631 	}
1632 
1633 	return (name);
1634 }
1635 
1636 static void
1637 nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
1638     struct nvme_feature_obj *feat,
1639     struct nvme_command *command,
1640     struct nvme_completion *compl)
1641 {
1642 
1643 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1644 }
1645 
1646 static void
1647 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1648     struct nvme_feature_obj *feat,
1649     struct nvme_command *command,
1650     struct nvme_completion *compl)
1651 {
1652 	uint32_t i;
1653 	uint32_t cdw11 = command->cdw11;
1654 	uint16_t iv;
1655 	bool cd;
1656 
1657 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1658 
1659 	iv = cdw11 & 0xffff;
1660 	cd = cdw11 & (1 << 16);
1661 
1662 	if (iv > (sc->max_queues + 1)) {
1663 		return;
1664 	}
1665 
1666 	/* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1667 	if ((iv == 0) && !cd)
1668 		return;
1669 
1670 	/* Requested Interrupt Vector must be used by a CQ */
1671 	for (i = 0; i < sc->num_cqueues + 1; i++) {
1672 		if (sc->compl_queues[i].intr_vec == iv) {
1673 			pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1674 		}
1675 	}
1676 }
1677 
1678 #define NVME_ASYNC_EVENT_ENDURANCE_GROUP		(0x4000)
1679 static void
1680 nvme_feature_async_event(struct pci_nvme_softc *sc,
1681     struct nvme_feature_obj *feat,
1682     struct nvme_command *command,
1683     struct nvme_completion *compl)
1684 {
1685 
1686 	if (command->cdw11 & NVME_ASYNC_EVENT_ENDURANCE_GROUP)
1687 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1688 }
1689 
1690 #define NVME_TEMP_THRESH_OVER	0
1691 #define NVME_TEMP_THRESH_UNDER	1
1692 static void
1693 nvme_feature_temperature(struct pci_nvme_softc *sc,
1694     struct nvme_feature_obj *feat,
1695     struct nvme_command *command,
1696     struct nvme_completion *compl)
1697 {
1698 	uint16_t	tmpth;	/* Temperature Threshold */
1699 	uint8_t		tmpsel; /* Threshold Temperature Select */
1700 	uint8_t		thsel;  /* Threshold Type Select */
1701 	bool		set_crit = false;
1702 
1703 	tmpth  = command->cdw11 & 0xffff;
1704 	tmpsel = (command->cdw11 >> 16) & 0xf;
1705 	thsel  = (command->cdw11 >> 20) & 0x3;
1706 
1707 	DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel);
1708 
1709 	/* Check for unsupported values */
1710 	if (((tmpsel != 0) && (tmpsel != 0xf)) ||
1711 	    (thsel > NVME_TEMP_THRESH_UNDER)) {
1712 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1713 		return;
1714 	}
1715 
1716 	if (((thsel == NVME_TEMP_THRESH_OVER)  && (NVME_TEMPERATURE >= tmpth)) ||
1717 	    ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth)))
1718 		set_crit = true;
1719 
1720 	pthread_mutex_lock(&sc->mtx);
1721 	if (set_crit)
1722 		sc->health_log.critical_warning |=
1723 		    NVME_CRIT_WARN_ST_TEMPERATURE;
1724 	else
1725 		sc->health_log.critical_warning &=
1726 		    ~NVME_CRIT_WARN_ST_TEMPERATURE;
1727 	pthread_mutex_unlock(&sc->mtx);
1728 
1729 	if (set_crit)
1730 		pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART,
1731 		    sc->health_log.critical_warning);
1732 
1733 
1734 	DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status);
1735 }
1736 
1737 static void
1738 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1739     struct nvme_feature_obj *feat,
1740     struct nvme_command *command,
1741     struct nvme_completion *compl)
1742 {
1743 	uint16_t nqr;	/* Number of Queues Requested */
1744 
1745 	if (sc->num_q_is_set) {
1746 		WPRINTF("%s: Number of Queues already set", __func__);
1747 		pci_nvme_status_genc(&compl->status,
1748 		    NVME_SC_COMMAND_SEQUENCE_ERROR);
1749 		return;
1750 	}
1751 
1752 	nqr = command->cdw11 & 0xFFFF;
1753 	if (nqr == 0xffff) {
1754 		WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1755 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1756 		return;
1757 	}
1758 
1759 	sc->num_squeues = ONE_BASED(nqr);
1760 	if (sc->num_squeues > sc->max_queues) {
1761 		DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1762 					sc->max_queues);
1763 		sc->num_squeues = sc->max_queues;
1764 	}
1765 
1766 	nqr = (command->cdw11 >> 16) & 0xFFFF;
1767 	if (nqr == 0xffff) {
1768 		WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1769 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1770 		return;
1771 	}
1772 
1773 	sc->num_cqueues = ONE_BASED(nqr);
1774 	if (sc->num_cqueues > sc->max_queues) {
1775 		DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1776 					sc->max_queues);
1777 		sc->num_cqueues = sc->max_queues;
1778 	}
1779 
1780 	/* Patch the command value which will be saved on callback's return */
1781 	command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1782 	compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1783 
1784 	sc->num_q_is_set = true;
1785 }
1786 
1787 static int
1788 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1789 	struct nvme_completion *compl)
1790 {
1791 	struct nvme_feature_obj *feat;
1792 	uint32_t nsid = command->nsid;
1793 	uint8_t fid = command->cdw10 & 0xFF;
1794 
1795 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1796 
1797 	if (fid >= NVME_FID_MAX) {
1798 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1799 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1800 		return (1);
1801 	}
1802 	feat = &sc->feat[fid];
1803 
1804 	if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) {
1805 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1806 		return (1);
1807 	}
1808 
1809 	if (!feat->namespace_specific &&
1810 	    !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1811 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1812 		    NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1813 		return (1);
1814 	}
1815 
1816 	compl->cdw0 = 0;
1817 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1818 
1819 	if (feat->set)
1820 		feat->set(sc, feat, command, compl);
1821 
1822 	DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11);
1823 	if (compl->status == NVME_SC_SUCCESS) {
1824 		feat->cdw11 = command->cdw11;
1825 		if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) &&
1826 		    (command->cdw11 != 0))
1827 			pci_nvme_aen_notify(sc);
1828 	}
1829 
1830 	return (0);
1831 }
1832 
1833 #define NVME_FEATURES_SEL_SUPPORTED	0x3
1834 #define NVME_FEATURES_NS_SPECIFIC	(1 << 1)
1835 
1836 static int
1837 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1838 	struct nvme_completion* compl)
1839 {
1840 	struct nvme_feature_obj *feat;
1841 	uint8_t fid = command->cdw10 & 0xFF;
1842 	uint8_t sel = (command->cdw10 >> 8) & 0x7;
1843 
1844 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1845 
1846 	if (fid >= NVME_FID_MAX) {
1847 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1848 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1849 		return (1);
1850 	}
1851 
1852 	compl->cdw0 = 0;
1853 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1854 
1855 	feat = &sc->feat[fid];
1856 	if (feat->get) {
1857 		feat->get(sc, feat, command, compl);
1858 	}
1859 
1860 	if (compl->status == NVME_SC_SUCCESS) {
1861 		if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific)
1862 			compl->cdw0 = NVME_FEATURES_NS_SPECIFIC;
1863 		else
1864 			compl->cdw0 = feat->cdw11;
1865 	}
1866 
1867 	return (0);
1868 }
1869 
1870 static int
1871 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1872 	struct nvme_completion* compl)
1873 {
1874 	uint8_t	ses, lbaf, pi;
1875 
1876 	/* Only supports Secure Erase Setting - User Data Erase */
1877 	ses = (command->cdw10 >> 9) & 0x7;
1878 	if (ses > 0x1) {
1879 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1880 		return (1);
1881 	}
1882 
1883 	/* Only supports a single LBA Format */
1884 	lbaf = command->cdw10 & 0xf;
1885 	if (lbaf != 0) {
1886 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1887 		    NVME_SC_INVALID_FORMAT);
1888 		return (1);
1889 	}
1890 
1891 	/* Doesn't support Protection Infomation */
1892 	pi = (command->cdw10 >> 5) & 0x7;
1893 	if (pi != 0) {
1894 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1895 		return (1);
1896 	}
1897 
1898 	if (sc->nvstore.type == NVME_STOR_RAM) {
1899 		if (sc->nvstore.ctx)
1900 			free(sc->nvstore.ctx);
1901 		sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1902 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1903 	} else {
1904 		struct pci_nvme_ioreq *req;
1905 		int err;
1906 
1907 		req = pci_nvme_get_ioreq(sc);
1908 		if (req == NULL) {
1909 			pci_nvme_status_genc(&compl->status,
1910 			    NVME_SC_INTERNAL_DEVICE_ERROR);
1911 			WPRINTF("%s: unable to allocate IO req", __func__);
1912 			return (1);
1913 		}
1914 		req->nvme_sq = &sc->submit_queues[0];
1915 		req->sqid = 0;
1916 		req->opc = command->opc;
1917 		req->cid = command->cid;
1918 		req->nsid = command->nsid;
1919 
1920 		req->io_req.br_offset = 0;
1921 		req->io_req.br_resid = sc->nvstore.size;
1922 		req->io_req.br_callback = pci_nvme_io_done;
1923 
1924 		err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1925 		if (err) {
1926 			pci_nvme_status_genc(&compl->status,
1927 			    NVME_SC_INTERNAL_DEVICE_ERROR);
1928 			pci_nvme_release_ioreq(sc, req);
1929 		} else
1930 			compl->status = NVME_NO_STATUS;
1931 	}
1932 
1933 	return (1);
1934 }
1935 
1936 static int
1937 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1938 	struct nvme_completion* compl)
1939 {
1940 	DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1941 	        command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1942 
1943 	/* TODO: search for the command ID and abort it */
1944 
1945 	compl->cdw0 = 1;
1946 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1947 	return (1);
1948 }
1949 
1950 static int
1951 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1952 	struct nvme_command* command, struct nvme_completion* compl)
1953 {
1954 	DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__,
1955 	    sc->aer_count, sc->ctrldata.aerl, command->cid);
1956 
1957 	/* Don't exceed the Async Event Request Limit (AERL). */
1958 	if (pci_nvme_aer_limit_reached(sc)) {
1959 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1960 				NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1961 		return (1);
1962 	}
1963 
1964 	if (pci_nvme_aer_add(sc, command->cid)) {
1965 		pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
1966 				NVME_SC_INTERNAL_DEVICE_ERROR);
1967 		return (1);
1968 	}
1969 
1970 	/*
1971 	 * Raise events when they happen based on the Set Features cmd.
1972 	 * These events happen async, so only set completion successful if
1973 	 * there is an event reflective of the request to get event.
1974 	 */
1975 	compl->status = NVME_NO_STATUS;
1976 	pci_nvme_aen_notify(sc);
1977 
1978 	return (0);
1979 }
1980 
1981 static void
1982 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1983 {
1984 	struct nvme_completion compl;
1985 	struct nvme_command *cmd;
1986 	struct nvme_submission_queue *sq;
1987 	struct nvme_completion_queue *cq;
1988 	uint16_t sqhead;
1989 
1990 	DPRINTF("%s index %u", __func__, (uint32_t)value);
1991 
1992 	sq = &sc->submit_queues[0];
1993 	cq = &sc->compl_queues[0];
1994 
1995 	pthread_mutex_lock(&sq->mtx);
1996 
1997 	sqhead = sq->head;
1998 	DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1999 
2000 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
2001 		cmd = &(sq->qbase)[sqhead];
2002 		compl.cdw0 = 0;
2003 		compl.status = 0;
2004 
2005 		switch (cmd->opc) {
2006 		case NVME_OPC_DELETE_IO_SQ:
2007 			DPRINTF("%s command DELETE_IO_SQ", __func__);
2008 			nvme_opc_delete_io_sq(sc, cmd, &compl);
2009 			break;
2010 		case NVME_OPC_CREATE_IO_SQ:
2011 			DPRINTF("%s command CREATE_IO_SQ", __func__);
2012 			nvme_opc_create_io_sq(sc, cmd, &compl);
2013 			break;
2014 		case NVME_OPC_DELETE_IO_CQ:
2015 			DPRINTF("%s command DELETE_IO_CQ", __func__);
2016 			nvme_opc_delete_io_cq(sc, cmd, &compl);
2017 			break;
2018 		case NVME_OPC_CREATE_IO_CQ:
2019 			DPRINTF("%s command CREATE_IO_CQ", __func__);
2020 			nvme_opc_create_io_cq(sc, cmd, &compl);
2021 			break;
2022 		case NVME_OPC_GET_LOG_PAGE:
2023 			DPRINTF("%s command GET_LOG_PAGE", __func__);
2024 			nvme_opc_get_log_page(sc, cmd, &compl);
2025 			break;
2026 		case NVME_OPC_IDENTIFY:
2027 			DPRINTF("%s command IDENTIFY", __func__);
2028 			nvme_opc_identify(sc, cmd, &compl);
2029 			break;
2030 		case NVME_OPC_ABORT:
2031 			DPRINTF("%s command ABORT", __func__);
2032 			nvme_opc_abort(sc, cmd, &compl);
2033 			break;
2034 		case NVME_OPC_SET_FEATURES:
2035 			DPRINTF("%s command SET_FEATURES", __func__);
2036 			nvme_opc_set_features(sc, cmd, &compl);
2037 			break;
2038 		case NVME_OPC_GET_FEATURES:
2039 			DPRINTF("%s command GET_FEATURES", __func__);
2040 			nvme_opc_get_features(sc, cmd, &compl);
2041 			break;
2042 		case NVME_OPC_FIRMWARE_ACTIVATE:
2043 			DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
2044 			pci_nvme_status_tc(&compl.status,
2045 			    NVME_SCT_COMMAND_SPECIFIC,
2046 			    NVME_SC_INVALID_FIRMWARE_SLOT);
2047 			break;
2048 		case NVME_OPC_ASYNC_EVENT_REQUEST:
2049 			DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
2050 			nvme_opc_async_event_req(sc, cmd, &compl);
2051 			break;
2052 		case NVME_OPC_FORMAT_NVM:
2053 			DPRINTF("%s command FORMAT_NVM", __func__);
2054 			if ((sc->ctrldata.oacs &
2055 			    (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
2056 				pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
2057 				break;
2058 			}
2059 			nvme_opc_format_nvm(sc, cmd, &compl);
2060 			break;
2061 		case NVME_OPC_SECURITY_SEND:
2062 		case NVME_OPC_SECURITY_RECEIVE:
2063 		case NVME_OPC_SANITIZE:
2064 		case NVME_OPC_GET_LBA_STATUS:
2065 			DPRINTF("%s command OPC=%#x (unsupported)", __func__,
2066 			    cmd->opc);
2067 			/* Valid but unsupported opcodes */
2068 			pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD);
2069 			break;
2070 		default:
2071 			DPRINTF("%s command OPC=%#X (not implemented)",
2072 			    __func__,
2073 			    cmd->opc);
2074 			pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
2075 		}
2076 		sqhead = (sqhead + 1) % sq->size;
2077 
2078 		if (NVME_COMPLETION_VALID(compl)) {
2079 			pci_nvme_cq_update(sc, &sc->compl_queues[0],
2080 			    compl.cdw0,
2081 			    cmd->cid,
2082 			    0,		/* SQID */
2083 			    compl.status);
2084 		}
2085 	}
2086 
2087 	DPRINTF("setting sqhead %u", sqhead);
2088 	sq->head = sqhead;
2089 
2090 	if (cq->head != cq->tail)
2091 		pci_generate_msix(sc->nsc_pi, 0);
2092 
2093 	pthread_mutex_unlock(&sq->mtx);
2094 }
2095 
2096 /*
2097  * Update the Write and Read statistics reported in SMART data
2098  *
2099  * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
2100  * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
2101  * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
2102  */
2103 static void
2104 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
2105     size_t bytes, uint16_t status)
2106 {
2107 
2108 	pthread_mutex_lock(&sc->mtx);
2109 	switch (opc) {
2110 	case NVME_OPC_WRITE:
2111 		sc->write_commands++;
2112 		if (status != NVME_SC_SUCCESS)
2113 			break;
2114 		sc->write_dunits_remainder += (bytes / 512);
2115 		while (sc->write_dunits_remainder >= 1000) {
2116 			sc->write_data_units++;
2117 			sc->write_dunits_remainder -= 1000;
2118 		}
2119 		break;
2120 	case NVME_OPC_READ:
2121 		sc->read_commands++;
2122 		if (status != NVME_SC_SUCCESS)
2123 			break;
2124 		sc->read_dunits_remainder += (bytes / 512);
2125 		while (sc->read_dunits_remainder >= 1000) {
2126 			sc->read_data_units++;
2127 			sc->read_dunits_remainder -= 1000;
2128 		}
2129 		break;
2130 	default:
2131 		DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
2132 		break;
2133 	}
2134 	pthread_mutex_unlock(&sc->mtx);
2135 }
2136 
2137 /*
2138  * Check if the combination of Starting LBA (slba) and number of blocks
2139  * exceeds the range of the underlying storage.
2140  *
2141  * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
2142  * the capacity in bytes as a uint64_t, care must be taken to avoid integer
2143  * overflow.
2144  */
2145 static bool
2146 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
2147     uint32_t nblocks)
2148 {
2149 	size_t	offset, bytes;
2150 
2151 	/* Overflow check of multiplying Starting LBA by the sector size */
2152 	if (slba >> (64 - nvstore->sectsz_bits))
2153 		return (true);
2154 
2155 	offset = slba << nvstore->sectsz_bits;
2156 	bytes = nblocks << nvstore->sectsz_bits;
2157 
2158 	/* Overflow check of Number of Logical Blocks */
2159 	if ((nvstore->size <= offset) || ((nvstore->size - offset) < bytes))
2160 		return (true);
2161 
2162 	return (false);
2163 }
2164 
2165 static int
2166 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
2167 	uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
2168 {
2169 	int iovidx;
2170 
2171 	if (req == NULL)
2172 		return (-1);
2173 
2174 	if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
2175 		return (-1);
2176 	}
2177 
2178 	/* concatenate contig block-iovs to minimize number of iovs */
2179 	if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
2180 		iovidx = req->io_req.br_iovcnt - 1;
2181 
2182 		req->io_req.br_iov[iovidx].iov_base =
2183 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2184 				     req->prev_gpaddr, size);
2185 
2186 		req->prev_size += size;
2187 		req->io_req.br_resid += size;
2188 
2189 		req->io_req.br_iov[iovidx].iov_len = req->prev_size;
2190 	} else {
2191 		iovidx = req->io_req.br_iovcnt;
2192 		if (iovidx == 0) {
2193 			req->io_req.br_offset = lba;
2194 			req->io_req.br_resid = 0;
2195 			req->io_req.br_param = req;
2196 		}
2197 
2198 		req->io_req.br_iov[iovidx].iov_base =
2199 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2200 				     gpaddr, size);
2201 
2202 		req->io_req.br_iov[iovidx].iov_len = size;
2203 
2204 		req->prev_gpaddr = gpaddr;
2205 		req->prev_size = size;
2206 		req->io_req.br_resid += size;
2207 
2208 		req->io_req.br_iovcnt++;
2209 	}
2210 
2211 	return (0);
2212 }
2213 
2214 static void
2215 pci_nvme_set_completion(struct pci_nvme_softc *sc,
2216 	struct nvme_submission_queue *sq, int sqid, uint16_t cid,
2217 	uint32_t cdw0, uint16_t status)
2218 {
2219 	struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
2220 
2221 	DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
2222 		 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
2223 		 NVME_STATUS_GET_SC(status));
2224 
2225 	pci_nvme_cq_update(sc, cq,
2226 	    0,		/* CDW0 */
2227 	    cid,
2228 	    sqid,
2229 	    status);
2230 
2231 	if (cq->head != cq->tail) {
2232 		if (cq->intr_en & NVME_CQ_INTEN) {
2233 			pci_generate_msix(sc->nsc_pi, cq->intr_vec);
2234 		} else {
2235 			DPRINTF("%s: CQ%u interrupt disabled",
2236 						__func__, sq->cqid);
2237 		}
2238 	}
2239 }
2240 
2241 static void
2242 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
2243 {
2244 	req->sc = NULL;
2245 	req->nvme_sq = NULL;
2246 	req->sqid = 0;
2247 
2248 	pthread_mutex_lock(&sc->mtx);
2249 
2250 	STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
2251 	sc->pending_ios--;
2252 
2253 	/* when no more IO pending, can set to ready if device reset/enabled */
2254 	if (sc->pending_ios == 0 &&
2255 	    NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
2256 		sc->regs.csts |= NVME_CSTS_RDY;
2257 
2258 	pthread_mutex_unlock(&sc->mtx);
2259 
2260 	sem_post(&sc->iosemlock);
2261 }
2262 
2263 static struct pci_nvme_ioreq *
2264 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
2265 {
2266 	struct pci_nvme_ioreq *req = NULL;
2267 
2268 	sem_wait(&sc->iosemlock);
2269 	pthread_mutex_lock(&sc->mtx);
2270 
2271 	req = STAILQ_FIRST(&sc->ioreqs_free);
2272 	assert(req != NULL);
2273 	STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
2274 
2275 	req->sc = sc;
2276 
2277 	sc->pending_ios++;
2278 
2279 	pthread_mutex_unlock(&sc->mtx);
2280 
2281 	req->io_req.br_iovcnt = 0;
2282 	req->io_req.br_offset = 0;
2283 	req->io_req.br_resid = 0;
2284 	req->io_req.br_param = req;
2285 	req->prev_gpaddr = 0;
2286 	req->prev_size = 0;
2287 
2288 	return req;
2289 }
2290 
2291 static void
2292 pci_nvme_io_done(struct blockif_req *br, int err)
2293 {
2294 	struct pci_nvme_ioreq *req = br->br_param;
2295 	struct nvme_submission_queue *sq = req->nvme_sq;
2296 	uint16_t code, status;
2297 
2298 	DPRINTF("%s error %d %s", __func__, err, strerror(err));
2299 
2300 	/* TODO return correct error */
2301 	code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
2302 	pci_nvme_status_genc(&status, code);
2303 
2304 	pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
2305 	pci_nvme_stats_write_read_update(req->sc, req->opc,
2306 	    req->bytes, status);
2307 	pci_nvme_release_ioreq(req->sc, req);
2308 }
2309 
2310 /*
2311  * Implements the Flush command. The specification states:
2312  *    If a volatile write cache is not present, Flush commands complete
2313  *    successfully and have no effect
2314  * in the description of the Volatile Write Cache (VWC) field of the Identify
2315  * Controller data. Therefore, set status to Success if the command is
2316  * not supported (i.e. RAM or as indicated by the blockif).
2317  */
2318 static bool
2319 nvme_opc_flush(struct pci_nvme_softc *sc,
2320     struct nvme_command *cmd,
2321     struct pci_nvme_blockstore *nvstore,
2322     struct pci_nvme_ioreq *req,
2323     uint16_t *status)
2324 {
2325 	bool pending = false;
2326 
2327 	if (nvstore->type == NVME_STOR_RAM) {
2328 		pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2329 	} else {
2330 		int err;
2331 
2332 		req->io_req.br_callback = pci_nvme_io_done;
2333 
2334 		err = blockif_flush(nvstore->ctx, &req->io_req);
2335 		switch (err) {
2336 		case 0:
2337 			pending = true;
2338 			break;
2339 		case EOPNOTSUPP:
2340 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2341 			break;
2342 		default:
2343 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2344 		}
2345 	}
2346 
2347 	return (pending);
2348 }
2349 
2350 static uint16_t
2351 nvme_write_read_ram(struct pci_nvme_softc *sc,
2352     struct pci_nvme_blockstore *nvstore,
2353     uint64_t prp1, uint64_t prp2,
2354     size_t offset, uint64_t bytes,
2355     bool is_write)
2356 {
2357 	uint8_t *buf = nvstore->ctx;
2358 	enum nvme_copy_dir dir;
2359 	uint16_t status;
2360 
2361 	if (is_write)
2362 		dir = NVME_COPY_TO_PRP;
2363 	else
2364 		dir = NVME_COPY_FROM_PRP;
2365 
2366 	if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
2367 	    buf + offset, bytes, dir))
2368 		pci_nvme_status_genc(&status,
2369 		    NVME_SC_DATA_TRANSFER_ERROR);
2370 	else
2371 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2372 
2373 	return (status);
2374 }
2375 
2376 static uint16_t
2377 nvme_write_read_blockif(struct pci_nvme_softc *sc,
2378     struct pci_nvme_blockstore *nvstore,
2379     struct pci_nvme_ioreq *req,
2380     uint64_t prp1, uint64_t prp2,
2381     size_t offset, uint64_t bytes,
2382     bool is_write)
2383 {
2384 	uint64_t size;
2385 	int err;
2386 	uint16_t status = NVME_NO_STATUS;
2387 
2388 	size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
2389 	if (pci_nvme_append_iov_req(sc, req, prp1,
2390 	    size, is_write, offset)) {
2391 		pci_nvme_status_genc(&status,
2392 		    NVME_SC_DATA_TRANSFER_ERROR);
2393 		goto out;
2394 	}
2395 
2396 	offset += size;
2397 	bytes  -= size;
2398 
2399 	if (bytes == 0) {
2400 		;
2401 	} else if (bytes <= PAGE_SIZE) {
2402 		size = bytes;
2403 		if (pci_nvme_append_iov_req(sc, req, prp2,
2404 		    size, is_write, offset)) {
2405 			pci_nvme_status_genc(&status,
2406 			    NVME_SC_DATA_TRANSFER_ERROR);
2407 			goto out;
2408 		}
2409 	} else {
2410 		void *vmctx = sc->nsc_pi->pi_vmctx;
2411 		uint64_t *prp_list = &prp2;
2412 		uint64_t *last = prp_list;
2413 
2414 		/* PRP2 is pointer to a physical region page list */
2415 		while (bytes) {
2416 			/* Last entry in list points to the next list */
2417 			if ((prp_list == last) && (bytes > PAGE_SIZE)) {
2418 				uint64_t prp = *prp_list;
2419 
2420 				prp_list = paddr_guest2host(vmctx, prp,
2421 				    PAGE_SIZE - (prp % PAGE_SIZE));
2422 				last = prp_list + (NVME_PRP2_ITEMS - 1);
2423 			}
2424 
2425 			size = MIN(bytes, PAGE_SIZE);
2426 
2427 			if (pci_nvme_append_iov_req(sc, req, *prp_list,
2428 			    size, is_write, offset)) {
2429 				pci_nvme_status_genc(&status,
2430 				    NVME_SC_DATA_TRANSFER_ERROR);
2431 				goto out;
2432 			}
2433 
2434 			offset += size;
2435 			bytes  -= size;
2436 
2437 			prp_list++;
2438 		}
2439 	}
2440 	req->io_req.br_callback = pci_nvme_io_done;
2441 	if (is_write)
2442 		err = blockif_write(nvstore->ctx, &req->io_req);
2443 	else
2444 		err = blockif_read(nvstore->ctx, &req->io_req);
2445 
2446 	if (err)
2447 		pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2448 out:
2449 	return (status);
2450 }
2451 
2452 static bool
2453 nvme_opc_write_read(struct pci_nvme_softc *sc,
2454     struct nvme_command *cmd,
2455     struct pci_nvme_blockstore *nvstore,
2456     struct pci_nvme_ioreq *req,
2457     uint16_t *status)
2458 {
2459 	uint64_t lba, nblocks, bytes;
2460 	size_t offset;
2461 	bool is_write = cmd->opc == NVME_OPC_WRITE;
2462 	bool pending = false;
2463 
2464 	lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2465 	nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2466 
2467 	if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2468 		WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)",
2469 		    __func__, lba, nblocks);
2470 		pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2471 		goto out;
2472 	}
2473 
2474 	bytes  = nblocks << nvstore->sectsz_bits;
2475 	if (bytes > NVME_MAX_DATA_SIZE) {
2476 		WPRINTF("%s command would exceed MDTS", __func__);
2477 		pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2478 		goto out;
2479 	}
2480 
2481 	offset = lba << nvstore->sectsz_bits;
2482 
2483 	req->bytes = bytes;
2484 	req->io_req.br_offset = lba;
2485 
2486 	/* PRP bits 1:0 must be zero */
2487 	cmd->prp1 &= ~0x3UL;
2488 	cmd->prp2 &= ~0x3UL;
2489 
2490 	if (nvstore->type == NVME_STOR_RAM) {
2491 		*status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2492 		    cmd->prp2, offset, bytes, is_write);
2493 	} else {
2494 		*status = nvme_write_read_blockif(sc, nvstore, req,
2495 		    cmd->prp1, cmd->prp2, offset, bytes, is_write);
2496 
2497 		if (*status == NVME_NO_STATUS)
2498 			pending = true;
2499 	}
2500 out:
2501 	if (!pending)
2502 		pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2503 
2504 	return (pending);
2505 }
2506 
2507 static void
2508 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2509 {
2510 	struct pci_nvme_ioreq *req = br->br_param;
2511 	struct pci_nvme_softc *sc = req->sc;
2512 	bool done = true;
2513 	uint16_t status;
2514 
2515 	if (err) {
2516 		pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2517 	} else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2518 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2519 	} else {
2520 		struct iovec *iov = req->io_req.br_iov;
2521 
2522 		req->prev_gpaddr++;
2523 		iov += req->prev_gpaddr;
2524 
2525 		/* The iov_* values already include the sector size */
2526 		req->io_req.br_offset = (off_t)iov->iov_base;
2527 		req->io_req.br_resid = iov->iov_len;
2528 		if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2529 			pci_nvme_status_genc(&status,
2530 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2531 		} else
2532 			done = false;
2533 	}
2534 
2535 	if (done) {
2536 		pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
2537 		    req->cid, 0, status);
2538 		pci_nvme_release_ioreq(sc, req);
2539 	}
2540 }
2541 
2542 static bool
2543 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2544     struct nvme_command *cmd,
2545     struct pci_nvme_blockstore *nvstore,
2546     struct pci_nvme_ioreq *req,
2547     uint16_t *status)
2548 {
2549 	struct nvme_dsm_range *range;
2550 	uint32_t nr, r, non_zero, dr;
2551 	int err;
2552 	bool pending = false;
2553 
2554 	if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2555 		pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2556 		goto out;
2557 	}
2558 
2559 	nr = cmd->cdw10 & 0xff;
2560 
2561 	/* copy locally because a range entry could straddle PRPs */
2562 	range = calloc(1, NVME_MAX_DSM_TRIM);
2563 	if (range == NULL) {
2564 		pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2565 		goto out;
2566 	}
2567 	nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2568 	    (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2569 
2570 	/* Check for invalid ranges and the number of non-zero lengths */
2571 	non_zero = 0;
2572 	for (r = 0; r <= nr; r++) {
2573 		if (pci_nvme_out_of_range(nvstore,
2574 		    range[r].starting_lba, range[r].length)) {
2575 			pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2576 			goto out;
2577 		}
2578 		if (range[r].length != 0)
2579 			non_zero++;
2580 	}
2581 
2582 	if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2583 		size_t offset, bytes;
2584 		int sectsz_bits = sc->nvstore.sectsz_bits;
2585 
2586 		/*
2587 		 * DSM calls are advisory only, and compliant controllers
2588 		 * may choose to take no actions (i.e. return Success).
2589 		 */
2590 		if (!nvstore->deallocate) {
2591 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2592 			goto out;
2593 		}
2594 
2595 		/* If all ranges have a zero length, return Success */
2596 		if (non_zero == 0) {
2597 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2598 			goto out;
2599 		}
2600 
2601 		if (req == NULL) {
2602 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2603 			goto out;
2604 		}
2605 
2606 		offset = range[0].starting_lba << sectsz_bits;
2607 		bytes = range[0].length << sectsz_bits;
2608 
2609 		/*
2610 		 * If the request is for more than a single range, store
2611 		 * the ranges in the br_iov. Optimize for the common case
2612 		 * of a single range.
2613 		 *
2614 		 * Note that NVMe Number of Ranges is a zero based value
2615 		 */
2616 		req->io_req.br_iovcnt = 0;
2617 		req->io_req.br_offset = offset;
2618 		req->io_req.br_resid = bytes;
2619 
2620 		if (nr == 0) {
2621 			req->io_req.br_callback = pci_nvme_io_done;
2622 		} else {
2623 			struct iovec *iov = req->io_req.br_iov;
2624 
2625 			for (r = 0, dr = 0; r <= nr; r++) {
2626 				offset = range[r].starting_lba << sectsz_bits;
2627 				bytes = range[r].length << sectsz_bits;
2628 				if (bytes == 0)
2629 					continue;
2630 
2631 				if ((nvstore->size - offset) < bytes) {
2632 					pci_nvme_status_genc(status,
2633 					    NVME_SC_LBA_OUT_OF_RANGE);
2634 					goto out;
2635 				}
2636 				iov[dr].iov_base = (void *)offset;
2637 				iov[dr].iov_len = bytes;
2638 				dr++;
2639 			}
2640 			req->io_req.br_callback = pci_nvme_dealloc_sm;
2641 
2642 			/*
2643 			 * Use prev_gpaddr to track the current entry and
2644 			 * prev_size to track the number of entries
2645 			 */
2646 			req->prev_gpaddr = 0;
2647 			req->prev_size = dr;
2648 		}
2649 
2650 		err = blockif_delete(nvstore->ctx, &req->io_req);
2651 		if (err)
2652 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2653 		else
2654 			pending = true;
2655 	}
2656 out:
2657 	free(range);
2658 	return (pending);
2659 }
2660 
2661 static void
2662 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2663 {
2664 	struct nvme_submission_queue *sq;
2665 	uint16_t status;
2666 	uint16_t sqhead;
2667 
2668 	/* handle all submissions up to sq->tail index */
2669 	sq = &sc->submit_queues[idx];
2670 
2671 	pthread_mutex_lock(&sq->mtx);
2672 
2673 	sqhead = sq->head;
2674 	DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2675 	         idx, sqhead, sq->tail, sq->qbase);
2676 
2677 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
2678 		struct nvme_command *cmd;
2679 		struct pci_nvme_ioreq *req;
2680 		uint32_t nsid;
2681 		bool pending;
2682 
2683 		pending = false;
2684 		req = NULL;
2685 		status = 0;
2686 
2687 		cmd = &sq->qbase[sqhead];
2688 		sqhead = (sqhead + 1) % sq->size;
2689 
2690 		nsid = le32toh(cmd->nsid);
2691 		if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2692 			pci_nvme_status_genc(&status,
2693 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2694 			status |=
2695 			    NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2696 			goto complete;
2697  		}
2698 
2699 		req = pci_nvme_get_ioreq(sc);
2700 		if (req == NULL) {
2701 			pci_nvme_status_genc(&status,
2702 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2703 			WPRINTF("%s: unable to allocate IO req", __func__);
2704 			goto complete;
2705 		}
2706 		req->nvme_sq = sq;
2707 		req->sqid = idx;
2708 		req->opc = cmd->opc;
2709 		req->cid = cmd->cid;
2710 		req->nsid = cmd->nsid;
2711 
2712 		switch (cmd->opc) {
2713 		case NVME_OPC_FLUSH:
2714 			pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2715 			    req, &status);
2716  			break;
2717 		case NVME_OPC_WRITE:
2718 		case NVME_OPC_READ:
2719 			pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2720 			    req, &status);
2721 			break;
2722 		case NVME_OPC_WRITE_ZEROES:
2723 			/* TODO: write zeroes
2724 			WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2725 			        __func__, lba, cmd->cdw12 & 0xFFFF); */
2726 			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2727 			break;
2728 		case NVME_OPC_DATASET_MANAGEMENT:
2729  			pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2730 			    req, &status);
2731 			break;
2732  		default:
2733  			WPRINTF("%s unhandled io command 0x%x",
2734 			    __func__, cmd->opc);
2735 			pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2736 		}
2737 complete:
2738 		if (!pending) {
2739 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
2740 			    status);
2741 			if (req != NULL)
2742 				pci_nvme_release_ioreq(sc, req);
2743 		}
2744 	}
2745 
2746 	sq->head = sqhead;
2747 
2748 	pthread_mutex_unlock(&sq->mtx);
2749 }
2750 
2751 static void
2752 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
2753 	uint64_t idx, int is_sq, uint64_t value)
2754 {
2755 	DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2756 	        idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2757 
2758 	if (is_sq) {
2759 		if (idx > sc->num_squeues) {
2760 			WPRINTF("%s queue index %lu overflow from "
2761 			         "guest (max %u)",
2762 			         __func__, idx, sc->num_squeues);
2763 			return;
2764 		}
2765 
2766 		atomic_store_short(&sc->submit_queues[idx].tail,
2767 		                   (uint16_t)value);
2768 
2769 		if (idx == 0) {
2770 			pci_nvme_handle_admin_cmd(sc, value);
2771 		} else {
2772 			/* submission queue; handle new entries in SQ */
2773 			if (idx > sc->num_squeues) {
2774 				WPRINTF("%s SQ index %lu overflow from "
2775 				         "guest (max %u)",
2776 				         __func__, idx, sc->num_squeues);
2777 				return;
2778 			}
2779 			pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2780 		}
2781 	} else {
2782 		if (idx > sc->num_cqueues) {
2783 			WPRINTF("%s queue index %lu overflow from "
2784 			         "guest (max %u)",
2785 			         __func__, idx, sc->num_cqueues);
2786 			return;
2787 		}
2788 
2789 		atomic_store_short(&sc->compl_queues[idx].head,
2790 				(uint16_t)value);
2791 	}
2792 }
2793 
2794 static void
2795 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2796 {
2797 	const char *s = iswrite ? "WRITE" : "READ";
2798 
2799 	switch (offset) {
2800 	case NVME_CR_CAP_LOW:
2801 		DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2802 		break;
2803 	case NVME_CR_CAP_HI:
2804 		DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2805 		break;
2806 	case NVME_CR_VS:
2807 		DPRINTF("%s %s NVME_CR_VS", func, s);
2808 		break;
2809 	case NVME_CR_INTMS:
2810 		DPRINTF("%s %s NVME_CR_INTMS", func, s);
2811 		break;
2812 	case NVME_CR_INTMC:
2813 		DPRINTF("%s %s NVME_CR_INTMC", func, s);
2814 		break;
2815 	case NVME_CR_CC:
2816 		DPRINTF("%s %s NVME_CR_CC", func, s);
2817 		break;
2818 	case NVME_CR_CSTS:
2819 		DPRINTF("%s %s NVME_CR_CSTS", func, s);
2820 		break;
2821 	case NVME_CR_NSSR:
2822 		DPRINTF("%s %s NVME_CR_NSSR", func, s);
2823 		break;
2824 	case NVME_CR_AQA:
2825 		DPRINTF("%s %s NVME_CR_AQA", func, s);
2826 		break;
2827 	case NVME_CR_ASQ_LOW:
2828 		DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2829 		break;
2830 	case NVME_CR_ASQ_HI:
2831 		DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2832 		break;
2833 	case NVME_CR_ACQ_LOW:
2834 		DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2835 		break;
2836 	case NVME_CR_ACQ_HI:
2837 		DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2838 		break;
2839 	default:
2840 		DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2841 	}
2842 
2843 }
2844 
2845 static void
2846 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2847 	uint64_t offset, int size, uint64_t value)
2848 {
2849 	uint32_t ccreg;
2850 
2851 	if (offset >= NVME_DOORBELL_OFFSET) {
2852 		uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2853 		uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2854 		int is_sq = (belloffset % 8) < 4;
2855 
2856 		if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2857 			WPRINTF("guest attempted an overflow write offset "
2858 			         "0x%lx, val 0x%lx in %s",
2859 			         offset, value, __func__);
2860 			return;
2861 		}
2862 
2863 		pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2864 		return;
2865 	}
2866 
2867 	DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2868 	        offset, size, value);
2869 
2870 	if (size != 4) {
2871 		WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2872 		         "val 0x%lx) to bar0 in %s",
2873 		         size, offset, value, __func__);
2874 		/* TODO: shutdown device */
2875 		return;
2876 	}
2877 
2878 	pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2879 
2880 	pthread_mutex_lock(&sc->mtx);
2881 
2882 	switch (offset) {
2883 	case NVME_CR_CAP_LOW:
2884 	case NVME_CR_CAP_HI:
2885 		/* readonly */
2886 		break;
2887 	case NVME_CR_VS:
2888 		/* readonly */
2889 		break;
2890 	case NVME_CR_INTMS:
2891 		/* MSI-X, so ignore */
2892 		break;
2893 	case NVME_CR_INTMC:
2894 		/* MSI-X, so ignore */
2895 		break;
2896 	case NVME_CR_CC:
2897 		ccreg = (uint32_t)value;
2898 
2899 		DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2900 		         "iocqes %u",
2901 		        __func__,
2902 			 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2903 			 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2904 			 NVME_CC_GET_IOCQES(ccreg));
2905 
2906 		if (NVME_CC_GET_SHN(ccreg)) {
2907 			/* perform shutdown - flush out data to backend */
2908 			sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2909 			    NVME_CSTS_REG_SHST_SHIFT);
2910 			sc->regs.csts |= NVME_SHST_COMPLETE <<
2911 			    NVME_CSTS_REG_SHST_SHIFT;
2912 		}
2913 		if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2914 			if (NVME_CC_GET_EN(ccreg) == 0)
2915 				/* transition 1-> causes controller reset */
2916 				pci_nvme_reset_locked(sc);
2917 			else
2918 				pci_nvme_init_controller(ctx, sc);
2919 		}
2920 
2921 		/* Insert the iocqes, iosqes and en bits from the write */
2922 		sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2923 		sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2924 		if (NVME_CC_GET_EN(ccreg) == 0) {
2925 			/* Insert the ams, mps and css bit fields */
2926 			sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2927 			sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2928 			sc->regs.csts &= ~NVME_CSTS_RDY;
2929 		} else if (sc->pending_ios == 0) {
2930 			sc->regs.csts |= NVME_CSTS_RDY;
2931 		}
2932 		break;
2933 	case NVME_CR_CSTS:
2934 		break;
2935 	case NVME_CR_NSSR:
2936 		/* ignore writes; don't support subsystem reset */
2937 		break;
2938 	case NVME_CR_AQA:
2939 		sc->regs.aqa = (uint32_t)value;
2940 		break;
2941 	case NVME_CR_ASQ_LOW:
2942 		sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2943 		               (0xFFFFF000 & value);
2944 		break;
2945 	case NVME_CR_ASQ_HI:
2946 		sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2947 		               (value << 32);
2948 		break;
2949 	case NVME_CR_ACQ_LOW:
2950 		sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2951 		               (0xFFFFF000 & value);
2952 		break;
2953 	case NVME_CR_ACQ_HI:
2954 		sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2955 		               (value << 32);
2956 		break;
2957 	default:
2958 		DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2959 		         __func__, offset, value, size);
2960 	}
2961 	pthread_mutex_unlock(&sc->mtx);
2962 }
2963 
2964 static void
2965 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2966                 int baridx, uint64_t offset, int size, uint64_t value)
2967 {
2968 	struct pci_nvme_softc* sc = pi->pi_arg;
2969 
2970 	if (baridx == pci_msix_table_bar(pi) ||
2971 	    baridx == pci_msix_pba_bar(pi)) {
2972 		DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2973 		         " value 0x%lx", baridx, offset, size, value);
2974 
2975 		pci_emul_msix_twrite(pi, offset, size, value);
2976 		return;
2977 	}
2978 
2979 	switch (baridx) {
2980 	case 0:
2981 		pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2982 		break;
2983 
2984 	default:
2985 		DPRINTF("%s unknown baridx %d, val 0x%lx",
2986 		         __func__, baridx, value);
2987 	}
2988 }
2989 
2990 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2991 	uint64_t offset, int size)
2992 {
2993 	uint64_t value;
2994 
2995 	pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2996 
2997 	if (offset < NVME_DOORBELL_OFFSET) {
2998 		void *p = &(sc->regs);
2999 		pthread_mutex_lock(&sc->mtx);
3000 		memcpy(&value, (void *)((uintptr_t)p + offset), size);
3001 		pthread_mutex_unlock(&sc->mtx);
3002 	} else {
3003 		value = 0;
3004                 WPRINTF("pci_nvme: read invalid offset %ld", offset);
3005 	}
3006 
3007 	switch (size) {
3008 	case 1:
3009 		value &= 0xFF;
3010 		break;
3011 	case 2:
3012 		value &= 0xFFFF;
3013 		break;
3014 	case 4:
3015 		value &= 0xFFFFFFFF;
3016 		break;
3017 	}
3018 
3019 	DPRINTF("   nvme-read offset 0x%lx, size %d -> value 0x%x",
3020 	         offset, size, (uint32_t)value);
3021 
3022 	return (value);
3023 }
3024 
3025 
3026 
3027 static uint64_t
3028 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
3029     uint64_t offset, int size)
3030 {
3031 	struct pci_nvme_softc* sc = pi->pi_arg;
3032 
3033 	if (baridx == pci_msix_table_bar(pi) ||
3034 	    baridx == pci_msix_pba_bar(pi)) {
3035 		DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
3036 		        baridx, offset, size);
3037 
3038 		return pci_emul_msix_tread(pi, offset, size);
3039 	}
3040 
3041 	switch (baridx) {
3042 	case 0:
3043        		return pci_nvme_read_bar_0(sc, offset, size);
3044 
3045 	default:
3046 		DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
3047 	}
3048 
3049 	return (0);
3050 }
3051 
3052 static int
3053 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl)
3054 {
3055 	char bident[sizeof("XX:X:X")];
3056 	const char *value;
3057 	uint32_t sectsz;
3058 
3059 	sc->max_queues = NVME_QUEUES;
3060 	sc->max_qentries = NVME_MAX_QENTRIES;
3061 	sc->ioslots = NVME_IOSLOTS;
3062 	sc->num_squeues = sc->max_queues;
3063 	sc->num_cqueues = sc->max_queues;
3064 	sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
3065 	sectsz = 0;
3066 	snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
3067 	         "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
3068 
3069 	value = get_config_value_node(nvl, "maxq");
3070 	if (value != NULL)
3071 		sc->max_queues = atoi(value);
3072 	value = get_config_value_node(nvl, "qsz");
3073 	if (value != NULL) {
3074 		sc->max_qentries = atoi(value);
3075 		if (sc->max_qentries <= 0) {
3076 			EPRINTLN("nvme: Invalid qsz option %d",
3077 			    sc->max_qentries);
3078 			return (-1);
3079 		}
3080 	}
3081 	value = get_config_value_node(nvl, "ioslots");
3082 	if (value != NULL) {
3083 		sc->ioslots = atoi(value);
3084 		if (sc->ioslots <= 0) {
3085 			EPRINTLN("Invalid ioslots option %d", sc->ioslots);
3086 			return (-1);
3087 		}
3088 	}
3089 	value = get_config_value_node(nvl, "sectsz");
3090 	if (value != NULL)
3091 		sectsz = atoi(value);
3092 	value = get_config_value_node(nvl, "ser");
3093 	if (value != NULL) {
3094 		/*
3095 		 * This field indicates the Product Serial Number in
3096 		 * 7-bit ASCII, unused bytes should be space characters.
3097 		 * Ref: NVMe v1.3c.
3098 		 */
3099 		cpywithpad((char *)sc->ctrldata.sn,
3100 		    sizeof(sc->ctrldata.sn), value, ' ');
3101 	}
3102 	value = get_config_value_node(nvl, "eui64");
3103 	if (value != NULL)
3104 		sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0));
3105 	value = get_config_value_node(nvl, "dsm");
3106 	if (value != NULL) {
3107 		if (strcmp(value, "auto") == 0)
3108 			sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
3109 		else if (strcmp(value, "enable") == 0)
3110 			sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
3111 		else if (strcmp(value, "disable") == 0)
3112 			sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
3113 	}
3114 
3115 	value = get_config_value_node(nvl, "ram");
3116 	if (value != NULL) {
3117 		uint64_t sz = strtoull(value, NULL, 10);
3118 
3119 		sc->nvstore.type = NVME_STOR_RAM;
3120 		sc->nvstore.size = sz * 1024 * 1024;
3121 		sc->nvstore.ctx = calloc(1, sc->nvstore.size);
3122 		sc->nvstore.sectsz = 4096;
3123 		sc->nvstore.sectsz_bits = 12;
3124 		if (sc->nvstore.ctx == NULL) {
3125 			EPRINTLN("nvme: Unable to allocate RAM");
3126 			return (-1);
3127 		}
3128 	} else {
3129 		snprintf(bident, sizeof(bident), "%d:%d",
3130 		    sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
3131 		sc->nvstore.ctx = blockif_open(nvl, bident);
3132 		if (sc->nvstore.ctx == NULL) {
3133 			EPRINTLN("nvme: Could not open backing file: %s",
3134 			    strerror(errno));
3135 			return (-1);
3136 		}
3137 		sc->nvstore.type = NVME_STOR_BLOCKIF;
3138 		sc->nvstore.size = blockif_size(sc->nvstore.ctx);
3139 	}
3140 
3141 	if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
3142 		sc->nvstore.sectsz = sectsz;
3143 	else if (sc->nvstore.type != NVME_STOR_RAM)
3144 		sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
3145 	for (sc->nvstore.sectsz_bits = 9;
3146 	     (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
3147 	     sc->nvstore.sectsz_bits++);
3148 
3149 	if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
3150 		sc->max_queues = NVME_QUEUES;
3151 
3152 	return (0);
3153 }
3154 
3155 static void
3156 pci_nvme_resized(struct blockif_ctxt *bctxt, void *arg, size_t new_size)
3157 {
3158 	struct pci_nvme_softc *sc;
3159 	struct pci_nvme_blockstore *nvstore;
3160 	struct nvme_namespace_data *nd;
3161 
3162 	sc = arg;
3163 	nvstore = &sc->nvstore;
3164 	nd = &sc->nsdata;
3165 
3166 	nvstore->size = new_size;
3167 	pci_nvme_init_nsdata_size(nvstore, nd);
3168 
3169 	/* Add changed NSID to list */
3170 	sc->ns_log.ns[0] = 1;
3171 	sc->ns_log.ns[1] = 0;
3172 
3173 	pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE,
3174 	    PCI_NVME_AE_INFO_NS_ATTR_CHANGED);
3175 }
3176 
3177 static int
3178 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl)
3179 {
3180 	struct pci_nvme_softc *sc;
3181 	uint32_t pci_membar_sz;
3182 	int	error;
3183 
3184 	error = 0;
3185 
3186 	sc = calloc(1, sizeof(struct pci_nvme_softc));
3187 	pi->pi_arg = sc;
3188 	sc->nsc_pi = pi;
3189 
3190 	error = pci_nvme_parse_config(sc, nvl);
3191 	if (error < 0)
3192 		goto done;
3193 	else
3194 		error = 0;
3195 
3196 	STAILQ_INIT(&sc->ioreqs_free);
3197 	sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
3198 	for (int i = 0; i < sc->ioslots; i++) {
3199 		STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
3200 	}
3201 
3202 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
3203 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
3204 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
3205 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
3206 	pci_set_cfgdata8(pi, PCIR_PROGIF,
3207 	                 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
3208 
3209 	/*
3210 	 * Allocate size of NVMe registers + doorbell space for all queues.
3211 	 *
3212 	 * The specification requires a minimum memory I/O window size of 16K.
3213 	 * The Windows driver will refuse to start a device with a smaller
3214 	 * window.
3215 	 */
3216 	pci_membar_sz = sizeof(struct nvme_registers) +
3217 	    2 * sizeof(uint32_t) * (sc->max_queues + 1);
3218 	pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
3219 
3220 	DPRINTF("nvme membar size: %u", pci_membar_sz);
3221 
3222 	error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
3223 	if (error) {
3224 		WPRINTF("%s pci alloc mem bar failed", __func__);
3225 		goto done;
3226 	}
3227 
3228 	error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
3229 	if (error) {
3230 		WPRINTF("%s pci add msixcap failed", __func__);
3231 		goto done;
3232 	}
3233 
3234 	error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
3235 	if (error) {
3236 		WPRINTF("%s pci add Express capability failed", __func__);
3237 		goto done;
3238 	}
3239 
3240 	pthread_mutex_init(&sc->mtx, NULL);
3241 	sem_init(&sc->iosemlock, 0, sc->ioslots);
3242 	blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc);
3243 
3244 	pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
3245 	/*
3246 	 * Controller data depends on Namespace data so initialize Namespace
3247 	 * data first.
3248 	 */
3249 	pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
3250 	pci_nvme_init_ctrldata(sc);
3251 	pci_nvme_init_logpages(sc);
3252 	pci_nvme_init_features(sc);
3253 
3254 	pci_nvme_aer_init(sc);
3255 	pci_nvme_aen_init(sc);
3256 
3257 	pci_nvme_reset(sc);
3258 
3259 	pci_lintr_request(pi);
3260 
3261 done:
3262 	return (error);
3263 }
3264 
3265 static int
3266 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts)
3267 {
3268 	char *cp, *ram;
3269 
3270 	if (opts == NULL)
3271 		return (0);
3272 
3273 	if (strncmp(opts, "ram=", 4) == 0) {
3274 		cp = strchr(opts, ',');
3275 		if (cp == NULL) {
3276 			set_config_value_node(nvl, "ram", opts + 4);
3277 			return (0);
3278 		}
3279 		ram = strndup(opts + 4, cp - opts - 4);
3280 		set_config_value_node(nvl, "ram", ram);
3281 		free(ram);
3282 		return (pci_parse_legacy_config(nvl, cp + 1));
3283 	} else
3284 		return (blockif_legacy_config(nvl, opts));
3285 }
3286 
3287 struct pci_devemu pci_de_nvme = {
3288 	.pe_emu =	"nvme",
3289 	.pe_init =	pci_nvme_init,
3290 	.pe_legacy_config = pci_nvme_legacy_config,
3291 	.pe_barwrite =	pci_nvme_write,
3292 	.pe_barread =	pci_nvme_read
3293 };
3294 PCI_EMUL_SET(pci_de_nvme);
3295