xref: /illumos-gate/usr/src/cmd/bhyve/pci_nvme.c (revision 251becc882939aaf03088561add2c257a7a92424)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  * Copyright (c) 2020 Chuck Tuffli
7  *
8  * Function crc16 Copyright (c) 2017, Fedor Uporov
9  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 /*
34  * bhyve PCIe-NVMe device emulation.
35  *
36  * options:
37  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
38  *
39  *  accepted devpath:
40  *    /dev/blockdev
41  *    /path/to/image
42  *    ram=size_in_MiB
43  *
44  *  maxq    = max number of queues
45  *  qsz     = max elements in each queue
46  *  ioslots = max number of concurrent io requests
47  *  sectsz  = sector size (defaults to blockif sector size)
48  *  ser     = serial number (20-chars max)
49  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
50  *  dsm     = DataSet Management support. Option is one of auto, enable,disable
51  *
52  */
53 
54 /* TODO:
55     - create async event for smart and log
56     - intr coalesce
57  */
58 
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61 
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
65 #ifndef __FreeBSD__
66 #include <endian.h>
67 #endif
68 
69 #include <assert.h>
70 #include <pthread.h>
71 #include <pthread_np.h>
72 #include <semaphore.h>
73 #include <stdbool.h>
74 #include <stddef.h>
75 #include <stdint.h>
76 #include <stdio.h>
77 #include <stdlib.h>
78 #include <string.h>
79 
80 #include <machine/atomic.h>
81 #include <machine/vmm.h>
82 #include <vmmapi.h>
83 
84 #include <dev/nvme/nvme.h>
85 
86 #include "bhyverun.h"
87 #include "block_if.h"
88 #include "config.h"
89 #include "debug.h"
90 #include "pci_emul.h"
91 
92 
93 static int nvme_debug = 0;
94 #define	DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
95 #define	WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
96 
97 /* defaults; can be overridden */
98 #define	NVME_MSIX_BAR		4
99 
100 #define	NVME_IOSLOTS		8
101 
102 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
103 #define NVME_MMIO_SPACE_MIN	(1 << 14)
104 
105 #define	NVME_QUEUES		16
106 #define	NVME_MAX_QENTRIES	2048
107 /* Memory Page size Minimum reported in CAP register */
108 #define	NVME_MPSMIN		0
109 /* MPSMIN converted to bytes */
110 #define	NVME_MPSMIN_BYTES	(1 << (12 + NVME_MPSMIN))
111 
112 #define	NVME_PRP2_ITEMS		(PAGE_SIZE/sizeof(uint64_t))
113 #define	NVME_MDTS		9
114 /* Note the + 1 allows for the initial descriptor to not be page aligned */
115 #define	NVME_MAX_IOVEC		((1 << NVME_MDTS) + 1)
116 #define	NVME_MAX_DATA_SIZE	((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
117 
118 /* This is a synthetic status code to indicate there is no status */
119 #define NVME_NO_STATUS		0xffff
120 #define NVME_COMPLETION_VALID(c)	((c).status != NVME_NO_STATUS)
121 
122 /* helpers */
123 
124 /* Convert a zero-based value into a one-based value */
125 #define ONE_BASED(zero)		((zero) + 1)
126 /* Convert a one-based value into a zero-based value */
127 #define ZERO_BASED(one)		((one)  - 1)
128 
129 /* Encode number of SQ's and CQ's for Set/Get Features */
130 #define NVME_FEATURE_NUM_QUEUES(sc) \
131 	(ZERO_BASED((sc)->num_squeues) & 0xffff) | \
132 	(ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
133 
134 #define	NVME_DOORBELL_OFFSET	offsetof(struct nvme_registers, doorbell)
135 
136 enum nvme_controller_register_offsets {
137 	NVME_CR_CAP_LOW = 0x00,
138 	NVME_CR_CAP_HI  = 0x04,
139 	NVME_CR_VS      = 0x08,
140 	NVME_CR_INTMS   = 0x0c,
141 	NVME_CR_INTMC   = 0x10,
142 	NVME_CR_CC      = 0x14,
143 	NVME_CR_CSTS    = 0x1c,
144 	NVME_CR_NSSR    = 0x20,
145 	NVME_CR_AQA     = 0x24,
146 	NVME_CR_ASQ_LOW = 0x28,
147 	NVME_CR_ASQ_HI  = 0x2c,
148 	NVME_CR_ACQ_LOW = 0x30,
149 	NVME_CR_ACQ_HI  = 0x34,
150 };
151 
152 enum nvme_cmd_cdw11 {
153 	NVME_CMD_CDW11_PC  = 0x0001,
154 	NVME_CMD_CDW11_IEN = 0x0002,
155 	NVME_CMD_CDW11_IV  = 0xFFFF0000,
156 };
157 
158 enum nvme_copy_dir {
159 	NVME_COPY_TO_PRP,
160 	NVME_COPY_FROM_PRP,
161 };
162 
163 #define	NVME_CQ_INTEN	0x01
164 #define	NVME_CQ_INTCOAL	0x02
165 
166 struct nvme_completion_queue {
167 	struct nvme_completion *qbase;
168 	pthread_mutex_t	mtx;
169 	uint32_t	size;
170 	uint16_t	tail; /* nvme progress */
171 	uint16_t	head; /* guest progress */
172 	uint16_t	intr_vec;
173 	uint32_t	intr_en;
174 };
175 
176 struct nvme_submission_queue {
177 	struct nvme_command *qbase;
178 	pthread_mutex_t	mtx;
179 	uint32_t	size;
180 	uint16_t	head; /* nvme progress */
181 	uint16_t	tail; /* guest progress */
182 	uint16_t	cqid; /* completion queue id */
183 	int		qpriority;
184 };
185 
186 enum nvme_storage_type {
187 	NVME_STOR_BLOCKIF = 0,
188 	NVME_STOR_RAM = 1,
189 };
190 
191 struct pci_nvme_blockstore {
192 	enum nvme_storage_type type;
193 	void		*ctx;
194 	uint64_t	size;
195 	uint32_t	sectsz;
196 	uint32_t	sectsz_bits;
197 	uint64_t	eui64;
198 	uint32_t	deallocate:1;
199 };
200 
201 /*
202  * Calculate the number of additional page descriptors for guest IO requests
203  * based on the advertised Max Data Transfer (MDTS) and given the number of
204  * default iovec's in a struct blockif_req.
205  */
206 #define MDTS_PAD_SIZE \
207 	( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
208 	  NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
209 	  0 )
210 
211 struct pci_nvme_ioreq {
212 	struct pci_nvme_softc *sc;
213 	STAILQ_ENTRY(pci_nvme_ioreq) link;
214 	struct nvme_submission_queue *nvme_sq;
215 	uint16_t	sqid;
216 
217 	/* command information */
218 	uint16_t	opc;
219 	uint16_t	cid;
220 	uint32_t	nsid;
221 
222 	uint64_t	prev_gpaddr;
223 	size_t		prev_size;
224 	size_t		bytes;
225 
226 	struct blockif_req io_req;
227 
228 	struct iovec	iovpadding[MDTS_PAD_SIZE];
229 };
230 
231 enum nvme_dsm_type {
232 	/* Dataset Management bit in ONCS reflects backing storage capability */
233 	NVME_DATASET_MANAGEMENT_AUTO,
234 	/* Unconditionally set Dataset Management bit in ONCS */
235 	NVME_DATASET_MANAGEMENT_ENABLE,
236 	/* Unconditionally clear Dataset Management bit in ONCS */
237 	NVME_DATASET_MANAGEMENT_DISABLE,
238 };
239 
240 struct pci_nvme_softc;
241 struct nvme_feature_obj;
242 
243 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
244     struct nvme_feature_obj *,
245     struct nvme_command *,
246     struct nvme_completion *);
247 
248 struct nvme_feature_obj {
249 	uint32_t	cdw11;
250 	nvme_feature_cb	set;
251 	nvme_feature_cb	get;
252 	bool namespace_specific;
253 };
254 
255 #define NVME_FID_MAX		(NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
256 
257 typedef enum {
258 	PCI_NVME_AE_TYPE_ERROR = 0,
259 	PCI_NVME_AE_TYPE_SMART,
260 	PCI_NVME_AE_TYPE_NOTICE,
261 	PCI_NVME_AE_TYPE_IO_CMD = 6,
262 	PCI_NVME_AE_TYPE_VENDOR = 7,
263 	PCI_NVME_AE_TYPE_MAX		/* Must be last */
264 } pci_nvme_async_type;
265 
266 /* Asynchronous Event Requests */
267 struct pci_nvme_aer {
268 	STAILQ_ENTRY(pci_nvme_aer) link;
269 	uint16_t	cid;	/* Command ID of the submitted AER */
270 };
271 
272 typedef enum {
273 	PCI_NVME_AE_INFO_NS_ATTR_CHANGED = 0,
274 	PCI_NVME_AE_INFO_FW_ACTIVATION,
275 	PCI_NVME_AE_INFO_TELEMETRY_CHANGE,
276 	PCI_NVME_AE_INFO_ANA_CHANGE,
277 	PCI_NVME_AE_INFO_PREDICT_LATENCY_CHANGE,
278 	PCI_NVME_AE_INFO_LBA_STATUS_ALERT,
279 	PCI_NVME_AE_INFO_ENDURANCE_GROUP_CHANGE,
280 	PCI_NVME_AE_INFO_MAX,
281 } pci_nvme_async_info;
282 
283 /* Asynchronous Event Notifications */
284 struct pci_nvme_aen {
285 	pci_nvme_async_type atype;
286 	uint32_t	event_data;
287 	bool		posted;
288 };
289 
290 struct pci_nvme_softc {
291 	struct pci_devinst *nsc_pi;
292 
293 	pthread_mutex_t	mtx;
294 
295 	struct nvme_registers regs;
296 
297 	struct nvme_namespace_data  nsdata;
298 	struct nvme_controller_data ctrldata;
299 	struct nvme_error_information_entry err_log;
300 	struct nvme_health_information_page health_log;
301 	struct nvme_firmware_page fw_log;
302 	struct nvme_ns_list ns_log;
303 
304 	struct pci_nvme_blockstore nvstore;
305 
306 	uint16_t	max_qentries;	/* max entries per queue */
307 	uint32_t	max_queues;	/* max number of IO SQ's or CQ's */
308 	uint32_t	num_cqueues;
309 	uint32_t	num_squeues;
310 	bool		num_q_is_set; /* Has host set Number of Queues */
311 
312 	struct pci_nvme_ioreq *ioreqs;
313 	STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
314 	uint32_t	pending_ios;
315 	uint32_t	ioslots;
316 	sem_t		iosemlock;
317 
318 	/*
319 	 * Memory mapped Submission and Completion queues
320 	 * Each array includes both Admin and IO queues
321 	 */
322 	struct nvme_completion_queue *compl_queues;
323 	struct nvme_submission_queue *submit_queues;
324 
325 	struct nvme_feature_obj feat[NVME_FID_MAX];
326 
327 	enum nvme_dsm_type dataset_management;
328 
329 	/* Accounting for SMART data */
330 	__uint128_t	read_data_units;
331 	__uint128_t	write_data_units;
332 	__uint128_t	read_commands;
333 	__uint128_t	write_commands;
334 	uint32_t	read_dunits_remainder;
335 	uint32_t	write_dunits_remainder;
336 
337 	STAILQ_HEAD(, pci_nvme_aer) aer_list;
338 	pthread_mutex_t	aer_mtx;
339 	uint32_t	aer_count;
340 	struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX];
341 	pthread_t	aen_tid;
342 	pthread_mutex_t	aen_mtx;
343 	pthread_cond_t	aen_cond;
344 };
345 
346 
347 static void pci_nvme_cq_update(struct pci_nvme_softc *sc,
348     struct nvme_completion_queue *cq,
349     uint32_t cdw0,
350     uint16_t cid,
351     uint16_t sqid,
352     uint16_t status);
353 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
354 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
355 static void pci_nvme_io_done(struct blockif_req *, int);
356 
357 /* Controller Configuration utils */
358 #define	NVME_CC_GET_EN(cc) \
359 	((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
360 #define	NVME_CC_GET_CSS(cc) \
361 	((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
362 #define	NVME_CC_GET_SHN(cc) \
363 	((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
364 #define	NVME_CC_GET_IOSQES(cc) \
365 	((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
366 #define	NVME_CC_GET_IOCQES(cc) \
367 	((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
368 
369 #define	NVME_CC_WRITE_MASK \
370 	((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
371 	 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
372 	 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
373 
374 #define	NVME_CC_NEN_WRITE_MASK \
375 	((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
376 	 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
377 	 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
378 
379 /* Controller Status utils */
380 #define	NVME_CSTS_GET_RDY(sts) \
381 	((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
382 
383 #define	NVME_CSTS_RDY	(1 << NVME_CSTS_REG_RDY_SHIFT)
384 
385 /* Completion Queue status word utils */
386 #define	NVME_STATUS_P	(1 << NVME_STATUS_P_SHIFT)
387 #define	NVME_STATUS_MASK \
388 	((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
389 	 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
390 
391 #define NVME_ONCS_DSM	(NVME_CTRLR_DATA_ONCS_DSM_MASK << \
392 	NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
393 
394 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
395     struct nvme_feature_obj *,
396     struct nvme_command *,
397     struct nvme_completion *);
398 static void nvme_feature_num_queues(struct pci_nvme_softc *,
399     struct nvme_feature_obj *,
400     struct nvme_command *,
401     struct nvme_completion *);
402 static void nvme_feature_iv_config(struct pci_nvme_softc *,
403     struct nvme_feature_obj *,
404     struct nvme_command *,
405     struct nvme_completion *);
406 
407 static void *aen_thr(void *arg);
408 
409 static __inline void
410 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
411 {
412 	size_t len;
413 
414 	len = strnlen(src, dst_size);
415 	memset(dst, pad, dst_size);
416 	memcpy(dst, src, len);
417 }
418 
419 static __inline void
420 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
421 {
422 
423 	*status &= ~NVME_STATUS_MASK;
424 	*status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
425 		(code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
426 }
427 
428 static __inline void
429 pci_nvme_status_genc(uint16_t *status, uint16_t code)
430 {
431 
432 	pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
433 }
434 
435 /*
436  * Initialize the requested number or IO Submission and Completion Queues.
437  * Admin queues are allocated implicitly.
438  */
439 static void
440 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
441 {
442 	uint32_t i;
443 
444 	/*
445 	 * Allocate and initialize the Submission Queues
446 	 */
447 	if (nsq > NVME_QUEUES) {
448 		WPRINTF("%s: clamping number of SQ from %u to %u",
449 					__func__, nsq, NVME_QUEUES);
450 		nsq = NVME_QUEUES;
451 	}
452 
453 	sc->num_squeues = nsq;
454 
455 	sc->submit_queues = calloc(sc->num_squeues + 1,
456 				sizeof(struct nvme_submission_queue));
457 	if (sc->submit_queues == NULL) {
458 		WPRINTF("%s: SQ allocation failed", __func__);
459 		sc->num_squeues = 0;
460 	} else {
461 		struct nvme_submission_queue *sq = sc->submit_queues;
462 
463 		for (i = 0; i < sc->num_squeues; i++)
464 			pthread_mutex_init(&sq[i].mtx, NULL);
465 	}
466 
467 	/*
468 	 * Allocate and initialize the Completion Queues
469 	 */
470 	if (ncq > NVME_QUEUES) {
471 		WPRINTF("%s: clamping number of CQ from %u to %u",
472 					__func__, ncq, NVME_QUEUES);
473 		ncq = NVME_QUEUES;
474 	}
475 
476 	sc->num_cqueues = ncq;
477 
478 	sc->compl_queues = calloc(sc->num_cqueues + 1,
479 				sizeof(struct nvme_completion_queue));
480 	if (sc->compl_queues == NULL) {
481 		WPRINTF("%s: CQ allocation failed", __func__);
482 		sc->num_cqueues = 0;
483 	} else {
484 		struct nvme_completion_queue *cq = sc->compl_queues;
485 
486 		for (i = 0; i < sc->num_cqueues; i++)
487 			pthread_mutex_init(&cq[i].mtx, NULL);
488 	}
489 }
490 
491 static void
492 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
493 {
494 	struct nvme_controller_data *cd = &sc->ctrldata;
495 
496 	cd->vid = 0xFB5D;
497 	cd->ssvid = 0x0000;
498 
499 	cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
500 	cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
501 
502 	/* Num of submission commands that we can handle at a time (2^rab) */
503 	cd->rab   = 4;
504 
505 	/* FreeBSD OUI */
506 	cd->ieee[0] = 0x58;
507 	cd->ieee[1] = 0x9c;
508 	cd->ieee[2] = 0xfc;
509 
510 	cd->mic = 0;
511 
512 	cd->mdts = NVME_MDTS;	/* max data transfer size (2^mdts * CAP.MPSMIN) */
513 
514 	cd->ver = 0x00010300;
515 
516 	cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
517 #ifndef __FreeBSD__
518 	/*
519 	 * Reported upstream against https://reviews.freebsd.org/D32953
520 	 * which introduced support for the namespace attribute changed AEN
521 	 * and the corresponding changed namespace log page, without setting
522 	 * the bit in oaes. A future sync will likely include this
523 	 * definition in usr/src/contrib/bhyve/dev/nvme/nvme.h once it's
524 	 * fixed there.
525 	 */
526 #define	NVME_CTRLR_DATA_OAES_NSCHANGE_SHIFT	(8)
527 	cd->oaes = 1 << NVME_CTRLR_DATA_OAES_NSCHANGE_SHIFT;
528 #endif
529 	cd->acl = 2;
530 	cd->aerl = 4;
531 
532 	/* Advertise 1, Read-only firmware slot */
533 	cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK |
534 	    (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
535 	cd->lpa = 0;	/* TODO: support some simple things like SMART */
536 	cd->elpe = 0;	/* max error log page entries */
537 	cd->npss = 1;	/* number of power states support */
538 
539 	/* Warning Composite Temperature Threshold */
540 	cd->wctemp = 0x0157;
541 
542 	cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
543 	    (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
544 	cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
545 	    (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
546 	cd->nn = 1;	/* number of namespaces */
547 
548 	cd->oncs = 0;
549 	switch (sc->dataset_management) {
550 	case NVME_DATASET_MANAGEMENT_AUTO:
551 		if (sc->nvstore.deallocate)
552 			cd->oncs |= NVME_ONCS_DSM;
553 		break;
554 	case NVME_DATASET_MANAGEMENT_ENABLE:
555 		cd->oncs |= NVME_ONCS_DSM;
556 		break;
557 	default:
558 		break;
559 	}
560 
561 	cd->fna = 0x03;
562 
563 	cd->power_state[0].mp = 10;
564 }
565 
566 /*
567  * Calculate the CRC-16 of the given buffer
568  * See copyright attribution at top of file
569  */
570 static uint16_t
571 crc16(uint16_t crc, const void *buffer, unsigned int len)
572 {
573 	const unsigned char *cp = buffer;
574 	/* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
575 	static uint16_t const crc16_table[256] = {
576 		0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
577 		0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
578 		0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
579 		0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
580 		0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
581 		0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
582 		0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
583 		0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
584 		0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
585 		0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
586 		0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
587 		0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
588 		0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
589 		0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
590 		0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
591 		0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
592 		0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
593 		0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
594 		0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
595 		0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
596 		0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
597 		0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
598 		0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
599 		0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
600 		0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
601 		0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
602 		0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
603 		0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
604 		0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
605 		0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
606 		0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
607 		0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
608 	};
609 
610 	while (len--)
611 		crc = (((crc >> 8) & 0xffU) ^
612 		    crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
613 	return crc;
614 }
615 
616 static void
617 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore,
618     struct nvme_namespace_data *nd)
619 {
620 
621 	/* Get capacity and block size information from backing store */
622 	nd->nsze = nvstore->size / nvstore->sectsz;
623 	nd->ncap = nd->nsze;
624 	nd->nuse = nd->nsze;
625 }
626 
627 static void
628 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
629     struct nvme_namespace_data *nd, uint32_t nsid,
630     struct pci_nvme_blockstore *nvstore)
631 {
632 
633 	pci_nvme_init_nsdata_size(nvstore, nd);
634 
635 	if (nvstore->type == NVME_STOR_BLOCKIF)
636 		nvstore->deallocate = blockif_candelete(nvstore->ctx);
637 
638 	nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
639 	nd->flbas = 0;
640 
641 	/* Create an EUI-64 if user did not provide one */
642 	if (nvstore->eui64 == 0) {
643 		char *data = NULL;
644 		uint64_t eui64 = nvstore->eui64;
645 
646 		asprintf(&data, "%s%u%u%u", get_config_value("name"),
647 		    sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot,
648 		    sc->nsc_pi->pi_func);
649 
650 		if (data != NULL) {
651 			eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
652 			free(data);
653 		}
654 		nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
655 	}
656 	be64enc(nd->eui64, nvstore->eui64);
657 
658 	/* LBA data-sz = 2^lbads */
659 	nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
660 }
661 
662 static void
663 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
664 {
665 
666 	memset(&sc->err_log, 0, sizeof(sc->err_log));
667 	memset(&sc->health_log, 0, sizeof(sc->health_log));
668 	memset(&sc->fw_log, 0, sizeof(sc->fw_log));
669 	memset(&sc->ns_log, 0, sizeof(sc->ns_log));
670 
671 	/* Set read/write remainder to round up according to spec */
672 	sc->read_dunits_remainder = 999;
673 	sc->write_dunits_remainder = 999;
674 
675 	/* Set nominal Health values checked by implementations */
676 	sc->health_log.temperature = 310;
677 	sc->health_log.available_spare = 100;
678 	sc->health_log.available_spare_threshold = 10;
679 }
680 
681 static void
682 pci_nvme_init_features(struct pci_nvme_softc *sc)
683 {
684 
685 	sc->feat[0].set = nvme_feature_invalid_cb;
686 	sc->feat[0].get = nvme_feature_invalid_cb;
687 
688 	sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true;
689 	sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true;
690 	sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues;
691 	sc->feat[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION].set =
692 	    nvme_feature_iv_config;
693 	/* Enable all AENs by default */
694 	sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11 = 0x31f;
695 	sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG].get =
696 	    nvme_feature_invalid_cb;
697 	sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW].get =
698 	    nvme_feature_invalid_cb;
699 }
700 
701 static void
702 pci_nvme_aer_reset(struct pci_nvme_softc *sc)
703 {
704 
705 	STAILQ_INIT(&sc->aer_list);
706 	sc->aer_count = 0;
707 }
708 
709 static void
710 pci_nvme_aer_init(struct pci_nvme_softc *sc)
711 {
712 
713 	pthread_mutex_init(&sc->aer_mtx, NULL);
714 	pci_nvme_aer_reset(sc);
715 }
716 
717 static void
718 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
719 {
720 	struct pci_nvme_aer *aer = NULL;
721 
722 	pthread_mutex_lock(&sc->aer_mtx);
723 	while (!STAILQ_EMPTY(&sc->aer_list)) {
724 		aer = STAILQ_FIRST(&sc->aer_list);
725 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
726 		free(aer);
727 	}
728 	pthread_mutex_unlock(&sc->aer_mtx);
729 
730 	pci_nvme_aer_reset(sc);
731 }
732 
733 static bool
734 pci_nvme_aer_available(struct pci_nvme_softc *sc)
735 {
736 
737 	return (sc->aer_count != 0);
738 }
739 
740 static bool
741 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
742 {
743 	struct nvme_controller_data *cd = &sc->ctrldata;
744 
745 	/* AERL is a zero based value while aer_count is one's based */
746 	return (sc->aer_count == (cd->aerl + 1));
747 }
748 
749 /*
750  * Add an Async Event Request
751  *
752  * Stores an AER to be returned later if the Controller needs to notify the
753  * host of an event.
754  * Note that while the NVMe spec doesn't require Controllers to return AER's
755  * in order, this implementation does preserve the order.
756  */
757 static int
758 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
759 {
760 	struct pci_nvme_aer *aer = NULL;
761 
762 	if (pci_nvme_aer_limit_reached(sc))
763 		return (-1);
764 
765 	aer = calloc(1, sizeof(struct pci_nvme_aer));
766 	if (aer == NULL)
767 		return (-1);
768 
769 	/* Save the Command ID for use in the completion message */
770 	aer->cid = cid;
771 
772 	pthread_mutex_lock(&sc->aer_mtx);
773 	sc->aer_count++;
774 	STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
775 	pthread_mutex_unlock(&sc->aer_mtx);
776 
777 	return (0);
778 }
779 
780 /*
781  * Get an Async Event Request structure
782  *
783  * Returns a pointer to an AER previously submitted by the host or NULL if
784  * no AER's exist. Caller is responsible for freeing the returned struct.
785  */
786 static struct pci_nvme_aer *
787 pci_nvme_aer_get(struct pci_nvme_softc *sc)
788 {
789 	struct pci_nvme_aer *aer = NULL;
790 
791 	pthread_mutex_lock(&sc->aer_mtx);
792 	aer = STAILQ_FIRST(&sc->aer_list);
793 	if (aer != NULL) {
794 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
795 		sc->aer_count--;
796 	}
797 	pthread_mutex_unlock(&sc->aer_mtx);
798 
799 	return (aer);
800 }
801 
802 static void
803 pci_nvme_aen_reset(struct pci_nvme_softc *sc)
804 {
805 	uint32_t	atype;
806 
807 	memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen));
808 
809 	for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
810 		sc->aen[atype].atype = atype;
811 	}
812 }
813 
814 static void
815 pci_nvme_aen_init(struct pci_nvme_softc *sc)
816 {
817 	char nstr[80];
818 
819 	pci_nvme_aen_reset(sc);
820 
821 	pthread_mutex_init(&sc->aen_mtx, NULL);
822 	pthread_create(&sc->aen_tid, NULL, aen_thr, sc);
823 	snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot,
824 	    sc->nsc_pi->pi_func);
825 	pthread_set_name_np(sc->aen_tid, nstr);
826 }
827 
828 static void
829 pci_nvme_aen_destroy(struct pci_nvme_softc *sc)
830 {
831 
832 	pci_nvme_aen_reset(sc);
833 }
834 
835 /* Notify the AEN thread of pending work */
836 static void
837 pci_nvme_aen_notify(struct pci_nvme_softc *sc)
838 {
839 
840 	pthread_cond_signal(&sc->aen_cond);
841 }
842 
843 /*
844  * Post an Asynchronous Event Notification
845  */
846 static int32_t
847 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype,
848 		uint32_t event_data)
849 {
850 	struct pci_nvme_aen *aen;
851 
852 	if (atype >= PCI_NVME_AE_TYPE_MAX) {
853 		return(EINVAL);
854 	}
855 
856 	pthread_mutex_lock(&sc->aen_mtx);
857 	aen = &sc->aen[atype];
858 
859 	/* Has the controller already posted an event of this type? */
860 	if (aen->posted) {
861 		pthread_mutex_unlock(&sc->aen_mtx);
862 		return(EALREADY);
863 	}
864 
865 	aen->event_data = event_data;
866 	aen->posted = true;
867 	pthread_mutex_unlock(&sc->aen_mtx);
868 
869 	pci_nvme_aen_notify(sc);
870 
871 	return(0);
872 }
873 
874 static void
875 pci_nvme_aen_process(struct pci_nvme_softc *sc)
876 {
877 	struct pci_nvme_aer *aer;
878 	struct pci_nvme_aen *aen;
879 	pci_nvme_async_type atype;
880 	uint32_t mask;
881 	uint16_t status;
882 	uint8_t lid;
883 
884 #ifndef __FreeBSD__
885 	lid = 0;
886 #endif
887 
888 	assert(pthread_mutex_isowned_np(&sc->aen_mtx));
889 	for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
890 		aen = &sc->aen[atype];
891 		/* Previous iterations may have depleted the available AER's */
892 		if (!pci_nvme_aer_available(sc)) {
893 			DPRINTF("%s: no AER", __func__);
894 			break;
895 		}
896 
897 		if (!aen->posted) {
898 			DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype);
899 			continue;
900 		}
901 
902 		status = NVME_SC_SUCCESS;
903 
904 		/* Is the event masked? */
905 		mask =
906 		    sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11;
907 
908 		DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data);
909 		switch (atype) {
910 		case PCI_NVME_AE_TYPE_ERROR:
911 			lid = NVME_LOG_ERROR;
912 			break;
913 		case PCI_NVME_AE_TYPE_SMART:
914 			mask &= 0xff;
915 			if ((mask & aen->event_data) == 0)
916 				continue;
917 			lid = NVME_LOG_HEALTH_INFORMATION;
918 			break;
919 		case PCI_NVME_AE_TYPE_NOTICE:
920 			if (aen->event_data >= PCI_NVME_AE_INFO_MAX) {
921 				EPRINTLN("%s unknown AEN notice type %u",
922 				    __func__, aen->event_data);
923 				status = NVME_SC_INTERNAL_DEVICE_ERROR;
924 				break;
925 			}
926 			mask >>= 8;
927 			if (((1 << aen->event_data) & mask) == 0)
928 				continue;
929 			switch (aen->event_data) {
930 			case PCI_NVME_AE_INFO_NS_ATTR_CHANGED:
931 				lid = NVME_LOG_CHANGED_NAMESPACE;
932 				break;
933 			case PCI_NVME_AE_INFO_FW_ACTIVATION:
934 				lid = NVME_LOG_FIRMWARE_SLOT;
935 				break;
936 			case PCI_NVME_AE_INFO_TELEMETRY_CHANGE:
937 				lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED;
938 				break;
939 			case PCI_NVME_AE_INFO_ANA_CHANGE:
940 				lid = NVME_LOG_ASYMMETRIC_NAMESPAVE_ACCESS; //TODO spelling
941 				break;
942 			case PCI_NVME_AE_INFO_PREDICT_LATENCY_CHANGE:
943 				lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE;
944 				break;
945 			case PCI_NVME_AE_INFO_LBA_STATUS_ALERT:
946 				lid = NVME_LOG_LBA_STATUS_INFORMATION;
947 				break;
948 			case PCI_NVME_AE_INFO_ENDURANCE_GROUP_CHANGE:
949 				lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE;
950 				break;
951 			default:
952 				lid = 0;
953 			}
954 			break;
955 		default:
956 			/* bad type?!? */
957 			EPRINTLN("%s unknown AEN type %u", __func__, atype);
958 			status = NVME_SC_INTERNAL_DEVICE_ERROR;
959 			break;
960 		}
961 
962 		aer = pci_nvme_aer_get(sc);
963 		assert(aer != NULL);
964 
965 		DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype);
966 		pci_nvme_cq_update(sc, &sc->compl_queues[0],
967 		    (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */
968 		    aer->cid,
969 		    0,		/* SQID */
970 		    status);
971 
972 		aen->event_data = 0;
973 		aen->posted = false;
974 
975 		pci_generate_msix(sc->nsc_pi, 0);
976 	}
977 }
978 
979 static void *
980 aen_thr(void *arg)
981 {
982 	struct pci_nvme_softc *sc;
983 
984 	sc = arg;
985 
986 	pthread_mutex_lock(&sc->aen_mtx);
987 	for (;;) {
988 		pci_nvme_aen_process(sc);
989 		pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx);
990 	}
991 #ifdef __FreeBSD__
992 	pthread_mutex_unlock(&sc->aen_mtx);
993 
994 	pthread_exit(NULL);
995 #endif
996 	return (NULL);
997 }
998 
999 static void
1000 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
1001 {
1002 	uint32_t i;
1003 
1004 	DPRINTF("%s", __func__);
1005 
1006 	sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
1007 	    (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
1008 	    (60 << NVME_CAP_LO_REG_TO_SHIFT);
1009 
1010 	sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
1011 
1012 	sc->regs.vs = 0x00010300;	/* NVMe v1.3 */
1013 
1014 	sc->regs.cc = 0;
1015 	sc->regs.csts = 0;
1016 
1017 	assert(sc->submit_queues != NULL);
1018 
1019 	for (i = 0; i < sc->num_squeues + 1; i++) {
1020 		sc->submit_queues[i].qbase = NULL;
1021 		sc->submit_queues[i].size = 0;
1022 		sc->submit_queues[i].cqid = 0;
1023 		sc->submit_queues[i].tail = 0;
1024 		sc->submit_queues[i].head = 0;
1025 	}
1026 
1027 	assert(sc->compl_queues != NULL);
1028 
1029 	for (i = 0; i < sc->num_cqueues + 1; i++) {
1030 		sc->compl_queues[i].qbase = NULL;
1031 		sc->compl_queues[i].size = 0;
1032 		sc->compl_queues[i].tail = 0;
1033 		sc->compl_queues[i].head = 0;
1034 	}
1035 
1036 	sc->num_q_is_set = false;
1037 
1038 	pci_nvme_aer_destroy(sc);
1039 	pci_nvme_aen_destroy(sc);
1040 }
1041 
1042 static void
1043 pci_nvme_reset(struct pci_nvme_softc *sc)
1044 {
1045 	pthread_mutex_lock(&sc->mtx);
1046 	pci_nvme_reset_locked(sc);
1047 	pthread_mutex_unlock(&sc->mtx);
1048 }
1049 
1050 static void
1051 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
1052 {
1053 	uint16_t acqs, asqs;
1054 
1055 	DPRINTF("%s", __func__);
1056 
1057 	asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
1058 	sc->submit_queues[0].size = asqs;
1059 	sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
1060 	            sizeof(struct nvme_command) * asqs);
1061 
1062 	DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
1063 	        __func__, sc->regs.asq, sc->submit_queues[0].qbase);
1064 
1065 	acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
1066 	    NVME_AQA_REG_ACQS_MASK) + 1;
1067 	sc->compl_queues[0].size = acqs;
1068 	sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
1069 	         sizeof(struct nvme_completion) * acqs);
1070 	sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
1071 
1072 	DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
1073 	        __func__, sc->regs.acq, sc->compl_queues[0].qbase);
1074 }
1075 
1076 static int
1077 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
1078 	size_t len, enum nvme_copy_dir dir)
1079 {
1080 	uint8_t *p;
1081 	size_t bytes;
1082 
1083 	if (len > (8 * 1024)) {
1084 		return (-1);
1085 	}
1086 
1087 	/* Copy from the start of prp1 to the end of the physical page */
1088 	bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
1089 	bytes = MIN(bytes, len);
1090 
1091 	p = vm_map_gpa(ctx, prp1, bytes);
1092 	if (p == NULL) {
1093 		return (-1);
1094 	}
1095 
1096 	if (dir == NVME_COPY_TO_PRP)
1097 		memcpy(p, b, bytes);
1098 	else
1099 		memcpy(b, p, bytes);
1100 
1101 	b += bytes;
1102 
1103 	len -= bytes;
1104 	if (len == 0) {
1105 		return (0);
1106 	}
1107 
1108 	len = MIN(len, PAGE_SIZE);
1109 
1110 	p = vm_map_gpa(ctx, prp2, len);
1111 	if (p == NULL) {
1112 		return (-1);
1113 	}
1114 
1115 	if (dir == NVME_COPY_TO_PRP)
1116 		memcpy(p, b, len);
1117 	else
1118 		memcpy(b, p, len);
1119 
1120 	return (0);
1121 }
1122 
1123 /*
1124  * Write a Completion Queue Entry update
1125  *
1126  * Write the completion and update the doorbell value
1127  */
1128 static void
1129 pci_nvme_cq_update(struct pci_nvme_softc *sc,
1130 		struct nvme_completion_queue *cq,
1131 		uint32_t cdw0,
1132 		uint16_t cid,
1133 		uint16_t sqid,
1134 		uint16_t status)
1135 {
1136 	struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
1137 	struct nvme_completion *cqe;
1138 
1139 	assert(cq->qbase != NULL);
1140 
1141 	pthread_mutex_lock(&cq->mtx);
1142 
1143 	cqe = &cq->qbase[cq->tail];
1144 
1145 	/* Flip the phase bit */
1146 	status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
1147 
1148 	cqe->cdw0 = cdw0;
1149 	cqe->sqhd = sq->head;
1150 	cqe->sqid = sqid;
1151 	cqe->cid = cid;
1152 	cqe->status = status;
1153 
1154 	cq->tail++;
1155 	if (cq->tail >= cq->size) {
1156 		cq->tail = 0;
1157 	}
1158 
1159 	pthread_mutex_unlock(&cq->mtx);
1160 }
1161 
1162 static int
1163 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1164 	struct nvme_completion* compl)
1165 {
1166 	uint16_t qid = command->cdw10 & 0xffff;
1167 
1168 	DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
1169 	if (qid == 0 || qid > sc->num_squeues ||
1170 	    (sc->submit_queues[qid].qbase == NULL)) {
1171 		WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
1172 		        __func__, qid, sc->num_squeues);
1173 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1174 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1175 		return (1);
1176 	}
1177 
1178 	sc->submit_queues[qid].qbase = NULL;
1179 	sc->submit_queues[qid].cqid = 0;
1180 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1181 	return (1);
1182 }
1183 
1184 static int
1185 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1186 	struct nvme_completion* compl)
1187 {
1188 	if (command->cdw11 & NVME_CMD_CDW11_PC) {
1189 		uint16_t qid = command->cdw10 & 0xffff;
1190 		struct nvme_submission_queue *nsq;
1191 
1192 		if ((qid == 0) || (qid > sc->num_squeues) ||
1193 		    (sc->submit_queues[qid].qbase != NULL)) {
1194 			WPRINTF("%s queue index %u > num_squeues %u",
1195 			        __func__, qid, sc->num_squeues);
1196 			pci_nvme_status_tc(&compl->status,
1197 			    NVME_SCT_COMMAND_SPECIFIC,
1198 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1199 			return (1);
1200 		}
1201 
1202 		nsq = &sc->submit_queues[qid];
1203 		nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1204 		DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
1205 		if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
1206 			/*
1207 			 * Queues must specify at least two entries
1208 			 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1209 			 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1210 			 */
1211 			pci_nvme_status_tc(&compl->status,
1212 			    NVME_SCT_COMMAND_SPECIFIC,
1213 			    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1214 			return (1);
1215 		}
1216 		nsq->head = nsq->tail = 0;
1217 
1218 		nsq->cqid = (command->cdw11 >> 16) & 0xffff;
1219 		if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
1220 			pci_nvme_status_tc(&compl->status,
1221 			    NVME_SCT_COMMAND_SPECIFIC,
1222 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1223 			return (1);
1224 		}
1225 
1226 		if (sc->compl_queues[nsq->cqid].qbase == NULL) {
1227 			pci_nvme_status_tc(&compl->status,
1228 			    NVME_SCT_COMMAND_SPECIFIC,
1229 			    NVME_SC_COMPLETION_QUEUE_INVALID);
1230 			return (1);
1231 		}
1232 
1233 		nsq->qpriority = (command->cdw11 >> 1) & 0x03;
1234 
1235 		nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1236 		              sizeof(struct nvme_command) * (size_t)nsq->size);
1237 
1238 		DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
1239 		        qid, nsq->size, nsq->qbase, nsq->cqid);
1240 
1241 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1242 
1243 		DPRINTF("%s completed creating IOSQ qid %u",
1244 		         __func__, qid);
1245 	} else {
1246 		/*
1247 		 * Guest sent non-cont submission queue request.
1248 		 * This setting is unsupported by this emulation.
1249 		 */
1250 		WPRINTF("%s unsupported non-contig (list-based) "
1251 		         "create i/o submission queue", __func__);
1252 
1253 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1254 	}
1255 	return (1);
1256 }
1257 
1258 static int
1259 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1260 	struct nvme_completion* compl)
1261 {
1262 	uint16_t qid = command->cdw10 & 0xffff;
1263 	uint16_t sqid;
1264 
1265 	DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
1266 	if (qid == 0 || qid > sc->num_cqueues ||
1267 	    (sc->compl_queues[qid].qbase == NULL)) {
1268 		WPRINTF("%s queue index %u / num_cqueues %u",
1269 		        __func__, qid, sc->num_cqueues);
1270 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1271 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1272 		return (1);
1273 	}
1274 
1275 	/* Deleting an Active CQ is an error */
1276 	for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
1277 		if (sc->submit_queues[sqid].cqid == qid) {
1278 			pci_nvme_status_tc(&compl->status,
1279 			    NVME_SCT_COMMAND_SPECIFIC,
1280 			    NVME_SC_INVALID_QUEUE_DELETION);
1281 			return (1);
1282 		}
1283 
1284 	sc->compl_queues[qid].qbase = NULL;
1285 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1286 	return (1);
1287 }
1288 
1289 static int
1290 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1291 	struct nvme_completion* compl)
1292 {
1293 	struct nvme_completion_queue *ncq;
1294 	uint16_t qid = command->cdw10 & 0xffff;
1295 
1296 	/* Only support Physically Contiguous queues */
1297 	if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1298 		WPRINTF("%s unsupported non-contig (list-based) "
1299 		         "create i/o completion queue",
1300 		         __func__);
1301 
1302 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1303 		return (1);
1304 	}
1305 
1306 	if ((qid == 0) || (qid > sc->num_cqueues) ||
1307 	    (sc->compl_queues[qid].qbase != NULL)) {
1308 		WPRINTF("%s queue index %u > num_cqueues %u",
1309 			__func__, qid, sc->num_cqueues);
1310 		pci_nvme_status_tc(&compl->status,
1311 		    NVME_SCT_COMMAND_SPECIFIC,
1312 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1313 		return (1);
1314  	}
1315 
1316 	ncq = &sc->compl_queues[qid];
1317 	ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1318 	ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1319 	if (ncq->intr_vec > (sc->max_queues + 1)) {
1320 		pci_nvme_status_tc(&compl->status,
1321 		    NVME_SCT_COMMAND_SPECIFIC,
1322 		    NVME_SC_INVALID_INTERRUPT_VECTOR);
1323 		return (1);
1324 	}
1325 
1326 	ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1327 	if ((ncq->size < 2) || (ncq->size > sc->max_qentries))  {
1328 		/*
1329 		 * Queues must specify at least two entries
1330 		 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1331 		 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1332 		 */
1333 		pci_nvme_status_tc(&compl->status,
1334 		    NVME_SCT_COMMAND_SPECIFIC,
1335 		    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1336 		return (1);
1337 	}
1338 	ncq->head = ncq->tail = 0;
1339 	ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1340 		     command->prp1,
1341 		     sizeof(struct nvme_command) * (size_t)ncq->size);
1342 
1343 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1344 
1345 
1346 	return (1);
1347 }
1348 
1349 static int
1350 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1351 	struct nvme_completion* compl)
1352 {
1353 	uint32_t logsize;
1354 	uint8_t logpage = command->cdw10 & 0xFF;
1355 
1356 #ifndef __FreeBSD__
1357 	logsize = 0;
1358 #endif
1359 
1360 	DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1361 
1362 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1363 
1364 	/*
1365 	 * Command specifies the number of dwords to return in fields NUMDU
1366 	 * and NUMDL. This is a zero-based value.
1367 	 */
1368 	logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1369 	logsize *= sizeof(uint32_t);
1370 
1371 	switch (logpage) {
1372 	case NVME_LOG_ERROR:
1373 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1374 		    command->prp2, (uint8_t *)&sc->err_log,
1375 		    MIN(logsize, sizeof(sc->err_log)),
1376 		    NVME_COPY_TO_PRP);
1377 		break;
1378 	case NVME_LOG_HEALTH_INFORMATION:
1379 		pthread_mutex_lock(&sc->mtx);
1380 		memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1381 		    sizeof(sc->health_log.data_units_read));
1382 		memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1383 		    sizeof(sc->health_log.data_units_written));
1384 		memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1385 		    sizeof(sc->health_log.host_read_commands));
1386 		memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1387 		    sizeof(sc->health_log.host_write_commands));
1388 		pthread_mutex_unlock(&sc->mtx);
1389 
1390 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1391 		    command->prp2, (uint8_t *)&sc->health_log,
1392 		    MIN(logsize, sizeof(sc->health_log)),
1393 		    NVME_COPY_TO_PRP);
1394 		break;
1395 	case NVME_LOG_FIRMWARE_SLOT:
1396 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1397 		    command->prp2, (uint8_t *)&sc->fw_log,
1398 		    MIN(logsize, sizeof(sc->fw_log)),
1399 		    NVME_COPY_TO_PRP);
1400 		break;
1401 	case NVME_LOG_CHANGED_NAMESPACE:
1402 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1403 		    command->prp2, (uint8_t *)&sc->ns_log,
1404 		    MIN(logsize, sizeof(sc->ns_log)),
1405 		    NVME_COPY_TO_PRP);
1406 		memset(&sc->ns_log, 0, sizeof(sc->ns_log));
1407 		break;
1408 	default:
1409 		DPRINTF("%s get log page %x command not supported",
1410 		        __func__, logpage);
1411 
1412 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1413 		    NVME_SC_INVALID_LOG_PAGE);
1414 	}
1415 
1416 	return (1);
1417 }
1418 
1419 static int
1420 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1421 	struct nvme_completion* compl)
1422 {
1423 	void *dest;
1424 	uint16_t status;
1425 
1426 #ifndef __FreeBSD__
1427 	status = 0;
1428 #endif
1429 
1430 	DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1431 	        command->cdw10 & 0xFF, command->nsid);
1432 
1433 	pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1434 
1435 	switch (command->cdw10 & 0xFF) {
1436 	case 0x00: /* return Identify Namespace data structure */
1437 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1438 		    command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1439 		    NVME_COPY_TO_PRP);
1440 		break;
1441 	case 0x01: /* return Identify Controller data structure */
1442 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1443 		    command->prp2, (uint8_t *)&sc->ctrldata,
1444 		    sizeof(sc->ctrldata),
1445 		    NVME_COPY_TO_PRP);
1446 		break;
1447 	case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1448 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1449 		                  sizeof(uint32_t) * 1024);
1450 		/* All unused entries shall be zero */
1451 		bzero(dest, sizeof(uint32_t) * 1024);
1452 		((uint32_t *)dest)[0] = 1;
1453 		break;
1454 	case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1455 		if (command->nsid != 1) {
1456 			pci_nvme_status_genc(&status,
1457 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1458 			break;
1459 		}
1460 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1461 		                  sizeof(uint32_t) * 1024);
1462 		/* All bytes after the descriptor shall be zero */
1463 		bzero(dest, sizeof(uint32_t) * 1024);
1464 
1465 		/* Return NIDT=1 (i.e. EUI64) descriptor */
1466 		((uint8_t *)dest)[0] = 1;
1467 		((uint8_t *)dest)[1] = sizeof(uint64_t);
1468 		bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1469 		break;
1470 	default:
1471 		DPRINTF("%s unsupported identify command requested 0x%x",
1472 		         __func__, command->cdw10 & 0xFF);
1473 		pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1474 		break;
1475 	}
1476 
1477 	compl->status = status;
1478 	return (1);
1479 }
1480 
1481 static const char *
1482 nvme_fid_to_name(uint8_t fid)
1483 {
1484 	const char *name;
1485 
1486 	switch (fid) {
1487 	case NVME_FEAT_ARBITRATION:
1488 		name = "Arbitration";
1489 		break;
1490 	case NVME_FEAT_POWER_MANAGEMENT:
1491 		name = "Power Management";
1492 		break;
1493 	case NVME_FEAT_LBA_RANGE_TYPE:
1494 		name = "LBA Range Type";
1495 		break;
1496 	case NVME_FEAT_TEMPERATURE_THRESHOLD:
1497 		name = "Temperature Threshold";
1498 		break;
1499 	case NVME_FEAT_ERROR_RECOVERY:
1500 		name = "Error Recovery";
1501 		break;
1502 	case NVME_FEAT_VOLATILE_WRITE_CACHE:
1503 		name = "Volatile Write Cache";
1504 		break;
1505 	case NVME_FEAT_NUMBER_OF_QUEUES:
1506 		name = "Number of Queues";
1507 		break;
1508 	case NVME_FEAT_INTERRUPT_COALESCING:
1509 		name = "Interrupt Coalescing";
1510 		break;
1511 	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1512 		name = "Interrupt Vector Configuration";
1513 		break;
1514 	case NVME_FEAT_WRITE_ATOMICITY:
1515 		name = "Write Atomicity Normal";
1516 		break;
1517 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1518 		name = "Asynchronous Event Configuration";
1519 		break;
1520 	case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1521 		name = "Autonomous Power State Transition";
1522 		break;
1523 	case NVME_FEAT_HOST_MEMORY_BUFFER:
1524 		name = "Host Memory Buffer";
1525 		break;
1526 	case NVME_FEAT_TIMESTAMP:
1527 		name = "Timestamp";
1528 		break;
1529 	case NVME_FEAT_KEEP_ALIVE_TIMER:
1530 		name = "Keep Alive Timer";
1531 		break;
1532 	case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1533 		name = "Host Controlled Thermal Management";
1534 		break;
1535 	case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1536 		name = "Non-Operation Power State Config";
1537 		break;
1538 	case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1539 		name = "Read Recovery Level Config";
1540 		break;
1541 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1542 		name = "Predictable Latency Mode Config";
1543 		break;
1544 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1545 		name = "Predictable Latency Mode Window";
1546 		break;
1547 	case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1548 		name = "LBA Status Information Report Interval";
1549 		break;
1550 	case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1551 		name = "Host Behavior Support";
1552 		break;
1553 	case NVME_FEAT_SANITIZE_CONFIG:
1554 		name = "Sanitize Config";
1555 		break;
1556 	case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1557 		name = "Endurance Group Event Configuration";
1558 		break;
1559 	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1560 		name = "Software Progress Marker";
1561 		break;
1562 	case NVME_FEAT_HOST_IDENTIFIER:
1563 		name = "Host Identifier";
1564 		break;
1565 	case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1566 		name = "Reservation Notification Mask";
1567 		break;
1568 	case NVME_FEAT_RESERVATION_PERSISTENCE:
1569 		name = "Reservation Persistence";
1570 		break;
1571 	case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1572 		name = "Namespace Write Protection Config";
1573 		break;
1574 	default:
1575 		name = "Unknown";
1576 		break;
1577 	}
1578 
1579 	return (name);
1580 }
1581 
1582 static void
1583 nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
1584     struct nvme_feature_obj *feat,
1585     struct nvme_command *command,
1586     struct nvme_completion *compl)
1587 {
1588 
1589 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1590 }
1591 
1592 static void
1593 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1594     struct nvme_feature_obj *feat,
1595     struct nvme_command *command,
1596     struct nvme_completion *compl)
1597 {
1598 	uint32_t i;
1599 	uint32_t cdw11 = command->cdw11;
1600 	uint16_t iv;
1601 	bool cd;
1602 
1603 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1604 
1605 	iv = cdw11 & 0xffff;
1606 	cd = cdw11 & (1 << 16);
1607 
1608 	if (iv > (sc->max_queues + 1)) {
1609 		return;
1610 	}
1611 
1612 	/* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1613 	if ((iv == 0) && !cd)
1614 		return;
1615 
1616 	/* Requested Interrupt Vector must be used by a CQ */
1617 	for (i = 0; i < sc->num_cqueues + 1; i++) {
1618 		if (sc->compl_queues[i].intr_vec == iv) {
1619 			pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1620 		}
1621 	}
1622 
1623 }
1624 
1625 static void
1626 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1627     struct nvme_feature_obj *feat,
1628     struct nvme_command *command,
1629     struct nvme_completion *compl)
1630 {
1631 	uint16_t nqr;	/* Number of Queues Requested */
1632 
1633 	if (sc->num_q_is_set) {
1634 		WPRINTF("%s: Number of Queues already set", __func__);
1635 		pci_nvme_status_genc(&compl->status,
1636 		    NVME_SC_COMMAND_SEQUENCE_ERROR);
1637 		return;
1638 	}
1639 
1640 	nqr = command->cdw11 & 0xFFFF;
1641 	if (nqr == 0xffff) {
1642 		WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1643 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1644 		return;
1645 	}
1646 
1647 	sc->num_squeues = ONE_BASED(nqr);
1648 	if (sc->num_squeues > sc->max_queues) {
1649 		DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1650 					sc->max_queues);
1651 		sc->num_squeues = sc->max_queues;
1652 	}
1653 
1654 	nqr = (command->cdw11 >> 16) & 0xFFFF;
1655 	if (nqr == 0xffff) {
1656 		WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1657 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1658 		return;
1659 	}
1660 
1661 	sc->num_cqueues = ONE_BASED(nqr);
1662 	if (sc->num_cqueues > sc->max_queues) {
1663 		DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1664 					sc->max_queues);
1665 		sc->num_cqueues = sc->max_queues;
1666 	}
1667 
1668 	/* Patch the command value which will be saved on callback's return */
1669 	command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1670 	compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1671 
1672 	sc->num_q_is_set = true;
1673 }
1674 
1675 static int
1676 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1677 	struct nvme_completion *compl)
1678 {
1679 	struct nvme_feature_obj *feat;
1680 	uint32_t nsid = command->nsid;
1681 	uint8_t fid = command->cdw10 & 0xFF;
1682 
1683 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1684 
1685 	if (fid >= NVME_FID_MAX) {
1686 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1687 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1688 		return (1);
1689 	}
1690 	feat = &sc->feat[fid];
1691 
1692 	if (!feat->namespace_specific &&
1693 	    !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1694 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1695 		    NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1696 		return (1);
1697 	}
1698 
1699 	compl->cdw0 = 0;
1700 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1701 
1702 	if (feat->set)
1703 		feat->set(sc, feat, command, compl);
1704 
1705 	DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11);
1706 	if (compl->status == NVME_SC_SUCCESS) {
1707 		feat->cdw11 = command->cdw11;
1708 		if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) &&
1709 		    (command->cdw11 != 0))
1710 			pci_nvme_aen_notify(sc);
1711 	}
1712 
1713 	return (0);
1714 }
1715 
1716 static int
1717 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1718 	struct nvme_completion* compl)
1719 {
1720 	struct nvme_feature_obj *feat;
1721 	uint8_t fid = command->cdw10 & 0xFF;
1722 
1723 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1724 
1725 	if (fid >= NVME_FID_MAX) {
1726 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1727 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1728 		return (1);
1729 	}
1730 
1731 	compl->cdw0 = 0;
1732 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1733 
1734 	feat = &sc->feat[fid];
1735 	if (feat->get) {
1736 		feat->get(sc, feat, command, compl);
1737 	}
1738 
1739 	if (compl->status == NVME_SC_SUCCESS) {
1740 		compl->cdw0 = feat->cdw11;
1741 	}
1742 
1743 	return (0);
1744 }
1745 
1746 static int
1747 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1748 	struct nvme_completion* compl)
1749 {
1750 	uint8_t	ses, lbaf, pi;
1751 
1752 	/* Only supports Secure Erase Setting - User Data Erase */
1753 	ses = (command->cdw10 >> 9) & 0x7;
1754 	if (ses > 0x1) {
1755 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1756 		return (1);
1757 	}
1758 
1759 	/* Only supports a single LBA Format */
1760 	lbaf = command->cdw10 & 0xf;
1761 	if (lbaf != 0) {
1762 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1763 		    NVME_SC_INVALID_FORMAT);
1764 		return (1);
1765 	}
1766 
1767 	/* Doesn't support Protection Infomation */
1768 	pi = (command->cdw10 >> 5) & 0x7;
1769 	if (pi != 0) {
1770 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1771 		return (1);
1772 	}
1773 
1774 	if (sc->nvstore.type == NVME_STOR_RAM) {
1775 		if (sc->nvstore.ctx)
1776 			free(sc->nvstore.ctx);
1777 		sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1778 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1779 	} else {
1780 		struct pci_nvme_ioreq *req;
1781 		int err;
1782 
1783 		req = pci_nvme_get_ioreq(sc);
1784 		if (req == NULL) {
1785 			pci_nvme_status_genc(&compl->status,
1786 			    NVME_SC_INTERNAL_DEVICE_ERROR);
1787 			WPRINTF("%s: unable to allocate IO req", __func__);
1788 			return (1);
1789 		}
1790 		req->nvme_sq = &sc->submit_queues[0];
1791 		req->sqid = 0;
1792 		req->opc = command->opc;
1793 		req->cid = command->cid;
1794 		req->nsid = command->nsid;
1795 
1796 		req->io_req.br_offset = 0;
1797 		req->io_req.br_resid = sc->nvstore.size;
1798 		req->io_req.br_callback = pci_nvme_io_done;
1799 
1800 		err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1801 		if (err) {
1802 			pci_nvme_status_genc(&compl->status,
1803 			    NVME_SC_INTERNAL_DEVICE_ERROR);
1804 			pci_nvme_release_ioreq(sc, req);
1805 		}
1806 	}
1807 
1808 	return (1);
1809 }
1810 
1811 static int
1812 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1813 	struct nvme_completion* compl)
1814 {
1815 	DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1816 	        command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1817 
1818 	/* TODO: search for the command ID and abort it */
1819 
1820 	compl->cdw0 = 1;
1821 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1822 	return (1);
1823 }
1824 
1825 static int
1826 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1827 	struct nvme_command* command, struct nvme_completion* compl)
1828 {
1829 	DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__,
1830 	    sc->aer_count, sc->ctrldata.aerl, command->cid);
1831 
1832 	/* Don't exceed the Async Event Request Limit (AERL). */
1833 	if (pci_nvme_aer_limit_reached(sc)) {
1834 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1835 				NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1836 		return (1);
1837 	}
1838 
1839 	if (pci_nvme_aer_add(sc, command->cid)) {
1840 		pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
1841 				NVME_SC_INTERNAL_DEVICE_ERROR);
1842 		return (1);
1843 	}
1844 
1845 	/*
1846 	 * Raise events when they happen based on the Set Features cmd.
1847 	 * These events happen async, so only set completion successful if
1848 	 * there is an event reflective of the request to get event.
1849 	 */
1850 	compl->status = NVME_NO_STATUS;
1851 	pci_nvme_aen_notify(sc);
1852 
1853 	return (0);
1854 }
1855 
1856 static void
1857 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1858 {
1859 	struct nvme_completion compl;
1860 	struct nvme_command *cmd;
1861 	struct nvme_submission_queue *sq;
1862 	struct nvme_completion_queue *cq;
1863 	uint16_t sqhead;
1864 
1865 	DPRINTF("%s index %u", __func__, (uint32_t)value);
1866 
1867 	sq = &sc->submit_queues[0];
1868 	cq = &sc->compl_queues[0];
1869 
1870 	pthread_mutex_lock(&sq->mtx);
1871 
1872 	sqhead = sq->head;
1873 	DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1874 
1875 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
1876 		cmd = &(sq->qbase)[sqhead];
1877 		compl.cdw0 = 0;
1878 		compl.status = 0;
1879 
1880 		switch (cmd->opc) {
1881 		case NVME_OPC_DELETE_IO_SQ:
1882 			DPRINTF("%s command DELETE_IO_SQ", __func__);
1883 			nvme_opc_delete_io_sq(sc, cmd, &compl);
1884 			break;
1885 		case NVME_OPC_CREATE_IO_SQ:
1886 			DPRINTF("%s command CREATE_IO_SQ", __func__);
1887 			nvme_opc_create_io_sq(sc, cmd, &compl);
1888 			break;
1889 		case NVME_OPC_DELETE_IO_CQ:
1890 			DPRINTF("%s command DELETE_IO_CQ", __func__);
1891 			nvme_opc_delete_io_cq(sc, cmd, &compl);
1892 			break;
1893 		case NVME_OPC_CREATE_IO_CQ:
1894 			DPRINTF("%s command CREATE_IO_CQ", __func__);
1895 			nvme_opc_create_io_cq(sc, cmd, &compl);
1896 			break;
1897 		case NVME_OPC_GET_LOG_PAGE:
1898 			DPRINTF("%s command GET_LOG_PAGE", __func__);
1899 			nvme_opc_get_log_page(sc, cmd, &compl);
1900 			break;
1901 		case NVME_OPC_IDENTIFY:
1902 			DPRINTF("%s command IDENTIFY", __func__);
1903 			nvme_opc_identify(sc, cmd, &compl);
1904 			break;
1905 		case NVME_OPC_ABORT:
1906 			DPRINTF("%s command ABORT", __func__);
1907 			nvme_opc_abort(sc, cmd, &compl);
1908 			break;
1909 		case NVME_OPC_SET_FEATURES:
1910 			DPRINTF("%s command SET_FEATURES", __func__);
1911 			nvme_opc_set_features(sc, cmd, &compl);
1912 			break;
1913 		case NVME_OPC_GET_FEATURES:
1914 			DPRINTF("%s command GET_FEATURES", __func__);
1915 			nvme_opc_get_features(sc, cmd, &compl);
1916 			break;
1917 		case NVME_OPC_FIRMWARE_ACTIVATE:
1918 			DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
1919 			pci_nvme_status_tc(&compl.status,
1920 			    NVME_SCT_COMMAND_SPECIFIC,
1921 			    NVME_SC_INVALID_FIRMWARE_SLOT);
1922 			break;
1923 		case NVME_OPC_ASYNC_EVENT_REQUEST:
1924 			DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1925 			nvme_opc_async_event_req(sc, cmd, &compl);
1926 			break;
1927 		case NVME_OPC_FORMAT_NVM:
1928 			DPRINTF("%s command FORMAT_NVM", __func__);
1929 			if ((sc->ctrldata.oacs &
1930 			    (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
1931 				pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1932 			}
1933 			compl.status = NVME_NO_STATUS;
1934 			nvme_opc_format_nvm(sc, cmd, &compl);
1935 			break;
1936 		default:
1937 			DPRINTF("0x%x command is not implemented",
1938 			    cmd->opc);
1939 			pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1940 		}
1941 		sqhead = (sqhead + 1) % sq->size;
1942 
1943 		if (NVME_COMPLETION_VALID(compl)) {
1944 			pci_nvme_cq_update(sc, &sc->compl_queues[0],
1945 			    compl.cdw0,
1946 			    cmd->cid,
1947 			    0,		/* SQID */
1948 			    compl.status);
1949 		}
1950 	}
1951 
1952 	DPRINTF("setting sqhead %u", sqhead);
1953 	sq->head = sqhead;
1954 
1955 	if (cq->head != cq->tail)
1956 		pci_generate_msix(sc->nsc_pi, 0);
1957 
1958 	pthread_mutex_unlock(&sq->mtx);
1959 }
1960 
1961 /*
1962  * Update the Write and Read statistics reported in SMART data
1963  *
1964  * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
1965  * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
1966  * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
1967  */
1968 static void
1969 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
1970     size_t bytes, uint16_t status)
1971 {
1972 
1973 	pthread_mutex_lock(&sc->mtx);
1974 	switch (opc) {
1975 	case NVME_OPC_WRITE:
1976 		sc->write_commands++;
1977 		if (status != NVME_SC_SUCCESS)
1978 			break;
1979 		sc->write_dunits_remainder += (bytes / 512);
1980 		while (sc->write_dunits_remainder >= 1000) {
1981 			sc->write_data_units++;
1982 			sc->write_dunits_remainder -= 1000;
1983 		}
1984 		break;
1985 	case NVME_OPC_READ:
1986 		sc->read_commands++;
1987 		if (status != NVME_SC_SUCCESS)
1988 			break;
1989 		sc->read_dunits_remainder += (bytes / 512);
1990 		while (sc->read_dunits_remainder >= 1000) {
1991 			sc->read_data_units++;
1992 			sc->read_dunits_remainder -= 1000;
1993 		}
1994 		break;
1995 	default:
1996 		DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
1997 		break;
1998 	}
1999 	pthread_mutex_unlock(&sc->mtx);
2000 }
2001 
2002 /*
2003  * Check if the combination of Starting LBA (slba) and Number of Logical
2004  * Blocks (nlb) exceeds the range of the underlying storage.
2005  *
2006  * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
2007  * the capacity in bytes as a uint64_t, care must be taken to avoid integer
2008  * overflow.
2009  */
2010 static bool
2011 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
2012     uint32_t nlb)
2013 {
2014 	size_t	offset, bytes;
2015 
2016 	/* Overflow check of multiplying Starting LBA by the sector size */
2017 	if (slba >> (64 - nvstore->sectsz_bits))
2018 		return (true);
2019 
2020 	offset = slba << nvstore->sectsz_bits;
2021 	bytes = nlb << nvstore->sectsz_bits;
2022 
2023 	/* Overflow check of Number of Logical Blocks */
2024 	if ((nvstore->size - offset) < bytes)
2025 		return (true);
2026 
2027 	return (false);
2028 }
2029 
2030 static int
2031 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
2032 	uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
2033 {
2034 	int iovidx;
2035 
2036 	if (req == NULL)
2037 		return (-1);
2038 
2039 	if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
2040 		return (-1);
2041 	}
2042 
2043 	/* concatenate contig block-iovs to minimize number of iovs */
2044 	if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
2045 		iovidx = req->io_req.br_iovcnt - 1;
2046 
2047 		req->io_req.br_iov[iovidx].iov_base =
2048 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2049 				     req->prev_gpaddr, size);
2050 
2051 		req->prev_size += size;
2052 		req->io_req.br_resid += size;
2053 
2054 		req->io_req.br_iov[iovidx].iov_len = req->prev_size;
2055 	} else {
2056 		iovidx = req->io_req.br_iovcnt;
2057 		if (iovidx == 0) {
2058 			req->io_req.br_offset = lba;
2059 			req->io_req.br_resid = 0;
2060 			req->io_req.br_param = req;
2061 		}
2062 
2063 		req->io_req.br_iov[iovidx].iov_base =
2064 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2065 				     gpaddr, size);
2066 
2067 		req->io_req.br_iov[iovidx].iov_len = size;
2068 
2069 		req->prev_gpaddr = gpaddr;
2070 		req->prev_size = size;
2071 		req->io_req.br_resid += size;
2072 
2073 		req->io_req.br_iovcnt++;
2074 	}
2075 
2076 	return (0);
2077 }
2078 
2079 static void
2080 pci_nvme_set_completion(struct pci_nvme_softc *sc,
2081 	struct nvme_submission_queue *sq, int sqid, uint16_t cid,
2082 	uint32_t cdw0, uint16_t status)
2083 {
2084 	struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
2085 
2086 	DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
2087 		 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
2088 		 NVME_STATUS_GET_SC(status));
2089 
2090 	pci_nvme_cq_update(sc, cq,
2091 	    0,		/* CDW0 */
2092 	    cid,
2093 	    sqid,
2094 	    status);
2095 
2096 	if (cq->head != cq->tail) {
2097 		if (cq->intr_en & NVME_CQ_INTEN) {
2098 			pci_generate_msix(sc->nsc_pi, cq->intr_vec);
2099 		} else {
2100 			DPRINTF("%s: CQ%u interrupt disabled",
2101 						__func__, sq->cqid);
2102 		}
2103 	}
2104 }
2105 
2106 static void
2107 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
2108 {
2109 	req->sc = NULL;
2110 	req->nvme_sq = NULL;
2111 	req->sqid = 0;
2112 
2113 	pthread_mutex_lock(&sc->mtx);
2114 
2115 	STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
2116 	sc->pending_ios--;
2117 
2118 	/* when no more IO pending, can set to ready if device reset/enabled */
2119 	if (sc->pending_ios == 0 &&
2120 	    NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
2121 		sc->regs.csts |= NVME_CSTS_RDY;
2122 
2123 	pthread_mutex_unlock(&sc->mtx);
2124 
2125 	sem_post(&sc->iosemlock);
2126 }
2127 
2128 static struct pci_nvme_ioreq *
2129 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
2130 {
2131 	struct pci_nvme_ioreq *req = NULL;
2132 
2133 	sem_wait(&sc->iosemlock);
2134 	pthread_mutex_lock(&sc->mtx);
2135 
2136 	req = STAILQ_FIRST(&sc->ioreqs_free);
2137 	assert(req != NULL);
2138 	STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
2139 
2140 	req->sc = sc;
2141 
2142 	sc->pending_ios++;
2143 
2144 	pthread_mutex_unlock(&sc->mtx);
2145 
2146 	req->io_req.br_iovcnt = 0;
2147 	req->io_req.br_offset = 0;
2148 	req->io_req.br_resid = 0;
2149 	req->io_req.br_param = req;
2150 	req->prev_gpaddr = 0;
2151 	req->prev_size = 0;
2152 
2153 	return req;
2154 }
2155 
2156 static void
2157 pci_nvme_io_done(struct blockif_req *br, int err)
2158 {
2159 	struct pci_nvme_ioreq *req = br->br_param;
2160 	struct nvme_submission_queue *sq = req->nvme_sq;
2161 	uint16_t code, status;
2162 
2163 #ifndef __FreeBSD__
2164 	status = 0;
2165 #endif
2166 
2167 	DPRINTF("%s error %d %s", __func__, err, strerror(err));
2168 
2169 	/* TODO return correct error */
2170 	code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
2171 	pci_nvme_status_genc(&status, code);
2172 
2173 	pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
2174 	pci_nvme_stats_write_read_update(req->sc, req->opc,
2175 	    req->bytes, status);
2176 	pci_nvme_release_ioreq(req->sc, req);
2177 }
2178 
2179 /*
2180  * Implements the Flush command. The specification states:
2181  *    If a volatile write cache is not present, Flush commands complete
2182  *    successfully and have no effect
2183  * in the description of the Volatile Write Cache (VWC) field of the Identify
2184  * Controller data. Therefore, set status to Success if the command is
2185  * not supported (i.e. RAM or as indicated by the blockif).
2186  */
2187 static bool
2188 nvme_opc_flush(struct pci_nvme_softc *sc,
2189     struct nvme_command *cmd,
2190     struct pci_nvme_blockstore *nvstore,
2191     struct pci_nvme_ioreq *req,
2192     uint16_t *status)
2193 {
2194 	bool pending = false;
2195 
2196 	if (nvstore->type == NVME_STOR_RAM) {
2197 		pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2198 	} else {
2199 		int err;
2200 
2201 		req->io_req.br_callback = pci_nvme_io_done;
2202 
2203 		err = blockif_flush(nvstore->ctx, &req->io_req);
2204 		switch (err) {
2205 		case 0:
2206 			pending = true;
2207 			break;
2208 		case EOPNOTSUPP:
2209 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2210 			break;
2211 		default:
2212 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2213 		}
2214 	}
2215 
2216 	return (pending);
2217 }
2218 
2219 static uint16_t
2220 nvme_write_read_ram(struct pci_nvme_softc *sc,
2221     struct pci_nvme_blockstore *nvstore,
2222     uint64_t prp1, uint64_t prp2,
2223     size_t offset, uint64_t bytes,
2224     bool is_write)
2225 {
2226 	uint8_t *buf = nvstore->ctx;
2227 	enum nvme_copy_dir dir;
2228 	uint16_t status;
2229 
2230 #ifndef __FreeBSD__
2231 	status = 0;
2232 #endif
2233 
2234 	if (is_write)
2235 		dir = NVME_COPY_TO_PRP;
2236 	else
2237 		dir = NVME_COPY_FROM_PRP;
2238 
2239 	if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
2240 	    buf + offset, bytes, dir))
2241 		pci_nvme_status_genc(&status,
2242 		    NVME_SC_DATA_TRANSFER_ERROR);
2243 	else
2244 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2245 
2246 	return (status);
2247 }
2248 
2249 static uint16_t
2250 nvme_write_read_blockif(struct pci_nvme_softc *sc,
2251     struct pci_nvme_blockstore *nvstore,
2252     struct pci_nvme_ioreq *req,
2253     uint64_t prp1, uint64_t prp2,
2254     size_t offset, uint64_t bytes,
2255     bool is_write)
2256 {
2257 	uint64_t size;
2258 	int err;
2259 	uint16_t status = NVME_NO_STATUS;
2260 
2261 	size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
2262 	if (pci_nvme_append_iov_req(sc, req, prp1,
2263 	    size, is_write, offset)) {
2264 		pci_nvme_status_genc(&status,
2265 		    NVME_SC_DATA_TRANSFER_ERROR);
2266 		goto out;
2267 	}
2268 
2269 	offset += size;
2270 	bytes  -= size;
2271 
2272 	if (bytes == 0) {
2273 		;
2274 	} else if (bytes <= PAGE_SIZE) {
2275 		size = bytes;
2276 		if (pci_nvme_append_iov_req(sc, req, prp2,
2277 		    size, is_write, offset)) {
2278 			pci_nvme_status_genc(&status,
2279 			    NVME_SC_DATA_TRANSFER_ERROR);
2280 			goto out;
2281 		}
2282 	} else {
2283 		void *vmctx = sc->nsc_pi->pi_vmctx;
2284 		uint64_t *prp_list = &prp2;
2285 		uint64_t *last = prp_list;
2286 
2287 		/* PRP2 is pointer to a physical region page list */
2288 		while (bytes) {
2289 			/* Last entry in list points to the next list */
2290 			if ((prp_list == last) && (bytes > PAGE_SIZE)) {
2291 				uint64_t prp = *prp_list;
2292 
2293 				prp_list = paddr_guest2host(vmctx, prp,
2294 				    PAGE_SIZE - (prp % PAGE_SIZE));
2295 				last = prp_list + (NVME_PRP2_ITEMS - 1);
2296 			}
2297 
2298 			size = MIN(bytes, PAGE_SIZE);
2299 
2300 			if (pci_nvme_append_iov_req(sc, req, *prp_list,
2301 			    size, is_write, offset)) {
2302 				pci_nvme_status_genc(&status,
2303 				    NVME_SC_DATA_TRANSFER_ERROR);
2304 				goto out;
2305 			}
2306 
2307 			offset += size;
2308 			bytes  -= size;
2309 
2310 			prp_list++;
2311 		}
2312 	}
2313 	req->io_req.br_callback = pci_nvme_io_done;
2314 	if (is_write)
2315 		err = blockif_write(nvstore->ctx, &req->io_req);
2316 	else
2317 		err = blockif_read(nvstore->ctx, &req->io_req);
2318 
2319 	if (err)
2320 		pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2321 out:
2322 	return (status);
2323 }
2324 
2325 static bool
2326 nvme_opc_write_read(struct pci_nvme_softc *sc,
2327     struct nvme_command *cmd,
2328     struct pci_nvme_blockstore *nvstore,
2329     struct pci_nvme_ioreq *req,
2330     uint16_t *status)
2331 {
2332 	uint64_t lba, nblocks, bytes;
2333 	size_t offset;
2334 	bool is_write = cmd->opc == NVME_OPC_WRITE;
2335 	bool pending = false;
2336 
2337 #ifndef __FreeBSD__
2338 	bytes = 0;
2339 #endif
2340 
2341 	lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2342 	nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2343 
2344 	if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2345 		WPRINTF("%s command would exceed LBA range", __func__);
2346 		pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2347 		goto out;
2348 	}
2349 
2350 	bytes  = nblocks << nvstore->sectsz_bits;
2351 	if (bytes > NVME_MAX_DATA_SIZE) {
2352 		WPRINTF("%s command would exceed MDTS", __func__);
2353 		pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2354 		goto out;
2355 	}
2356 
2357 	offset = lba << nvstore->sectsz_bits;
2358 
2359 	req->bytes = bytes;
2360 	req->io_req.br_offset = lba;
2361 
2362 	/* PRP bits 1:0 must be zero */
2363 	cmd->prp1 &= ~0x3UL;
2364 	cmd->prp2 &= ~0x3UL;
2365 
2366 	if (nvstore->type == NVME_STOR_RAM) {
2367 		*status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2368 		    cmd->prp2, offset, bytes, is_write);
2369 	} else {
2370 		*status = nvme_write_read_blockif(sc, nvstore, req,
2371 		    cmd->prp1, cmd->prp2, offset, bytes, is_write);
2372 
2373 		if (*status == NVME_NO_STATUS)
2374 			pending = true;
2375 	}
2376 out:
2377 	if (!pending)
2378 		pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2379 
2380 	return (pending);
2381 }
2382 
2383 static void
2384 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2385 {
2386 	struct pci_nvme_ioreq *req = br->br_param;
2387 	struct pci_nvme_softc *sc = req->sc;
2388 	bool done = true;
2389 	uint16_t status;
2390 
2391 #ifndef __FreeBSD__
2392 	status = 0;
2393 #endif
2394 
2395 	if (err) {
2396 		pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2397 	} else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2398 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2399 	} else {
2400 		struct iovec *iov = req->io_req.br_iov;
2401 
2402 		req->prev_gpaddr++;
2403 		iov += req->prev_gpaddr;
2404 
2405 		/* The iov_* values already include the sector size */
2406 		req->io_req.br_offset = (off_t)iov->iov_base;
2407 		req->io_req.br_resid = iov->iov_len;
2408 		if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2409 			pci_nvme_status_genc(&status,
2410 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2411 		} else
2412 			done = false;
2413 	}
2414 
2415 	if (done) {
2416 		pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
2417 		    req->cid, 0, status);
2418 		pci_nvme_release_ioreq(sc, req);
2419 	}
2420 }
2421 
2422 static bool
2423 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2424     struct nvme_command *cmd,
2425     struct pci_nvme_blockstore *nvstore,
2426     struct pci_nvme_ioreq *req,
2427     uint16_t *status)
2428 {
2429 	struct nvme_dsm_range *range;
2430 	uint32_t nr, r, non_zero, dr;
2431 	int err;
2432 	bool pending = false;
2433 
2434 #ifndef __FreeBSD__
2435 	range = NULL;
2436 #endif
2437 
2438 	if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2439 		pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2440 		goto out;
2441 	}
2442 
2443 	nr = cmd->cdw10 & 0xff;
2444 
2445 	/* copy locally because a range entry could straddle PRPs */
2446 	range = calloc(1, NVME_MAX_DSM_TRIM);
2447 	if (range == NULL) {
2448 		pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2449 		goto out;
2450 	}
2451 	nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2452 	    (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2453 
2454 	/* Check for invalid ranges and the number of non-zero lengths */
2455 	non_zero = 0;
2456 	for (r = 0; r <= nr; r++) {
2457 		if (pci_nvme_out_of_range(nvstore,
2458 		    range[r].starting_lba, range[r].length)) {
2459 			pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2460 			goto out;
2461 		}
2462 		if (range[r].length != 0)
2463 			non_zero++;
2464 	}
2465 
2466 	if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2467 		size_t offset, bytes;
2468 		int sectsz_bits = sc->nvstore.sectsz_bits;
2469 
2470 		/*
2471 		 * DSM calls are advisory only, and compliant controllers
2472 		 * may choose to take no actions (i.e. return Success).
2473 		 */
2474 		if (!nvstore->deallocate) {
2475 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2476 			goto out;
2477 		}
2478 
2479 		/* If all ranges have a zero length, return Success */
2480 		if (non_zero == 0) {
2481 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2482 			goto out;
2483 		}
2484 
2485 		if (req == NULL) {
2486 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2487 			goto out;
2488 		}
2489 
2490 		offset = range[0].starting_lba << sectsz_bits;
2491 		bytes = range[0].length << sectsz_bits;
2492 
2493 		/*
2494 		 * If the request is for more than a single range, store
2495 		 * the ranges in the br_iov. Optimize for the common case
2496 		 * of a single range.
2497 		 *
2498 		 * Note that NVMe Number of Ranges is a zero based value
2499 		 */
2500 		req->io_req.br_iovcnt = 0;
2501 		req->io_req.br_offset = offset;
2502 		req->io_req.br_resid = bytes;
2503 
2504 		if (nr == 0) {
2505 			req->io_req.br_callback = pci_nvme_io_done;
2506 		} else {
2507 			struct iovec *iov = req->io_req.br_iov;
2508 
2509 			for (r = 0, dr = 0; r <= nr; r++) {
2510 				offset = range[r].starting_lba << sectsz_bits;
2511 				bytes = range[r].length << sectsz_bits;
2512 				if (bytes == 0)
2513 					continue;
2514 
2515 				if ((nvstore->size - offset) < bytes) {
2516 					pci_nvme_status_genc(status,
2517 					    NVME_SC_LBA_OUT_OF_RANGE);
2518 					goto out;
2519 				}
2520 				iov[dr].iov_base = (void *)offset;
2521 				iov[dr].iov_len = bytes;
2522 				dr++;
2523 			}
2524 			req->io_req.br_callback = pci_nvme_dealloc_sm;
2525 
2526 			/*
2527 			 * Use prev_gpaddr to track the current entry and
2528 			 * prev_size to track the number of entries
2529 			 */
2530 			req->prev_gpaddr = 0;
2531 			req->prev_size = dr;
2532 		}
2533 
2534 		err = blockif_delete(nvstore->ctx, &req->io_req);
2535 		if (err)
2536 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2537 		else
2538 			pending = true;
2539 	}
2540 out:
2541 	free(range);
2542 	return (pending);
2543 }
2544 
2545 static void
2546 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2547 {
2548 	struct nvme_submission_queue *sq;
2549 	uint16_t status;
2550 	uint16_t sqhead;
2551 
2552 #ifndef __FreeBSD__
2553 	status = 0;
2554 #endif
2555 
2556 	/* handle all submissions up to sq->tail index */
2557 	sq = &sc->submit_queues[idx];
2558 
2559 	pthread_mutex_lock(&sq->mtx);
2560 
2561 	sqhead = sq->head;
2562 	DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2563 	         idx, sqhead, sq->tail, sq->qbase);
2564 
2565 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
2566 		struct nvme_command *cmd;
2567 		struct pci_nvme_ioreq *req;
2568 		uint32_t nsid;
2569 		bool pending;
2570 
2571 		pending = false;
2572 		req = NULL;
2573 		status = 0;
2574 
2575 		cmd = &sq->qbase[sqhead];
2576 		sqhead = (sqhead + 1) % sq->size;
2577 
2578 		nsid = le32toh(cmd->nsid);
2579 		if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2580 			pci_nvme_status_genc(&status,
2581 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2582 			status |=
2583 			    NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2584 			goto complete;
2585  		}
2586 
2587 		req = pci_nvme_get_ioreq(sc);
2588 		if (req == NULL) {
2589 			pci_nvme_status_genc(&status,
2590 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2591 			WPRINTF("%s: unable to allocate IO req", __func__);
2592 			goto complete;
2593 		}
2594 		req->nvme_sq = sq;
2595 		req->sqid = idx;
2596 		req->opc = cmd->opc;
2597 		req->cid = cmd->cid;
2598 		req->nsid = cmd->nsid;
2599 
2600 		switch (cmd->opc) {
2601 		case NVME_OPC_FLUSH:
2602 			pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2603 			    req, &status);
2604  			break;
2605 		case NVME_OPC_WRITE:
2606 		case NVME_OPC_READ:
2607 			pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2608 			    req, &status);
2609 			break;
2610 		case NVME_OPC_WRITE_ZEROES:
2611 			/* TODO: write zeroes
2612 			WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2613 			        __func__, lba, cmd->cdw12 & 0xFFFF); */
2614 			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2615 			break;
2616 		case NVME_OPC_DATASET_MANAGEMENT:
2617  			pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2618 			    req, &status);
2619 			break;
2620  		default:
2621  			WPRINTF("%s unhandled io command 0x%x",
2622 			    __func__, cmd->opc);
2623 			pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2624 		}
2625 complete:
2626 		if (!pending) {
2627 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
2628 			    status);
2629 			if (req != NULL)
2630 				pci_nvme_release_ioreq(sc, req);
2631 		}
2632 	}
2633 
2634 	sq->head = sqhead;
2635 
2636 	pthread_mutex_unlock(&sq->mtx);
2637 }
2638 
2639 static void
2640 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
2641 	uint64_t idx, int is_sq, uint64_t value)
2642 {
2643 	DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2644 	        idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2645 
2646 	if (is_sq) {
2647 		if (idx > sc->num_squeues) {
2648 			WPRINTF("%s queue index %lu overflow from "
2649 			         "guest (max %u)",
2650 			         __func__, idx, sc->num_squeues);
2651 			return;
2652 		}
2653 
2654 		atomic_store_short(&sc->submit_queues[idx].tail,
2655 		                   (uint16_t)value);
2656 
2657 		if (idx == 0) {
2658 			pci_nvme_handle_admin_cmd(sc, value);
2659 		} else {
2660 			/* submission queue; handle new entries in SQ */
2661 			if (idx > sc->num_squeues) {
2662 				WPRINTF("%s SQ index %lu overflow from "
2663 				         "guest (max %u)",
2664 				         __func__, idx, sc->num_squeues);
2665 				return;
2666 			}
2667 			pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2668 		}
2669 	} else {
2670 		if (idx > sc->num_cqueues) {
2671 			WPRINTF("%s queue index %lu overflow from "
2672 			         "guest (max %u)",
2673 			         __func__, idx, sc->num_cqueues);
2674 			return;
2675 		}
2676 
2677 		atomic_store_short(&sc->compl_queues[idx].head,
2678 				(uint16_t)value);
2679 	}
2680 }
2681 
2682 static void
2683 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2684 {
2685 	const char *s = iswrite ? "WRITE" : "READ";
2686 
2687 	switch (offset) {
2688 	case NVME_CR_CAP_LOW:
2689 		DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2690 		break;
2691 	case NVME_CR_CAP_HI:
2692 		DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2693 		break;
2694 	case NVME_CR_VS:
2695 		DPRINTF("%s %s NVME_CR_VS", func, s);
2696 		break;
2697 	case NVME_CR_INTMS:
2698 		DPRINTF("%s %s NVME_CR_INTMS", func, s);
2699 		break;
2700 	case NVME_CR_INTMC:
2701 		DPRINTF("%s %s NVME_CR_INTMC", func, s);
2702 		break;
2703 	case NVME_CR_CC:
2704 		DPRINTF("%s %s NVME_CR_CC", func, s);
2705 		break;
2706 	case NVME_CR_CSTS:
2707 		DPRINTF("%s %s NVME_CR_CSTS", func, s);
2708 		break;
2709 	case NVME_CR_NSSR:
2710 		DPRINTF("%s %s NVME_CR_NSSR", func, s);
2711 		break;
2712 	case NVME_CR_AQA:
2713 		DPRINTF("%s %s NVME_CR_AQA", func, s);
2714 		break;
2715 	case NVME_CR_ASQ_LOW:
2716 		DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2717 		break;
2718 	case NVME_CR_ASQ_HI:
2719 		DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2720 		break;
2721 	case NVME_CR_ACQ_LOW:
2722 		DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2723 		break;
2724 	case NVME_CR_ACQ_HI:
2725 		DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2726 		break;
2727 	default:
2728 		DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2729 	}
2730 
2731 }
2732 
2733 static void
2734 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2735 	uint64_t offset, int size, uint64_t value)
2736 {
2737 	uint32_t ccreg;
2738 
2739 	if (offset >= NVME_DOORBELL_OFFSET) {
2740 		uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2741 		uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2742 		int is_sq = (belloffset % 8) < 4;
2743 
2744 		if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2745 			WPRINTF("guest attempted an overflow write offset "
2746 			         "0x%lx, val 0x%lx in %s",
2747 			         offset, value, __func__);
2748 			return;
2749 		}
2750 
2751 		pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2752 		return;
2753 	}
2754 
2755 	DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2756 	        offset, size, value);
2757 
2758 	if (size != 4) {
2759 		WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2760 		         "val 0x%lx) to bar0 in %s",
2761 		         size, offset, value, __func__);
2762 		/* TODO: shutdown device */
2763 		return;
2764 	}
2765 
2766 	pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2767 
2768 	pthread_mutex_lock(&sc->mtx);
2769 
2770 	switch (offset) {
2771 	case NVME_CR_CAP_LOW:
2772 	case NVME_CR_CAP_HI:
2773 		/* readonly */
2774 		break;
2775 	case NVME_CR_VS:
2776 		/* readonly */
2777 		break;
2778 	case NVME_CR_INTMS:
2779 		/* MSI-X, so ignore */
2780 		break;
2781 	case NVME_CR_INTMC:
2782 		/* MSI-X, so ignore */
2783 		break;
2784 	case NVME_CR_CC:
2785 		ccreg = (uint32_t)value;
2786 
2787 		DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2788 		         "iocqes %u",
2789 		        __func__,
2790 			 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2791 			 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2792 			 NVME_CC_GET_IOCQES(ccreg));
2793 
2794 		if (NVME_CC_GET_SHN(ccreg)) {
2795 			/* perform shutdown - flush out data to backend */
2796 			sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2797 			    NVME_CSTS_REG_SHST_SHIFT);
2798 			sc->regs.csts |= NVME_SHST_COMPLETE <<
2799 			    NVME_CSTS_REG_SHST_SHIFT;
2800 		}
2801 		if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2802 			if (NVME_CC_GET_EN(ccreg) == 0)
2803 				/* transition 1-> causes controller reset */
2804 				pci_nvme_reset_locked(sc);
2805 			else
2806 				pci_nvme_init_controller(ctx, sc);
2807 		}
2808 
2809 		/* Insert the iocqes, iosqes and en bits from the write */
2810 		sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2811 		sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2812 		if (NVME_CC_GET_EN(ccreg) == 0) {
2813 			/* Insert the ams, mps and css bit fields */
2814 			sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2815 			sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2816 			sc->regs.csts &= ~NVME_CSTS_RDY;
2817 		} else if (sc->pending_ios == 0) {
2818 			sc->regs.csts |= NVME_CSTS_RDY;
2819 		}
2820 		break;
2821 	case NVME_CR_CSTS:
2822 		break;
2823 	case NVME_CR_NSSR:
2824 		/* ignore writes; don't support subsystem reset */
2825 		break;
2826 	case NVME_CR_AQA:
2827 		sc->regs.aqa = (uint32_t)value;
2828 		break;
2829 	case NVME_CR_ASQ_LOW:
2830 		sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2831 		               (0xFFFFF000 & value);
2832 		break;
2833 	case NVME_CR_ASQ_HI:
2834 		sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2835 		               (value << 32);
2836 		break;
2837 	case NVME_CR_ACQ_LOW:
2838 		sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2839 		               (0xFFFFF000 & value);
2840 		break;
2841 	case NVME_CR_ACQ_HI:
2842 		sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2843 		               (value << 32);
2844 		break;
2845 	default:
2846 		DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2847 		         __func__, offset, value, size);
2848 	}
2849 	pthread_mutex_unlock(&sc->mtx);
2850 }
2851 
2852 static void
2853 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2854                 int baridx, uint64_t offset, int size, uint64_t value)
2855 {
2856 	struct pci_nvme_softc* sc = pi->pi_arg;
2857 
2858 	if (baridx == pci_msix_table_bar(pi) ||
2859 	    baridx == pci_msix_pba_bar(pi)) {
2860 		DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2861 		         " value 0x%lx", baridx, offset, size, value);
2862 
2863 		pci_emul_msix_twrite(pi, offset, size, value);
2864 		return;
2865 	}
2866 
2867 	switch (baridx) {
2868 	case 0:
2869 		pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2870 		break;
2871 
2872 	default:
2873 		DPRINTF("%s unknown baridx %d, val 0x%lx",
2874 		         __func__, baridx, value);
2875 	}
2876 }
2877 
2878 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2879 	uint64_t offset, int size)
2880 {
2881 	uint64_t value;
2882 
2883 	pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2884 
2885 	if (offset < NVME_DOORBELL_OFFSET) {
2886 		void *p = &(sc->regs);
2887 		pthread_mutex_lock(&sc->mtx);
2888 		memcpy(&value, (void *)((uintptr_t)p + offset), size);
2889 		pthread_mutex_unlock(&sc->mtx);
2890 	} else {
2891 		value = 0;
2892                 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2893 	}
2894 
2895 	switch (size) {
2896 	case 1:
2897 		value &= 0xFF;
2898 		break;
2899 	case 2:
2900 		value &= 0xFFFF;
2901 		break;
2902 	case 4:
2903 		value &= 0xFFFFFFFF;
2904 		break;
2905 	}
2906 
2907 	DPRINTF("   nvme-read offset 0x%lx, size %d -> value 0x%x",
2908 	         offset, size, (uint32_t)value);
2909 
2910 	return (value);
2911 }
2912 
2913 
2914 
2915 static uint64_t
2916 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2917     uint64_t offset, int size)
2918 {
2919 	struct pci_nvme_softc* sc = pi->pi_arg;
2920 
2921 	if (baridx == pci_msix_table_bar(pi) ||
2922 	    baridx == pci_msix_pba_bar(pi)) {
2923 		DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2924 		        baridx, offset, size);
2925 
2926 		return pci_emul_msix_tread(pi, offset, size);
2927 	}
2928 
2929 	switch (baridx) {
2930 	case 0:
2931        		return pci_nvme_read_bar_0(sc, offset, size);
2932 
2933 	default:
2934 		DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2935 	}
2936 
2937 	return (0);
2938 }
2939 
2940 static int
2941 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl)
2942 {
2943 	char bident[sizeof("XX:X:X")];
2944 	const char *value;
2945 	uint32_t sectsz;
2946 
2947 	sc->max_queues = NVME_QUEUES;
2948 	sc->max_qentries = NVME_MAX_QENTRIES;
2949 	sc->ioslots = NVME_IOSLOTS;
2950 	sc->num_squeues = sc->max_queues;
2951 	sc->num_cqueues = sc->max_queues;
2952 	sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2953 	sectsz = 0;
2954 	snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2955 	         "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2956 
2957 	value = get_config_value_node(nvl, "maxq");
2958 	if (value != NULL)
2959 		sc->max_queues = atoi(value);
2960 	value = get_config_value_node(nvl, "qsz");
2961 	if (value != NULL) {
2962 		sc->max_qentries = atoi(value);
2963 		if (sc->max_qentries <= 0) {
2964 			EPRINTLN("nvme: Invalid qsz option %d",
2965 			    sc->max_qentries);
2966 			return (-1);
2967 		}
2968 	}
2969 	value = get_config_value_node(nvl, "ioslots");
2970 	if (value != NULL) {
2971 		sc->ioslots = atoi(value);
2972 		if (sc->ioslots <= 0) {
2973 			EPRINTLN("Invalid ioslots option %d", sc->ioslots);
2974 			return (-1);
2975 		}
2976 	}
2977 	value = get_config_value_node(nvl, "sectsz");
2978 	if (value != NULL)
2979 		sectsz = atoi(value);
2980 	value = get_config_value_node(nvl, "ser");
2981 	if (value != NULL) {
2982 		/*
2983 		 * This field indicates the Product Serial Number in
2984 		 * 7-bit ASCII, unused bytes should be space characters.
2985 		 * Ref: NVMe v1.3c.
2986 		 */
2987 		cpywithpad((char *)sc->ctrldata.sn,
2988 		    sizeof(sc->ctrldata.sn), value, ' ');
2989 	}
2990 	value = get_config_value_node(nvl, "eui64");
2991 	if (value != NULL)
2992 		sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0));
2993 	value = get_config_value_node(nvl, "dsm");
2994 	if (value != NULL) {
2995 		if (strcmp(value, "auto") == 0)
2996 			sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2997 		else if (strcmp(value, "enable") == 0)
2998 			sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2999 		else if (strcmp(value, "disable") == 0)
3000 			sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
3001 	}
3002 
3003 	value = get_config_value_node(nvl, "ram");
3004 	if (value != NULL) {
3005 		uint64_t sz = strtoull(value, NULL, 10);
3006 
3007 		sc->nvstore.type = NVME_STOR_RAM;
3008 		sc->nvstore.size = sz * 1024 * 1024;
3009 		sc->nvstore.ctx = calloc(1, sc->nvstore.size);
3010 		sc->nvstore.sectsz = 4096;
3011 		sc->nvstore.sectsz_bits = 12;
3012 		if (sc->nvstore.ctx == NULL) {
3013 			EPRINTLN("nvme: Unable to allocate RAM");
3014 			return (-1);
3015 		}
3016 	} else {
3017 		snprintf(bident, sizeof(bident), "%d:%d",
3018 		    sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
3019 		sc->nvstore.ctx = blockif_open(nvl, bident);
3020 		if (sc->nvstore.ctx == NULL) {
3021 			EPRINTLN("nvme: Could not open backing file: %s",
3022 			    strerror(errno));
3023 			return (-1);
3024 		}
3025 		sc->nvstore.type = NVME_STOR_BLOCKIF;
3026 		sc->nvstore.size = blockif_size(sc->nvstore.ctx);
3027 	}
3028 
3029 	if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
3030 		sc->nvstore.sectsz = sectsz;
3031 	else if (sc->nvstore.type != NVME_STOR_RAM)
3032 		sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
3033 	for (sc->nvstore.sectsz_bits = 9;
3034 	     (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
3035 	     sc->nvstore.sectsz_bits++);
3036 
3037 	if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
3038 		sc->max_queues = NVME_QUEUES;
3039 
3040 	return (0);
3041 }
3042 
3043 static void
3044 pci_nvme_resized(struct blockif_ctxt *bctxt, void *arg, size_t new_size)
3045 {
3046 	struct pci_nvme_softc *sc;
3047 	struct pci_nvme_blockstore *nvstore;
3048 	struct nvme_namespace_data *nd;
3049 
3050 	sc = arg;
3051 	nvstore = &sc->nvstore;
3052 	nd = &sc->nsdata;
3053 
3054 	nvstore->size = new_size;
3055 	pci_nvme_init_nsdata_size(nvstore, nd);
3056 
3057 	/* Add changed NSID to list */
3058 	sc->ns_log.ns[0] = 1;
3059 	sc->ns_log.ns[1] = 0;
3060 
3061 	pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE,
3062 	    PCI_NVME_AE_INFO_NS_ATTR_CHANGED);
3063 }
3064 
3065 static int
3066 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl)
3067 {
3068 	struct pci_nvme_softc *sc;
3069 	uint32_t pci_membar_sz;
3070 	int	error;
3071 
3072 	error = 0;
3073 
3074 	sc = calloc(1, sizeof(struct pci_nvme_softc));
3075 	pi->pi_arg = sc;
3076 	sc->nsc_pi = pi;
3077 
3078 	error = pci_nvme_parse_config(sc, nvl);
3079 	if (error < 0)
3080 		goto done;
3081 	else
3082 		error = 0;
3083 
3084 	STAILQ_INIT(&sc->ioreqs_free);
3085 	sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
3086 	for (int i = 0; i < sc->ioslots; i++) {
3087 		STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
3088 	}
3089 
3090 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
3091 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
3092 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
3093 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
3094 	pci_set_cfgdata8(pi, PCIR_PROGIF,
3095 	                 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
3096 
3097 	/*
3098 	 * Allocate size of NVMe registers + doorbell space for all queues.
3099 	 *
3100 	 * The specification requires a minimum memory I/O window size of 16K.
3101 	 * The Windows driver will refuse to start a device with a smaller
3102 	 * window.
3103 	 */
3104 	pci_membar_sz = sizeof(struct nvme_registers) +
3105 	    2 * sizeof(uint32_t) * (sc->max_queues + 1);
3106 	pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
3107 
3108 	DPRINTF("nvme membar size: %u", pci_membar_sz);
3109 
3110 	error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
3111 	if (error) {
3112 		WPRINTF("%s pci alloc mem bar failed", __func__);
3113 		goto done;
3114 	}
3115 
3116 	error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
3117 	if (error) {
3118 		WPRINTF("%s pci add msixcap failed", __func__);
3119 		goto done;
3120 	}
3121 
3122 	error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
3123 	if (error) {
3124 		WPRINTF("%s pci add Express capability failed", __func__);
3125 		goto done;
3126 	}
3127 
3128 	pthread_mutex_init(&sc->mtx, NULL);
3129 	sem_init(&sc->iosemlock, 0, sc->ioslots);
3130 	blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc);
3131 
3132 	pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
3133 	/*
3134 	 * Controller data depends on Namespace data so initialize Namespace
3135 	 * data first.
3136 	 */
3137 	pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
3138 	pci_nvme_init_ctrldata(sc);
3139 	pci_nvme_init_logpages(sc);
3140 	pci_nvme_init_features(sc);
3141 
3142 	pci_nvme_aer_init(sc);
3143 	pci_nvme_aen_init(sc);
3144 
3145 	pci_nvme_reset(sc);
3146 
3147 	pci_lintr_request(pi);
3148 
3149 done:
3150 	return (error);
3151 }
3152 
3153 static int
3154 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts)
3155 {
3156 	char *cp, *ram;
3157 
3158 	if (opts == NULL)
3159 		return (0);
3160 
3161 	if (strncmp(opts, "ram=", 4) == 0) {
3162 		cp = strchr(opts, ',');
3163 		if (cp == NULL) {
3164 			set_config_value_node(nvl, "ram", opts + 4);
3165 			return (0);
3166 		}
3167 		ram = strndup(opts + 4, cp - opts - 4);
3168 		set_config_value_node(nvl, "ram", ram);
3169 		free(ram);
3170 		return (pci_parse_legacy_config(nvl, cp + 1));
3171 	} else
3172 		return (blockif_legacy_config(nvl, opts));
3173 }
3174 
3175 struct pci_devemu pci_de_nvme = {
3176 	.pe_emu =	"nvme",
3177 	.pe_init =	pci_nvme_init,
3178 	.pe_legacy_config = pci_nvme_legacy_config,
3179 	.pe_barwrite =	pci_nvme_write,
3180 	.pe_barread =	pci_nvme_read
3181 };
3182 PCI_EMUL_SET(pci_de_nvme);
3183