xref: /freebsd/usr.sbin/bhyve/pci_nvme.c (revision cc426dd31990b8b50b210efc450e404596548ca1)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 /*
30  * bhyve PCIe-NVMe device emulation.
31  *
32  * options:
33  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z
34  *
35  *  accepted devpath:
36  *    /dev/blockdev
37  *    /path/to/image
38  *    ram=size_in_MiB
39  *
40  *  maxq    = max number of queues
41  *  qsz     = max elements in each queue
42  *  ioslots = max number of concurrent io requests
43  *  sectsz  = sector size (defaults to blockif sector size)
44  *  ser     = serial number (20-chars max)
45  *
46  */
47 
48 /* TODO:
49     - create async event for smart and log
50     - intr coalesce
51  */
52 
53 #include <sys/cdefs.h>
54 __FBSDID("$FreeBSD$");
55 
56 #include <sys/types.h>
57 
58 #include <assert.h>
59 #include <pthread.h>
60 #include <semaphore.h>
61 #include <stdbool.h>
62 #include <stddef.h>
63 #include <stdint.h>
64 #include <stdio.h>
65 #include <stdlib.h>
66 #include <string.h>
67 
68 #include <machine/atomic.h>
69 #include <machine/vmm.h>
70 #include <vmmapi.h>
71 
72 #include <dev/nvme/nvme.h>
73 
74 #include "bhyverun.h"
75 #include "block_if.h"
76 #include "pci_emul.h"
77 
78 
79 static int nvme_debug = 0;
80 #define	DPRINTF(params) if (nvme_debug) printf params
81 #define	WPRINTF(params) printf params
82 
83 /* defaults; can be overridden */
84 #define	NVME_MSIX_BAR		4
85 
86 #define	NVME_IOSLOTS		8
87 
88 #define	NVME_QUEUES		16
89 #define	NVME_MAX_QENTRIES	2048
90 
91 #define	NVME_PRP2_ITEMS		(PAGE_SIZE/sizeof(uint64_t))
92 #define	NVME_MAX_BLOCKIOVS	512
93 
94 /* helpers */
95 
96 #define	NVME_DOORBELL_OFFSET	offsetof(struct nvme_registers, doorbell)
97 
98 enum nvme_controller_register_offsets {
99 	NVME_CR_CAP_LOW = 0x00,
100 	NVME_CR_CAP_HI  = 0x04,
101 	NVME_CR_VS      = 0x08,
102 	NVME_CR_INTMS   = 0x0c,
103 	NVME_CR_INTMC   = 0x10,
104 	NVME_CR_CC      = 0x14,
105 	NVME_CR_CSTS    = 0x1c,
106 	NVME_CR_NSSR    = 0x20,
107 	NVME_CR_AQA     = 0x24,
108 	NVME_CR_ASQ_LOW = 0x28,
109 	NVME_CR_ASQ_HI  = 0x2c,
110 	NVME_CR_ACQ_LOW = 0x30,
111 	NVME_CR_ACQ_HI  = 0x34,
112 };
113 
114 enum nvme_cmd_cdw11 {
115 	NVME_CMD_CDW11_PC  = 0x0001,
116 	NVME_CMD_CDW11_IEN = 0x0002,
117 	NVME_CMD_CDW11_IV  = 0xFFFF0000,
118 };
119 
120 #define	NVME_CQ_INTEN	0x01
121 #define	NVME_CQ_INTCOAL	0x02
122 
123 struct nvme_completion_queue {
124 	struct nvme_completion *qbase;
125 	uint32_t	size;
126 	uint16_t	tail; /* nvme progress */
127 	uint16_t	head; /* guest progress */
128 	uint16_t	intr_vec;
129 	uint32_t	intr_en;
130 	pthread_mutex_t	mtx;
131 };
132 
133 struct nvme_submission_queue {
134 	struct nvme_command *qbase;
135 	uint32_t	size;
136 	uint16_t	head; /* nvme progress */
137 	uint16_t	tail; /* guest progress */
138 	uint16_t	cqid; /* completion queue id */
139 	int		busy; /* queue is being processed */
140 	int		qpriority;
141 };
142 
143 enum nvme_storage_type {
144 	NVME_STOR_BLOCKIF = 0,
145 	NVME_STOR_RAM = 1,
146 };
147 
148 struct pci_nvme_blockstore {
149 	enum nvme_storage_type type;
150 	void		*ctx;
151 	uint64_t	size;
152 	uint32_t	sectsz;
153 	uint32_t	sectsz_bits;
154 };
155 
156 struct pci_nvme_ioreq {
157 	struct pci_nvme_softc *sc;
158 	struct pci_nvme_ioreq *next;
159 	struct nvme_submission_queue *nvme_sq;
160 	uint16_t	sqid;
161 
162 	/* command information */
163 	uint16_t	opc;
164 	uint16_t	cid;
165 	uint32_t	nsid;
166 
167 	uint64_t	prev_gpaddr;
168 	size_t		prev_size;
169 
170 	/*
171 	 * lock if all iovs consumed (big IO);
172 	 * complete transaction before continuing
173 	 */
174 	pthread_mutex_t	mtx;
175 	pthread_cond_t	cv;
176 
177 	struct blockif_req io_req;
178 
179 	/* pad to fit up to 512 page descriptors from guest IO request */
180 	struct iovec	iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX];
181 };
182 
183 struct pci_nvme_softc {
184 	struct pci_devinst *nsc_pi;
185 
186 	pthread_mutex_t	mtx;
187 
188 	struct nvme_registers regs;
189 
190 	struct nvme_namespace_data  nsdata;
191 	struct nvme_controller_data ctrldata;
192 
193 	struct pci_nvme_blockstore nvstore;
194 
195 	uint16_t	max_qentries; /* max entries per queue */
196 	uint32_t	max_queues;
197 	uint32_t	num_cqueues;
198 	uint32_t	num_squeues;
199 
200 	struct pci_nvme_ioreq *ioreqs;
201 	struct pci_nvme_ioreq *ioreqs_free; /* free list of ioreqs */
202 	uint32_t	pending_ios;
203 	uint32_t	ioslots;
204 	sem_t		iosemlock;
205 
206 	/* status and guest memory mapped queues */
207 	struct nvme_completion_queue *compl_queues;
208 	struct nvme_submission_queue *submit_queues;
209 
210 	/* controller features */
211 	uint32_t	intr_coales_aggr_time;   /* 0x08: uS to delay intr */
212 	uint32_t	intr_coales_aggr_thresh; /* 0x08: compl-Q entries */
213 	uint32_t	async_ev_config;         /* 0x0B: async event config */
214 };
215 
216 
217 static void pci_nvme_io_partial(struct blockif_req *br, int err);
218 
219 /* Controller Configuration utils */
220 #define	NVME_CC_GET_EN(cc) \
221 	((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
222 #define	NVME_CC_GET_CSS(cc) \
223 	((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
224 #define	NVME_CC_GET_SHN(cc) \
225 	((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
226 #define	NVME_CC_GET_IOSQES(cc) \
227 	((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
228 #define	NVME_CC_GET_IOCQES(cc) \
229 	((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
230 
231 #define	NVME_CC_WRITE_MASK \
232 	((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
233 	 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
234 	 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
235 
236 #define	NVME_CC_NEN_WRITE_MASK \
237 	((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
238 	 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
239 	 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
240 
241 /* Controller Status utils */
242 #define	NVME_CSTS_GET_RDY(sts) \
243 	((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
244 
245 #define	NVME_CSTS_RDY	(1 << NVME_CSTS_REG_RDY_SHIFT)
246 
247 /* Completion Queue status word utils */
248 #define	NVME_STATUS_P	(1 << NVME_STATUS_P_SHIFT)
249 #define	NVME_STATUS_MASK \
250 	((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
251 	 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
252 
253 static __inline void
254 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
255 {
256 	size_t len;
257 
258 	len = strnlen(src, dst_size);
259 	memset(dst, pad, dst_size);
260 	memcpy(dst, src, len);
261 }
262 
263 static __inline void
264 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
265 {
266 
267 	*status &= ~NVME_STATUS_MASK;
268 	*status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
269 		(code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
270 }
271 
272 static __inline void
273 pci_nvme_status_genc(uint16_t *status, uint16_t code)
274 {
275 
276 	pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
277 }
278 
279 static __inline void
280 pci_nvme_toggle_phase(uint16_t *status, int prev)
281 {
282 
283 	if (prev)
284 		*status &= ~NVME_STATUS_P;
285 	else
286 		*status |= NVME_STATUS_P;
287 }
288 
289 static void
290 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
291 {
292 	struct nvme_controller_data *cd = &sc->ctrldata;
293 
294 	cd->vid = 0xFB5D;
295 	cd->ssvid = 0x0000;
296 
297 	cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
298 	cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
299 
300 	/* Num of submission commands that we can handle at a time (2^rab) */
301 	cd->rab   = 4;
302 
303 	/* FreeBSD OUI */
304 	cd->ieee[0] = 0x58;
305 	cd->ieee[1] = 0x9c;
306 	cd->ieee[2] = 0xfc;
307 
308 	cd->mic = 0;
309 
310 	cd->mdts = 9;	/* max data transfer size (2^mdts * CAP.MPSMIN) */
311 
312 	cd->ver = 0x00010300;
313 
314 	cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
315 	cd->acl = 2;
316 	cd->aerl = 4;
317 
318 	cd->lpa = 0;	/* TODO: support some simple things like SMART */
319 	cd->elpe = 0;	/* max error log page entries */
320 	cd->npss = 1;	/* number of power states support */
321 
322 	/* Warning Composite Temperature Threshold */
323 	cd->wctemp = 0x0157;
324 
325 	cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
326 	    (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
327 	cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
328 	    (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
329 	cd->nn = 1;	/* number of namespaces */
330 
331 	cd->fna = 0x03;
332 
333 	cd->power_state[0].mp = 10;
334 }
335 
336 static void
337 pci_nvme_init_nsdata(struct pci_nvme_softc *sc)
338 {
339 	struct nvme_namespace_data *nd;
340 
341 	nd = &sc->nsdata;
342 
343 	nd->nsze = sc->nvstore.size / sc->nvstore.sectsz;
344 	nd->ncap = nd->nsze;
345 	nd->nuse = nd->nsze;
346 
347 	/* Get LBA and backstore information from backing store */
348 	nd->nlbaf = 1;
349 	/* LBA data-sz = 2^lbads */
350 	nd->lbaf[0] = sc->nvstore.sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
351 
352 	nd->flbas = 0;
353 }
354 
355 static void
356 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
357 {
358 	DPRINTF(("%s\r\n", __func__));
359 
360 	sc->regs.cap_lo = (sc->max_qentries & NVME_CAP_LO_REG_MQES_MASK) |
361 	    (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
362 	    (60 << NVME_CAP_LO_REG_TO_SHIFT);
363 
364 	sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
365 
366 	sc->regs.vs = 0x00010300;	/* NVMe v1.3 */
367 
368 	sc->regs.cc = 0;
369 	sc->regs.csts = 0;
370 
371 	sc->num_cqueues = sc->num_squeues = sc->max_queues;
372 	if (sc->submit_queues != NULL) {
373 		for (int i = 0; i <= sc->max_queues; i++) {
374 			/*
375 			 * The Admin Submission Queue is at index 0.
376 			 * It must not be changed at reset otherwise the
377 			 * emulation will be out of sync with the guest.
378 			 */
379 			if (i != 0) {
380 				sc->submit_queues[i].qbase = NULL;
381 				sc->submit_queues[i].size = 0;
382 				sc->submit_queues[i].cqid = 0;
383 
384 				sc->compl_queues[i].qbase = NULL;
385 				sc->compl_queues[i].size = 0;
386 			}
387 			sc->submit_queues[i].tail = 0;
388 			sc->submit_queues[i].head = 0;
389 			sc->submit_queues[i].busy = 0;
390 
391 			sc->compl_queues[i].tail = 0;
392 			sc->compl_queues[i].head = 0;
393 		}
394 	} else
395 		sc->submit_queues = calloc(sc->max_queues + 1,
396 		                        sizeof(struct nvme_submission_queue));
397 
398 	if (sc->compl_queues == NULL) {
399 		sc->compl_queues = calloc(sc->max_queues + 1,
400 		                        sizeof(struct nvme_completion_queue));
401 
402 		for (int i = 0; i <= sc->num_cqueues; i++)
403 			pthread_mutex_init(&sc->compl_queues[i].mtx, NULL);
404 	}
405 }
406 
407 static void
408 pci_nvme_reset(struct pci_nvme_softc *sc)
409 {
410 	pthread_mutex_lock(&sc->mtx);
411 	pci_nvme_reset_locked(sc);
412 	pthread_mutex_unlock(&sc->mtx);
413 }
414 
415 static void
416 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
417 {
418 	uint16_t acqs, asqs;
419 
420 	DPRINTF(("%s\r\n", __func__));
421 
422 	asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
423 	sc->submit_queues[0].size = asqs;
424 	sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
425 	            sizeof(struct nvme_command) * asqs);
426 
427 	DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p\r\n",
428 	        __func__, sc->regs.asq, sc->submit_queues[0].qbase));
429 
430 	acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
431 	    NVME_AQA_REG_ACQS_MASK) + 1;
432 	sc->compl_queues[0].size = acqs;
433 	sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
434 	         sizeof(struct nvme_completion) * acqs);
435 	DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p\r\n",
436 	        __func__, sc->regs.acq, sc->compl_queues[0].qbase));
437 }
438 
439 static int
440 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
441 	struct nvme_completion* compl)
442 {
443 	uint16_t qid = command->cdw10 & 0xffff;
444 
445 	DPRINTF(("%s DELETE_IO_SQ %u\r\n", __func__, qid));
446 	if (qid == 0 || qid > sc->num_cqueues) {
447 		WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u\r\n",
448 		        __func__, qid, sc->num_squeues));
449 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
450 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
451 		return (1);
452 	}
453 
454 	sc->submit_queues[qid].qbase = NULL;
455 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
456 	return (1);
457 }
458 
459 static int
460 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
461 	struct nvme_completion* compl)
462 {
463 	if (command->cdw11 & NVME_CMD_CDW11_PC) {
464 		uint16_t qid = command->cdw10 & 0xffff;
465 		struct nvme_submission_queue *nsq;
466 
467 		if (qid > sc->num_squeues) {
468 			WPRINTF(("%s queue index %u > num_squeues %u\r\n",
469 			        __func__, qid, sc->num_squeues));
470 			pci_nvme_status_tc(&compl->status,
471 			    NVME_SCT_COMMAND_SPECIFIC,
472 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
473 			return (1);
474 		}
475 
476 		nsq = &sc->submit_queues[qid];
477 		nsq->size = ((command->cdw10 >> 16) & 0xffff) + 1;
478 
479 		nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
480 		              sizeof(struct nvme_command) * (size_t)nsq->size);
481 		nsq->cqid = (command->cdw11 >> 16) & 0xffff;
482 		nsq->qpriority = (command->cdw11 >> 1) & 0x03;
483 
484 		DPRINTF(("%s sq %u size %u gaddr %p cqid %u\r\n", __func__,
485 		        qid, nsq->size, nsq->qbase, nsq->cqid));
486 
487 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
488 
489 		DPRINTF(("%s completed creating IOSQ qid %u\r\n",
490 		         __func__, qid));
491 	} else {
492 		/*
493 		 * Guest sent non-cont submission queue request.
494 		 * This setting is unsupported by this emulation.
495 		 */
496 		WPRINTF(("%s unsupported non-contig (list-based) "
497 		         "create i/o submission queue\r\n", __func__));
498 
499 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
500 	}
501 	return (1);
502 }
503 
504 static int
505 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
506 	struct nvme_completion* compl)
507 {
508 	uint16_t qid = command->cdw10 & 0xffff;
509 
510 	DPRINTF(("%s DELETE_IO_CQ %u\r\n", __func__, qid));
511 	if (qid == 0 || qid > sc->num_cqueues) {
512 		WPRINTF(("%s queue index %u / num_cqueues %u\r\n",
513 		        __func__, qid, sc->num_cqueues));
514 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
515 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
516 		return (1);
517 	}
518 
519 	sc->compl_queues[qid].qbase = NULL;
520 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
521 	return (1);
522 }
523 
524 static int
525 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
526 	struct nvme_completion* compl)
527 {
528 	if (command->cdw11 & NVME_CMD_CDW11_PC) {
529 		uint16_t qid = command->cdw10 & 0xffff;
530 		struct nvme_completion_queue *ncq;
531 
532 		if (qid > sc->num_cqueues) {
533 			WPRINTF(("%s queue index %u > num_cqueues %u\r\n",
534 			        __func__, qid, sc->num_cqueues));
535 			pci_nvme_status_tc(&compl->status,
536 			    NVME_SCT_COMMAND_SPECIFIC,
537 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
538 			return (1);
539 		}
540 
541 		ncq = &sc->compl_queues[qid];
542 		ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
543 		ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
544 		ncq->size = ((command->cdw10 >> 16) & 0xffff) + 1;
545 
546 		ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
547 		             command->prp1,
548 		             sizeof(struct nvme_command) * (size_t)ncq->size);
549 
550 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
551 	} else {
552 		/*
553 		 * Non-contig completion queue unsupported.
554 		 */
555 		WPRINTF(("%s unsupported non-contig (list-based) "
556 		         "create i/o completion queue\r\n",
557 		         __func__));
558 
559 		/* 0x12 = Invalid Use of Controller Memory Buffer */
560 		pci_nvme_status_genc(&compl->status, 0x12);
561 	}
562 
563 	return (1);
564 }
565 
566 static int
567 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
568 	struct nvme_completion* compl)
569 {
570 	uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2;
571 	uint8_t logpage = command->cdw10 & 0xFF;
572 	void *data;
573 
574 	DPRINTF(("%s log page %u len %u\r\n", __func__, logpage, logsize));
575 
576 	if (logpage >= 1 && logpage <= 3)
577 		data = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
578 		                  PAGE_SIZE);
579 
580 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
581 
582 	switch (logpage) {
583 	case 0x01: /* Error information */
584 		memset(data, 0, logsize > PAGE_SIZE ? PAGE_SIZE : logsize);
585 		break;
586 	case 0x02: /* SMART/Health information */
587 		/* TODO: present some smart info */
588 		memset(data, 0, logsize > PAGE_SIZE ? PAGE_SIZE : logsize);
589 		break;
590 	case 0x03: /* Firmware slot information */
591 		memset(data, 0, logsize > PAGE_SIZE ? PAGE_SIZE : logsize);
592 		break;
593 	default:
594 		WPRINTF(("%s get log page %x command not supported\r\n",
595 		        __func__, logpage));
596 
597 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
598 		    NVME_SC_INVALID_LOG_PAGE);
599 	}
600 
601 	return (1);
602 }
603 
604 static int
605 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
606 	struct nvme_completion* compl)
607 {
608 	void *dest;
609 
610 	DPRINTF(("%s identify 0x%x nsid 0x%x\r\n", __func__,
611 	        command->cdw10 & 0xFF, command->nsid));
612 
613 	switch (command->cdw10 & 0xFF) {
614 	case 0x00: /* return Identify Namespace data structure */
615 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
616 		                  sizeof(sc->nsdata));
617 		memcpy(dest, &sc->nsdata, sizeof(sc->nsdata));
618 		break;
619 	case 0x01: /* return Identify Controller data structure */
620 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
621 		                  sizeof(sc->ctrldata));
622 		memcpy(dest, &sc->ctrldata, sizeof(sc->ctrldata));
623 		break;
624 	case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
625 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
626 		                  sizeof(uint32_t) * 1024);
627 		((uint32_t *)dest)[0] = 1;
628 		((uint32_t *)dest)[1] = 0;
629 		break;
630 	case 0x11:
631 		pci_nvme_status_genc(&compl->status,
632 		    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
633 		return (1);
634 	case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
635 	case 0x10:
636 	case 0x12:
637 	case 0x13:
638 	case 0x14:
639 	case 0x15:
640 	default:
641 		DPRINTF(("%s unsupported identify command requested 0x%x\r\n",
642 		         __func__, command->cdw10 & 0xFF));
643 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
644 		return (1);
645 	}
646 
647 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
648 	return (1);
649 }
650 
651 static int
652 nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command,
653 	struct nvme_completion* compl)
654 {
655 	int feature = command->cdw10 & 0xFF;
656 	uint32_t iv;
657 
658 	DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
659 	compl->cdw0 = 0;
660 
661 	switch (feature) {
662 	case NVME_FEAT_ARBITRATION:
663 		DPRINTF(("  arbitration 0x%x\r\n", command->cdw11));
664 		break;
665 	case NVME_FEAT_POWER_MANAGEMENT:
666 		DPRINTF(("  power management 0x%x\r\n", command->cdw11));
667 		break;
668 	case NVME_FEAT_LBA_RANGE_TYPE:
669 		DPRINTF(("  lba range 0x%x\r\n", command->cdw11));
670 		break;
671 	case NVME_FEAT_TEMPERATURE_THRESHOLD:
672 		DPRINTF(("  temperature threshold 0x%x\r\n", command->cdw11));
673 		break;
674 	case NVME_FEAT_ERROR_RECOVERY:
675 		DPRINTF(("  error recovery 0x%x\r\n", command->cdw11));
676 		break;
677 	case NVME_FEAT_VOLATILE_WRITE_CACHE:
678 		DPRINTF(("  volatile write cache 0x%x\r\n", command->cdw11));
679 		break;
680 	case NVME_FEAT_NUMBER_OF_QUEUES:
681 		sc->num_squeues = command->cdw11 & 0xFFFF;
682 		sc->num_cqueues = (command->cdw11 >> 16) & 0xFFFF;
683 		DPRINTF(("  number of queues (submit %u, completion %u)\r\n",
684 		        sc->num_squeues, sc->num_cqueues));
685 
686 		if (sc->num_squeues == 0 || sc->num_squeues > sc->max_queues)
687 			sc->num_squeues = sc->max_queues;
688 		if (sc->num_cqueues == 0 || sc->num_cqueues > sc->max_queues)
689 			sc->num_cqueues = sc->max_queues;
690 
691 		compl->cdw0 = (sc->num_squeues & 0xFFFF) |
692 		              ((sc->num_cqueues & 0xFFFF) << 16);
693 
694 		break;
695 	case NVME_FEAT_INTERRUPT_COALESCING:
696 		DPRINTF(("  interrupt coalescing 0x%x\r\n", command->cdw11));
697 
698 		/* in uS */
699 		sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100;
700 
701 		sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF;
702 		break;
703 	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
704 		iv = command->cdw11 & 0xFFFF;
705 
706 		DPRINTF(("  interrupt vector configuration 0x%x\r\n",
707 		        command->cdw11));
708 
709 		for (uint32_t i = 0; i <= sc->num_cqueues; i++) {
710 			if (sc->compl_queues[i].intr_vec == iv) {
711 				if (command->cdw11 & (1 << 16))
712 					sc->compl_queues[i].intr_en |=
713 					                      NVME_CQ_INTCOAL;
714 				else
715 					sc->compl_queues[i].intr_en &=
716 					                     ~NVME_CQ_INTCOAL;
717 			}
718 		}
719 		break;
720 	case NVME_FEAT_WRITE_ATOMICITY:
721 		DPRINTF(("  write atomicity 0x%x\r\n", command->cdw11));
722 		break;
723 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
724 		DPRINTF(("  async event configuration 0x%x\r\n",
725 		        command->cdw11));
726 		sc->async_ev_config = command->cdw11;
727 		break;
728 	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
729 		DPRINTF(("  software progress marker 0x%x\r\n",
730 		        command->cdw11));
731 		break;
732 	case 0x0C:
733 		DPRINTF(("  autonomous power state transition 0x%x\r\n",
734 		        command->cdw11));
735 		break;
736 	default:
737 		WPRINTF(("%s invalid feature\r\n", __func__));
738 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
739 		return (1);
740 	}
741 
742 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
743 	return (1);
744 }
745 
746 static int
747 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
748 	struct nvme_completion* compl)
749 {
750 	int feature = command->cdw10 & 0xFF;
751 
752 	DPRINTF(("%s feature 0x%x\r\n", __func__, feature));
753 
754 	compl->cdw0 = 0;
755 
756 	switch (feature) {
757 	case NVME_FEAT_ARBITRATION:
758 		DPRINTF(("  arbitration\r\n"));
759 		break;
760 	case NVME_FEAT_POWER_MANAGEMENT:
761 		DPRINTF(("  power management\r\n"));
762 		break;
763 	case NVME_FEAT_LBA_RANGE_TYPE:
764 		DPRINTF(("  lba range\r\n"));
765 		break;
766 	case NVME_FEAT_TEMPERATURE_THRESHOLD:
767 		DPRINTF(("  temperature threshold\r\n"));
768 		switch ((command->cdw11 >> 20) & 0x3) {
769 		case 0:
770 			/* Over temp threshold */
771 			compl->cdw0 = 0xFFFF;
772 			break;
773 		case 1:
774 			/* Under temp threshold */
775 			compl->cdw0 = 0;
776 			break;
777 		default:
778 			WPRINTF(("  invalid threshold type select\r\n"));
779 			pci_nvme_status_genc(&compl->status,
780 			    NVME_SC_INVALID_FIELD);
781 			return (1);
782 		}
783 		break;
784 	case NVME_FEAT_ERROR_RECOVERY:
785 		DPRINTF(("  error recovery\r\n"));
786 		break;
787 	case NVME_FEAT_VOLATILE_WRITE_CACHE:
788 		DPRINTF(("  volatile write cache\r\n"));
789 		break;
790 	case NVME_FEAT_NUMBER_OF_QUEUES:
791 		compl->cdw0 = 0;
792 		if (sc->num_squeues == 0)
793 			compl->cdw0 |= sc->max_queues & 0xFFFF;
794 		else
795 			compl->cdw0 |= sc->num_squeues & 0xFFFF;
796 
797 		if (sc->num_cqueues == 0)
798 			compl->cdw0 |= (sc->max_queues & 0xFFFF) << 16;
799 		else
800 			compl->cdw0 |= (sc->num_cqueues & 0xFFFF) << 16;
801 
802 		DPRINTF(("  number of queues (submit %u, completion %u)\r\n",
803 		        compl->cdw0 & 0xFFFF,
804 		        (compl->cdw0 >> 16) & 0xFFFF));
805 
806 		break;
807 	case NVME_FEAT_INTERRUPT_COALESCING:
808 		DPRINTF(("  interrupt coalescing\r\n"));
809 		break;
810 	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
811 		DPRINTF(("  interrupt vector configuration\r\n"));
812 		break;
813 	case NVME_FEAT_WRITE_ATOMICITY:
814 		DPRINTF(("  write atomicity\r\n"));
815 		break;
816 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
817 		DPRINTF(("  async event configuration\r\n"));
818 		sc->async_ev_config = command->cdw11;
819 		break;
820 	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
821 		DPRINTF(("  software progress marker\r\n"));
822 		break;
823 	case 0x0C:
824 		DPRINTF(("  autonomous power state transition\r\n"));
825 		break;
826 	default:
827 		WPRINTF(("%s invalid feature 0x%x\r\n", __func__, feature));
828 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
829 		return (1);
830 	}
831 
832 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
833 	return (1);
834 }
835 
836 static int
837 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
838 	struct nvme_completion* compl)
839 {
840 	DPRINTF(("%s submission queue %u, command ID 0x%x\r\n", __func__,
841 	        command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF));
842 
843 	/* TODO: search for the command ID and abort it */
844 
845 	compl->cdw0 = 1;
846 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
847 	return (1);
848 }
849 
850 static int
851 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
852 	struct nvme_command* command, struct nvme_completion* compl)
853 {
854 	DPRINTF(("%s async event request 0x%x\r\n", __func__, command->cdw11));
855 
856 	/*
857 	 * TODO: raise events when they happen based on the Set Features cmd.
858 	 * These events happen async, so only set completion successful if
859 	 * there is an event reflective of the request to get event.
860 	 */
861 	pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
862 	    NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
863 	return (0);
864 }
865 
866 static void
867 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
868 {
869 	struct nvme_completion compl;
870 	struct nvme_command *cmd;
871 	struct nvme_submission_queue *sq;
872 	struct nvme_completion_queue *cq;
873 	int do_intr = 0;
874 	uint16_t sqhead;
875 
876 	DPRINTF(("%s index %u\r\n", __func__, (uint32_t)value));
877 
878 	sq = &sc->submit_queues[0];
879 
880 	sqhead = atomic_load_acq_short(&sq->head);
881 
882 	if (atomic_testandset_int(&sq->busy, 1)) {
883 		DPRINTF(("%s SQ busy, head %u, tail %u\r\n",
884 		        __func__, sqhead, sq->tail));
885 		return;
886 	}
887 
888 	DPRINTF(("sqhead %u, tail %u\r\n", sqhead, sq->tail));
889 
890 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
891 		cmd = &(sq->qbase)[sqhead];
892 		compl.status = 0;
893 
894 		switch (cmd->opc) {
895 		case NVME_OPC_DELETE_IO_SQ:
896 			DPRINTF(("%s command DELETE_IO_SQ\r\n", __func__));
897 			do_intr |= nvme_opc_delete_io_sq(sc, cmd, &compl);
898 			break;
899 		case NVME_OPC_CREATE_IO_SQ:
900 			DPRINTF(("%s command CREATE_IO_SQ\r\n", __func__));
901 			do_intr |= nvme_opc_create_io_sq(sc, cmd, &compl);
902 			break;
903 		case NVME_OPC_DELETE_IO_CQ:
904 			DPRINTF(("%s command DELETE_IO_CQ\r\n", __func__));
905 			do_intr |= nvme_opc_delete_io_cq(sc, cmd, &compl);
906 			break;
907 		case NVME_OPC_CREATE_IO_CQ:
908 			DPRINTF(("%s command CREATE_IO_CQ\r\n", __func__));
909 			do_intr |= nvme_opc_create_io_cq(sc, cmd, &compl);
910 			break;
911 		case NVME_OPC_GET_LOG_PAGE:
912 			DPRINTF(("%s command GET_LOG_PAGE\r\n", __func__));
913 			do_intr |= nvme_opc_get_log_page(sc, cmd, &compl);
914 			break;
915 		case NVME_OPC_IDENTIFY:
916 			DPRINTF(("%s command IDENTIFY\r\n", __func__));
917 			do_intr |= nvme_opc_identify(sc, cmd, &compl);
918 			break;
919 		case NVME_OPC_ABORT:
920 			DPRINTF(("%s command ABORT\r\n", __func__));
921 			do_intr |= nvme_opc_abort(sc, cmd, &compl);
922 			break;
923 		case NVME_OPC_SET_FEATURES:
924 			DPRINTF(("%s command SET_FEATURES\r\n", __func__));
925 			do_intr |= nvme_opc_set_features(sc, cmd, &compl);
926 			break;
927 		case NVME_OPC_GET_FEATURES:
928 			DPRINTF(("%s command GET_FEATURES\r\n", __func__));
929 			do_intr |= nvme_opc_get_features(sc, cmd, &compl);
930 			break;
931 		case NVME_OPC_ASYNC_EVENT_REQUEST:
932 			DPRINTF(("%s command ASYNC_EVENT_REQ\r\n", __func__));
933 			/* XXX dont care, unhandled for now
934 			do_intr |= nvme_opc_async_event_req(sc, cmd, &compl);
935 			*/
936 			break;
937 		default:
938 			WPRINTF(("0x%x command is not implemented\r\n",
939 			    cmd->opc));
940 		}
941 
942 		/* for now skip async event generation */
943 		if (cmd->opc != NVME_OPC_ASYNC_EVENT_REQUEST) {
944 			struct nvme_completion *cp;
945 			int phase;
946 
947 			cq = &sc->compl_queues[0];
948 
949 			cp = &(cq->qbase)[cq->tail];
950 			cp->sqid = 0;
951 			cp->sqhd = sqhead;
952 			cp->cid = cmd->cid;
953 
954 			phase = NVME_STATUS_GET_P(cp->status);
955 			cp->status = compl.status;
956 			pci_nvme_toggle_phase(&cp->status, phase);
957 
958 			cq->tail = (cq->tail + 1) % cq->size;
959 		}
960 		sqhead = (sqhead + 1) % sq->size;
961 	}
962 
963 	DPRINTF(("setting sqhead %u\r\n", sqhead));
964 	atomic_store_short(&sq->head, sqhead);
965 	atomic_store_int(&sq->busy, 0);
966 
967 	if (do_intr)
968 		pci_generate_msix(sc->nsc_pi, 0);
969 
970 }
971 
972 static int
973 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
974 	uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
975 {
976 	int iovidx;
977 
978 	if (req != NULL) {
979 		/* concatenate contig block-iovs to minimize number of iovs */
980 		if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
981 			iovidx = req->io_req.br_iovcnt - 1;
982 
983 			req->io_req.br_iov[iovidx].iov_base =
984 			    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
985 			                     req->prev_gpaddr, size);
986 
987 			req->prev_size += size;
988 			req->io_req.br_resid += size;
989 
990 			req->io_req.br_iov[iovidx].iov_len = req->prev_size;
991 		} else {
992 			pthread_mutex_lock(&req->mtx);
993 
994 			iovidx = req->io_req.br_iovcnt;
995 			if (iovidx == NVME_MAX_BLOCKIOVS) {
996 				int err = 0;
997 
998 				DPRINTF(("large I/O, doing partial req\r\n"));
999 
1000 				iovidx = 0;
1001 				req->io_req.br_iovcnt = 0;
1002 
1003 				req->io_req.br_callback = pci_nvme_io_partial;
1004 
1005 				if (!do_write)
1006 					err = blockif_read(sc->nvstore.ctx,
1007 					                   &req->io_req);
1008 				else
1009 					err = blockif_write(sc->nvstore.ctx,
1010 					                    &req->io_req);
1011 
1012 				/* wait until req completes before cont */
1013 				if (err == 0)
1014 					pthread_cond_wait(&req->cv, &req->mtx);
1015 			}
1016 			if (iovidx == 0) {
1017 				req->io_req.br_offset = lba;
1018 				req->io_req.br_resid = 0;
1019 				req->io_req.br_param = req;
1020 			}
1021 
1022 			req->io_req.br_iov[iovidx].iov_base =
1023 			    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1024 			                     gpaddr, size);
1025 
1026 			req->io_req.br_iov[iovidx].iov_len = size;
1027 
1028 			req->prev_gpaddr = gpaddr;
1029 			req->prev_size = size;
1030 			req->io_req.br_resid += size;
1031 
1032 			req->io_req.br_iovcnt++;
1033 
1034 			pthread_mutex_unlock(&req->mtx);
1035 		}
1036 	} else {
1037 		/* RAM buffer: read/write directly */
1038 		void *p = sc->nvstore.ctx;
1039 		void *gptr;
1040 
1041 		if ((lba + size) > sc->nvstore.size) {
1042 			WPRINTF(("%s write would overflow RAM\r\n", __func__));
1043 			return (-1);
1044 		}
1045 
1046 		p = (void *)((uintptr_t)p + (uintptr_t)lba);
1047 		gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size);
1048 		if (do_write)
1049 			memcpy(p, gptr, size);
1050 		else
1051 			memcpy(gptr, p, size);
1052 	}
1053 	return (0);
1054 }
1055 
1056 static void
1057 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1058 	struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1059 	uint32_t cdw0, uint16_t status, int ignore_busy)
1060 {
1061 	struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1062 	struct nvme_completion *compl;
1063 	int do_intr = 0;
1064 	int phase;
1065 
1066 	DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x\r\n",
1067 		 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1068 		 NVME_STATUS_GET_SC(status)));
1069 
1070 	pthread_mutex_lock(&cq->mtx);
1071 
1072 	assert(cq->qbase != NULL);
1073 
1074 	compl = &cq->qbase[cq->tail];
1075 
1076 	compl->sqhd = atomic_load_acq_short(&sq->head);
1077 	compl->sqid = sqid;
1078 	compl->cid = cid;
1079 
1080 	// toggle phase
1081 	phase = NVME_STATUS_GET_P(compl->status);
1082 	compl->status = status;
1083 	pci_nvme_toggle_phase(&compl->status, phase);
1084 
1085 	cq->tail = (cq->tail + 1) % cq->size;
1086 
1087 	if (cq->intr_en & NVME_CQ_INTEN)
1088 		do_intr = 1;
1089 
1090 	pthread_mutex_unlock(&cq->mtx);
1091 
1092 	if (ignore_busy || !atomic_load_acq_int(&sq->busy))
1093 		if (do_intr)
1094 			pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1095 }
1096 
1097 static void
1098 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1099 {
1100 	req->sc = NULL;
1101 	req->nvme_sq = NULL;
1102 	req->sqid = 0;
1103 
1104 	pthread_mutex_lock(&sc->mtx);
1105 
1106 	req->next = sc->ioreqs_free;
1107 	sc->ioreqs_free = req;
1108 	sc->pending_ios--;
1109 
1110 	/* when no more IO pending, can set to ready if device reset/enabled */
1111 	if (sc->pending_ios == 0 &&
1112 	    NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1113 		sc->regs.csts |= NVME_CSTS_RDY;
1114 
1115 	pthread_mutex_unlock(&sc->mtx);
1116 
1117 	sem_post(&sc->iosemlock);
1118 }
1119 
1120 static struct pci_nvme_ioreq *
1121 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1122 {
1123 	struct pci_nvme_ioreq *req = NULL;;
1124 
1125 	sem_wait(&sc->iosemlock);
1126 	pthread_mutex_lock(&sc->mtx);
1127 
1128 	req = sc->ioreqs_free;
1129 	assert(req != NULL);
1130 
1131 	sc->ioreqs_free = req->next;
1132 
1133 	req->next = NULL;
1134 	req->sc = sc;
1135 
1136 	sc->pending_ios++;
1137 
1138 	pthread_mutex_unlock(&sc->mtx);
1139 
1140 	req->io_req.br_iovcnt = 0;
1141 	req->io_req.br_offset = 0;
1142 	req->io_req.br_resid = 0;
1143 	req->io_req.br_param = req;
1144 	req->prev_gpaddr = 0;
1145 	req->prev_size = 0;
1146 
1147 	return req;
1148 }
1149 
1150 static void
1151 pci_nvme_io_done(struct blockif_req *br, int err)
1152 {
1153 	struct pci_nvme_ioreq *req = br->br_param;
1154 	struct nvme_submission_queue *sq = req->nvme_sq;
1155 	uint16_t code, status;
1156 
1157 	DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
1158 
1159 	/* TODO return correct error */
1160 	code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1161 	pci_nvme_status_genc(&status, code);
1162 
1163 	pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0);
1164 	pci_nvme_release_ioreq(req->sc, req);
1165 }
1166 
1167 static void
1168 pci_nvme_io_partial(struct blockif_req *br, int err)
1169 {
1170 	struct pci_nvme_ioreq *req = br->br_param;
1171 
1172 	DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err)));
1173 
1174 	pthread_cond_signal(&req->cv);
1175 }
1176 
1177 
1178 static void
1179 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
1180 {
1181 	struct nvme_submission_queue *sq;
1182 	uint16_t status;
1183 	uint16_t sqhead;
1184 	int err;
1185 
1186 	/* handle all submissions up to sq->tail index */
1187 	sq = &sc->submit_queues[idx];
1188 
1189 	if (atomic_testandset_int(&sq->busy, 1)) {
1190 		DPRINTF(("%s sqid %u busy\r\n", __func__, idx));
1191 		return;
1192 	}
1193 
1194 	sqhead = atomic_load_acq_short(&sq->head);
1195 
1196 	DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p\r\n",
1197 	         idx, sqhead, sq->tail, sq->qbase));
1198 
1199 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
1200 		struct nvme_command *cmd;
1201 		struct pci_nvme_ioreq *req = NULL;
1202 		uint64_t lba;
1203 		uint64_t nblocks, bytes, size, cpsz;
1204 
1205 		/* TODO: support scatter gather list handling */
1206 
1207 		cmd = &sq->qbase[sqhead];
1208 		sqhead = (sqhead + 1) % sq->size;
1209 
1210 		lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
1211 
1212 		if (cmd->opc == NVME_OPC_FLUSH) {
1213 			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1214 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1215 			                        status, 1);
1216 
1217 			continue;
1218 		} else if (cmd->opc == 0x08) {
1219 			/* TODO: write zeroes */
1220 			WPRINTF(("%s write zeroes lba 0x%lx blocks %u\r\n",
1221 			        __func__, lba, cmd->cdw12 & 0xFFFF));
1222 			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1223 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1224 			                        status, 1);
1225 
1226 			continue;
1227 		}
1228 
1229 		nblocks = (cmd->cdw12 & 0xFFFF) + 1;
1230 
1231 		bytes = nblocks * sc->nvstore.sectsz;
1232 
1233 		if (sc->nvstore.type == NVME_STOR_BLOCKIF) {
1234 			req = pci_nvme_get_ioreq(sc);
1235 			req->nvme_sq = sq;
1236 			req->sqid = idx;
1237 		}
1238 
1239 		/*
1240 		 * If data starts mid-page and flows into the next page, then
1241 		 * increase page count
1242 		 */
1243 
1244 		DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu "
1245 		         "(%lu-bytes)\r\n",
1246 		         sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size,
1247 		         cmd->opc == NVME_OPC_WRITE ?
1248 			     "WRITE" : "READ",
1249 		         lba, nblocks, bytes));
1250 
1251 		cmd->prp1 &= ~(0x03UL);
1252 		cmd->prp2 &= ~(0x03UL);
1253 
1254 		DPRINTF((" prp1 0x%lx prp2 0x%lx\r\n", cmd->prp1, cmd->prp2));
1255 
1256 		size = bytes;
1257 		lba *= sc->nvstore.sectsz;
1258 
1259 		cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE);
1260 
1261 		if (cpsz > bytes)
1262 			cpsz = bytes;
1263 
1264 		if (req != NULL) {
1265 			req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) |
1266 			                        cmd->cdw10;
1267 			req->opc = cmd->opc;
1268 			req->cid = cmd->cid;
1269 			req->nsid = cmd->nsid;
1270 		}
1271 
1272 		err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz,
1273 		    cmd->opc == NVME_OPC_WRITE, lba);
1274 		lba += cpsz;
1275 		size -= cpsz;
1276 
1277 		if (size == 0)
1278 			goto iodone;
1279 
1280 		if (size <= PAGE_SIZE) {
1281 			/* prp2 is second (and final) page in transfer */
1282 
1283 			err = pci_nvme_append_iov_req(sc, req, cmd->prp2,
1284 			    size,
1285 			    cmd->opc == NVME_OPC_WRITE,
1286 			    lba);
1287 		} else {
1288 			uint64_t *prp_list;
1289 			int i;
1290 
1291 			/* prp2 is pointer to a physical region page list */
1292 			prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx,
1293 			                            cmd->prp2, PAGE_SIZE);
1294 
1295 			i = 0;
1296 			while (size != 0) {
1297 				cpsz = MIN(size, PAGE_SIZE);
1298 
1299 				/*
1300 				 * Move to linked physical region page list
1301 				 * in last item.
1302 				 */
1303 				if (i == (NVME_PRP2_ITEMS-1) &&
1304 				    size > PAGE_SIZE) {
1305 					assert((prp_list[i] & (PAGE_SIZE-1)) == 0);
1306 					prp_list = paddr_guest2host(
1307 					              sc->nsc_pi->pi_vmctx,
1308 					              prp_list[i], PAGE_SIZE);
1309 					i = 0;
1310 				}
1311 				if (prp_list[i] == 0) {
1312 					WPRINTF(("PRP2[%d] = 0 !!!\r\n", i));
1313 					err = 1;
1314 					break;
1315 				}
1316 
1317 				err = pci_nvme_append_iov_req(sc, req,
1318 				    prp_list[i], cpsz,
1319 				    cmd->opc == NVME_OPC_WRITE, lba);
1320 				if (err)
1321 					break;
1322 
1323 				lba += cpsz;
1324 				size -= cpsz;
1325 				i++;
1326 			}
1327 		}
1328 
1329 iodone:
1330 		if (sc->nvstore.type == NVME_STOR_RAM) {
1331 			uint16_t code, status;
1332 
1333 			code = err ? NVME_SC_LBA_OUT_OF_RANGE :
1334 			    NVME_SC_SUCCESS;
1335 			pci_nvme_status_genc(&status, code);
1336 
1337 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1338 			                        status, 1);
1339 
1340 			continue;
1341 		}
1342 
1343 
1344 		if (err)
1345 			goto do_error;
1346 
1347 		req->io_req.br_callback = pci_nvme_io_done;
1348 
1349 		err = 0;
1350 		switch (cmd->opc) {
1351 		case NVME_OPC_READ:
1352 			err = blockif_read(sc->nvstore.ctx, &req->io_req);
1353 			break;
1354 		case NVME_OPC_WRITE:
1355 			err = blockif_write(sc->nvstore.ctx, &req->io_req);
1356 			break;
1357 		default:
1358 			WPRINTF(("%s unhandled io command 0x%x\r\n",
1359 				 __func__, cmd->opc));
1360 			err = 1;
1361 		}
1362 
1363 do_error:
1364 		if (err) {
1365 			uint16_t status;
1366 
1367 			pci_nvme_status_genc(&status,
1368 			    NVME_SC_DATA_TRANSFER_ERROR);
1369 
1370 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1371 			                        status, 1);
1372 			pci_nvme_release_ioreq(sc, req);
1373 		}
1374 	}
1375 
1376 	atomic_store_short(&sq->head, sqhead);
1377 	atomic_store_int(&sq->busy, 0);
1378 }
1379 
1380 static void
1381 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
1382 	uint64_t idx, int is_sq, uint64_t value)
1383 {
1384 	DPRINTF(("nvme doorbell %lu, %s, val 0x%lx\r\n",
1385 	        idx, is_sq ? "SQ" : "CQ", value & 0xFFFF));
1386 
1387 	if (is_sq) {
1388 		atomic_store_short(&sc->submit_queues[idx].tail,
1389 		                   (uint16_t)value);
1390 
1391 		if (idx == 0) {
1392 			pci_nvme_handle_admin_cmd(sc, value);
1393 		} else {
1394 			/* submission queue; handle new entries in SQ */
1395 			if (idx > sc->num_squeues) {
1396 				WPRINTF(("%s SQ index %lu overflow from "
1397 				         "guest (max %u)\r\n",
1398 				         __func__, idx, sc->num_squeues));
1399 				return;
1400 			}
1401 			pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
1402 		}
1403 	} else {
1404 		if (idx > sc->num_cqueues) {
1405 			WPRINTF(("%s queue index %lu overflow from "
1406 			         "guest (max %u)\r\n",
1407 			         __func__, idx, sc->num_cqueues));
1408 			return;
1409 		}
1410 
1411 		sc->compl_queues[idx].head = (uint16_t)value;
1412 	}
1413 }
1414 
1415 static void
1416 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
1417 {
1418 	const char *s = iswrite ? "WRITE" : "READ";
1419 
1420 	switch (offset) {
1421 	case NVME_CR_CAP_LOW:
1422 		DPRINTF(("%s %s NVME_CR_CAP_LOW\r\n", func, s));
1423 		break;
1424 	case NVME_CR_CAP_HI:
1425 		DPRINTF(("%s %s NVME_CR_CAP_HI\r\n", func, s));
1426 		break;
1427 	case NVME_CR_VS:
1428 		DPRINTF(("%s %s NVME_CR_VS\r\n", func, s));
1429 		break;
1430 	case NVME_CR_INTMS:
1431 		DPRINTF(("%s %s NVME_CR_INTMS\r\n", func, s));
1432 		break;
1433 	case NVME_CR_INTMC:
1434 		DPRINTF(("%s %s NVME_CR_INTMC\r\n", func, s));
1435 		break;
1436 	case NVME_CR_CC:
1437 		DPRINTF(("%s %s NVME_CR_CC\r\n", func, s));
1438 		break;
1439 	case NVME_CR_CSTS:
1440 		DPRINTF(("%s %s NVME_CR_CSTS\r\n", func, s));
1441 		break;
1442 	case NVME_CR_NSSR:
1443 		DPRINTF(("%s %s NVME_CR_NSSR\r\n", func, s));
1444 		break;
1445 	case NVME_CR_AQA:
1446 		DPRINTF(("%s %s NVME_CR_AQA\r\n", func, s));
1447 		break;
1448 	case NVME_CR_ASQ_LOW:
1449 		DPRINTF(("%s %s NVME_CR_ASQ_LOW\r\n", func, s));
1450 		break;
1451 	case NVME_CR_ASQ_HI:
1452 		DPRINTF(("%s %s NVME_CR_ASQ_HI\r\n", func, s));
1453 		break;
1454 	case NVME_CR_ACQ_LOW:
1455 		DPRINTF(("%s %s NVME_CR_ACQ_LOW\r\n", func, s));
1456 		break;
1457 	case NVME_CR_ACQ_HI:
1458 		DPRINTF(("%s %s NVME_CR_ACQ_HI\r\n", func, s));
1459 		break;
1460 	default:
1461 		DPRINTF(("unknown nvme bar-0 offset 0x%lx\r\n", offset));
1462 	}
1463 
1464 }
1465 
1466 static void
1467 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
1468 	uint64_t offset, int size, uint64_t value)
1469 {
1470 	uint32_t ccreg;
1471 
1472 	if (offset >= NVME_DOORBELL_OFFSET) {
1473 		uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
1474 		uint64_t idx = belloffset / 8; /* door bell size = 2*int */
1475 		int is_sq = (belloffset % 8) < 4;
1476 
1477 		if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
1478 			WPRINTF(("guest attempted an overflow write offset "
1479 			         "0x%lx, val 0x%lx in %s",
1480 			         offset, value, __func__));
1481 			return;
1482 		}
1483 
1484 		pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
1485 		return;
1486 	}
1487 
1488 	DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx\r\n",
1489 	        offset, size, value));
1490 
1491 	if (size != 4) {
1492 		WPRINTF(("guest wrote invalid size %d (offset 0x%lx, "
1493 		         "val 0x%lx) to bar0 in %s",
1494 		         size, offset, value, __func__));
1495 		/* TODO: shutdown device */
1496 		return;
1497 	}
1498 
1499 	pci_nvme_bar0_reg_dumps(__func__, offset, 1);
1500 
1501 	pthread_mutex_lock(&sc->mtx);
1502 
1503 	switch (offset) {
1504 	case NVME_CR_CAP_LOW:
1505 	case NVME_CR_CAP_HI:
1506 		/* readonly */
1507 		break;
1508 	case NVME_CR_VS:
1509 		/* readonly */
1510 		break;
1511 	case NVME_CR_INTMS:
1512 		/* MSI-X, so ignore */
1513 		break;
1514 	case NVME_CR_INTMC:
1515 		/* MSI-X, so ignore */
1516 		break;
1517 	case NVME_CR_CC:
1518 		ccreg = (uint32_t)value;
1519 
1520 		DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
1521 		         "iocqes %u\r\n",
1522 		        __func__,
1523 			 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
1524 			 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
1525 			 NVME_CC_GET_IOCQES(ccreg)));
1526 
1527 		if (NVME_CC_GET_SHN(ccreg)) {
1528 			/* perform shutdown - flush out data to backend */
1529 			sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
1530 			    NVME_CSTS_REG_SHST_SHIFT);
1531 			sc->regs.csts |= NVME_SHST_COMPLETE <<
1532 			    NVME_CSTS_REG_SHST_SHIFT;
1533 		}
1534 		if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
1535 			if (NVME_CC_GET_EN(ccreg) == 0)
1536 				/* transition 1-> causes controller reset */
1537 				pci_nvme_reset_locked(sc);
1538 			else
1539 				pci_nvme_init_controller(ctx, sc);
1540 		}
1541 
1542 		/* Insert the iocqes, iosqes and en bits from the write */
1543 		sc->regs.cc &= ~NVME_CC_WRITE_MASK;
1544 		sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
1545 		if (NVME_CC_GET_EN(ccreg) == 0) {
1546 			/* Insert the ams, mps and css bit fields */
1547 			sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
1548 			sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
1549 			sc->regs.csts &= ~NVME_CSTS_RDY;
1550 		} else if (sc->pending_ios == 0) {
1551 			sc->regs.csts |= NVME_CSTS_RDY;
1552 		}
1553 		break;
1554 	case NVME_CR_CSTS:
1555 		break;
1556 	case NVME_CR_NSSR:
1557 		/* ignore writes; don't support subsystem reset */
1558 		break;
1559 	case NVME_CR_AQA:
1560 		sc->regs.aqa = (uint32_t)value;
1561 		break;
1562 	case NVME_CR_ASQ_LOW:
1563 		sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
1564 		               (0xFFFFF000 & value);
1565 		break;
1566 	case NVME_CR_ASQ_HI:
1567 		sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
1568 		               (value << 32);
1569 		break;
1570 	case NVME_CR_ACQ_LOW:
1571 		sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
1572 		               (0xFFFFF000 & value);
1573 		break;
1574 	case NVME_CR_ACQ_HI:
1575 		sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
1576 		               (value << 32);
1577 		break;
1578 	default:
1579 		DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d\r\n",
1580 		         __func__, offset, value, size));
1581 	}
1582 	pthread_mutex_unlock(&sc->mtx);
1583 }
1584 
1585 static void
1586 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
1587                 int baridx, uint64_t offset, int size, uint64_t value)
1588 {
1589 	struct pci_nvme_softc* sc = pi->pi_arg;
1590 
1591 	if (baridx == pci_msix_table_bar(pi) ||
1592 	    baridx == pci_msix_pba_bar(pi)) {
1593 		DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, "
1594 		         " value 0x%lx\r\n", baridx, offset, size, value));
1595 
1596 		pci_emul_msix_twrite(pi, offset, size, value);
1597 		return;
1598 	}
1599 
1600 	switch (baridx) {
1601 	case 0:
1602 		pci_nvme_write_bar_0(ctx, sc, offset, size, value);
1603 		break;
1604 
1605 	default:
1606 		DPRINTF(("%s unknown baridx %d, val 0x%lx\r\n",
1607 		         __func__, baridx, value));
1608 	}
1609 }
1610 
1611 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
1612 	uint64_t offset, int size)
1613 {
1614 	uint64_t value;
1615 
1616 	pci_nvme_bar0_reg_dumps(__func__, offset, 0);
1617 
1618 	if (offset < NVME_DOORBELL_OFFSET) {
1619 		void *p = &(sc->regs);
1620 		pthread_mutex_lock(&sc->mtx);
1621 		memcpy(&value, (void *)((uintptr_t)p + offset), size);
1622 		pthread_mutex_unlock(&sc->mtx);
1623 	} else {
1624 		value = 0;
1625                 WPRINTF(("pci_nvme: read invalid offset %ld\r\n", offset));
1626 	}
1627 
1628 	switch (size) {
1629 	case 1:
1630 		value &= 0xFF;
1631 		break;
1632 	case 2:
1633 		value &= 0xFFFF;
1634 		break;
1635 	case 4:
1636 		value &= 0xFFFFFFFF;
1637 		break;
1638 	}
1639 
1640 	DPRINTF(("   nvme-read offset 0x%lx, size %d -> value 0x%x\r\n",
1641 	         offset, size, (uint32_t)value));
1642 
1643 	return (value);
1644 }
1645 
1646 
1647 
1648 static uint64_t
1649 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
1650     uint64_t offset, int size)
1651 {
1652 	struct pci_nvme_softc* sc = pi->pi_arg;
1653 
1654 	if (baridx == pci_msix_table_bar(pi) ||
1655 	    baridx == pci_msix_pba_bar(pi)) {
1656 		DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d\r\n",
1657 		        baridx, offset, size));
1658 
1659 		return pci_emul_msix_tread(pi, offset, size);
1660 	}
1661 
1662 	switch (baridx) {
1663 	case 0:
1664        		return pci_nvme_read_bar_0(sc, offset, size);
1665 
1666 	default:
1667 		DPRINTF(("unknown bar %d, 0x%lx\r\n", baridx, offset));
1668 	}
1669 
1670 	return (0);
1671 }
1672 
1673 
1674 static int
1675 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
1676 {
1677 	char bident[sizeof("XX:X:X")];
1678 	char	*uopt, *xopts, *config;
1679 	uint32_t sectsz;
1680 	int optidx;
1681 
1682 	sc->max_queues = NVME_QUEUES;
1683 	sc->max_qentries = NVME_MAX_QENTRIES;
1684 	sc->ioslots = NVME_IOSLOTS;
1685 	sc->num_squeues = sc->max_queues;
1686 	sc->num_cqueues = sc->max_queues;
1687 	sectsz = 0;
1688 
1689 	uopt = strdup(opts);
1690 	optidx = 0;
1691 	snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
1692 	         "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1693 	for (xopts = strtok(uopt, ",");
1694 	     xopts != NULL;
1695 	     xopts = strtok(NULL, ",")) {
1696 
1697 		if ((config = strchr(xopts, '=')) != NULL)
1698 			*config++ = '\0';
1699 
1700 		if (!strcmp("maxq", xopts)) {
1701 			sc->max_queues = atoi(config);
1702 		} else if (!strcmp("qsz", xopts)) {
1703 			sc->max_qentries = atoi(config);
1704 		} else if (!strcmp("ioslots", xopts)) {
1705 			sc->ioslots = atoi(config);
1706 		} else if (!strcmp("sectsz", xopts)) {
1707 			sectsz = atoi(config);
1708 		} else if (!strcmp("ser", xopts)) {
1709 			/*
1710 			 * This field indicates the Product Serial Number in
1711 			 * 7-bit ASCII, unused bytes should be space characters.
1712 			 * Ref: NVMe v1.3c.
1713 			 */
1714 			cpywithpad((char *)sc->ctrldata.sn,
1715 			           sizeof(sc->ctrldata.sn), config, ' ');
1716 		} else if (!strcmp("ram", xopts)) {
1717 			uint64_t sz = strtoull(&xopts[4], NULL, 10);
1718 
1719 			sc->nvstore.type = NVME_STOR_RAM;
1720 			sc->nvstore.size = sz * 1024 * 1024;
1721 			sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1722 			sc->nvstore.sectsz = 4096;
1723 			sc->nvstore.sectsz_bits = 12;
1724 			if (sc->nvstore.ctx == NULL) {
1725 				perror("Unable to allocate RAM");
1726 				free(uopt);
1727 				return (-1);
1728 			}
1729 		} else if (optidx == 0) {
1730 			snprintf(bident, sizeof(bident), "%d:%d",
1731 			         sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
1732 			sc->nvstore.ctx = blockif_open(xopts, bident);
1733 			if (sc->nvstore.ctx == NULL) {
1734 				perror("Could not open backing file");
1735 				free(uopt);
1736 				return (-1);
1737 			}
1738 			sc->nvstore.type = NVME_STOR_BLOCKIF;
1739 			sc->nvstore.size = blockif_size(sc->nvstore.ctx);
1740 		} else {
1741 			fprintf(stderr, "Invalid option %s\n", xopts);
1742 			free(uopt);
1743 			return (-1);
1744 		}
1745 
1746 		optidx++;
1747 	}
1748 	free(uopt);
1749 
1750 	if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
1751 		fprintf(stderr, "backing store not specified\n");
1752 		return (-1);
1753 	}
1754 	if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
1755 		sc->nvstore.sectsz = sectsz;
1756 	else if (sc->nvstore.type != NVME_STOR_RAM)
1757 		sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
1758 	for (sc->nvstore.sectsz_bits = 9;
1759 	     (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
1760 	     sc->nvstore.sectsz_bits++);
1761 
1762 	if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
1763 		sc->max_queues = NVME_QUEUES;
1764 
1765 	if (sc->max_qentries <= 0) {
1766 		fprintf(stderr, "Invalid qsz option\n");
1767 		return (-1);
1768 	}
1769 	if (sc->ioslots <= 0) {
1770 		fprintf(stderr, "Invalid ioslots option\n");
1771 		return (-1);
1772 	}
1773 
1774 	return (0);
1775 }
1776 
1777 static int
1778 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
1779 {
1780 	struct pci_nvme_softc *sc;
1781 	uint32_t pci_membar_sz;
1782 	int	error;
1783 
1784 	error = 0;
1785 
1786 	sc = calloc(1, sizeof(struct pci_nvme_softc));
1787 	pi->pi_arg = sc;
1788 	sc->nsc_pi = pi;
1789 
1790 	error = pci_nvme_parse_opts(sc, opts);
1791 	if (error < 0)
1792 		goto done;
1793 	else
1794 		error = 0;
1795 
1796 	sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
1797 	for (int i = 0; i < sc->ioslots; i++) {
1798 		if (i < (sc->ioslots-1))
1799 			sc->ioreqs[i].next = &sc->ioreqs[i+1];
1800 		pthread_mutex_init(&sc->ioreqs[i].mtx, NULL);
1801 		pthread_cond_init(&sc->ioreqs[i].cv, NULL);
1802 	}
1803 	sc->ioreqs_free = sc->ioreqs;
1804 	sc->intr_coales_aggr_thresh = 1;
1805 
1806 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
1807 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
1808 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
1809 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
1810 	pci_set_cfgdata8(pi, PCIR_PROGIF,
1811 	                 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
1812 
1813 	/* allocate size of nvme registers + doorbell space for all queues */
1814 	pci_membar_sz = sizeof(struct nvme_registers) +
1815 	                2*sizeof(uint32_t)*(sc->max_queues);
1816 
1817 	DPRINTF(("nvme membar size: %u\r\n", pci_membar_sz));
1818 
1819 	error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
1820 	if (error) {
1821 		WPRINTF(("%s pci alloc mem bar failed\r\n", __func__));
1822 		goto done;
1823 	}
1824 
1825 	error = pci_emul_add_msixcap(pi, sc->max_queues, NVME_MSIX_BAR);
1826 	if (error) {
1827 		WPRINTF(("%s pci add msixcap failed\r\n", __func__));
1828 		goto done;
1829 	}
1830 
1831 	pthread_mutex_init(&sc->mtx, NULL);
1832 	sem_init(&sc->iosemlock, 0, sc->ioslots);
1833 
1834 	pci_nvme_reset(sc);
1835 	pci_nvme_init_ctrldata(sc);
1836 	pci_nvme_init_nsdata(sc);
1837 
1838 	pci_lintr_request(pi);
1839 
1840 done:
1841 	return (error);
1842 }
1843 
1844 
1845 struct pci_devemu pci_de_nvme = {
1846 	.pe_emu =	"nvme",
1847 	.pe_init =	pci_nvme_init,
1848 	.pe_barwrite =	pci_nvme_write,
1849 	.pe_barread =	pci_nvme_read
1850 };
1851 PCI_EMUL_SET(pci_de_nvme);
1852