1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2017 Shunsuke Mie 5 * Copyright (c) 2018 Leon Dang 6 * 7 * Function crc16 Copyright (c) 2017, Fedor Uporov 8 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 /* 33 * bhyve PCIe-NVMe device emulation. 34 * 35 * options: 36 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=# 37 * 38 * accepted devpath: 39 * /dev/blockdev 40 * /path/to/image 41 * ram=size_in_MiB 42 * 43 * maxq = max number of queues 44 * qsz = max elements in each queue 45 * ioslots = max number of concurrent io requests 46 * sectsz = sector size (defaults to blockif sector size) 47 * ser = serial number (20-chars max) 48 * eui64 = IEEE Extended Unique Identifier (8 byte value) 49 * 50 */ 51 52 /* TODO: 53 - create async event for smart and log 54 - intr coalesce 55 */ 56 57 #include <sys/cdefs.h> 58 __FBSDID("$FreeBSD$"); 59 60 #include <sys/types.h> 61 #include <net/ieee_oui.h> 62 63 #include <assert.h> 64 #include <pthread.h> 65 #include <semaphore.h> 66 #include <stdbool.h> 67 #include <stddef.h> 68 #include <stdint.h> 69 #include <stdio.h> 70 #include <stdlib.h> 71 #include <string.h> 72 73 #include <machine/atomic.h> 74 #include <machine/vmm.h> 75 #include <vmmapi.h> 76 77 #include <dev/nvme/nvme.h> 78 79 #include "bhyverun.h" 80 #include "block_if.h" 81 #include "debug.h" 82 #include "pci_emul.h" 83 84 85 static int nvme_debug = 0; 86 #define DPRINTF(params) if (nvme_debug) PRINTLN params 87 #define WPRINTF(params) PRINTLN params 88 89 /* defaults; can be overridden */ 90 #define NVME_MSIX_BAR 4 91 92 #define NVME_IOSLOTS 8 93 94 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ 95 #define NVME_MMIO_SPACE_MIN (1 << 14) 96 97 #define NVME_QUEUES 16 98 #define NVME_MAX_QENTRIES 2048 99 100 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) 101 #define NVME_MAX_BLOCKIOVS 512 102 103 /* This is a synthetic status code to indicate there is no status */ 104 #define NVME_NO_STATUS 0xffff 105 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) 106 107 /* helpers */ 108 109 /* Convert a zero-based value into a one-based value */ 110 #define ONE_BASED(zero) ((zero) + 1) 111 /* Convert a one-based value into a zero-based value */ 112 #define ZERO_BASED(one) ((one) - 1) 113 114 /* Encode number of SQ's and CQ's for Set/Get Features */ 115 #define NVME_FEATURE_NUM_QUEUES(sc) \ 116 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ 117 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16; 118 119 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) 120 121 enum nvme_controller_register_offsets { 122 NVME_CR_CAP_LOW = 0x00, 123 NVME_CR_CAP_HI = 0x04, 124 NVME_CR_VS = 0x08, 125 NVME_CR_INTMS = 0x0c, 126 NVME_CR_INTMC = 0x10, 127 NVME_CR_CC = 0x14, 128 NVME_CR_CSTS = 0x1c, 129 NVME_CR_NSSR = 0x20, 130 NVME_CR_AQA = 0x24, 131 NVME_CR_ASQ_LOW = 0x28, 132 NVME_CR_ASQ_HI = 0x2c, 133 NVME_CR_ACQ_LOW = 0x30, 134 NVME_CR_ACQ_HI = 0x34, 135 }; 136 137 enum nvme_cmd_cdw11 { 138 NVME_CMD_CDW11_PC = 0x0001, 139 NVME_CMD_CDW11_IEN = 0x0002, 140 NVME_CMD_CDW11_IV = 0xFFFF0000, 141 }; 142 143 #define NVME_CQ_INTEN 0x01 144 #define NVME_CQ_INTCOAL 0x02 145 146 struct nvme_completion_queue { 147 struct nvme_completion *qbase; 148 uint32_t size; 149 uint16_t tail; /* nvme progress */ 150 uint16_t head; /* guest progress */ 151 uint16_t intr_vec; 152 uint32_t intr_en; 153 pthread_mutex_t mtx; 154 }; 155 156 struct nvme_submission_queue { 157 struct nvme_command *qbase; 158 uint32_t size; 159 uint16_t head; /* nvme progress */ 160 uint16_t tail; /* guest progress */ 161 uint16_t cqid; /* completion queue id */ 162 int busy; /* queue is being processed */ 163 int qpriority; 164 }; 165 166 enum nvme_storage_type { 167 NVME_STOR_BLOCKIF = 0, 168 NVME_STOR_RAM = 1, 169 }; 170 171 struct pci_nvme_blockstore { 172 enum nvme_storage_type type; 173 void *ctx; 174 uint64_t size; 175 uint32_t sectsz; 176 uint32_t sectsz_bits; 177 uint64_t eui64; 178 }; 179 180 struct pci_nvme_ioreq { 181 struct pci_nvme_softc *sc; 182 struct pci_nvme_ioreq *next; 183 struct nvme_submission_queue *nvme_sq; 184 uint16_t sqid; 185 186 /* command information */ 187 uint16_t opc; 188 uint16_t cid; 189 uint32_t nsid; 190 191 uint64_t prev_gpaddr; 192 size_t prev_size; 193 194 /* 195 * lock if all iovs consumed (big IO); 196 * complete transaction before continuing 197 */ 198 pthread_mutex_t mtx; 199 pthread_cond_t cv; 200 201 struct blockif_req io_req; 202 203 /* pad to fit up to 512 page descriptors from guest IO request */ 204 struct iovec iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX]; 205 }; 206 207 struct pci_nvme_softc { 208 struct pci_devinst *nsc_pi; 209 210 pthread_mutex_t mtx; 211 212 struct nvme_registers regs; 213 214 struct nvme_namespace_data nsdata; 215 struct nvme_controller_data ctrldata; 216 struct nvme_error_information_entry err_log; 217 struct nvme_health_information_page health_log; 218 struct nvme_firmware_page fw_log; 219 220 struct pci_nvme_blockstore nvstore; 221 222 uint16_t max_qentries; /* max entries per queue */ 223 uint32_t max_queues; /* max number of IO SQ's or CQ's */ 224 uint32_t num_cqueues; 225 uint32_t num_squeues; 226 227 struct pci_nvme_ioreq *ioreqs; 228 struct pci_nvme_ioreq *ioreqs_free; /* free list of ioreqs */ 229 uint32_t pending_ios; 230 uint32_t ioslots; 231 sem_t iosemlock; 232 233 /* 234 * Memory mapped Submission and Completion queues 235 * Each array includes both Admin and IO queues 236 */ 237 struct nvme_completion_queue *compl_queues; 238 struct nvme_submission_queue *submit_queues; 239 240 /* controller features */ 241 uint32_t intr_coales_aggr_time; /* 0x08: uS to delay intr */ 242 uint32_t intr_coales_aggr_thresh; /* 0x08: compl-Q entries */ 243 uint32_t async_ev_config; /* 0x0B: async event config */ 244 }; 245 246 247 static void pci_nvme_io_partial(struct blockif_req *br, int err); 248 249 /* Controller Configuration utils */ 250 #define NVME_CC_GET_EN(cc) \ 251 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) 252 #define NVME_CC_GET_CSS(cc) \ 253 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) 254 #define NVME_CC_GET_SHN(cc) \ 255 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) 256 #define NVME_CC_GET_IOSQES(cc) \ 257 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) 258 #define NVME_CC_GET_IOCQES(cc) \ 259 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) 260 261 #define NVME_CC_WRITE_MASK \ 262 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ 263 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ 264 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) 265 266 #define NVME_CC_NEN_WRITE_MASK \ 267 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ 268 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ 269 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) 270 271 /* Controller Status utils */ 272 #define NVME_CSTS_GET_RDY(sts) \ 273 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) 274 275 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) 276 277 /* Completion Queue status word utils */ 278 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) 279 #define NVME_STATUS_MASK \ 280 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ 281 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) 282 283 static __inline void 284 cpywithpad(char *dst, size_t dst_size, const char *src, char pad) 285 { 286 size_t len; 287 288 len = strnlen(src, dst_size); 289 memset(dst, pad, dst_size); 290 memcpy(dst, src, len); 291 } 292 293 static __inline void 294 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) 295 { 296 297 *status &= ~NVME_STATUS_MASK; 298 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | 299 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; 300 } 301 302 static __inline void 303 pci_nvme_status_genc(uint16_t *status, uint16_t code) 304 { 305 306 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); 307 } 308 309 static __inline void 310 pci_nvme_toggle_phase(uint16_t *status, int prev) 311 { 312 313 if (prev) 314 *status &= ~NVME_STATUS_P; 315 else 316 *status |= NVME_STATUS_P; 317 } 318 319 static void 320 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) 321 { 322 struct nvme_controller_data *cd = &sc->ctrldata; 323 324 cd->vid = 0xFB5D; 325 cd->ssvid = 0x0000; 326 327 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); 328 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); 329 330 /* Num of submission commands that we can handle at a time (2^rab) */ 331 cd->rab = 4; 332 333 /* FreeBSD OUI */ 334 cd->ieee[0] = 0x58; 335 cd->ieee[1] = 0x9c; 336 cd->ieee[2] = 0xfc; 337 338 cd->mic = 0; 339 340 cd->mdts = 9; /* max data transfer size (2^mdts * CAP.MPSMIN) */ 341 342 cd->ver = 0x00010300; 343 344 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; 345 cd->acl = 2; 346 cd->aerl = 4; 347 348 cd->lpa = 0; /* TODO: support some simple things like SMART */ 349 cd->elpe = 0; /* max error log page entries */ 350 cd->npss = 1; /* number of power states support */ 351 352 /* Warning Composite Temperature Threshold */ 353 cd->wctemp = 0x0157; 354 355 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | 356 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); 357 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | 358 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); 359 cd->nn = 1; /* number of namespaces */ 360 361 cd->fna = 0x03; 362 363 cd->power_state[0].mp = 10; 364 } 365 366 /* 367 * Calculate the CRC-16 of the given buffer 368 * See copyright attribution at top of file 369 */ 370 static uint16_t 371 crc16(uint16_t crc, const void *buffer, unsigned int len) 372 { 373 const unsigned char *cp = buffer; 374 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */ 375 static uint16_t const crc16_table[256] = { 376 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, 377 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, 378 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, 379 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841, 380 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, 381 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41, 382 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641, 383 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040, 384 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240, 385 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441, 386 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41, 387 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840, 388 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41, 389 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40, 390 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640, 391 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041, 392 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240, 393 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441, 394 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41, 395 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840, 396 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41, 397 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40, 398 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640, 399 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041, 400 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241, 401 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440, 402 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40, 403 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841, 404 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40, 405 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41, 406 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, 407 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 408 }; 409 410 while (len--) 411 crc = (((crc >> 8) & 0xffU) ^ 412 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU; 413 return crc; 414 } 415 416 static void 417 pci_nvme_init_nsdata(struct pci_nvme_softc *sc, 418 struct nvme_namespace_data *nd, uint32_t nsid, 419 uint64_t eui64) 420 { 421 422 nd->nsze = sc->nvstore.size / sc->nvstore.sectsz; 423 nd->ncap = nd->nsze; 424 nd->nuse = nd->nsze; 425 426 /* Get LBA and backstore information from backing store */ 427 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ 428 nd->flbas = 0; 429 430 /* Create an EUI-64 if user did not provide one */ 431 if (eui64 == 0) { 432 char *data = NULL; 433 434 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus, 435 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 436 437 if (data != NULL) { 438 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); 439 free(data); 440 } 441 eui64 = (eui64 << 16) | (nsid & 0xffff); 442 } 443 be64enc(nd->eui64, eui64); 444 445 /* LBA data-sz = 2^lbads */ 446 nd->lbaf[0] = sc->nvstore.sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; 447 } 448 449 static void 450 pci_nvme_init_logpages(struct pci_nvme_softc *sc) 451 { 452 453 memset(&sc->err_log, 0, sizeof(sc->err_log)); 454 memset(&sc->health_log, 0, sizeof(sc->health_log)); 455 memset(&sc->fw_log, 0, sizeof(sc->fw_log)); 456 } 457 458 static void 459 pci_nvme_reset_locked(struct pci_nvme_softc *sc) 460 { 461 DPRINTF(("%s", __func__)); 462 463 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | 464 (1 << NVME_CAP_LO_REG_CQR_SHIFT) | 465 (60 << NVME_CAP_LO_REG_TO_SHIFT); 466 467 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; 468 469 sc->regs.vs = 0x00010300; /* NVMe v1.3 */ 470 471 sc->regs.cc = 0; 472 sc->regs.csts = 0; 473 474 sc->num_cqueues = sc->num_squeues = sc->max_queues; 475 if (sc->submit_queues != NULL) { 476 for (int i = 0; i < sc->num_squeues + 1; i++) { 477 /* 478 * The Admin Submission Queue is at index 0. 479 * It must not be changed at reset otherwise the 480 * emulation will be out of sync with the guest. 481 */ 482 if (i != 0) { 483 sc->submit_queues[i].qbase = NULL; 484 sc->submit_queues[i].size = 0; 485 sc->submit_queues[i].cqid = 0; 486 } 487 sc->submit_queues[i].tail = 0; 488 sc->submit_queues[i].head = 0; 489 sc->submit_queues[i].busy = 0; 490 } 491 } else 492 sc->submit_queues = calloc(sc->num_squeues + 1, 493 sizeof(struct nvme_submission_queue)); 494 495 if (sc->compl_queues != NULL) { 496 for (int i = 0; i < sc->num_cqueues + 1; i++) { 497 /* See Admin Submission Queue note above */ 498 if (i != 0) { 499 sc->compl_queues[i].qbase = NULL; 500 sc->compl_queues[i].size = 0; 501 } 502 503 sc->compl_queues[i].tail = 0; 504 sc->compl_queues[i].head = 0; 505 } 506 } else { 507 sc->compl_queues = calloc(sc->num_cqueues + 1, 508 sizeof(struct nvme_completion_queue)); 509 510 for (int i = 0; i < sc->num_cqueues + 1; i++) 511 pthread_mutex_init(&sc->compl_queues[i].mtx, NULL); 512 } 513 } 514 515 static void 516 pci_nvme_reset(struct pci_nvme_softc *sc) 517 { 518 pthread_mutex_lock(&sc->mtx); 519 pci_nvme_reset_locked(sc); 520 pthread_mutex_unlock(&sc->mtx); 521 } 522 523 static void 524 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc) 525 { 526 uint16_t acqs, asqs; 527 528 DPRINTF(("%s", __func__)); 529 530 asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1; 531 sc->submit_queues[0].size = asqs; 532 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq, 533 sizeof(struct nvme_command) * asqs); 534 535 DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p", 536 __func__, sc->regs.asq, sc->submit_queues[0].qbase)); 537 538 acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 539 NVME_AQA_REG_ACQS_MASK) + 1; 540 sc->compl_queues[0].size = acqs; 541 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq, 542 sizeof(struct nvme_completion) * acqs); 543 DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p", 544 __func__, sc->regs.acq, sc->compl_queues[0].qbase)); 545 } 546 547 static int 548 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *src, 549 size_t len) 550 { 551 uint8_t *dst; 552 size_t bytes; 553 554 if (len > (8 * 1024)) { 555 return (-1); 556 } 557 558 /* Copy from the start of prp1 to the end of the physical page */ 559 bytes = PAGE_SIZE - (prp1 & PAGE_MASK); 560 bytes = MIN(bytes, len); 561 562 dst = vm_map_gpa(ctx, prp1, bytes); 563 if (dst == NULL) { 564 return (-1); 565 } 566 567 memcpy(dst, src, bytes); 568 569 src += bytes; 570 571 len -= bytes; 572 if (len == 0) { 573 return (0); 574 } 575 576 len = MIN(len, PAGE_SIZE); 577 578 dst = vm_map_gpa(ctx, prp2, len); 579 if (dst == NULL) { 580 return (-1); 581 } 582 583 memcpy(dst, src, len); 584 585 return (0); 586 } 587 588 static int 589 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 590 struct nvme_completion* compl) 591 { 592 uint16_t qid = command->cdw10 & 0xffff; 593 594 DPRINTF(("%s DELETE_IO_SQ %u", __func__, qid)); 595 if (qid == 0 || qid > sc->num_squeues) { 596 WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u", 597 __func__, qid, sc->num_squeues)); 598 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 599 NVME_SC_INVALID_QUEUE_IDENTIFIER); 600 return (1); 601 } 602 603 sc->submit_queues[qid].qbase = NULL; 604 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 605 return (1); 606 } 607 608 static int 609 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 610 struct nvme_completion* compl) 611 { 612 if (command->cdw11 & NVME_CMD_CDW11_PC) { 613 uint16_t qid = command->cdw10 & 0xffff; 614 struct nvme_submission_queue *nsq; 615 616 if ((qid == 0) || (qid > sc->num_squeues)) { 617 WPRINTF(("%s queue index %u > num_squeues %u", 618 __func__, qid, sc->num_squeues)); 619 pci_nvme_status_tc(&compl->status, 620 NVME_SCT_COMMAND_SPECIFIC, 621 NVME_SC_INVALID_QUEUE_IDENTIFIER); 622 return (1); 623 } 624 625 nsq = &sc->submit_queues[qid]; 626 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 627 628 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 629 sizeof(struct nvme_command) * (size_t)nsq->size); 630 nsq->cqid = (command->cdw11 >> 16) & 0xffff; 631 nsq->qpriority = (command->cdw11 >> 1) & 0x03; 632 633 DPRINTF(("%s sq %u size %u gaddr %p cqid %u", __func__, 634 qid, nsq->size, nsq->qbase, nsq->cqid)); 635 636 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 637 638 DPRINTF(("%s completed creating IOSQ qid %u", 639 __func__, qid)); 640 } else { 641 /* 642 * Guest sent non-cont submission queue request. 643 * This setting is unsupported by this emulation. 644 */ 645 WPRINTF(("%s unsupported non-contig (list-based) " 646 "create i/o submission queue", __func__)); 647 648 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 649 } 650 return (1); 651 } 652 653 static int 654 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 655 struct nvme_completion* compl) 656 { 657 uint16_t qid = command->cdw10 & 0xffff; 658 659 DPRINTF(("%s DELETE_IO_CQ %u", __func__, qid)); 660 if (qid == 0 || qid > sc->num_cqueues) { 661 WPRINTF(("%s queue index %u / num_cqueues %u", 662 __func__, qid, sc->num_cqueues)); 663 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 664 NVME_SC_INVALID_QUEUE_IDENTIFIER); 665 return (1); 666 } 667 668 sc->compl_queues[qid].qbase = NULL; 669 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 670 return (1); 671 } 672 673 static int 674 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 675 struct nvme_completion* compl) 676 { 677 if (command->cdw11 & NVME_CMD_CDW11_PC) { 678 uint16_t qid = command->cdw10 & 0xffff; 679 struct nvme_completion_queue *ncq; 680 681 if ((qid == 0) || (qid > sc->num_cqueues)) { 682 WPRINTF(("%s queue index %u > num_cqueues %u", 683 __func__, qid, sc->num_cqueues)); 684 pci_nvme_status_tc(&compl->status, 685 NVME_SCT_COMMAND_SPECIFIC, 686 NVME_SC_INVALID_QUEUE_IDENTIFIER); 687 return (1); 688 } 689 690 ncq = &sc->compl_queues[qid]; 691 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; 692 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; 693 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 694 695 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 696 command->prp1, 697 sizeof(struct nvme_command) * (size_t)ncq->size); 698 699 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 700 } else { 701 /* 702 * Non-contig completion queue unsupported. 703 */ 704 WPRINTF(("%s unsupported non-contig (list-based) " 705 "create i/o completion queue", 706 __func__)); 707 708 /* 0x12 = Invalid Use of Controller Memory Buffer */ 709 pci_nvme_status_genc(&compl->status, 0x12); 710 } 711 712 return (1); 713 } 714 715 static int 716 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, 717 struct nvme_completion* compl) 718 { 719 uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2; 720 uint8_t logpage = command->cdw10 & 0xFF; 721 722 DPRINTF(("%s log page %u len %u", __func__, logpage, logsize)); 723 724 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 725 726 switch (logpage) { 727 case NVME_LOG_ERROR: 728 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 729 command->prp2, (uint8_t *)&sc->err_log, logsize); 730 break; 731 case NVME_LOG_HEALTH_INFORMATION: 732 /* TODO: present some smart info */ 733 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 734 command->prp2, (uint8_t *)&sc->health_log, logsize); 735 break; 736 case NVME_LOG_FIRMWARE_SLOT: 737 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 738 command->prp2, (uint8_t *)&sc->fw_log, logsize); 739 break; 740 default: 741 WPRINTF(("%s get log page %x command not supported", 742 __func__, logpage)); 743 744 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 745 NVME_SC_INVALID_LOG_PAGE); 746 } 747 748 return (1); 749 } 750 751 static int 752 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, 753 struct nvme_completion* compl) 754 { 755 void *dest; 756 757 DPRINTF(("%s identify 0x%x nsid 0x%x", __func__, 758 command->cdw10 & 0xFF, command->nsid)); 759 760 switch (command->cdw10 & 0xFF) { 761 case 0x00: /* return Identify Namespace data structure */ 762 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 763 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata)); 764 break; 765 case 0x01: /* return Identify Controller data structure */ 766 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 767 command->prp2, (uint8_t *)&sc->ctrldata, 768 sizeof(sc->ctrldata)); 769 break; 770 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ 771 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 772 sizeof(uint32_t) * 1024); 773 ((uint32_t *)dest)[0] = 1; 774 ((uint32_t *)dest)[1] = 0; 775 break; 776 case 0x11: 777 pci_nvme_status_genc(&compl->status, 778 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 779 return (1); 780 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ 781 case 0x10: 782 case 0x12: 783 case 0x13: 784 case 0x14: 785 case 0x15: 786 default: 787 DPRINTF(("%s unsupported identify command requested 0x%x", 788 __func__, command->cdw10 & 0xFF)); 789 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 790 return (1); 791 } 792 793 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 794 return (1); 795 } 796 797 static int 798 nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command, 799 struct nvme_completion* compl) 800 { 801 uint16_t nqr; /* Number of Queues Requested */ 802 803 nqr = command->cdw11 & 0xFFFF; 804 if (nqr == 0xffff) { 805 WPRINTF(("%s: Illegal NSQR value %#x", __func__, nqr)); 806 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 807 return (-1); 808 } 809 810 sc->num_squeues = ONE_BASED(nqr); 811 if (sc->num_squeues > sc->max_queues) { 812 DPRINTF(("NSQR=%u is greater than max %u", sc->num_squeues, 813 sc->max_queues)); 814 sc->num_squeues = sc->max_queues; 815 } 816 817 nqr = (command->cdw11 >> 16) & 0xFFFF; 818 if (nqr == 0xffff) { 819 WPRINTF(("%s: Illegal NCQR value %#x", __func__, nqr)); 820 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 821 return (-1); 822 } 823 824 sc->num_cqueues = ONE_BASED(nqr); 825 if (sc->num_cqueues > sc->max_queues) { 826 DPRINTF(("NCQR=%u is greater than max %u", sc->num_cqueues, 827 sc->max_queues)); 828 sc->num_cqueues = sc->max_queues; 829 } 830 831 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 832 833 return (0); 834 } 835 836 static int 837 nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command, 838 struct nvme_completion* compl) 839 { 840 int feature = command->cdw10 & 0xFF; 841 uint32_t iv; 842 843 DPRINTF(("%s feature 0x%x", __func__, feature)); 844 compl->cdw0 = 0; 845 846 switch (feature) { 847 case NVME_FEAT_ARBITRATION: 848 DPRINTF((" arbitration 0x%x", command->cdw11)); 849 break; 850 case NVME_FEAT_POWER_MANAGEMENT: 851 DPRINTF((" power management 0x%x", command->cdw11)); 852 break; 853 case NVME_FEAT_LBA_RANGE_TYPE: 854 DPRINTF((" lba range 0x%x", command->cdw11)); 855 break; 856 case NVME_FEAT_TEMPERATURE_THRESHOLD: 857 DPRINTF((" temperature threshold 0x%x", command->cdw11)); 858 break; 859 case NVME_FEAT_ERROR_RECOVERY: 860 DPRINTF((" error recovery 0x%x", command->cdw11)); 861 break; 862 case NVME_FEAT_VOLATILE_WRITE_CACHE: 863 DPRINTF((" volatile write cache 0x%x", command->cdw11)); 864 break; 865 case NVME_FEAT_NUMBER_OF_QUEUES: 866 nvme_set_feature_queues(sc, command, compl); 867 break; 868 case NVME_FEAT_INTERRUPT_COALESCING: 869 DPRINTF((" interrupt coalescing 0x%x", command->cdw11)); 870 871 /* in uS */ 872 sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100; 873 874 sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF; 875 break; 876 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 877 iv = command->cdw11 & 0xFFFF; 878 879 DPRINTF((" interrupt vector configuration 0x%x", 880 command->cdw11)); 881 882 for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) { 883 if (sc->compl_queues[i].intr_vec == iv) { 884 if (command->cdw11 & (1 << 16)) 885 sc->compl_queues[i].intr_en |= 886 NVME_CQ_INTCOAL; 887 else 888 sc->compl_queues[i].intr_en &= 889 ~NVME_CQ_INTCOAL; 890 } 891 } 892 break; 893 case NVME_FEAT_WRITE_ATOMICITY: 894 DPRINTF((" write atomicity 0x%x", command->cdw11)); 895 break; 896 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 897 DPRINTF((" async event configuration 0x%x", 898 command->cdw11)); 899 sc->async_ev_config = command->cdw11; 900 break; 901 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 902 DPRINTF((" software progress marker 0x%x", 903 command->cdw11)); 904 break; 905 case 0x0C: 906 DPRINTF((" autonomous power state transition 0x%x", 907 command->cdw11)); 908 break; 909 default: 910 WPRINTF(("%s invalid feature", __func__)); 911 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 912 return (1); 913 } 914 915 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 916 return (1); 917 } 918 919 static int 920 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, 921 struct nvme_completion* compl) 922 { 923 int feature = command->cdw10 & 0xFF; 924 925 DPRINTF(("%s feature 0x%x", __func__, feature)); 926 927 compl->cdw0 = 0; 928 929 switch (feature) { 930 case NVME_FEAT_ARBITRATION: 931 DPRINTF((" arbitration")); 932 break; 933 case NVME_FEAT_POWER_MANAGEMENT: 934 DPRINTF((" power management")); 935 break; 936 case NVME_FEAT_LBA_RANGE_TYPE: 937 DPRINTF((" lba range")); 938 break; 939 case NVME_FEAT_TEMPERATURE_THRESHOLD: 940 DPRINTF((" temperature threshold")); 941 switch ((command->cdw11 >> 20) & 0x3) { 942 case 0: 943 /* Over temp threshold */ 944 compl->cdw0 = 0xFFFF; 945 break; 946 case 1: 947 /* Under temp threshold */ 948 compl->cdw0 = 0; 949 break; 950 default: 951 WPRINTF((" invalid threshold type select")); 952 pci_nvme_status_genc(&compl->status, 953 NVME_SC_INVALID_FIELD); 954 return (1); 955 } 956 break; 957 case NVME_FEAT_ERROR_RECOVERY: 958 DPRINTF((" error recovery")); 959 break; 960 case NVME_FEAT_VOLATILE_WRITE_CACHE: 961 DPRINTF((" volatile write cache")); 962 break; 963 case NVME_FEAT_NUMBER_OF_QUEUES: 964 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 965 966 DPRINTF((" number of queues (submit %u, completion %u)", 967 compl->cdw0 & 0xFFFF, 968 (compl->cdw0 >> 16) & 0xFFFF)); 969 970 break; 971 case NVME_FEAT_INTERRUPT_COALESCING: 972 DPRINTF((" interrupt coalescing")); 973 break; 974 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 975 DPRINTF((" interrupt vector configuration")); 976 break; 977 case NVME_FEAT_WRITE_ATOMICITY: 978 DPRINTF((" write atomicity")); 979 break; 980 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 981 DPRINTF((" async event configuration")); 982 sc->async_ev_config = command->cdw11; 983 break; 984 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 985 DPRINTF((" software progress marker")); 986 break; 987 case 0x0C: 988 DPRINTF((" autonomous power state transition")); 989 break; 990 default: 991 WPRINTF(("%s invalid feature 0x%x", __func__, feature)); 992 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 993 return (1); 994 } 995 996 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 997 return (1); 998 } 999 1000 static int 1001 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command, 1002 struct nvme_completion* compl) 1003 { 1004 DPRINTF(("%s submission queue %u, command ID 0x%x", __func__, 1005 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF)); 1006 1007 /* TODO: search for the command ID and abort it */ 1008 1009 compl->cdw0 = 1; 1010 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1011 return (1); 1012 } 1013 1014 static int 1015 nvme_opc_async_event_req(struct pci_nvme_softc* sc, 1016 struct nvme_command* command, struct nvme_completion* compl) 1017 { 1018 DPRINTF(("%s async event request 0x%x", __func__, command->cdw11)); 1019 1020 /* 1021 * TODO: raise events when they happen based on the Set Features cmd. 1022 * These events happen async, so only set completion successful if 1023 * there is an event reflective of the request to get event. 1024 */ 1025 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1026 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); 1027 return (0); 1028 } 1029 1030 static void 1031 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) 1032 { 1033 struct nvme_completion compl; 1034 struct nvme_command *cmd; 1035 struct nvme_submission_queue *sq; 1036 struct nvme_completion_queue *cq; 1037 int do_intr = 0; 1038 uint16_t sqhead; 1039 1040 DPRINTF(("%s index %u", __func__, (uint32_t)value)); 1041 1042 sq = &sc->submit_queues[0]; 1043 1044 sqhead = atomic_load_acq_short(&sq->head); 1045 1046 if (atomic_testandset_int(&sq->busy, 1)) { 1047 DPRINTF(("%s SQ busy, head %u, tail %u", 1048 __func__, sqhead, sq->tail)); 1049 return; 1050 } 1051 1052 DPRINTF(("sqhead %u, tail %u", sqhead, sq->tail)); 1053 1054 while (sqhead != atomic_load_acq_short(&sq->tail)) { 1055 cmd = &(sq->qbase)[sqhead]; 1056 compl.cdw0 = 0; 1057 compl.status = 0; 1058 1059 switch (cmd->opc) { 1060 case NVME_OPC_DELETE_IO_SQ: 1061 DPRINTF(("%s command DELETE_IO_SQ", __func__)); 1062 do_intr |= nvme_opc_delete_io_sq(sc, cmd, &compl); 1063 break; 1064 case NVME_OPC_CREATE_IO_SQ: 1065 DPRINTF(("%s command CREATE_IO_SQ", __func__)); 1066 do_intr |= nvme_opc_create_io_sq(sc, cmd, &compl); 1067 break; 1068 case NVME_OPC_DELETE_IO_CQ: 1069 DPRINTF(("%s command DELETE_IO_CQ", __func__)); 1070 do_intr |= nvme_opc_delete_io_cq(sc, cmd, &compl); 1071 break; 1072 case NVME_OPC_CREATE_IO_CQ: 1073 DPRINTF(("%s command CREATE_IO_CQ", __func__)); 1074 do_intr |= nvme_opc_create_io_cq(sc, cmd, &compl); 1075 break; 1076 case NVME_OPC_GET_LOG_PAGE: 1077 DPRINTF(("%s command GET_LOG_PAGE", __func__)); 1078 do_intr |= nvme_opc_get_log_page(sc, cmd, &compl); 1079 break; 1080 case NVME_OPC_IDENTIFY: 1081 DPRINTF(("%s command IDENTIFY", __func__)); 1082 do_intr |= nvme_opc_identify(sc, cmd, &compl); 1083 break; 1084 case NVME_OPC_ABORT: 1085 DPRINTF(("%s command ABORT", __func__)); 1086 do_intr |= nvme_opc_abort(sc, cmd, &compl); 1087 break; 1088 case NVME_OPC_SET_FEATURES: 1089 DPRINTF(("%s command SET_FEATURES", __func__)); 1090 do_intr |= nvme_opc_set_features(sc, cmd, &compl); 1091 break; 1092 case NVME_OPC_GET_FEATURES: 1093 DPRINTF(("%s command GET_FEATURES", __func__)); 1094 do_intr |= nvme_opc_get_features(sc, cmd, &compl); 1095 break; 1096 case NVME_OPC_ASYNC_EVENT_REQUEST: 1097 DPRINTF(("%s command ASYNC_EVENT_REQ", __func__)); 1098 /* XXX dont care, unhandled for now 1099 do_intr |= nvme_opc_async_event_req(sc, cmd, &compl); 1100 */ 1101 compl.status = NVME_NO_STATUS; 1102 break; 1103 default: 1104 WPRINTF(("0x%x command is not implemented", 1105 cmd->opc)); 1106 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 1107 do_intr |= 1; 1108 } 1109 1110 if (NVME_COMPLETION_VALID(compl)) { 1111 struct nvme_completion *cp; 1112 int phase; 1113 1114 cq = &sc->compl_queues[0]; 1115 1116 cp = &(cq->qbase)[cq->tail]; 1117 cp->cdw0 = compl.cdw0; 1118 cp->sqid = 0; 1119 cp->sqhd = sqhead; 1120 cp->cid = cmd->cid; 1121 1122 phase = NVME_STATUS_GET_P(cp->status); 1123 cp->status = compl.status; 1124 pci_nvme_toggle_phase(&cp->status, phase); 1125 1126 cq->tail = (cq->tail + 1) % cq->size; 1127 } 1128 sqhead = (sqhead + 1) % sq->size; 1129 } 1130 1131 DPRINTF(("setting sqhead %u", sqhead)); 1132 atomic_store_short(&sq->head, sqhead); 1133 atomic_store_int(&sq->busy, 0); 1134 1135 if (do_intr) 1136 pci_generate_msix(sc->nsc_pi, 0); 1137 1138 } 1139 1140 static int 1141 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req, 1142 uint64_t gpaddr, size_t size, int do_write, uint64_t lba) 1143 { 1144 int iovidx; 1145 1146 if (req != NULL) { 1147 /* concatenate contig block-iovs to minimize number of iovs */ 1148 if ((req->prev_gpaddr + req->prev_size) == gpaddr) { 1149 iovidx = req->io_req.br_iovcnt - 1; 1150 1151 req->io_req.br_iov[iovidx].iov_base = 1152 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 1153 req->prev_gpaddr, size); 1154 1155 req->prev_size += size; 1156 req->io_req.br_resid += size; 1157 1158 req->io_req.br_iov[iovidx].iov_len = req->prev_size; 1159 } else { 1160 pthread_mutex_lock(&req->mtx); 1161 1162 iovidx = req->io_req.br_iovcnt; 1163 if (iovidx == NVME_MAX_BLOCKIOVS) { 1164 int err = 0; 1165 1166 DPRINTF(("large I/O, doing partial req")); 1167 1168 iovidx = 0; 1169 req->io_req.br_iovcnt = 0; 1170 1171 req->io_req.br_callback = pci_nvme_io_partial; 1172 1173 if (!do_write) 1174 err = blockif_read(sc->nvstore.ctx, 1175 &req->io_req); 1176 else 1177 err = blockif_write(sc->nvstore.ctx, 1178 &req->io_req); 1179 1180 /* wait until req completes before cont */ 1181 if (err == 0) 1182 pthread_cond_wait(&req->cv, &req->mtx); 1183 } 1184 if (iovidx == 0) { 1185 req->io_req.br_offset = lba; 1186 req->io_req.br_resid = 0; 1187 req->io_req.br_param = req; 1188 } 1189 1190 req->io_req.br_iov[iovidx].iov_base = 1191 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 1192 gpaddr, size); 1193 1194 req->io_req.br_iov[iovidx].iov_len = size; 1195 1196 req->prev_gpaddr = gpaddr; 1197 req->prev_size = size; 1198 req->io_req.br_resid += size; 1199 1200 req->io_req.br_iovcnt++; 1201 1202 pthread_mutex_unlock(&req->mtx); 1203 } 1204 } else { 1205 /* RAM buffer: read/write directly */ 1206 void *p = sc->nvstore.ctx; 1207 void *gptr; 1208 1209 if ((lba + size) > sc->nvstore.size) { 1210 WPRINTF(("%s write would overflow RAM", __func__)); 1211 return (-1); 1212 } 1213 1214 p = (void *)((uintptr_t)p + (uintptr_t)lba); 1215 gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size); 1216 if (do_write) 1217 memcpy(p, gptr, size); 1218 else 1219 memcpy(gptr, p, size); 1220 } 1221 return (0); 1222 } 1223 1224 static void 1225 pci_nvme_set_completion(struct pci_nvme_softc *sc, 1226 struct nvme_submission_queue *sq, int sqid, uint16_t cid, 1227 uint32_t cdw0, uint16_t status, int ignore_busy) 1228 { 1229 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; 1230 struct nvme_completion *compl; 1231 int do_intr = 0; 1232 int phase; 1233 1234 DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x", 1235 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), 1236 NVME_STATUS_GET_SC(status))); 1237 1238 pthread_mutex_lock(&cq->mtx); 1239 1240 assert(cq->qbase != NULL); 1241 1242 compl = &cq->qbase[cq->tail]; 1243 1244 compl->sqhd = atomic_load_acq_short(&sq->head); 1245 compl->sqid = sqid; 1246 compl->cid = cid; 1247 1248 // toggle phase 1249 phase = NVME_STATUS_GET_P(compl->status); 1250 compl->status = status; 1251 pci_nvme_toggle_phase(&compl->status, phase); 1252 1253 cq->tail = (cq->tail + 1) % cq->size; 1254 1255 if (cq->intr_en & NVME_CQ_INTEN) 1256 do_intr = 1; 1257 1258 pthread_mutex_unlock(&cq->mtx); 1259 1260 if (ignore_busy || !atomic_load_acq_int(&sq->busy)) 1261 if (do_intr) 1262 pci_generate_msix(sc->nsc_pi, cq->intr_vec); 1263 } 1264 1265 static void 1266 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) 1267 { 1268 req->sc = NULL; 1269 req->nvme_sq = NULL; 1270 req->sqid = 0; 1271 1272 pthread_mutex_lock(&sc->mtx); 1273 1274 req->next = sc->ioreqs_free; 1275 sc->ioreqs_free = req; 1276 sc->pending_ios--; 1277 1278 /* when no more IO pending, can set to ready if device reset/enabled */ 1279 if (sc->pending_ios == 0 && 1280 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) 1281 sc->regs.csts |= NVME_CSTS_RDY; 1282 1283 pthread_mutex_unlock(&sc->mtx); 1284 1285 sem_post(&sc->iosemlock); 1286 } 1287 1288 static struct pci_nvme_ioreq * 1289 pci_nvme_get_ioreq(struct pci_nvme_softc *sc) 1290 { 1291 struct pci_nvme_ioreq *req = NULL;; 1292 1293 sem_wait(&sc->iosemlock); 1294 pthread_mutex_lock(&sc->mtx); 1295 1296 req = sc->ioreqs_free; 1297 assert(req != NULL); 1298 1299 sc->ioreqs_free = req->next; 1300 1301 req->next = NULL; 1302 req->sc = sc; 1303 1304 sc->pending_ios++; 1305 1306 pthread_mutex_unlock(&sc->mtx); 1307 1308 req->io_req.br_iovcnt = 0; 1309 req->io_req.br_offset = 0; 1310 req->io_req.br_resid = 0; 1311 req->io_req.br_param = req; 1312 req->prev_gpaddr = 0; 1313 req->prev_size = 0; 1314 1315 return req; 1316 } 1317 1318 static void 1319 pci_nvme_io_done(struct blockif_req *br, int err) 1320 { 1321 struct pci_nvme_ioreq *req = br->br_param; 1322 struct nvme_submission_queue *sq = req->nvme_sq; 1323 uint16_t code, status; 1324 1325 DPRINTF(("%s error %d %s", __func__, err, strerror(err))); 1326 1327 /* TODO return correct error */ 1328 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; 1329 pci_nvme_status_genc(&status, code); 1330 1331 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0); 1332 pci_nvme_release_ioreq(req->sc, req); 1333 } 1334 1335 static void 1336 pci_nvme_io_partial(struct blockif_req *br, int err) 1337 { 1338 struct pci_nvme_ioreq *req = br->br_param; 1339 1340 DPRINTF(("%s error %d %s", __func__, err, strerror(err))); 1341 1342 pthread_cond_signal(&req->cv); 1343 } 1344 1345 1346 static void 1347 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) 1348 { 1349 struct nvme_submission_queue *sq; 1350 uint16_t status; 1351 uint16_t sqhead; 1352 int err; 1353 1354 /* handle all submissions up to sq->tail index */ 1355 sq = &sc->submit_queues[idx]; 1356 1357 if (atomic_testandset_int(&sq->busy, 1)) { 1358 DPRINTF(("%s sqid %u busy", __func__, idx)); 1359 return; 1360 } 1361 1362 sqhead = atomic_load_acq_short(&sq->head); 1363 1364 DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p", 1365 idx, sqhead, sq->tail, sq->qbase)); 1366 1367 while (sqhead != atomic_load_acq_short(&sq->tail)) { 1368 struct nvme_command *cmd; 1369 struct pci_nvme_ioreq *req = NULL; 1370 uint64_t lba; 1371 uint64_t nblocks, bytes, size, cpsz; 1372 1373 /* TODO: support scatter gather list handling */ 1374 1375 cmd = &sq->qbase[sqhead]; 1376 sqhead = (sqhead + 1) % sq->size; 1377 1378 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; 1379 1380 if (cmd->opc == NVME_OPC_FLUSH) { 1381 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1382 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, 1383 status, 1); 1384 1385 continue; 1386 } else if (cmd->opc == 0x08) { 1387 /* TODO: write zeroes */ 1388 WPRINTF(("%s write zeroes lba 0x%lx blocks %u", 1389 __func__, lba, cmd->cdw12 & 0xFFFF)); 1390 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1391 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, 1392 status, 1); 1393 1394 continue; 1395 } 1396 1397 nblocks = (cmd->cdw12 & 0xFFFF) + 1; 1398 1399 bytes = nblocks * sc->nvstore.sectsz; 1400 1401 if (sc->nvstore.type == NVME_STOR_BLOCKIF) { 1402 req = pci_nvme_get_ioreq(sc); 1403 req->nvme_sq = sq; 1404 req->sqid = idx; 1405 } 1406 1407 /* 1408 * If data starts mid-page and flows into the next page, then 1409 * increase page count 1410 */ 1411 1412 DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu " 1413 "(%lu-bytes)", 1414 sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size, 1415 cmd->opc == NVME_OPC_WRITE ? 1416 "WRITE" : "READ", 1417 lba, nblocks, bytes)); 1418 1419 cmd->prp1 &= ~(0x03UL); 1420 cmd->prp2 &= ~(0x03UL); 1421 1422 DPRINTF((" prp1 0x%lx prp2 0x%lx", cmd->prp1, cmd->prp2)); 1423 1424 size = bytes; 1425 lba *= sc->nvstore.sectsz; 1426 1427 cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE); 1428 1429 if (cpsz > bytes) 1430 cpsz = bytes; 1431 1432 if (req != NULL) { 1433 req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) | 1434 cmd->cdw10; 1435 req->opc = cmd->opc; 1436 req->cid = cmd->cid; 1437 req->nsid = cmd->nsid; 1438 } 1439 1440 err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz, 1441 cmd->opc == NVME_OPC_WRITE, lba); 1442 lba += cpsz; 1443 size -= cpsz; 1444 1445 if (size == 0) 1446 goto iodone; 1447 1448 if (size <= PAGE_SIZE) { 1449 /* prp2 is second (and final) page in transfer */ 1450 1451 err = pci_nvme_append_iov_req(sc, req, cmd->prp2, 1452 size, 1453 cmd->opc == NVME_OPC_WRITE, 1454 lba); 1455 } else { 1456 uint64_t *prp_list; 1457 int i; 1458 1459 /* prp2 is pointer to a physical region page list */ 1460 prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx, 1461 cmd->prp2, PAGE_SIZE); 1462 1463 i = 0; 1464 while (size != 0) { 1465 cpsz = MIN(size, PAGE_SIZE); 1466 1467 /* 1468 * Move to linked physical region page list 1469 * in last item. 1470 */ 1471 if (i == (NVME_PRP2_ITEMS-1) && 1472 size > PAGE_SIZE) { 1473 assert((prp_list[i] & (PAGE_SIZE-1)) == 0); 1474 prp_list = paddr_guest2host( 1475 sc->nsc_pi->pi_vmctx, 1476 prp_list[i], PAGE_SIZE); 1477 i = 0; 1478 } 1479 if (prp_list[i] == 0) { 1480 WPRINTF(("PRP2[%d] = 0 !!!", i)); 1481 err = 1; 1482 break; 1483 } 1484 1485 err = pci_nvme_append_iov_req(sc, req, 1486 prp_list[i], cpsz, 1487 cmd->opc == NVME_OPC_WRITE, lba); 1488 if (err) 1489 break; 1490 1491 lba += cpsz; 1492 size -= cpsz; 1493 i++; 1494 } 1495 } 1496 1497 iodone: 1498 if (sc->nvstore.type == NVME_STOR_RAM) { 1499 uint16_t code, status; 1500 1501 code = err ? NVME_SC_LBA_OUT_OF_RANGE : 1502 NVME_SC_SUCCESS; 1503 pci_nvme_status_genc(&status, code); 1504 1505 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, 1506 status, 1); 1507 1508 continue; 1509 } 1510 1511 1512 if (err) 1513 goto do_error; 1514 1515 req->io_req.br_callback = pci_nvme_io_done; 1516 1517 err = 0; 1518 switch (cmd->opc) { 1519 case NVME_OPC_READ: 1520 err = blockif_read(sc->nvstore.ctx, &req->io_req); 1521 break; 1522 case NVME_OPC_WRITE: 1523 err = blockif_write(sc->nvstore.ctx, &req->io_req); 1524 break; 1525 default: 1526 WPRINTF(("%s unhandled io command 0x%x", 1527 __func__, cmd->opc)); 1528 err = 1; 1529 } 1530 1531 do_error: 1532 if (err) { 1533 uint16_t status; 1534 1535 pci_nvme_status_genc(&status, 1536 NVME_SC_DATA_TRANSFER_ERROR); 1537 1538 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, 1539 status, 1); 1540 pci_nvme_release_ioreq(sc, req); 1541 } 1542 } 1543 1544 atomic_store_short(&sq->head, sqhead); 1545 atomic_store_int(&sq->busy, 0); 1546 } 1547 1548 static void 1549 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc, 1550 uint64_t idx, int is_sq, uint64_t value) 1551 { 1552 DPRINTF(("nvme doorbell %lu, %s, val 0x%lx", 1553 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF)); 1554 1555 if (is_sq) { 1556 atomic_store_short(&sc->submit_queues[idx].tail, 1557 (uint16_t)value); 1558 1559 if (idx == 0) { 1560 pci_nvme_handle_admin_cmd(sc, value); 1561 } else { 1562 /* submission queue; handle new entries in SQ */ 1563 if (idx > sc->num_squeues) { 1564 WPRINTF(("%s SQ index %lu overflow from " 1565 "guest (max %u)", 1566 __func__, idx, sc->num_squeues)); 1567 return; 1568 } 1569 pci_nvme_handle_io_cmd(sc, (uint16_t)idx); 1570 } 1571 } else { 1572 if (idx > sc->num_cqueues) { 1573 WPRINTF(("%s queue index %lu overflow from " 1574 "guest (max %u)", 1575 __func__, idx, sc->num_cqueues)); 1576 return; 1577 } 1578 1579 sc->compl_queues[idx].head = (uint16_t)value; 1580 } 1581 } 1582 1583 static void 1584 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) 1585 { 1586 const char *s = iswrite ? "WRITE" : "READ"; 1587 1588 switch (offset) { 1589 case NVME_CR_CAP_LOW: 1590 DPRINTF(("%s %s NVME_CR_CAP_LOW", func, s)); 1591 break; 1592 case NVME_CR_CAP_HI: 1593 DPRINTF(("%s %s NVME_CR_CAP_HI", func, s)); 1594 break; 1595 case NVME_CR_VS: 1596 DPRINTF(("%s %s NVME_CR_VS", func, s)); 1597 break; 1598 case NVME_CR_INTMS: 1599 DPRINTF(("%s %s NVME_CR_INTMS", func, s)); 1600 break; 1601 case NVME_CR_INTMC: 1602 DPRINTF(("%s %s NVME_CR_INTMC", func, s)); 1603 break; 1604 case NVME_CR_CC: 1605 DPRINTF(("%s %s NVME_CR_CC", func, s)); 1606 break; 1607 case NVME_CR_CSTS: 1608 DPRINTF(("%s %s NVME_CR_CSTS", func, s)); 1609 break; 1610 case NVME_CR_NSSR: 1611 DPRINTF(("%s %s NVME_CR_NSSR", func, s)); 1612 break; 1613 case NVME_CR_AQA: 1614 DPRINTF(("%s %s NVME_CR_AQA", func, s)); 1615 break; 1616 case NVME_CR_ASQ_LOW: 1617 DPRINTF(("%s %s NVME_CR_ASQ_LOW", func, s)); 1618 break; 1619 case NVME_CR_ASQ_HI: 1620 DPRINTF(("%s %s NVME_CR_ASQ_HI", func, s)); 1621 break; 1622 case NVME_CR_ACQ_LOW: 1623 DPRINTF(("%s %s NVME_CR_ACQ_LOW", func, s)); 1624 break; 1625 case NVME_CR_ACQ_HI: 1626 DPRINTF(("%s %s NVME_CR_ACQ_HI", func, s)); 1627 break; 1628 default: 1629 DPRINTF(("unknown nvme bar-0 offset 0x%lx", offset)); 1630 } 1631 1632 } 1633 1634 static void 1635 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc, 1636 uint64_t offset, int size, uint64_t value) 1637 { 1638 uint32_t ccreg; 1639 1640 if (offset >= NVME_DOORBELL_OFFSET) { 1641 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; 1642 uint64_t idx = belloffset / 8; /* door bell size = 2*int */ 1643 int is_sq = (belloffset % 8) < 4; 1644 1645 if (belloffset > ((sc->max_queues+1) * 8 - 4)) { 1646 WPRINTF(("guest attempted an overflow write offset " 1647 "0x%lx, val 0x%lx in %s", 1648 offset, value, __func__)); 1649 return; 1650 } 1651 1652 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value); 1653 return; 1654 } 1655 1656 DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx", 1657 offset, size, value)); 1658 1659 if (size != 4) { 1660 WPRINTF(("guest wrote invalid size %d (offset 0x%lx, " 1661 "val 0x%lx) to bar0 in %s", 1662 size, offset, value, __func__)); 1663 /* TODO: shutdown device */ 1664 return; 1665 } 1666 1667 pci_nvme_bar0_reg_dumps(__func__, offset, 1); 1668 1669 pthread_mutex_lock(&sc->mtx); 1670 1671 switch (offset) { 1672 case NVME_CR_CAP_LOW: 1673 case NVME_CR_CAP_HI: 1674 /* readonly */ 1675 break; 1676 case NVME_CR_VS: 1677 /* readonly */ 1678 break; 1679 case NVME_CR_INTMS: 1680 /* MSI-X, so ignore */ 1681 break; 1682 case NVME_CR_INTMC: 1683 /* MSI-X, so ignore */ 1684 break; 1685 case NVME_CR_CC: 1686 ccreg = (uint32_t)value; 1687 1688 DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u " 1689 "iocqes %u", 1690 __func__, 1691 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), 1692 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), 1693 NVME_CC_GET_IOCQES(ccreg))); 1694 1695 if (NVME_CC_GET_SHN(ccreg)) { 1696 /* perform shutdown - flush out data to backend */ 1697 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << 1698 NVME_CSTS_REG_SHST_SHIFT); 1699 sc->regs.csts |= NVME_SHST_COMPLETE << 1700 NVME_CSTS_REG_SHST_SHIFT; 1701 } 1702 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { 1703 if (NVME_CC_GET_EN(ccreg) == 0) 1704 /* transition 1-> causes controller reset */ 1705 pci_nvme_reset_locked(sc); 1706 else 1707 pci_nvme_init_controller(ctx, sc); 1708 } 1709 1710 /* Insert the iocqes, iosqes and en bits from the write */ 1711 sc->regs.cc &= ~NVME_CC_WRITE_MASK; 1712 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; 1713 if (NVME_CC_GET_EN(ccreg) == 0) { 1714 /* Insert the ams, mps and css bit fields */ 1715 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; 1716 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; 1717 sc->regs.csts &= ~NVME_CSTS_RDY; 1718 } else if (sc->pending_ios == 0) { 1719 sc->regs.csts |= NVME_CSTS_RDY; 1720 } 1721 break; 1722 case NVME_CR_CSTS: 1723 break; 1724 case NVME_CR_NSSR: 1725 /* ignore writes; don't support subsystem reset */ 1726 break; 1727 case NVME_CR_AQA: 1728 sc->regs.aqa = (uint32_t)value; 1729 break; 1730 case NVME_CR_ASQ_LOW: 1731 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | 1732 (0xFFFFF000 & value); 1733 break; 1734 case NVME_CR_ASQ_HI: 1735 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | 1736 (value << 32); 1737 break; 1738 case NVME_CR_ACQ_LOW: 1739 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | 1740 (0xFFFFF000 & value); 1741 break; 1742 case NVME_CR_ACQ_HI: 1743 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | 1744 (value << 32); 1745 break; 1746 default: 1747 DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d", 1748 __func__, offset, value, size)); 1749 } 1750 pthread_mutex_unlock(&sc->mtx); 1751 } 1752 1753 static void 1754 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, 1755 int baridx, uint64_t offset, int size, uint64_t value) 1756 { 1757 struct pci_nvme_softc* sc = pi->pi_arg; 1758 1759 if (baridx == pci_msix_table_bar(pi) || 1760 baridx == pci_msix_pba_bar(pi)) { 1761 DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, " 1762 " value 0x%lx", baridx, offset, size, value)); 1763 1764 pci_emul_msix_twrite(pi, offset, size, value); 1765 return; 1766 } 1767 1768 switch (baridx) { 1769 case 0: 1770 pci_nvme_write_bar_0(ctx, sc, offset, size, value); 1771 break; 1772 1773 default: 1774 DPRINTF(("%s unknown baridx %d, val 0x%lx", 1775 __func__, baridx, value)); 1776 } 1777 } 1778 1779 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, 1780 uint64_t offset, int size) 1781 { 1782 uint64_t value; 1783 1784 pci_nvme_bar0_reg_dumps(__func__, offset, 0); 1785 1786 if (offset < NVME_DOORBELL_OFFSET) { 1787 void *p = &(sc->regs); 1788 pthread_mutex_lock(&sc->mtx); 1789 memcpy(&value, (void *)((uintptr_t)p + offset), size); 1790 pthread_mutex_unlock(&sc->mtx); 1791 } else { 1792 value = 0; 1793 WPRINTF(("pci_nvme: read invalid offset %ld", offset)); 1794 } 1795 1796 switch (size) { 1797 case 1: 1798 value &= 0xFF; 1799 break; 1800 case 2: 1801 value &= 0xFFFF; 1802 break; 1803 case 4: 1804 value &= 0xFFFFFFFF; 1805 break; 1806 } 1807 1808 DPRINTF((" nvme-read offset 0x%lx, size %d -> value 0x%x", 1809 offset, size, (uint32_t)value)); 1810 1811 return (value); 1812 } 1813 1814 1815 1816 static uint64_t 1817 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, 1818 uint64_t offset, int size) 1819 { 1820 struct pci_nvme_softc* sc = pi->pi_arg; 1821 1822 if (baridx == pci_msix_table_bar(pi) || 1823 baridx == pci_msix_pba_bar(pi)) { 1824 DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d", 1825 baridx, offset, size)); 1826 1827 return pci_emul_msix_tread(pi, offset, size); 1828 } 1829 1830 switch (baridx) { 1831 case 0: 1832 return pci_nvme_read_bar_0(sc, offset, size); 1833 1834 default: 1835 DPRINTF(("unknown bar %d, 0x%lx", baridx, offset)); 1836 } 1837 1838 return (0); 1839 } 1840 1841 1842 static int 1843 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts) 1844 { 1845 char bident[sizeof("XX:X:X")]; 1846 char *uopt, *xopts, *config; 1847 uint32_t sectsz; 1848 int optidx; 1849 1850 sc->max_queues = NVME_QUEUES; 1851 sc->max_qentries = NVME_MAX_QENTRIES; 1852 sc->ioslots = NVME_IOSLOTS; 1853 sc->num_squeues = sc->max_queues; 1854 sc->num_cqueues = sc->max_queues; 1855 sectsz = 0; 1856 1857 uopt = strdup(opts); 1858 optidx = 0; 1859 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), 1860 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 1861 for (xopts = strtok(uopt, ","); 1862 xopts != NULL; 1863 xopts = strtok(NULL, ",")) { 1864 1865 if ((config = strchr(xopts, '=')) != NULL) 1866 *config++ = '\0'; 1867 1868 if (!strcmp("maxq", xopts)) { 1869 sc->max_queues = atoi(config); 1870 } else if (!strcmp("qsz", xopts)) { 1871 sc->max_qentries = atoi(config); 1872 } else if (!strcmp("ioslots", xopts)) { 1873 sc->ioslots = atoi(config); 1874 } else if (!strcmp("sectsz", xopts)) { 1875 sectsz = atoi(config); 1876 } else if (!strcmp("ser", xopts)) { 1877 /* 1878 * This field indicates the Product Serial Number in 1879 * 7-bit ASCII, unused bytes should be space characters. 1880 * Ref: NVMe v1.3c. 1881 */ 1882 cpywithpad((char *)sc->ctrldata.sn, 1883 sizeof(sc->ctrldata.sn), config, ' '); 1884 } else if (!strcmp("ram", xopts)) { 1885 uint64_t sz = strtoull(&xopts[4], NULL, 10); 1886 1887 sc->nvstore.type = NVME_STOR_RAM; 1888 sc->nvstore.size = sz * 1024 * 1024; 1889 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 1890 sc->nvstore.sectsz = 4096; 1891 sc->nvstore.sectsz_bits = 12; 1892 if (sc->nvstore.ctx == NULL) { 1893 perror("Unable to allocate RAM"); 1894 free(uopt); 1895 return (-1); 1896 } 1897 } else if (!strcmp("eui64", xopts)) { 1898 sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0)); 1899 } else if (optidx == 0) { 1900 snprintf(bident, sizeof(bident), "%d:%d", 1901 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 1902 sc->nvstore.ctx = blockif_open(xopts, bident); 1903 if (sc->nvstore.ctx == NULL) { 1904 perror("Could not open backing file"); 1905 free(uopt); 1906 return (-1); 1907 } 1908 sc->nvstore.type = NVME_STOR_BLOCKIF; 1909 sc->nvstore.size = blockif_size(sc->nvstore.ctx); 1910 } else { 1911 EPRINTLN("Invalid option %s", xopts); 1912 free(uopt); 1913 return (-1); 1914 } 1915 1916 optidx++; 1917 } 1918 free(uopt); 1919 1920 if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) { 1921 EPRINTLN("backing store not specified"); 1922 return (-1); 1923 } 1924 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) 1925 sc->nvstore.sectsz = sectsz; 1926 else if (sc->nvstore.type != NVME_STOR_RAM) 1927 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); 1928 for (sc->nvstore.sectsz_bits = 9; 1929 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; 1930 sc->nvstore.sectsz_bits++); 1931 1932 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) 1933 sc->max_queues = NVME_QUEUES; 1934 1935 if (sc->max_qentries <= 0) { 1936 EPRINTLN("Invalid qsz option"); 1937 return (-1); 1938 } 1939 if (sc->ioslots <= 0) { 1940 EPRINTLN("Invalid ioslots option"); 1941 return (-1); 1942 } 1943 1944 return (0); 1945 } 1946 1947 static int 1948 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) 1949 { 1950 struct pci_nvme_softc *sc; 1951 uint32_t pci_membar_sz; 1952 int error; 1953 1954 error = 0; 1955 1956 sc = calloc(1, sizeof(struct pci_nvme_softc)); 1957 pi->pi_arg = sc; 1958 sc->nsc_pi = pi; 1959 1960 error = pci_nvme_parse_opts(sc, opts); 1961 if (error < 0) 1962 goto done; 1963 else 1964 error = 0; 1965 1966 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); 1967 for (int i = 0; i < sc->ioslots; i++) { 1968 if (i < (sc->ioslots-1)) 1969 sc->ioreqs[i].next = &sc->ioreqs[i+1]; 1970 pthread_mutex_init(&sc->ioreqs[i].mtx, NULL); 1971 pthread_cond_init(&sc->ioreqs[i].cv, NULL); 1972 } 1973 sc->ioreqs_free = sc->ioreqs; 1974 sc->intr_coales_aggr_thresh = 1; 1975 1976 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); 1977 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); 1978 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); 1979 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); 1980 pci_set_cfgdata8(pi, PCIR_PROGIF, 1981 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); 1982 1983 /* 1984 * Allocate size of NVMe registers + doorbell space for all queues. 1985 * 1986 * The specification requires a minimum memory I/O window size of 16K. 1987 * The Windows driver will refuse to start a device with a smaller 1988 * window. 1989 */ 1990 pci_membar_sz = sizeof(struct nvme_registers) + 1991 2 * sizeof(uint32_t) * (sc->max_queues + 1); 1992 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); 1993 1994 DPRINTF(("nvme membar size: %u", pci_membar_sz)); 1995 1996 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); 1997 if (error) { 1998 WPRINTF(("%s pci alloc mem bar failed", __func__)); 1999 goto done; 2000 } 2001 2002 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); 2003 if (error) { 2004 WPRINTF(("%s pci add msixcap failed", __func__)); 2005 goto done; 2006 } 2007 2008 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); 2009 if (error) { 2010 WPRINTF(("%s pci add Express capability failed", __func__)); 2011 goto done; 2012 } 2013 2014 pthread_mutex_init(&sc->mtx, NULL); 2015 sem_init(&sc->iosemlock, 0, sc->ioslots); 2016 2017 pci_nvme_reset(sc); 2018 pci_nvme_init_ctrldata(sc); 2019 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, sc->nvstore.eui64); 2020 pci_nvme_init_logpages(sc); 2021 2022 pci_lintr_request(pi); 2023 2024 done: 2025 return (error); 2026 } 2027 2028 2029 struct pci_devemu pci_de_nvme = { 2030 .pe_emu = "nvme", 2031 .pe_init = pci_nvme_init, 2032 .pe_barwrite = pci_nvme_write, 2033 .pe_barread = pci_nvme_read 2034 }; 2035 PCI_EMUL_SET(pci_de_nvme); 2036