1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2017 Shunsuke Mie 5 * Copyright (c) 2018 Leon Dang 6 * 7 * Function crc16 Copyright (c) 2017, Fedor Uporov 8 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 /* 33 * bhyve PCIe-NVMe device emulation. 34 * 35 * options: 36 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=# 37 * 38 * accepted devpath: 39 * /dev/blockdev 40 * /path/to/image 41 * ram=size_in_MiB 42 * 43 * maxq = max number of queues 44 * qsz = max elements in each queue 45 * ioslots = max number of concurrent io requests 46 * sectsz = sector size (defaults to blockif sector size) 47 * ser = serial number (20-chars max) 48 * eui64 = IEEE Extended Unique Identifier (8 byte value) 49 * 50 */ 51 52 /* TODO: 53 - create async event for smart and log 54 - intr coalesce 55 */ 56 57 #include <sys/cdefs.h> 58 __FBSDID("$FreeBSD$"); 59 60 #include <sys/types.h> 61 #include <net/ieee_oui.h> 62 63 #include <assert.h> 64 #include <pthread.h> 65 #include <semaphore.h> 66 #include <stdbool.h> 67 #include <stddef.h> 68 #include <stdint.h> 69 #include <stdio.h> 70 #include <stdlib.h> 71 #include <string.h> 72 73 #include <machine/atomic.h> 74 #include <machine/vmm.h> 75 #include <vmmapi.h> 76 77 #include <dev/nvme/nvme.h> 78 79 #include "bhyverun.h" 80 #include "block_if.h" 81 #include "pci_emul.h" 82 83 84 static int nvme_debug = 0; 85 #define DPRINTF(params) if (nvme_debug) printf params 86 #define WPRINTF(params) printf params 87 88 /* defaults; can be overridden */ 89 #define NVME_MSIX_BAR 4 90 91 #define NVME_IOSLOTS 8 92 93 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ 94 #define NVME_MMIO_SPACE_MIN (1 << 14) 95 96 #define NVME_QUEUES 16 97 #define NVME_MAX_QENTRIES 2048 98 99 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) 100 #define NVME_MAX_BLOCKIOVS 512 101 102 /* This is a synthetic status code to indicate there is no status */ 103 #define NVME_NO_STATUS 0xffff 104 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) 105 106 /* helpers */ 107 108 /* Convert a zero-based value into a one-based value */ 109 #define ONE_BASED(zero) ((zero) + 1) 110 /* Convert a one-based value into a zero-based value */ 111 #define ZERO_BASED(one) ((one) - 1) 112 113 /* Encode number of SQ's and CQ's for Set/Get Features */ 114 #define NVME_FEATURE_NUM_QUEUES(sc) \ 115 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ 116 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16; 117 118 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) 119 120 enum nvme_controller_register_offsets { 121 NVME_CR_CAP_LOW = 0x00, 122 NVME_CR_CAP_HI = 0x04, 123 NVME_CR_VS = 0x08, 124 NVME_CR_INTMS = 0x0c, 125 NVME_CR_INTMC = 0x10, 126 NVME_CR_CC = 0x14, 127 NVME_CR_CSTS = 0x1c, 128 NVME_CR_NSSR = 0x20, 129 NVME_CR_AQA = 0x24, 130 NVME_CR_ASQ_LOW = 0x28, 131 NVME_CR_ASQ_HI = 0x2c, 132 NVME_CR_ACQ_LOW = 0x30, 133 NVME_CR_ACQ_HI = 0x34, 134 }; 135 136 enum nvme_cmd_cdw11 { 137 NVME_CMD_CDW11_PC = 0x0001, 138 NVME_CMD_CDW11_IEN = 0x0002, 139 NVME_CMD_CDW11_IV = 0xFFFF0000, 140 }; 141 142 #define NVME_CQ_INTEN 0x01 143 #define NVME_CQ_INTCOAL 0x02 144 145 struct nvme_completion_queue { 146 struct nvme_completion *qbase; 147 uint32_t size; 148 uint16_t tail; /* nvme progress */ 149 uint16_t head; /* guest progress */ 150 uint16_t intr_vec; 151 uint32_t intr_en; 152 pthread_mutex_t mtx; 153 }; 154 155 struct nvme_submission_queue { 156 struct nvme_command *qbase; 157 uint32_t size; 158 uint16_t head; /* nvme progress */ 159 uint16_t tail; /* guest progress */ 160 uint16_t cqid; /* completion queue id */ 161 int busy; /* queue is being processed */ 162 int qpriority; 163 }; 164 165 enum nvme_storage_type { 166 NVME_STOR_BLOCKIF = 0, 167 NVME_STOR_RAM = 1, 168 }; 169 170 struct pci_nvme_blockstore { 171 enum nvme_storage_type type; 172 void *ctx; 173 uint64_t size; 174 uint32_t sectsz; 175 uint32_t sectsz_bits; 176 uint64_t eui64; 177 }; 178 179 struct pci_nvme_ioreq { 180 struct pci_nvme_softc *sc; 181 struct pci_nvme_ioreq *next; 182 struct nvme_submission_queue *nvme_sq; 183 uint16_t sqid; 184 185 /* command information */ 186 uint16_t opc; 187 uint16_t cid; 188 uint32_t nsid; 189 190 uint64_t prev_gpaddr; 191 size_t prev_size; 192 193 /* 194 * lock if all iovs consumed (big IO); 195 * complete transaction before continuing 196 */ 197 pthread_mutex_t mtx; 198 pthread_cond_t cv; 199 200 struct blockif_req io_req; 201 202 /* pad to fit up to 512 page descriptors from guest IO request */ 203 struct iovec iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX]; 204 }; 205 206 struct pci_nvme_softc { 207 struct pci_devinst *nsc_pi; 208 209 pthread_mutex_t mtx; 210 211 struct nvme_registers regs; 212 213 struct nvme_namespace_data nsdata; 214 struct nvme_controller_data ctrldata; 215 struct nvme_error_information_entry err_log; 216 struct nvme_health_information_page health_log; 217 struct nvme_firmware_page fw_log; 218 219 struct pci_nvme_blockstore nvstore; 220 221 uint16_t max_qentries; /* max entries per queue */ 222 uint32_t max_queues; /* max number of IO SQ's or CQ's */ 223 uint32_t num_cqueues; 224 uint32_t num_squeues; 225 226 struct pci_nvme_ioreq *ioreqs; 227 struct pci_nvme_ioreq *ioreqs_free; /* free list of ioreqs */ 228 uint32_t pending_ios; 229 uint32_t ioslots; 230 sem_t iosemlock; 231 232 /* 233 * Memory mapped Submission and Completion queues 234 * Each array includes both Admin and IO queues 235 */ 236 struct nvme_completion_queue *compl_queues; 237 struct nvme_submission_queue *submit_queues; 238 239 /* controller features */ 240 uint32_t intr_coales_aggr_time; /* 0x08: uS to delay intr */ 241 uint32_t intr_coales_aggr_thresh; /* 0x08: compl-Q entries */ 242 uint32_t async_ev_config; /* 0x0B: async event config */ 243 }; 244 245 246 static void pci_nvme_io_partial(struct blockif_req *br, int err); 247 248 /* Controller Configuration utils */ 249 #define NVME_CC_GET_EN(cc) \ 250 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) 251 #define NVME_CC_GET_CSS(cc) \ 252 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) 253 #define NVME_CC_GET_SHN(cc) \ 254 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) 255 #define NVME_CC_GET_IOSQES(cc) \ 256 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) 257 #define NVME_CC_GET_IOCQES(cc) \ 258 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) 259 260 #define NVME_CC_WRITE_MASK \ 261 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ 262 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ 263 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) 264 265 #define NVME_CC_NEN_WRITE_MASK \ 266 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ 267 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ 268 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) 269 270 /* Controller Status utils */ 271 #define NVME_CSTS_GET_RDY(sts) \ 272 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) 273 274 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) 275 276 /* Completion Queue status word utils */ 277 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) 278 #define NVME_STATUS_MASK \ 279 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ 280 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) 281 282 static __inline void 283 cpywithpad(char *dst, size_t dst_size, const char *src, char pad) 284 { 285 size_t len; 286 287 len = strnlen(src, dst_size); 288 memset(dst, pad, dst_size); 289 memcpy(dst, src, len); 290 } 291 292 static __inline void 293 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) 294 { 295 296 *status &= ~NVME_STATUS_MASK; 297 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | 298 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; 299 } 300 301 static __inline void 302 pci_nvme_status_genc(uint16_t *status, uint16_t code) 303 { 304 305 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); 306 } 307 308 static __inline void 309 pci_nvme_toggle_phase(uint16_t *status, int prev) 310 { 311 312 if (prev) 313 *status &= ~NVME_STATUS_P; 314 else 315 *status |= NVME_STATUS_P; 316 } 317 318 static void 319 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) 320 { 321 struct nvme_controller_data *cd = &sc->ctrldata; 322 323 cd->vid = 0xFB5D; 324 cd->ssvid = 0x0000; 325 326 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); 327 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); 328 329 /* Num of submission commands that we can handle at a time (2^rab) */ 330 cd->rab = 4; 331 332 /* FreeBSD OUI */ 333 cd->ieee[0] = 0x58; 334 cd->ieee[1] = 0x9c; 335 cd->ieee[2] = 0xfc; 336 337 cd->mic = 0; 338 339 cd->mdts = 9; /* max data transfer size (2^mdts * CAP.MPSMIN) */ 340 341 cd->ver = 0x00010300; 342 343 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; 344 cd->acl = 2; 345 cd->aerl = 4; 346 347 cd->lpa = 0; /* TODO: support some simple things like SMART */ 348 cd->elpe = 0; /* max error log page entries */ 349 cd->npss = 1; /* number of power states support */ 350 351 /* Warning Composite Temperature Threshold */ 352 cd->wctemp = 0x0157; 353 354 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | 355 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); 356 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | 357 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); 358 cd->nn = 1; /* number of namespaces */ 359 360 cd->fna = 0x03; 361 362 cd->power_state[0].mp = 10; 363 } 364 365 /* 366 * Calculate the CRC-16 of the given buffer 367 * See copyright attribution at top of file 368 */ 369 static uint16_t 370 crc16(uint16_t crc, const void *buffer, unsigned int len) 371 { 372 const unsigned char *cp = buffer; 373 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */ 374 static uint16_t const crc16_table[256] = { 375 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, 376 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, 377 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, 378 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841, 379 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, 380 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41, 381 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641, 382 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040, 383 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240, 384 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441, 385 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41, 386 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840, 387 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41, 388 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40, 389 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640, 390 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041, 391 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240, 392 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441, 393 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41, 394 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840, 395 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41, 396 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40, 397 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640, 398 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041, 399 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241, 400 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440, 401 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40, 402 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841, 403 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40, 404 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41, 405 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, 406 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 407 }; 408 409 while (len--) 410 crc = (((crc >> 8) & 0xffU) ^ 411 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU; 412 return crc; 413 } 414 415 static void 416 pci_nvme_init_nsdata(struct pci_nvme_softc *sc, 417 struct nvme_namespace_data *nd, uint32_t nsid, 418 uint64_t eui64) 419 { 420 421 nd->nsze = sc->nvstore.size / sc->nvstore.sectsz; 422 nd->ncap = nd->nsze; 423 nd->nuse = nd->nsze; 424 425 /* Get LBA and backstore information from backing store */ 426 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ 427 nd->flbas = 0; 428 429 /* Create an EUI-64 if user did not provide one */ 430 if (eui64 == 0) { 431 char *data = NULL; 432 433 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus, 434 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 435 436 if (data != NULL) { 437 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); 438 free(data); 439 } 440 eui64 = (eui64 << 16) | (nsid & 0xffff); 441 } 442 be64enc(nd->eui64, eui64); 443 444 /* LBA data-sz = 2^lbads */ 445 nd->lbaf[0] = sc->nvstore.sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; 446 } 447 448 static void 449 pci_nvme_init_logpages(struct pci_nvme_softc *sc) 450 { 451 452 memset(&sc->err_log, 0, sizeof(sc->err_log)); 453 memset(&sc->health_log, 0, sizeof(sc->health_log)); 454 memset(&sc->fw_log, 0, sizeof(sc->fw_log)); 455 } 456 457 static void 458 pci_nvme_reset_locked(struct pci_nvme_softc *sc) 459 { 460 DPRINTF(("%s\r\n", __func__)); 461 462 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | 463 (1 << NVME_CAP_LO_REG_CQR_SHIFT) | 464 (60 << NVME_CAP_LO_REG_TO_SHIFT); 465 466 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; 467 468 sc->regs.vs = 0x00010300; /* NVMe v1.3 */ 469 470 sc->regs.cc = 0; 471 sc->regs.csts = 0; 472 473 sc->num_cqueues = sc->num_squeues = sc->max_queues; 474 if (sc->submit_queues != NULL) { 475 for (int i = 0; i < sc->num_squeues + 1; i++) { 476 /* 477 * The Admin Submission Queue is at index 0. 478 * It must not be changed at reset otherwise the 479 * emulation will be out of sync with the guest. 480 */ 481 if (i != 0) { 482 sc->submit_queues[i].qbase = NULL; 483 sc->submit_queues[i].size = 0; 484 sc->submit_queues[i].cqid = 0; 485 } 486 sc->submit_queues[i].tail = 0; 487 sc->submit_queues[i].head = 0; 488 sc->submit_queues[i].busy = 0; 489 } 490 } else 491 sc->submit_queues = calloc(sc->num_squeues + 1, 492 sizeof(struct nvme_submission_queue)); 493 494 if (sc->compl_queues != NULL) { 495 for (int i = 0; i < sc->num_cqueues + 1; i++) { 496 /* See Admin Submission Queue note above */ 497 if (i != 0) { 498 sc->compl_queues[i].qbase = NULL; 499 sc->compl_queues[i].size = 0; 500 } 501 502 sc->compl_queues[i].tail = 0; 503 sc->compl_queues[i].head = 0; 504 } 505 } else { 506 sc->compl_queues = calloc(sc->num_cqueues + 1, 507 sizeof(struct nvme_completion_queue)); 508 509 for (int i = 0; i < sc->num_cqueues + 1; i++) 510 pthread_mutex_init(&sc->compl_queues[i].mtx, NULL); 511 } 512 } 513 514 static void 515 pci_nvme_reset(struct pci_nvme_softc *sc) 516 { 517 pthread_mutex_lock(&sc->mtx); 518 pci_nvme_reset_locked(sc); 519 pthread_mutex_unlock(&sc->mtx); 520 } 521 522 static void 523 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc) 524 { 525 uint16_t acqs, asqs; 526 527 DPRINTF(("%s\r\n", __func__)); 528 529 asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1; 530 sc->submit_queues[0].size = asqs; 531 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq, 532 sizeof(struct nvme_command) * asqs); 533 534 DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p\r\n", 535 __func__, sc->regs.asq, sc->submit_queues[0].qbase)); 536 537 acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 538 NVME_AQA_REG_ACQS_MASK) + 1; 539 sc->compl_queues[0].size = acqs; 540 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq, 541 sizeof(struct nvme_completion) * acqs); 542 DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p\r\n", 543 __func__, sc->regs.acq, sc->compl_queues[0].qbase)); 544 } 545 546 static int 547 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *src, 548 size_t len) 549 { 550 uint8_t *dst; 551 size_t bytes; 552 553 if (len > (8 * 1024)) { 554 return (-1); 555 } 556 557 /* Copy from the start of prp1 to the end of the physical page */ 558 bytes = PAGE_SIZE - (prp1 & PAGE_MASK); 559 bytes = MIN(bytes, len); 560 561 dst = vm_map_gpa(ctx, prp1, bytes); 562 if (dst == NULL) { 563 return (-1); 564 } 565 566 memcpy(dst, src, bytes); 567 568 src += bytes; 569 570 len -= bytes; 571 if (len == 0) { 572 return (0); 573 } 574 575 len = MIN(len, PAGE_SIZE); 576 577 dst = vm_map_gpa(ctx, prp2, len); 578 if (dst == NULL) { 579 return (-1); 580 } 581 582 memcpy(dst, src, len); 583 584 return (0); 585 } 586 587 static int 588 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 589 struct nvme_completion* compl) 590 { 591 uint16_t qid = command->cdw10 & 0xffff; 592 593 DPRINTF(("%s DELETE_IO_SQ %u\r\n", __func__, qid)); 594 if (qid == 0 || qid > sc->num_squeues) { 595 WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u\r\n", 596 __func__, qid, sc->num_squeues)); 597 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 598 NVME_SC_INVALID_QUEUE_IDENTIFIER); 599 return (1); 600 } 601 602 sc->submit_queues[qid].qbase = NULL; 603 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 604 return (1); 605 } 606 607 static int 608 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 609 struct nvme_completion* compl) 610 { 611 if (command->cdw11 & NVME_CMD_CDW11_PC) { 612 uint16_t qid = command->cdw10 & 0xffff; 613 struct nvme_submission_queue *nsq; 614 615 if ((qid == 0) || (qid > sc->num_squeues)) { 616 WPRINTF(("%s queue index %u > num_squeues %u\r\n", 617 __func__, qid, sc->num_squeues)); 618 pci_nvme_status_tc(&compl->status, 619 NVME_SCT_COMMAND_SPECIFIC, 620 NVME_SC_INVALID_QUEUE_IDENTIFIER); 621 return (1); 622 } 623 624 nsq = &sc->submit_queues[qid]; 625 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 626 627 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 628 sizeof(struct nvme_command) * (size_t)nsq->size); 629 nsq->cqid = (command->cdw11 >> 16) & 0xffff; 630 nsq->qpriority = (command->cdw11 >> 1) & 0x03; 631 632 DPRINTF(("%s sq %u size %u gaddr %p cqid %u\r\n", __func__, 633 qid, nsq->size, nsq->qbase, nsq->cqid)); 634 635 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 636 637 DPRINTF(("%s completed creating IOSQ qid %u\r\n", 638 __func__, qid)); 639 } else { 640 /* 641 * Guest sent non-cont submission queue request. 642 * This setting is unsupported by this emulation. 643 */ 644 WPRINTF(("%s unsupported non-contig (list-based) " 645 "create i/o submission queue\r\n", __func__)); 646 647 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 648 } 649 return (1); 650 } 651 652 static int 653 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 654 struct nvme_completion* compl) 655 { 656 uint16_t qid = command->cdw10 & 0xffff; 657 658 DPRINTF(("%s DELETE_IO_CQ %u\r\n", __func__, qid)); 659 if (qid == 0 || qid > sc->num_cqueues) { 660 WPRINTF(("%s queue index %u / num_cqueues %u\r\n", 661 __func__, qid, sc->num_cqueues)); 662 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 663 NVME_SC_INVALID_QUEUE_IDENTIFIER); 664 return (1); 665 } 666 667 sc->compl_queues[qid].qbase = NULL; 668 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 669 return (1); 670 } 671 672 static int 673 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 674 struct nvme_completion* compl) 675 { 676 if (command->cdw11 & NVME_CMD_CDW11_PC) { 677 uint16_t qid = command->cdw10 & 0xffff; 678 struct nvme_completion_queue *ncq; 679 680 if ((qid == 0) || (qid > sc->num_cqueues)) { 681 WPRINTF(("%s queue index %u > num_cqueues %u\r\n", 682 __func__, qid, sc->num_cqueues)); 683 pci_nvme_status_tc(&compl->status, 684 NVME_SCT_COMMAND_SPECIFIC, 685 NVME_SC_INVALID_QUEUE_IDENTIFIER); 686 return (1); 687 } 688 689 ncq = &sc->compl_queues[qid]; 690 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; 691 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; 692 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 693 694 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 695 command->prp1, 696 sizeof(struct nvme_command) * (size_t)ncq->size); 697 698 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 699 } else { 700 /* 701 * Non-contig completion queue unsupported. 702 */ 703 WPRINTF(("%s unsupported non-contig (list-based) " 704 "create i/o completion queue\r\n", 705 __func__)); 706 707 /* 0x12 = Invalid Use of Controller Memory Buffer */ 708 pci_nvme_status_genc(&compl->status, 0x12); 709 } 710 711 return (1); 712 } 713 714 static int 715 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, 716 struct nvme_completion* compl) 717 { 718 uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2; 719 uint8_t logpage = command->cdw10 & 0xFF; 720 721 DPRINTF(("%s log page %u len %u\r\n", __func__, logpage, logsize)); 722 723 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 724 725 switch (logpage) { 726 case NVME_LOG_ERROR: 727 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 728 command->prp2, (uint8_t *)&sc->err_log, logsize); 729 break; 730 case NVME_LOG_HEALTH_INFORMATION: 731 /* TODO: present some smart info */ 732 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 733 command->prp2, (uint8_t *)&sc->health_log, logsize); 734 break; 735 case NVME_LOG_FIRMWARE_SLOT: 736 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 737 command->prp2, (uint8_t *)&sc->fw_log, logsize); 738 break; 739 default: 740 WPRINTF(("%s get log page %x command not supported\r\n", 741 __func__, logpage)); 742 743 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 744 NVME_SC_INVALID_LOG_PAGE); 745 } 746 747 return (1); 748 } 749 750 static int 751 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, 752 struct nvme_completion* compl) 753 { 754 void *dest; 755 756 DPRINTF(("%s identify 0x%x nsid 0x%x\r\n", __func__, 757 command->cdw10 & 0xFF, command->nsid)); 758 759 switch (command->cdw10 & 0xFF) { 760 case 0x00: /* return Identify Namespace data structure */ 761 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 762 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata)); 763 break; 764 case 0x01: /* return Identify Controller data structure */ 765 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 766 command->prp2, (uint8_t *)&sc->ctrldata, 767 sizeof(sc->ctrldata)); 768 break; 769 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ 770 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 771 sizeof(uint32_t) * 1024); 772 ((uint32_t *)dest)[0] = 1; 773 ((uint32_t *)dest)[1] = 0; 774 break; 775 case 0x11: 776 pci_nvme_status_genc(&compl->status, 777 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 778 return (1); 779 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ 780 case 0x10: 781 case 0x12: 782 case 0x13: 783 case 0x14: 784 case 0x15: 785 default: 786 DPRINTF(("%s unsupported identify command requested 0x%x\r\n", 787 __func__, command->cdw10 & 0xFF)); 788 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 789 return (1); 790 } 791 792 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 793 return (1); 794 } 795 796 static int 797 nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command, 798 struct nvme_completion* compl) 799 { 800 uint16_t nqr; /* Number of Queues Requested */ 801 802 nqr = command->cdw11 & 0xFFFF; 803 if (nqr == 0xffff) { 804 WPRINTF(("%s: Illegal NSQR value %#x\n", __func__, nqr)); 805 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 806 return (-1); 807 } 808 809 sc->num_squeues = ONE_BASED(nqr); 810 if (sc->num_squeues > sc->max_queues) { 811 DPRINTF(("NSQR=%u is greater than max %u\n", sc->num_squeues, 812 sc->max_queues)); 813 sc->num_squeues = sc->max_queues; 814 } 815 816 nqr = (command->cdw11 >> 16) & 0xFFFF; 817 if (nqr == 0xffff) { 818 WPRINTF(("%s: Illegal NCQR value %#x\n", __func__, nqr)); 819 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 820 return (-1); 821 } 822 823 sc->num_cqueues = ONE_BASED(nqr); 824 if (sc->num_cqueues > sc->max_queues) { 825 DPRINTF(("NCQR=%u is greater than max %u\n", sc->num_cqueues, 826 sc->max_queues)); 827 sc->num_cqueues = sc->max_queues; 828 } 829 830 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 831 832 return (0); 833 } 834 835 static int 836 nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command, 837 struct nvme_completion* compl) 838 { 839 int feature = command->cdw10 & 0xFF; 840 uint32_t iv; 841 842 DPRINTF(("%s feature 0x%x\r\n", __func__, feature)); 843 compl->cdw0 = 0; 844 845 switch (feature) { 846 case NVME_FEAT_ARBITRATION: 847 DPRINTF((" arbitration 0x%x\r\n", command->cdw11)); 848 break; 849 case NVME_FEAT_POWER_MANAGEMENT: 850 DPRINTF((" power management 0x%x\r\n", command->cdw11)); 851 break; 852 case NVME_FEAT_LBA_RANGE_TYPE: 853 DPRINTF((" lba range 0x%x\r\n", command->cdw11)); 854 break; 855 case NVME_FEAT_TEMPERATURE_THRESHOLD: 856 DPRINTF((" temperature threshold 0x%x\r\n", command->cdw11)); 857 break; 858 case NVME_FEAT_ERROR_RECOVERY: 859 DPRINTF((" error recovery 0x%x\r\n", command->cdw11)); 860 break; 861 case NVME_FEAT_VOLATILE_WRITE_CACHE: 862 DPRINTF((" volatile write cache 0x%x\r\n", command->cdw11)); 863 break; 864 case NVME_FEAT_NUMBER_OF_QUEUES: 865 nvme_set_feature_queues(sc, command, compl); 866 break; 867 case NVME_FEAT_INTERRUPT_COALESCING: 868 DPRINTF((" interrupt coalescing 0x%x\r\n", command->cdw11)); 869 870 /* in uS */ 871 sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100; 872 873 sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF; 874 break; 875 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 876 iv = command->cdw11 & 0xFFFF; 877 878 DPRINTF((" interrupt vector configuration 0x%x\r\n", 879 command->cdw11)); 880 881 for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) { 882 if (sc->compl_queues[i].intr_vec == iv) { 883 if (command->cdw11 & (1 << 16)) 884 sc->compl_queues[i].intr_en |= 885 NVME_CQ_INTCOAL; 886 else 887 sc->compl_queues[i].intr_en &= 888 ~NVME_CQ_INTCOAL; 889 } 890 } 891 break; 892 case NVME_FEAT_WRITE_ATOMICITY: 893 DPRINTF((" write atomicity 0x%x\r\n", command->cdw11)); 894 break; 895 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 896 DPRINTF((" async event configuration 0x%x\r\n", 897 command->cdw11)); 898 sc->async_ev_config = command->cdw11; 899 break; 900 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 901 DPRINTF((" software progress marker 0x%x\r\n", 902 command->cdw11)); 903 break; 904 case 0x0C: 905 DPRINTF((" autonomous power state transition 0x%x\r\n", 906 command->cdw11)); 907 break; 908 default: 909 WPRINTF(("%s invalid feature\r\n", __func__)); 910 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 911 return (1); 912 } 913 914 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 915 return (1); 916 } 917 918 static int 919 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, 920 struct nvme_completion* compl) 921 { 922 int feature = command->cdw10 & 0xFF; 923 924 DPRINTF(("%s feature 0x%x\r\n", __func__, feature)); 925 926 compl->cdw0 = 0; 927 928 switch (feature) { 929 case NVME_FEAT_ARBITRATION: 930 DPRINTF((" arbitration\r\n")); 931 break; 932 case NVME_FEAT_POWER_MANAGEMENT: 933 DPRINTF((" power management\r\n")); 934 break; 935 case NVME_FEAT_LBA_RANGE_TYPE: 936 DPRINTF((" lba range\r\n")); 937 break; 938 case NVME_FEAT_TEMPERATURE_THRESHOLD: 939 DPRINTF((" temperature threshold\r\n")); 940 switch ((command->cdw11 >> 20) & 0x3) { 941 case 0: 942 /* Over temp threshold */ 943 compl->cdw0 = 0xFFFF; 944 break; 945 case 1: 946 /* Under temp threshold */ 947 compl->cdw0 = 0; 948 break; 949 default: 950 WPRINTF((" invalid threshold type select\r\n")); 951 pci_nvme_status_genc(&compl->status, 952 NVME_SC_INVALID_FIELD); 953 return (1); 954 } 955 break; 956 case NVME_FEAT_ERROR_RECOVERY: 957 DPRINTF((" error recovery\r\n")); 958 break; 959 case NVME_FEAT_VOLATILE_WRITE_CACHE: 960 DPRINTF((" volatile write cache\r\n")); 961 break; 962 case NVME_FEAT_NUMBER_OF_QUEUES: 963 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 964 965 DPRINTF((" number of queues (submit %u, completion %u)\r\n", 966 compl->cdw0 & 0xFFFF, 967 (compl->cdw0 >> 16) & 0xFFFF)); 968 969 break; 970 case NVME_FEAT_INTERRUPT_COALESCING: 971 DPRINTF((" interrupt coalescing\r\n")); 972 break; 973 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 974 DPRINTF((" interrupt vector configuration\r\n")); 975 break; 976 case NVME_FEAT_WRITE_ATOMICITY: 977 DPRINTF((" write atomicity\r\n")); 978 break; 979 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 980 DPRINTF((" async event configuration\r\n")); 981 sc->async_ev_config = command->cdw11; 982 break; 983 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 984 DPRINTF((" software progress marker\r\n")); 985 break; 986 case 0x0C: 987 DPRINTF((" autonomous power state transition\r\n")); 988 break; 989 default: 990 WPRINTF(("%s invalid feature 0x%x\r\n", __func__, feature)); 991 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 992 return (1); 993 } 994 995 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 996 return (1); 997 } 998 999 static int 1000 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command, 1001 struct nvme_completion* compl) 1002 { 1003 DPRINTF(("%s submission queue %u, command ID 0x%x\r\n", __func__, 1004 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF)); 1005 1006 /* TODO: search for the command ID and abort it */ 1007 1008 compl->cdw0 = 1; 1009 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1010 return (1); 1011 } 1012 1013 static int 1014 nvme_opc_async_event_req(struct pci_nvme_softc* sc, 1015 struct nvme_command* command, struct nvme_completion* compl) 1016 { 1017 DPRINTF(("%s async event request 0x%x\r\n", __func__, command->cdw11)); 1018 1019 /* 1020 * TODO: raise events when they happen based on the Set Features cmd. 1021 * These events happen async, so only set completion successful if 1022 * there is an event reflective of the request to get event. 1023 */ 1024 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1025 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); 1026 return (0); 1027 } 1028 1029 static void 1030 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) 1031 { 1032 struct nvme_completion compl; 1033 struct nvme_command *cmd; 1034 struct nvme_submission_queue *sq; 1035 struct nvme_completion_queue *cq; 1036 int do_intr = 0; 1037 uint16_t sqhead; 1038 1039 DPRINTF(("%s index %u\r\n", __func__, (uint32_t)value)); 1040 1041 sq = &sc->submit_queues[0]; 1042 1043 sqhead = atomic_load_acq_short(&sq->head); 1044 1045 if (atomic_testandset_int(&sq->busy, 1)) { 1046 DPRINTF(("%s SQ busy, head %u, tail %u\r\n", 1047 __func__, sqhead, sq->tail)); 1048 return; 1049 } 1050 1051 DPRINTF(("sqhead %u, tail %u\r\n", sqhead, sq->tail)); 1052 1053 while (sqhead != atomic_load_acq_short(&sq->tail)) { 1054 cmd = &(sq->qbase)[sqhead]; 1055 compl.cdw0 = 0; 1056 compl.status = 0; 1057 1058 switch (cmd->opc) { 1059 case NVME_OPC_DELETE_IO_SQ: 1060 DPRINTF(("%s command DELETE_IO_SQ\r\n", __func__)); 1061 do_intr |= nvme_opc_delete_io_sq(sc, cmd, &compl); 1062 break; 1063 case NVME_OPC_CREATE_IO_SQ: 1064 DPRINTF(("%s command CREATE_IO_SQ\r\n", __func__)); 1065 do_intr |= nvme_opc_create_io_sq(sc, cmd, &compl); 1066 break; 1067 case NVME_OPC_DELETE_IO_CQ: 1068 DPRINTF(("%s command DELETE_IO_CQ\r\n", __func__)); 1069 do_intr |= nvme_opc_delete_io_cq(sc, cmd, &compl); 1070 break; 1071 case NVME_OPC_CREATE_IO_CQ: 1072 DPRINTF(("%s command CREATE_IO_CQ\r\n", __func__)); 1073 do_intr |= nvme_opc_create_io_cq(sc, cmd, &compl); 1074 break; 1075 case NVME_OPC_GET_LOG_PAGE: 1076 DPRINTF(("%s command GET_LOG_PAGE\r\n", __func__)); 1077 do_intr |= nvme_opc_get_log_page(sc, cmd, &compl); 1078 break; 1079 case NVME_OPC_IDENTIFY: 1080 DPRINTF(("%s command IDENTIFY\r\n", __func__)); 1081 do_intr |= nvme_opc_identify(sc, cmd, &compl); 1082 break; 1083 case NVME_OPC_ABORT: 1084 DPRINTF(("%s command ABORT\r\n", __func__)); 1085 do_intr |= nvme_opc_abort(sc, cmd, &compl); 1086 break; 1087 case NVME_OPC_SET_FEATURES: 1088 DPRINTF(("%s command SET_FEATURES\r\n", __func__)); 1089 do_intr |= nvme_opc_set_features(sc, cmd, &compl); 1090 break; 1091 case NVME_OPC_GET_FEATURES: 1092 DPRINTF(("%s command GET_FEATURES\r\n", __func__)); 1093 do_intr |= nvme_opc_get_features(sc, cmd, &compl); 1094 break; 1095 case NVME_OPC_ASYNC_EVENT_REQUEST: 1096 DPRINTF(("%s command ASYNC_EVENT_REQ\r\n", __func__)); 1097 /* XXX dont care, unhandled for now 1098 do_intr |= nvme_opc_async_event_req(sc, cmd, &compl); 1099 */ 1100 compl.status = NVME_NO_STATUS; 1101 break; 1102 default: 1103 WPRINTF(("0x%x command is not implemented\r\n", 1104 cmd->opc)); 1105 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 1106 do_intr |= 1; 1107 } 1108 1109 if (NVME_COMPLETION_VALID(compl)) { 1110 struct nvme_completion *cp; 1111 int phase; 1112 1113 cq = &sc->compl_queues[0]; 1114 1115 cp = &(cq->qbase)[cq->tail]; 1116 cp->cdw0 = compl.cdw0; 1117 cp->sqid = 0; 1118 cp->sqhd = sqhead; 1119 cp->cid = cmd->cid; 1120 1121 phase = NVME_STATUS_GET_P(cp->status); 1122 cp->status = compl.status; 1123 pci_nvme_toggle_phase(&cp->status, phase); 1124 1125 cq->tail = (cq->tail + 1) % cq->size; 1126 } 1127 sqhead = (sqhead + 1) % sq->size; 1128 } 1129 1130 DPRINTF(("setting sqhead %u\r\n", sqhead)); 1131 atomic_store_short(&sq->head, sqhead); 1132 atomic_store_int(&sq->busy, 0); 1133 1134 if (do_intr) 1135 pci_generate_msix(sc->nsc_pi, 0); 1136 1137 } 1138 1139 static int 1140 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req, 1141 uint64_t gpaddr, size_t size, int do_write, uint64_t lba) 1142 { 1143 int iovidx; 1144 1145 if (req != NULL) { 1146 /* concatenate contig block-iovs to minimize number of iovs */ 1147 if ((req->prev_gpaddr + req->prev_size) == gpaddr) { 1148 iovidx = req->io_req.br_iovcnt - 1; 1149 1150 req->io_req.br_iov[iovidx].iov_base = 1151 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 1152 req->prev_gpaddr, size); 1153 1154 req->prev_size += size; 1155 req->io_req.br_resid += size; 1156 1157 req->io_req.br_iov[iovidx].iov_len = req->prev_size; 1158 } else { 1159 pthread_mutex_lock(&req->mtx); 1160 1161 iovidx = req->io_req.br_iovcnt; 1162 if (iovidx == NVME_MAX_BLOCKIOVS) { 1163 int err = 0; 1164 1165 DPRINTF(("large I/O, doing partial req\r\n")); 1166 1167 iovidx = 0; 1168 req->io_req.br_iovcnt = 0; 1169 1170 req->io_req.br_callback = pci_nvme_io_partial; 1171 1172 if (!do_write) 1173 err = blockif_read(sc->nvstore.ctx, 1174 &req->io_req); 1175 else 1176 err = blockif_write(sc->nvstore.ctx, 1177 &req->io_req); 1178 1179 /* wait until req completes before cont */ 1180 if (err == 0) 1181 pthread_cond_wait(&req->cv, &req->mtx); 1182 } 1183 if (iovidx == 0) { 1184 req->io_req.br_offset = lba; 1185 req->io_req.br_resid = 0; 1186 req->io_req.br_param = req; 1187 } 1188 1189 req->io_req.br_iov[iovidx].iov_base = 1190 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 1191 gpaddr, size); 1192 1193 req->io_req.br_iov[iovidx].iov_len = size; 1194 1195 req->prev_gpaddr = gpaddr; 1196 req->prev_size = size; 1197 req->io_req.br_resid += size; 1198 1199 req->io_req.br_iovcnt++; 1200 1201 pthread_mutex_unlock(&req->mtx); 1202 } 1203 } else { 1204 /* RAM buffer: read/write directly */ 1205 void *p = sc->nvstore.ctx; 1206 void *gptr; 1207 1208 if ((lba + size) > sc->nvstore.size) { 1209 WPRINTF(("%s write would overflow RAM\r\n", __func__)); 1210 return (-1); 1211 } 1212 1213 p = (void *)((uintptr_t)p + (uintptr_t)lba); 1214 gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size); 1215 if (do_write) 1216 memcpy(p, gptr, size); 1217 else 1218 memcpy(gptr, p, size); 1219 } 1220 return (0); 1221 } 1222 1223 static void 1224 pci_nvme_set_completion(struct pci_nvme_softc *sc, 1225 struct nvme_submission_queue *sq, int sqid, uint16_t cid, 1226 uint32_t cdw0, uint16_t status, int ignore_busy) 1227 { 1228 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; 1229 struct nvme_completion *compl; 1230 int do_intr = 0; 1231 int phase; 1232 1233 DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x\r\n", 1234 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), 1235 NVME_STATUS_GET_SC(status))); 1236 1237 pthread_mutex_lock(&cq->mtx); 1238 1239 assert(cq->qbase != NULL); 1240 1241 compl = &cq->qbase[cq->tail]; 1242 1243 compl->sqhd = atomic_load_acq_short(&sq->head); 1244 compl->sqid = sqid; 1245 compl->cid = cid; 1246 1247 // toggle phase 1248 phase = NVME_STATUS_GET_P(compl->status); 1249 compl->status = status; 1250 pci_nvme_toggle_phase(&compl->status, phase); 1251 1252 cq->tail = (cq->tail + 1) % cq->size; 1253 1254 if (cq->intr_en & NVME_CQ_INTEN) 1255 do_intr = 1; 1256 1257 pthread_mutex_unlock(&cq->mtx); 1258 1259 if (ignore_busy || !atomic_load_acq_int(&sq->busy)) 1260 if (do_intr) 1261 pci_generate_msix(sc->nsc_pi, cq->intr_vec); 1262 } 1263 1264 static void 1265 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) 1266 { 1267 req->sc = NULL; 1268 req->nvme_sq = NULL; 1269 req->sqid = 0; 1270 1271 pthread_mutex_lock(&sc->mtx); 1272 1273 req->next = sc->ioreqs_free; 1274 sc->ioreqs_free = req; 1275 sc->pending_ios--; 1276 1277 /* when no more IO pending, can set to ready if device reset/enabled */ 1278 if (sc->pending_ios == 0 && 1279 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) 1280 sc->regs.csts |= NVME_CSTS_RDY; 1281 1282 pthread_mutex_unlock(&sc->mtx); 1283 1284 sem_post(&sc->iosemlock); 1285 } 1286 1287 static struct pci_nvme_ioreq * 1288 pci_nvme_get_ioreq(struct pci_nvme_softc *sc) 1289 { 1290 struct pci_nvme_ioreq *req = NULL;; 1291 1292 sem_wait(&sc->iosemlock); 1293 pthread_mutex_lock(&sc->mtx); 1294 1295 req = sc->ioreqs_free; 1296 assert(req != NULL); 1297 1298 sc->ioreqs_free = req->next; 1299 1300 req->next = NULL; 1301 req->sc = sc; 1302 1303 sc->pending_ios++; 1304 1305 pthread_mutex_unlock(&sc->mtx); 1306 1307 req->io_req.br_iovcnt = 0; 1308 req->io_req.br_offset = 0; 1309 req->io_req.br_resid = 0; 1310 req->io_req.br_param = req; 1311 req->prev_gpaddr = 0; 1312 req->prev_size = 0; 1313 1314 return req; 1315 } 1316 1317 static void 1318 pci_nvme_io_done(struct blockif_req *br, int err) 1319 { 1320 struct pci_nvme_ioreq *req = br->br_param; 1321 struct nvme_submission_queue *sq = req->nvme_sq; 1322 uint16_t code, status; 1323 1324 DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err))); 1325 1326 /* TODO return correct error */ 1327 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; 1328 pci_nvme_status_genc(&status, code); 1329 1330 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0); 1331 pci_nvme_release_ioreq(req->sc, req); 1332 } 1333 1334 static void 1335 pci_nvme_io_partial(struct blockif_req *br, int err) 1336 { 1337 struct pci_nvme_ioreq *req = br->br_param; 1338 1339 DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err))); 1340 1341 pthread_cond_signal(&req->cv); 1342 } 1343 1344 1345 static void 1346 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) 1347 { 1348 struct nvme_submission_queue *sq; 1349 uint16_t status; 1350 uint16_t sqhead; 1351 int err; 1352 1353 /* handle all submissions up to sq->tail index */ 1354 sq = &sc->submit_queues[idx]; 1355 1356 if (atomic_testandset_int(&sq->busy, 1)) { 1357 DPRINTF(("%s sqid %u busy\r\n", __func__, idx)); 1358 return; 1359 } 1360 1361 sqhead = atomic_load_acq_short(&sq->head); 1362 1363 DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p\r\n", 1364 idx, sqhead, sq->tail, sq->qbase)); 1365 1366 while (sqhead != atomic_load_acq_short(&sq->tail)) { 1367 struct nvme_command *cmd; 1368 struct pci_nvme_ioreq *req = NULL; 1369 uint64_t lba; 1370 uint64_t nblocks, bytes, size, cpsz; 1371 1372 /* TODO: support scatter gather list handling */ 1373 1374 cmd = &sq->qbase[sqhead]; 1375 sqhead = (sqhead + 1) % sq->size; 1376 1377 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; 1378 1379 if (cmd->opc == NVME_OPC_FLUSH) { 1380 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1381 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, 1382 status, 1); 1383 1384 continue; 1385 } else if (cmd->opc == 0x08) { 1386 /* TODO: write zeroes */ 1387 WPRINTF(("%s write zeroes lba 0x%lx blocks %u\r\n", 1388 __func__, lba, cmd->cdw12 & 0xFFFF)); 1389 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1390 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, 1391 status, 1); 1392 1393 continue; 1394 } 1395 1396 nblocks = (cmd->cdw12 & 0xFFFF) + 1; 1397 1398 bytes = nblocks * sc->nvstore.sectsz; 1399 1400 if (sc->nvstore.type == NVME_STOR_BLOCKIF) { 1401 req = pci_nvme_get_ioreq(sc); 1402 req->nvme_sq = sq; 1403 req->sqid = idx; 1404 } 1405 1406 /* 1407 * If data starts mid-page and flows into the next page, then 1408 * increase page count 1409 */ 1410 1411 DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu " 1412 "(%lu-bytes)\r\n", 1413 sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size, 1414 cmd->opc == NVME_OPC_WRITE ? 1415 "WRITE" : "READ", 1416 lba, nblocks, bytes)); 1417 1418 cmd->prp1 &= ~(0x03UL); 1419 cmd->prp2 &= ~(0x03UL); 1420 1421 DPRINTF((" prp1 0x%lx prp2 0x%lx\r\n", cmd->prp1, cmd->prp2)); 1422 1423 size = bytes; 1424 lba *= sc->nvstore.sectsz; 1425 1426 cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE); 1427 1428 if (cpsz > bytes) 1429 cpsz = bytes; 1430 1431 if (req != NULL) { 1432 req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) | 1433 cmd->cdw10; 1434 req->opc = cmd->opc; 1435 req->cid = cmd->cid; 1436 req->nsid = cmd->nsid; 1437 } 1438 1439 err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz, 1440 cmd->opc == NVME_OPC_WRITE, lba); 1441 lba += cpsz; 1442 size -= cpsz; 1443 1444 if (size == 0) 1445 goto iodone; 1446 1447 if (size <= PAGE_SIZE) { 1448 /* prp2 is second (and final) page in transfer */ 1449 1450 err = pci_nvme_append_iov_req(sc, req, cmd->prp2, 1451 size, 1452 cmd->opc == NVME_OPC_WRITE, 1453 lba); 1454 } else { 1455 uint64_t *prp_list; 1456 int i; 1457 1458 /* prp2 is pointer to a physical region page list */ 1459 prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx, 1460 cmd->prp2, PAGE_SIZE); 1461 1462 i = 0; 1463 while (size != 0) { 1464 cpsz = MIN(size, PAGE_SIZE); 1465 1466 /* 1467 * Move to linked physical region page list 1468 * in last item. 1469 */ 1470 if (i == (NVME_PRP2_ITEMS-1) && 1471 size > PAGE_SIZE) { 1472 assert((prp_list[i] & (PAGE_SIZE-1)) == 0); 1473 prp_list = paddr_guest2host( 1474 sc->nsc_pi->pi_vmctx, 1475 prp_list[i], PAGE_SIZE); 1476 i = 0; 1477 } 1478 if (prp_list[i] == 0) { 1479 WPRINTF(("PRP2[%d] = 0 !!!\r\n", i)); 1480 err = 1; 1481 break; 1482 } 1483 1484 err = pci_nvme_append_iov_req(sc, req, 1485 prp_list[i], cpsz, 1486 cmd->opc == NVME_OPC_WRITE, lba); 1487 if (err) 1488 break; 1489 1490 lba += cpsz; 1491 size -= cpsz; 1492 i++; 1493 } 1494 } 1495 1496 iodone: 1497 if (sc->nvstore.type == NVME_STOR_RAM) { 1498 uint16_t code, status; 1499 1500 code = err ? NVME_SC_LBA_OUT_OF_RANGE : 1501 NVME_SC_SUCCESS; 1502 pci_nvme_status_genc(&status, code); 1503 1504 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, 1505 status, 1); 1506 1507 continue; 1508 } 1509 1510 1511 if (err) 1512 goto do_error; 1513 1514 req->io_req.br_callback = pci_nvme_io_done; 1515 1516 err = 0; 1517 switch (cmd->opc) { 1518 case NVME_OPC_READ: 1519 err = blockif_read(sc->nvstore.ctx, &req->io_req); 1520 break; 1521 case NVME_OPC_WRITE: 1522 err = blockif_write(sc->nvstore.ctx, &req->io_req); 1523 break; 1524 default: 1525 WPRINTF(("%s unhandled io command 0x%x\r\n", 1526 __func__, cmd->opc)); 1527 err = 1; 1528 } 1529 1530 do_error: 1531 if (err) { 1532 uint16_t status; 1533 1534 pci_nvme_status_genc(&status, 1535 NVME_SC_DATA_TRANSFER_ERROR); 1536 1537 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, 1538 status, 1); 1539 pci_nvme_release_ioreq(sc, req); 1540 } 1541 } 1542 1543 atomic_store_short(&sq->head, sqhead); 1544 atomic_store_int(&sq->busy, 0); 1545 } 1546 1547 static void 1548 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc, 1549 uint64_t idx, int is_sq, uint64_t value) 1550 { 1551 DPRINTF(("nvme doorbell %lu, %s, val 0x%lx\r\n", 1552 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF)); 1553 1554 if (is_sq) { 1555 atomic_store_short(&sc->submit_queues[idx].tail, 1556 (uint16_t)value); 1557 1558 if (idx == 0) { 1559 pci_nvme_handle_admin_cmd(sc, value); 1560 } else { 1561 /* submission queue; handle new entries in SQ */ 1562 if (idx > sc->num_squeues) { 1563 WPRINTF(("%s SQ index %lu overflow from " 1564 "guest (max %u)\r\n", 1565 __func__, idx, sc->num_squeues)); 1566 return; 1567 } 1568 pci_nvme_handle_io_cmd(sc, (uint16_t)idx); 1569 } 1570 } else { 1571 if (idx > sc->num_cqueues) { 1572 WPRINTF(("%s queue index %lu overflow from " 1573 "guest (max %u)\r\n", 1574 __func__, idx, sc->num_cqueues)); 1575 return; 1576 } 1577 1578 sc->compl_queues[idx].head = (uint16_t)value; 1579 } 1580 } 1581 1582 static void 1583 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) 1584 { 1585 const char *s = iswrite ? "WRITE" : "READ"; 1586 1587 switch (offset) { 1588 case NVME_CR_CAP_LOW: 1589 DPRINTF(("%s %s NVME_CR_CAP_LOW\r\n", func, s)); 1590 break; 1591 case NVME_CR_CAP_HI: 1592 DPRINTF(("%s %s NVME_CR_CAP_HI\r\n", func, s)); 1593 break; 1594 case NVME_CR_VS: 1595 DPRINTF(("%s %s NVME_CR_VS\r\n", func, s)); 1596 break; 1597 case NVME_CR_INTMS: 1598 DPRINTF(("%s %s NVME_CR_INTMS\r\n", func, s)); 1599 break; 1600 case NVME_CR_INTMC: 1601 DPRINTF(("%s %s NVME_CR_INTMC\r\n", func, s)); 1602 break; 1603 case NVME_CR_CC: 1604 DPRINTF(("%s %s NVME_CR_CC\r\n", func, s)); 1605 break; 1606 case NVME_CR_CSTS: 1607 DPRINTF(("%s %s NVME_CR_CSTS\r\n", func, s)); 1608 break; 1609 case NVME_CR_NSSR: 1610 DPRINTF(("%s %s NVME_CR_NSSR\r\n", func, s)); 1611 break; 1612 case NVME_CR_AQA: 1613 DPRINTF(("%s %s NVME_CR_AQA\r\n", func, s)); 1614 break; 1615 case NVME_CR_ASQ_LOW: 1616 DPRINTF(("%s %s NVME_CR_ASQ_LOW\r\n", func, s)); 1617 break; 1618 case NVME_CR_ASQ_HI: 1619 DPRINTF(("%s %s NVME_CR_ASQ_HI\r\n", func, s)); 1620 break; 1621 case NVME_CR_ACQ_LOW: 1622 DPRINTF(("%s %s NVME_CR_ACQ_LOW\r\n", func, s)); 1623 break; 1624 case NVME_CR_ACQ_HI: 1625 DPRINTF(("%s %s NVME_CR_ACQ_HI\r\n", func, s)); 1626 break; 1627 default: 1628 DPRINTF(("unknown nvme bar-0 offset 0x%lx\r\n", offset)); 1629 } 1630 1631 } 1632 1633 static void 1634 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc, 1635 uint64_t offset, int size, uint64_t value) 1636 { 1637 uint32_t ccreg; 1638 1639 if (offset >= NVME_DOORBELL_OFFSET) { 1640 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; 1641 uint64_t idx = belloffset / 8; /* door bell size = 2*int */ 1642 int is_sq = (belloffset % 8) < 4; 1643 1644 if (belloffset > ((sc->max_queues+1) * 8 - 4)) { 1645 WPRINTF(("guest attempted an overflow write offset " 1646 "0x%lx, val 0x%lx in %s", 1647 offset, value, __func__)); 1648 return; 1649 } 1650 1651 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value); 1652 return; 1653 } 1654 1655 DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx\r\n", 1656 offset, size, value)); 1657 1658 if (size != 4) { 1659 WPRINTF(("guest wrote invalid size %d (offset 0x%lx, " 1660 "val 0x%lx) to bar0 in %s", 1661 size, offset, value, __func__)); 1662 /* TODO: shutdown device */ 1663 return; 1664 } 1665 1666 pci_nvme_bar0_reg_dumps(__func__, offset, 1); 1667 1668 pthread_mutex_lock(&sc->mtx); 1669 1670 switch (offset) { 1671 case NVME_CR_CAP_LOW: 1672 case NVME_CR_CAP_HI: 1673 /* readonly */ 1674 break; 1675 case NVME_CR_VS: 1676 /* readonly */ 1677 break; 1678 case NVME_CR_INTMS: 1679 /* MSI-X, so ignore */ 1680 break; 1681 case NVME_CR_INTMC: 1682 /* MSI-X, so ignore */ 1683 break; 1684 case NVME_CR_CC: 1685 ccreg = (uint32_t)value; 1686 1687 DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u " 1688 "iocqes %u\r\n", 1689 __func__, 1690 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), 1691 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), 1692 NVME_CC_GET_IOCQES(ccreg))); 1693 1694 if (NVME_CC_GET_SHN(ccreg)) { 1695 /* perform shutdown - flush out data to backend */ 1696 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << 1697 NVME_CSTS_REG_SHST_SHIFT); 1698 sc->regs.csts |= NVME_SHST_COMPLETE << 1699 NVME_CSTS_REG_SHST_SHIFT; 1700 } 1701 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { 1702 if (NVME_CC_GET_EN(ccreg) == 0) 1703 /* transition 1-> causes controller reset */ 1704 pci_nvme_reset_locked(sc); 1705 else 1706 pci_nvme_init_controller(ctx, sc); 1707 } 1708 1709 /* Insert the iocqes, iosqes and en bits from the write */ 1710 sc->regs.cc &= ~NVME_CC_WRITE_MASK; 1711 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; 1712 if (NVME_CC_GET_EN(ccreg) == 0) { 1713 /* Insert the ams, mps and css bit fields */ 1714 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; 1715 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; 1716 sc->regs.csts &= ~NVME_CSTS_RDY; 1717 } else if (sc->pending_ios == 0) { 1718 sc->regs.csts |= NVME_CSTS_RDY; 1719 } 1720 break; 1721 case NVME_CR_CSTS: 1722 break; 1723 case NVME_CR_NSSR: 1724 /* ignore writes; don't support subsystem reset */ 1725 break; 1726 case NVME_CR_AQA: 1727 sc->regs.aqa = (uint32_t)value; 1728 break; 1729 case NVME_CR_ASQ_LOW: 1730 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | 1731 (0xFFFFF000 & value); 1732 break; 1733 case NVME_CR_ASQ_HI: 1734 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | 1735 (value << 32); 1736 break; 1737 case NVME_CR_ACQ_LOW: 1738 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | 1739 (0xFFFFF000 & value); 1740 break; 1741 case NVME_CR_ACQ_HI: 1742 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | 1743 (value << 32); 1744 break; 1745 default: 1746 DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d\r\n", 1747 __func__, offset, value, size)); 1748 } 1749 pthread_mutex_unlock(&sc->mtx); 1750 } 1751 1752 static void 1753 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, 1754 int baridx, uint64_t offset, int size, uint64_t value) 1755 { 1756 struct pci_nvme_softc* sc = pi->pi_arg; 1757 1758 if (baridx == pci_msix_table_bar(pi) || 1759 baridx == pci_msix_pba_bar(pi)) { 1760 DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, " 1761 " value 0x%lx\r\n", baridx, offset, size, value)); 1762 1763 pci_emul_msix_twrite(pi, offset, size, value); 1764 return; 1765 } 1766 1767 switch (baridx) { 1768 case 0: 1769 pci_nvme_write_bar_0(ctx, sc, offset, size, value); 1770 break; 1771 1772 default: 1773 DPRINTF(("%s unknown baridx %d, val 0x%lx\r\n", 1774 __func__, baridx, value)); 1775 } 1776 } 1777 1778 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, 1779 uint64_t offset, int size) 1780 { 1781 uint64_t value; 1782 1783 pci_nvme_bar0_reg_dumps(__func__, offset, 0); 1784 1785 if (offset < NVME_DOORBELL_OFFSET) { 1786 void *p = &(sc->regs); 1787 pthread_mutex_lock(&sc->mtx); 1788 memcpy(&value, (void *)((uintptr_t)p + offset), size); 1789 pthread_mutex_unlock(&sc->mtx); 1790 } else { 1791 value = 0; 1792 WPRINTF(("pci_nvme: read invalid offset %ld\r\n", offset)); 1793 } 1794 1795 switch (size) { 1796 case 1: 1797 value &= 0xFF; 1798 break; 1799 case 2: 1800 value &= 0xFFFF; 1801 break; 1802 case 4: 1803 value &= 0xFFFFFFFF; 1804 break; 1805 } 1806 1807 DPRINTF((" nvme-read offset 0x%lx, size %d -> value 0x%x\r\n", 1808 offset, size, (uint32_t)value)); 1809 1810 return (value); 1811 } 1812 1813 1814 1815 static uint64_t 1816 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, 1817 uint64_t offset, int size) 1818 { 1819 struct pci_nvme_softc* sc = pi->pi_arg; 1820 1821 if (baridx == pci_msix_table_bar(pi) || 1822 baridx == pci_msix_pba_bar(pi)) { 1823 DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d\r\n", 1824 baridx, offset, size)); 1825 1826 return pci_emul_msix_tread(pi, offset, size); 1827 } 1828 1829 switch (baridx) { 1830 case 0: 1831 return pci_nvme_read_bar_0(sc, offset, size); 1832 1833 default: 1834 DPRINTF(("unknown bar %d, 0x%lx\r\n", baridx, offset)); 1835 } 1836 1837 return (0); 1838 } 1839 1840 1841 static int 1842 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts) 1843 { 1844 char bident[sizeof("XX:X:X")]; 1845 char *uopt, *xopts, *config; 1846 uint32_t sectsz; 1847 int optidx; 1848 1849 sc->max_queues = NVME_QUEUES; 1850 sc->max_qentries = NVME_MAX_QENTRIES; 1851 sc->ioslots = NVME_IOSLOTS; 1852 sc->num_squeues = sc->max_queues; 1853 sc->num_cqueues = sc->max_queues; 1854 sectsz = 0; 1855 1856 uopt = strdup(opts); 1857 optidx = 0; 1858 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), 1859 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 1860 for (xopts = strtok(uopt, ","); 1861 xopts != NULL; 1862 xopts = strtok(NULL, ",")) { 1863 1864 if ((config = strchr(xopts, '=')) != NULL) 1865 *config++ = '\0'; 1866 1867 if (!strcmp("maxq", xopts)) { 1868 sc->max_queues = atoi(config); 1869 } else if (!strcmp("qsz", xopts)) { 1870 sc->max_qentries = atoi(config); 1871 } else if (!strcmp("ioslots", xopts)) { 1872 sc->ioslots = atoi(config); 1873 } else if (!strcmp("sectsz", xopts)) { 1874 sectsz = atoi(config); 1875 } else if (!strcmp("ser", xopts)) { 1876 /* 1877 * This field indicates the Product Serial Number in 1878 * 7-bit ASCII, unused bytes should be space characters. 1879 * Ref: NVMe v1.3c. 1880 */ 1881 cpywithpad((char *)sc->ctrldata.sn, 1882 sizeof(sc->ctrldata.sn), config, ' '); 1883 } else if (!strcmp("ram", xopts)) { 1884 uint64_t sz = strtoull(&xopts[4], NULL, 10); 1885 1886 sc->nvstore.type = NVME_STOR_RAM; 1887 sc->nvstore.size = sz * 1024 * 1024; 1888 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 1889 sc->nvstore.sectsz = 4096; 1890 sc->nvstore.sectsz_bits = 12; 1891 if (sc->nvstore.ctx == NULL) { 1892 perror("Unable to allocate RAM"); 1893 free(uopt); 1894 return (-1); 1895 } 1896 } else if (!strcmp("eui64", xopts)) { 1897 sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0)); 1898 } else if (optidx == 0) { 1899 snprintf(bident, sizeof(bident), "%d:%d", 1900 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 1901 sc->nvstore.ctx = blockif_open(xopts, bident); 1902 if (sc->nvstore.ctx == NULL) { 1903 perror("Could not open backing file"); 1904 free(uopt); 1905 return (-1); 1906 } 1907 sc->nvstore.type = NVME_STOR_BLOCKIF; 1908 sc->nvstore.size = blockif_size(sc->nvstore.ctx); 1909 } else { 1910 fprintf(stderr, "Invalid option %s\n", xopts); 1911 free(uopt); 1912 return (-1); 1913 } 1914 1915 optidx++; 1916 } 1917 free(uopt); 1918 1919 if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) { 1920 fprintf(stderr, "backing store not specified\n"); 1921 return (-1); 1922 } 1923 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) 1924 sc->nvstore.sectsz = sectsz; 1925 else if (sc->nvstore.type != NVME_STOR_RAM) 1926 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); 1927 for (sc->nvstore.sectsz_bits = 9; 1928 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; 1929 sc->nvstore.sectsz_bits++); 1930 1931 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) 1932 sc->max_queues = NVME_QUEUES; 1933 1934 if (sc->max_qentries <= 0) { 1935 fprintf(stderr, "Invalid qsz option\n"); 1936 return (-1); 1937 } 1938 if (sc->ioslots <= 0) { 1939 fprintf(stderr, "Invalid ioslots option\n"); 1940 return (-1); 1941 } 1942 1943 return (0); 1944 } 1945 1946 static int 1947 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) 1948 { 1949 struct pci_nvme_softc *sc; 1950 uint32_t pci_membar_sz; 1951 int error; 1952 1953 error = 0; 1954 1955 sc = calloc(1, sizeof(struct pci_nvme_softc)); 1956 pi->pi_arg = sc; 1957 sc->nsc_pi = pi; 1958 1959 error = pci_nvme_parse_opts(sc, opts); 1960 if (error < 0) 1961 goto done; 1962 else 1963 error = 0; 1964 1965 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); 1966 for (int i = 0; i < sc->ioslots; i++) { 1967 if (i < (sc->ioslots-1)) 1968 sc->ioreqs[i].next = &sc->ioreqs[i+1]; 1969 pthread_mutex_init(&sc->ioreqs[i].mtx, NULL); 1970 pthread_cond_init(&sc->ioreqs[i].cv, NULL); 1971 } 1972 sc->ioreqs_free = sc->ioreqs; 1973 sc->intr_coales_aggr_thresh = 1; 1974 1975 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); 1976 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); 1977 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); 1978 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); 1979 pci_set_cfgdata8(pi, PCIR_PROGIF, 1980 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); 1981 1982 /* 1983 * Allocate size of NVMe registers + doorbell space for all queues. 1984 * 1985 * The specification requires a minimum memory I/O window size of 16K. 1986 * The Windows driver will refuse to start a device with a smaller 1987 * window. 1988 */ 1989 pci_membar_sz = sizeof(struct nvme_registers) + 1990 2 * sizeof(uint32_t) * (sc->max_queues + 1); 1991 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); 1992 1993 DPRINTF(("nvme membar size: %u\r\n", pci_membar_sz)); 1994 1995 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); 1996 if (error) { 1997 WPRINTF(("%s pci alloc mem bar failed\r\n", __func__)); 1998 goto done; 1999 } 2000 2001 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); 2002 if (error) { 2003 WPRINTF(("%s pci add msixcap failed\r\n", __func__)); 2004 goto done; 2005 } 2006 2007 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); 2008 if (error) { 2009 WPRINTF(("%s pci add Express capability failed\r\n", __func__)); 2010 goto done; 2011 } 2012 2013 pthread_mutex_init(&sc->mtx, NULL); 2014 sem_init(&sc->iosemlock, 0, sc->ioslots); 2015 2016 pci_nvme_reset(sc); 2017 pci_nvme_init_ctrldata(sc); 2018 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, sc->nvstore.eui64); 2019 pci_nvme_init_logpages(sc); 2020 2021 pci_lintr_request(pi); 2022 2023 done: 2024 return (error); 2025 } 2026 2027 2028 struct pci_devemu pci_de_nvme = { 2029 .pe_emu = "nvme", 2030 .pe_init = pci_nvme_init, 2031 .pe_barwrite = pci_nvme_write, 2032 .pe_barread = pci_nvme_read 2033 }; 2034 PCI_EMUL_SET(pci_de_nvme); 2035