1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2017 Shunsuke Mie 5 * Copyright (c) 2018 Leon Dang 6 * 7 * Function crc16 Copyright (c) 2017, Fedor Uporov 8 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 /* 33 * bhyve PCIe-NVMe device emulation. 34 * 35 * options: 36 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=# 37 * 38 * accepted devpath: 39 * /dev/blockdev 40 * /path/to/image 41 * ram=size_in_MiB 42 * 43 * maxq = max number of queues 44 * qsz = max elements in each queue 45 * ioslots = max number of concurrent io requests 46 * sectsz = sector size (defaults to blockif sector size) 47 * ser = serial number (20-chars max) 48 * eui64 = IEEE Extended Unique Identifier (8 byte value) 49 * 50 */ 51 52 /* TODO: 53 - create async event for smart and log 54 - intr coalesce 55 */ 56 57 #include <sys/cdefs.h> 58 __FBSDID("$FreeBSD$"); 59 60 #include <sys/types.h> 61 #include <net/ieee_oui.h> 62 #ifndef __FreeBSD__ 63 #include <endian.h> 64 #endif 65 66 #include <assert.h> 67 #include <pthread.h> 68 #include <semaphore.h> 69 #include <stdbool.h> 70 #include <stddef.h> 71 #include <stdint.h> 72 #include <stdio.h> 73 #include <stdlib.h> 74 #include <string.h> 75 76 #include <machine/atomic.h> 77 #include <machine/vmm.h> 78 #include <vmmapi.h> 79 80 #include <dev/nvme/nvme.h> 81 82 #include "bhyverun.h" 83 #include "block_if.h" 84 #include "pci_emul.h" 85 86 87 static int nvme_debug = 0; 88 #define DPRINTF(params) if (nvme_debug) printf params 89 #define WPRINTF(params) printf params 90 91 /* defaults; can be overridden */ 92 #define NVME_MSIX_BAR 4 93 94 #define NVME_IOSLOTS 8 95 96 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ 97 #define NVME_MMIO_SPACE_MIN (1 << 14) 98 99 #define NVME_QUEUES 16 100 #define NVME_MAX_QENTRIES 2048 101 102 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) 103 #define NVME_MAX_BLOCKIOVS 512 104 105 /* This is a synthetic status code to indicate there is no status */ 106 #define NVME_NO_STATUS 0xffff 107 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) 108 109 /* helpers */ 110 111 /* Convert a zero-based value into a one-based value */ 112 #define ONE_BASED(zero) ((zero) + 1) 113 /* Convert a one-based value into a zero-based value */ 114 #define ZERO_BASED(one) ((one) - 1) 115 116 /* Encode number of SQ's and CQ's for Set/Get Features */ 117 #define NVME_FEATURE_NUM_QUEUES(sc) \ 118 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ 119 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16; 120 121 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) 122 123 enum nvme_controller_register_offsets { 124 NVME_CR_CAP_LOW = 0x00, 125 NVME_CR_CAP_HI = 0x04, 126 NVME_CR_VS = 0x08, 127 NVME_CR_INTMS = 0x0c, 128 NVME_CR_INTMC = 0x10, 129 NVME_CR_CC = 0x14, 130 NVME_CR_CSTS = 0x1c, 131 NVME_CR_NSSR = 0x20, 132 NVME_CR_AQA = 0x24, 133 NVME_CR_ASQ_LOW = 0x28, 134 NVME_CR_ASQ_HI = 0x2c, 135 NVME_CR_ACQ_LOW = 0x30, 136 NVME_CR_ACQ_HI = 0x34, 137 }; 138 139 enum nvme_cmd_cdw11 { 140 NVME_CMD_CDW11_PC = 0x0001, 141 NVME_CMD_CDW11_IEN = 0x0002, 142 NVME_CMD_CDW11_IV = 0xFFFF0000, 143 }; 144 145 #define NVME_CQ_INTEN 0x01 146 #define NVME_CQ_INTCOAL 0x02 147 148 struct nvme_completion_queue { 149 struct nvme_completion *qbase; 150 uint32_t size; 151 uint16_t tail; /* nvme progress */ 152 uint16_t head; /* guest progress */ 153 uint16_t intr_vec; 154 uint32_t intr_en; 155 pthread_mutex_t mtx; 156 }; 157 158 struct nvme_submission_queue { 159 struct nvme_command *qbase; 160 uint32_t size; 161 uint16_t head; /* nvme progress */ 162 uint16_t tail; /* guest progress */ 163 uint16_t cqid; /* completion queue id */ 164 int busy; /* queue is being processed */ 165 int qpriority; 166 }; 167 168 enum nvme_storage_type { 169 NVME_STOR_BLOCKIF = 0, 170 NVME_STOR_RAM = 1, 171 }; 172 173 struct pci_nvme_blockstore { 174 enum nvme_storage_type type; 175 void *ctx; 176 uint64_t size; 177 uint32_t sectsz; 178 uint32_t sectsz_bits; 179 uint64_t eui64; 180 }; 181 182 struct pci_nvme_ioreq { 183 struct pci_nvme_softc *sc; 184 struct pci_nvme_ioreq *next; 185 struct nvme_submission_queue *nvme_sq; 186 uint16_t sqid; 187 188 /* command information */ 189 uint16_t opc; 190 uint16_t cid; 191 uint32_t nsid; 192 193 uint64_t prev_gpaddr; 194 size_t prev_size; 195 196 /* 197 * lock if all iovs consumed (big IO); 198 * complete transaction before continuing 199 */ 200 pthread_mutex_t mtx; 201 pthread_cond_t cv; 202 203 struct blockif_req io_req; 204 205 /* pad to fit up to 512 page descriptors from guest IO request */ 206 struct iovec iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX]; 207 }; 208 209 struct pci_nvme_softc { 210 struct pci_devinst *nsc_pi; 211 212 pthread_mutex_t mtx; 213 214 struct nvme_registers regs; 215 216 struct nvme_namespace_data nsdata; 217 struct nvme_controller_data ctrldata; 218 struct nvme_error_information_entry err_log; 219 struct nvme_health_information_page health_log; 220 struct nvme_firmware_page fw_log; 221 222 struct pci_nvme_blockstore nvstore; 223 224 uint16_t max_qentries; /* max entries per queue */ 225 uint32_t max_queues; /* max number of IO SQ's or CQ's */ 226 uint32_t num_cqueues; 227 uint32_t num_squeues; 228 229 struct pci_nvme_ioreq *ioreqs; 230 struct pci_nvme_ioreq *ioreqs_free; /* free list of ioreqs */ 231 uint32_t pending_ios; 232 uint32_t ioslots; 233 sem_t iosemlock; 234 235 /* 236 * Memory mapped Submission and Completion queues 237 * Each array includes both Admin and IO queues 238 */ 239 struct nvme_completion_queue *compl_queues; 240 struct nvme_submission_queue *submit_queues; 241 242 /* controller features */ 243 uint32_t intr_coales_aggr_time; /* 0x08: uS to delay intr */ 244 uint32_t intr_coales_aggr_thresh; /* 0x08: compl-Q entries */ 245 uint32_t async_ev_config; /* 0x0B: async event config */ 246 }; 247 248 249 static void pci_nvme_io_partial(struct blockif_req *br, int err); 250 251 /* Controller Configuration utils */ 252 #define NVME_CC_GET_EN(cc) \ 253 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) 254 #define NVME_CC_GET_CSS(cc) \ 255 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) 256 #define NVME_CC_GET_SHN(cc) \ 257 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) 258 #define NVME_CC_GET_IOSQES(cc) \ 259 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) 260 #define NVME_CC_GET_IOCQES(cc) \ 261 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) 262 263 #define NVME_CC_WRITE_MASK \ 264 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ 265 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ 266 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) 267 268 #define NVME_CC_NEN_WRITE_MASK \ 269 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ 270 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ 271 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) 272 273 /* Controller Status utils */ 274 #define NVME_CSTS_GET_RDY(sts) \ 275 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) 276 277 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) 278 279 /* Completion Queue status word utils */ 280 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) 281 #define NVME_STATUS_MASK \ 282 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ 283 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) 284 285 static __inline void 286 cpywithpad(char *dst, size_t dst_size, const char *src, char pad) 287 { 288 size_t len; 289 290 len = strnlen(src, dst_size); 291 memset(dst, pad, dst_size); 292 memcpy(dst, src, len); 293 } 294 295 static __inline void 296 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) 297 { 298 299 *status &= ~NVME_STATUS_MASK; 300 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | 301 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; 302 } 303 304 static __inline void 305 pci_nvme_status_genc(uint16_t *status, uint16_t code) 306 { 307 308 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); 309 } 310 311 static __inline void 312 pci_nvme_toggle_phase(uint16_t *status, int prev) 313 { 314 315 if (prev) 316 *status &= ~NVME_STATUS_P; 317 else 318 *status |= NVME_STATUS_P; 319 } 320 321 static void 322 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) 323 { 324 struct nvme_controller_data *cd = &sc->ctrldata; 325 326 cd->vid = 0xFB5D; 327 cd->ssvid = 0x0000; 328 329 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); 330 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); 331 332 /* Num of submission commands that we can handle at a time (2^rab) */ 333 cd->rab = 4; 334 335 /* FreeBSD OUI */ 336 cd->ieee[0] = 0x58; 337 cd->ieee[1] = 0x9c; 338 cd->ieee[2] = 0xfc; 339 340 cd->mic = 0; 341 342 cd->mdts = 9; /* max data transfer size (2^mdts * CAP.MPSMIN) */ 343 344 cd->ver = 0x00010300; 345 346 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; 347 cd->acl = 2; 348 cd->aerl = 4; 349 350 cd->lpa = 0; /* TODO: support some simple things like SMART */ 351 cd->elpe = 0; /* max error log page entries */ 352 cd->npss = 1; /* number of power states support */ 353 354 /* Warning Composite Temperature Threshold */ 355 cd->wctemp = 0x0157; 356 357 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | 358 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); 359 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | 360 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); 361 cd->nn = 1; /* number of namespaces */ 362 363 cd->fna = 0x03; 364 365 cd->power_state[0].mp = 10; 366 } 367 368 /* 369 * Calculate the CRC-16 of the given buffer 370 * See copyright attribution at top of file 371 */ 372 static uint16_t 373 crc16(uint16_t crc, const void *buffer, unsigned int len) 374 { 375 const unsigned char *cp = buffer; 376 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */ 377 static uint16_t const crc16_table[256] = { 378 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, 379 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, 380 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, 381 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841, 382 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, 383 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41, 384 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641, 385 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040, 386 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240, 387 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441, 388 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41, 389 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840, 390 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41, 391 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40, 392 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640, 393 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041, 394 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240, 395 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441, 396 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41, 397 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840, 398 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41, 399 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40, 400 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640, 401 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041, 402 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241, 403 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440, 404 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40, 405 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841, 406 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40, 407 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41, 408 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, 409 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 410 }; 411 412 while (len--) 413 crc = (((crc >> 8) & 0xffU) ^ 414 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU; 415 return crc; 416 } 417 418 static void 419 pci_nvme_init_nsdata(struct pci_nvme_softc *sc, 420 struct nvme_namespace_data *nd, uint32_t nsid, 421 uint64_t eui64) 422 { 423 424 nd->nsze = sc->nvstore.size / sc->nvstore.sectsz; 425 nd->ncap = nd->nsze; 426 nd->nuse = nd->nsze; 427 428 /* Get LBA and backstore information from backing store */ 429 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ 430 nd->flbas = 0; 431 432 /* Create an EUI-64 if user did not provide one */ 433 if (eui64 == 0) { 434 char *data = NULL; 435 436 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus, 437 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 438 439 if (data != NULL) { 440 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); 441 free(data); 442 } 443 eui64 = (eui64 << 16) | (nsid & 0xffff); 444 } 445 be64enc(nd->eui64, eui64); 446 447 /* LBA data-sz = 2^lbads */ 448 nd->lbaf[0] = sc->nvstore.sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; 449 } 450 451 static void 452 pci_nvme_init_logpages(struct pci_nvme_softc *sc) 453 { 454 455 memset(&sc->err_log, 0, sizeof(sc->err_log)); 456 memset(&sc->health_log, 0, sizeof(sc->health_log)); 457 memset(&sc->fw_log, 0, sizeof(sc->fw_log)); 458 } 459 460 static void 461 pci_nvme_reset_locked(struct pci_nvme_softc *sc) 462 { 463 DPRINTF(("%s\r\n", __func__)); 464 465 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | 466 (1 << NVME_CAP_LO_REG_CQR_SHIFT) | 467 (60 << NVME_CAP_LO_REG_TO_SHIFT); 468 469 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; 470 471 sc->regs.vs = 0x00010300; /* NVMe v1.3 */ 472 473 sc->regs.cc = 0; 474 sc->regs.csts = 0; 475 476 sc->num_cqueues = sc->num_squeues = sc->max_queues; 477 if (sc->submit_queues != NULL) { 478 for (int i = 0; i < sc->num_squeues + 1; i++) { 479 /* 480 * The Admin Submission Queue is at index 0. 481 * It must not be changed at reset otherwise the 482 * emulation will be out of sync with the guest. 483 */ 484 if (i != 0) { 485 sc->submit_queues[i].qbase = NULL; 486 sc->submit_queues[i].size = 0; 487 sc->submit_queues[i].cqid = 0; 488 } 489 sc->submit_queues[i].tail = 0; 490 sc->submit_queues[i].head = 0; 491 sc->submit_queues[i].busy = 0; 492 } 493 } else 494 sc->submit_queues = calloc(sc->num_squeues + 1, 495 sizeof(struct nvme_submission_queue)); 496 497 if (sc->compl_queues != NULL) { 498 for (int i = 0; i < sc->num_cqueues + 1; i++) { 499 /* See Admin Submission Queue note above */ 500 if (i != 0) { 501 sc->compl_queues[i].qbase = NULL; 502 sc->compl_queues[i].size = 0; 503 } 504 505 sc->compl_queues[i].tail = 0; 506 sc->compl_queues[i].head = 0; 507 } 508 } else { 509 sc->compl_queues = calloc(sc->num_cqueues + 1, 510 sizeof(struct nvme_completion_queue)); 511 512 for (int i = 0; i < sc->num_cqueues + 1; i++) 513 pthread_mutex_init(&sc->compl_queues[i].mtx, NULL); 514 } 515 } 516 517 static void 518 pci_nvme_reset(struct pci_nvme_softc *sc) 519 { 520 pthread_mutex_lock(&sc->mtx); 521 pci_nvme_reset_locked(sc); 522 pthread_mutex_unlock(&sc->mtx); 523 } 524 525 static void 526 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc) 527 { 528 uint16_t acqs, asqs; 529 530 DPRINTF(("%s\r\n", __func__)); 531 532 asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1; 533 sc->submit_queues[0].size = asqs; 534 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq, 535 sizeof(struct nvme_command) * asqs); 536 537 DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p\r\n", 538 __func__, sc->regs.asq, sc->submit_queues[0].qbase)); 539 540 acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 541 NVME_AQA_REG_ACQS_MASK) + 1; 542 sc->compl_queues[0].size = acqs; 543 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq, 544 sizeof(struct nvme_completion) * acqs); 545 DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p\r\n", 546 __func__, sc->regs.acq, sc->compl_queues[0].qbase)); 547 } 548 549 static int 550 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *src, 551 size_t len) 552 { 553 uint8_t *dst; 554 size_t bytes; 555 556 if (len > (8 * 1024)) { 557 return (-1); 558 } 559 560 /* Copy from the start of prp1 to the end of the physical page */ 561 bytes = PAGE_SIZE - (prp1 & PAGE_MASK); 562 bytes = MIN(bytes, len); 563 564 dst = vm_map_gpa(ctx, prp1, bytes); 565 if (dst == NULL) { 566 return (-1); 567 } 568 569 memcpy(dst, src, bytes); 570 571 src += bytes; 572 573 len -= bytes; 574 if (len == 0) { 575 return (0); 576 } 577 578 len = MIN(len, PAGE_SIZE); 579 580 dst = vm_map_gpa(ctx, prp2, len); 581 if (dst == NULL) { 582 return (-1); 583 } 584 585 memcpy(dst, src, len); 586 587 return (0); 588 } 589 590 static int 591 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 592 struct nvme_completion* compl) 593 { 594 uint16_t qid = command->cdw10 & 0xffff; 595 596 DPRINTF(("%s DELETE_IO_SQ %u\r\n", __func__, qid)); 597 if (qid == 0 || qid > sc->num_squeues) { 598 WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u\r\n", 599 __func__, qid, sc->num_squeues)); 600 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 601 NVME_SC_INVALID_QUEUE_IDENTIFIER); 602 return (1); 603 } 604 605 sc->submit_queues[qid].qbase = NULL; 606 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 607 return (1); 608 } 609 610 static int 611 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 612 struct nvme_completion* compl) 613 { 614 if (command->cdw11 & NVME_CMD_CDW11_PC) { 615 uint16_t qid = command->cdw10 & 0xffff; 616 struct nvme_submission_queue *nsq; 617 618 if ((qid == 0) || (qid > sc->num_squeues)) { 619 WPRINTF(("%s queue index %u > num_squeues %u\r\n", 620 __func__, qid, sc->num_squeues)); 621 pci_nvme_status_tc(&compl->status, 622 NVME_SCT_COMMAND_SPECIFIC, 623 NVME_SC_INVALID_QUEUE_IDENTIFIER); 624 return (1); 625 } 626 627 nsq = &sc->submit_queues[qid]; 628 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 629 630 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 631 sizeof(struct nvme_command) * (size_t)nsq->size); 632 nsq->cqid = (command->cdw11 >> 16) & 0xffff; 633 nsq->qpriority = (command->cdw11 >> 1) & 0x03; 634 635 DPRINTF(("%s sq %u size %u gaddr %p cqid %u\r\n", __func__, 636 qid, nsq->size, nsq->qbase, nsq->cqid)); 637 638 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 639 640 DPRINTF(("%s completed creating IOSQ qid %u\r\n", 641 __func__, qid)); 642 } else { 643 /* 644 * Guest sent non-cont submission queue request. 645 * This setting is unsupported by this emulation. 646 */ 647 WPRINTF(("%s unsupported non-contig (list-based) " 648 "create i/o submission queue\r\n", __func__)); 649 650 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 651 } 652 return (1); 653 } 654 655 static int 656 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 657 struct nvme_completion* compl) 658 { 659 uint16_t qid = command->cdw10 & 0xffff; 660 661 DPRINTF(("%s DELETE_IO_CQ %u\r\n", __func__, qid)); 662 if (qid == 0 || qid > sc->num_cqueues) { 663 WPRINTF(("%s queue index %u / num_cqueues %u\r\n", 664 __func__, qid, sc->num_cqueues)); 665 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 666 NVME_SC_INVALID_QUEUE_IDENTIFIER); 667 return (1); 668 } 669 670 sc->compl_queues[qid].qbase = NULL; 671 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 672 return (1); 673 } 674 675 static int 676 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 677 struct nvme_completion* compl) 678 { 679 if (command->cdw11 & NVME_CMD_CDW11_PC) { 680 uint16_t qid = command->cdw10 & 0xffff; 681 struct nvme_completion_queue *ncq; 682 683 if ((qid == 0) || (qid > sc->num_cqueues)) { 684 WPRINTF(("%s queue index %u > num_cqueues %u\r\n", 685 __func__, qid, sc->num_cqueues)); 686 pci_nvme_status_tc(&compl->status, 687 NVME_SCT_COMMAND_SPECIFIC, 688 NVME_SC_INVALID_QUEUE_IDENTIFIER); 689 return (1); 690 } 691 692 ncq = &sc->compl_queues[qid]; 693 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; 694 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; 695 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 696 697 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 698 command->prp1, 699 sizeof(struct nvme_command) * (size_t)ncq->size); 700 701 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 702 } else { 703 /* 704 * Non-contig completion queue unsupported. 705 */ 706 WPRINTF(("%s unsupported non-contig (list-based) " 707 "create i/o completion queue\r\n", 708 __func__)); 709 710 /* 0x12 = Invalid Use of Controller Memory Buffer */ 711 pci_nvme_status_genc(&compl->status, 0x12); 712 } 713 714 return (1); 715 } 716 717 static int 718 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, 719 struct nvme_completion* compl) 720 { 721 uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2; 722 uint8_t logpage = command->cdw10 & 0xFF; 723 724 DPRINTF(("%s log page %u len %u\r\n", __func__, logpage, logsize)); 725 726 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 727 728 switch (logpage) { 729 case NVME_LOG_ERROR: 730 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 731 command->prp2, (uint8_t *)&sc->err_log, logsize); 732 break; 733 case NVME_LOG_HEALTH_INFORMATION: 734 /* TODO: present some smart info */ 735 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 736 command->prp2, (uint8_t *)&sc->health_log, logsize); 737 break; 738 case NVME_LOG_FIRMWARE_SLOT: 739 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 740 command->prp2, (uint8_t *)&sc->fw_log, logsize); 741 break; 742 default: 743 WPRINTF(("%s get log page %x command not supported\r\n", 744 __func__, logpage)); 745 746 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 747 NVME_SC_INVALID_LOG_PAGE); 748 } 749 750 return (1); 751 } 752 753 static int 754 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, 755 struct nvme_completion* compl) 756 { 757 void *dest; 758 759 DPRINTF(("%s identify 0x%x nsid 0x%x\r\n", __func__, 760 command->cdw10 & 0xFF, command->nsid)); 761 762 switch (command->cdw10 & 0xFF) { 763 case 0x00: /* return Identify Namespace data structure */ 764 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 765 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata)); 766 break; 767 case 0x01: /* return Identify Controller data structure */ 768 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 769 command->prp2, (uint8_t *)&sc->ctrldata, 770 sizeof(sc->ctrldata)); 771 break; 772 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ 773 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 774 sizeof(uint32_t) * 1024); 775 ((uint32_t *)dest)[0] = 1; 776 ((uint32_t *)dest)[1] = 0; 777 break; 778 case 0x11: 779 pci_nvme_status_genc(&compl->status, 780 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 781 return (1); 782 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ 783 case 0x10: 784 case 0x12: 785 case 0x13: 786 case 0x14: 787 case 0x15: 788 default: 789 DPRINTF(("%s unsupported identify command requested 0x%x\r\n", 790 __func__, command->cdw10 & 0xFF)); 791 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 792 return (1); 793 } 794 795 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 796 return (1); 797 } 798 799 static int 800 nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command, 801 struct nvme_completion* compl) 802 { 803 uint16_t nqr; /* Number of Queues Requested */ 804 805 nqr = command->cdw11 & 0xFFFF; 806 if (nqr == 0xffff) { 807 WPRINTF(("%s: Illegal NSQR value %#x\n", __func__, nqr)); 808 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 809 return (-1); 810 } 811 812 sc->num_squeues = ONE_BASED(nqr); 813 if (sc->num_squeues > sc->max_queues) { 814 DPRINTF(("NSQR=%u is greater than max %u\n", sc->num_squeues, 815 sc->max_queues)); 816 sc->num_squeues = sc->max_queues; 817 } 818 819 nqr = (command->cdw11 >> 16) & 0xFFFF; 820 if (nqr == 0xffff) { 821 WPRINTF(("%s: Illegal NCQR value %#x\n", __func__, nqr)); 822 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 823 return (-1); 824 } 825 826 sc->num_cqueues = ONE_BASED(nqr); 827 if (sc->num_cqueues > sc->max_queues) { 828 DPRINTF(("NCQR=%u is greater than max %u\n", sc->num_cqueues, 829 sc->max_queues)); 830 sc->num_cqueues = sc->max_queues; 831 } 832 833 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 834 835 return (0); 836 } 837 838 static int 839 nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command, 840 struct nvme_completion* compl) 841 { 842 int feature = command->cdw10 & 0xFF; 843 uint32_t iv; 844 845 DPRINTF(("%s feature 0x%x\r\n", __func__, feature)); 846 compl->cdw0 = 0; 847 848 switch (feature) { 849 case NVME_FEAT_ARBITRATION: 850 DPRINTF((" arbitration 0x%x\r\n", command->cdw11)); 851 break; 852 case NVME_FEAT_POWER_MANAGEMENT: 853 DPRINTF((" power management 0x%x\r\n", command->cdw11)); 854 break; 855 case NVME_FEAT_LBA_RANGE_TYPE: 856 DPRINTF((" lba range 0x%x\r\n", command->cdw11)); 857 break; 858 case NVME_FEAT_TEMPERATURE_THRESHOLD: 859 DPRINTF((" temperature threshold 0x%x\r\n", command->cdw11)); 860 break; 861 case NVME_FEAT_ERROR_RECOVERY: 862 DPRINTF((" error recovery 0x%x\r\n", command->cdw11)); 863 break; 864 case NVME_FEAT_VOLATILE_WRITE_CACHE: 865 DPRINTF((" volatile write cache 0x%x\r\n", command->cdw11)); 866 break; 867 case NVME_FEAT_NUMBER_OF_QUEUES: 868 nvme_set_feature_queues(sc, command, compl); 869 break; 870 case NVME_FEAT_INTERRUPT_COALESCING: 871 DPRINTF((" interrupt coalescing 0x%x\r\n", command->cdw11)); 872 873 /* in uS */ 874 sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100; 875 876 sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF; 877 break; 878 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 879 iv = command->cdw11 & 0xFFFF; 880 881 DPRINTF((" interrupt vector configuration 0x%x\r\n", 882 command->cdw11)); 883 884 for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) { 885 if (sc->compl_queues[i].intr_vec == iv) { 886 if (command->cdw11 & (1 << 16)) 887 sc->compl_queues[i].intr_en |= 888 NVME_CQ_INTCOAL; 889 else 890 sc->compl_queues[i].intr_en &= 891 ~NVME_CQ_INTCOAL; 892 } 893 } 894 break; 895 case NVME_FEAT_WRITE_ATOMICITY: 896 DPRINTF((" write atomicity 0x%x\r\n", command->cdw11)); 897 break; 898 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 899 DPRINTF((" async event configuration 0x%x\r\n", 900 command->cdw11)); 901 sc->async_ev_config = command->cdw11; 902 break; 903 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 904 DPRINTF((" software progress marker 0x%x\r\n", 905 command->cdw11)); 906 break; 907 case 0x0C: 908 DPRINTF((" autonomous power state transition 0x%x\r\n", 909 command->cdw11)); 910 break; 911 default: 912 WPRINTF(("%s invalid feature\r\n", __func__)); 913 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 914 return (1); 915 } 916 917 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 918 return (1); 919 } 920 921 static int 922 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, 923 struct nvme_completion* compl) 924 { 925 int feature = command->cdw10 & 0xFF; 926 927 DPRINTF(("%s feature 0x%x\r\n", __func__, feature)); 928 929 compl->cdw0 = 0; 930 931 switch (feature) { 932 case NVME_FEAT_ARBITRATION: 933 DPRINTF((" arbitration\r\n")); 934 break; 935 case NVME_FEAT_POWER_MANAGEMENT: 936 DPRINTF((" power management\r\n")); 937 break; 938 case NVME_FEAT_LBA_RANGE_TYPE: 939 DPRINTF((" lba range\r\n")); 940 break; 941 case NVME_FEAT_TEMPERATURE_THRESHOLD: 942 DPRINTF((" temperature threshold\r\n")); 943 switch ((command->cdw11 >> 20) & 0x3) { 944 case 0: 945 /* Over temp threshold */ 946 compl->cdw0 = 0xFFFF; 947 break; 948 case 1: 949 /* Under temp threshold */ 950 compl->cdw0 = 0; 951 break; 952 default: 953 WPRINTF((" invalid threshold type select\r\n")); 954 pci_nvme_status_genc(&compl->status, 955 NVME_SC_INVALID_FIELD); 956 return (1); 957 } 958 break; 959 case NVME_FEAT_ERROR_RECOVERY: 960 DPRINTF((" error recovery\r\n")); 961 break; 962 case NVME_FEAT_VOLATILE_WRITE_CACHE: 963 DPRINTF((" volatile write cache\r\n")); 964 break; 965 case NVME_FEAT_NUMBER_OF_QUEUES: 966 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 967 968 DPRINTF((" number of queues (submit %u, completion %u)\r\n", 969 compl->cdw0 & 0xFFFF, 970 (compl->cdw0 >> 16) & 0xFFFF)); 971 972 break; 973 case NVME_FEAT_INTERRUPT_COALESCING: 974 DPRINTF((" interrupt coalescing\r\n")); 975 break; 976 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 977 DPRINTF((" interrupt vector configuration\r\n")); 978 break; 979 case NVME_FEAT_WRITE_ATOMICITY: 980 DPRINTF((" write atomicity\r\n")); 981 break; 982 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 983 DPRINTF((" async event configuration\r\n")); 984 sc->async_ev_config = command->cdw11; 985 break; 986 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 987 DPRINTF((" software progress marker\r\n")); 988 break; 989 case 0x0C: 990 DPRINTF((" autonomous power state transition\r\n")); 991 break; 992 default: 993 WPRINTF(("%s invalid feature 0x%x\r\n", __func__, feature)); 994 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 995 return (1); 996 } 997 998 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 999 return (1); 1000 } 1001 1002 static int 1003 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command, 1004 struct nvme_completion* compl) 1005 { 1006 DPRINTF(("%s submission queue %u, command ID 0x%x\r\n", __func__, 1007 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF)); 1008 1009 /* TODO: search for the command ID and abort it */ 1010 1011 compl->cdw0 = 1; 1012 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1013 return (1); 1014 } 1015 1016 #ifdef __FreeBSD__ 1017 static int 1018 nvme_opc_async_event_req(struct pci_nvme_softc* sc, 1019 struct nvme_command* command, struct nvme_completion* compl) 1020 { 1021 DPRINTF(("%s async event request 0x%x\r\n", __func__, command->cdw11)); 1022 1023 /* 1024 * TODO: raise events when they happen based on the Set Features cmd. 1025 * These events happen async, so only set completion successful if 1026 * there is an event reflective of the request to get event. 1027 */ 1028 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1029 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); 1030 return (0); 1031 } 1032 #else 1033 /* This is kept behind an ifdef while it's unused to appease the compiler. */ 1034 #endif /* __FreeBSD__ */ 1035 1036 static void 1037 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) 1038 { 1039 struct nvme_completion compl; 1040 struct nvme_command *cmd; 1041 struct nvme_submission_queue *sq; 1042 struct nvme_completion_queue *cq; 1043 int do_intr = 0; 1044 uint16_t sqhead; 1045 1046 DPRINTF(("%s index %u\r\n", __func__, (uint32_t)value)); 1047 1048 sq = &sc->submit_queues[0]; 1049 1050 sqhead = atomic_load_acq_short(&sq->head); 1051 1052 if (atomic_testandset_int(&sq->busy, 1)) { 1053 DPRINTF(("%s SQ busy, head %u, tail %u\r\n", 1054 __func__, sqhead, sq->tail)); 1055 return; 1056 } 1057 1058 DPRINTF(("sqhead %u, tail %u\r\n", sqhead, sq->tail)); 1059 1060 while (sqhead != atomic_load_acq_short(&sq->tail)) { 1061 cmd = &(sq->qbase)[sqhead]; 1062 compl.cdw0 = 0; 1063 compl.status = 0; 1064 1065 switch (cmd->opc) { 1066 case NVME_OPC_DELETE_IO_SQ: 1067 DPRINTF(("%s command DELETE_IO_SQ\r\n", __func__)); 1068 do_intr |= nvme_opc_delete_io_sq(sc, cmd, &compl); 1069 break; 1070 case NVME_OPC_CREATE_IO_SQ: 1071 DPRINTF(("%s command CREATE_IO_SQ\r\n", __func__)); 1072 do_intr |= nvme_opc_create_io_sq(sc, cmd, &compl); 1073 break; 1074 case NVME_OPC_DELETE_IO_CQ: 1075 DPRINTF(("%s command DELETE_IO_CQ\r\n", __func__)); 1076 do_intr |= nvme_opc_delete_io_cq(sc, cmd, &compl); 1077 break; 1078 case NVME_OPC_CREATE_IO_CQ: 1079 DPRINTF(("%s command CREATE_IO_CQ\r\n", __func__)); 1080 do_intr |= nvme_opc_create_io_cq(sc, cmd, &compl); 1081 break; 1082 case NVME_OPC_GET_LOG_PAGE: 1083 DPRINTF(("%s command GET_LOG_PAGE\r\n", __func__)); 1084 do_intr |= nvme_opc_get_log_page(sc, cmd, &compl); 1085 break; 1086 case NVME_OPC_IDENTIFY: 1087 DPRINTF(("%s command IDENTIFY\r\n", __func__)); 1088 do_intr |= nvme_opc_identify(sc, cmd, &compl); 1089 break; 1090 case NVME_OPC_ABORT: 1091 DPRINTF(("%s command ABORT\r\n", __func__)); 1092 do_intr |= nvme_opc_abort(sc, cmd, &compl); 1093 break; 1094 case NVME_OPC_SET_FEATURES: 1095 DPRINTF(("%s command SET_FEATURES\r\n", __func__)); 1096 do_intr |= nvme_opc_set_features(sc, cmd, &compl); 1097 break; 1098 case NVME_OPC_GET_FEATURES: 1099 DPRINTF(("%s command GET_FEATURES\r\n", __func__)); 1100 do_intr |= nvme_opc_get_features(sc, cmd, &compl); 1101 break; 1102 case NVME_OPC_ASYNC_EVENT_REQUEST: 1103 DPRINTF(("%s command ASYNC_EVENT_REQ\r\n", __func__)); 1104 /* XXX dont care, unhandled for now 1105 do_intr |= nvme_opc_async_event_req(sc, cmd, &compl); 1106 */ 1107 compl.status = NVME_NO_STATUS; 1108 break; 1109 default: 1110 WPRINTF(("0x%x command is not implemented\r\n", 1111 cmd->opc)); 1112 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 1113 do_intr |= 1; 1114 } 1115 1116 if (NVME_COMPLETION_VALID(compl)) { 1117 struct nvme_completion *cp; 1118 int phase; 1119 1120 cq = &sc->compl_queues[0]; 1121 1122 cp = &(cq->qbase)[cq->tail]; 1123 cp->cdw0 = compl.cdw0; 1124 cp->sqid = 0; 1125 cp->sqhd = sqhead; 1126 cp->cid = cmd->cid; 1127 1128 phase = NVME_STATUS_GET_P(cp->status); 1129 cp->status = compl.status; 1130 pci_nvme_toggle_phase(&cp->status, phase); 1131 1132 cq->tail = (cq->tail + 1) % cq->size; 1133 } 1134 sqhead = (sqhead + 1) % sq->size; 1135 } 1136 1137 DPRINTF(("setting sqhead %u\r\n", sqhead)); 1138 atomic_store_short(&sq->head, sqhead); 1139 atomic_store_int(&sq->busy, 0); 1140 1141 if (do_intr) 1142 pci_generate_msix(sc->nsc_pi, 0); 1143 1144 } 1145 1146 static int 1147 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req, 1148 uint64_t gpaddr, size_t size, int do_write, uint64_t lba) 1149 { 1150 int iovidx; 1151 1152 if (req != NULL) { 1153 /* concatenate contig block-iovs to minimize number of iovs */ 1154 if ((req->prev_gpaddr + req->prev_size) == gpaddr) { 1155 iovidx = req->io_req.br_iovcnt - 1; 1156 1157 req->io_req.br_iov[iovidx].iov_base = 1158 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 1159 req->prev_gpaddr, size); 1160 1161 req->prev_size += size; 1162 req->io_req.br_resid += size; 1163 1164 req->io_req.br_iov[iovidx].iov_len = req->prev_size; 1165 } else { 1166 pthread_mutex_lock(&req->mtx); 1167 1168 iovidx = req->io_req.br_iovcnt; 1169 if (iovidx == NVME_MAX_BLOCKIOVS) { 1170 int err = 0; 1171 1172 DPRINTF(("large I/O, doing partial req\r\n")); 1173 1174 iovidx = 0; 1175 req->io_req.br_iovcnt = 0; 1176 1177 req->io_req.br_callback = pci_nvme_io_partial; 1178 1179 if (!do_write) 1180 err = blockif_read(sc->nvstore.ctx, 1181 &req->io_req); 1182 else 1183 err = blockif_write(sc->nvstore.ctx, 1184 &req->io_req); 1185 1186 /* wait until req completes before cont */ 1187 if (err == 0) 1188 pthread_cond_wait(&req->cv, &req->mtx); 1189 } 1190 if (iovidx == 0) { 1191 req->io_req.br_offset = lba; 1192 req->io_req.br_resid = 0; 1193 req->io_req.br_param = req; 1194 } 1195 1196 req->io_req.br_iov[iovidx].iov_base = 1197 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 1198 gpaddr, size); 1199 1200 req->io_req.br_iov[iovidx].iov_len = size; 1201 1202 req->prev_gpaddr = gpaddr; 1203 req->prev_size = size; 1204 req->io_req.br_resid += size; 1205 1206 req->io_req.br_iovcnt++; 1207 1208 pthread_mutex_unlock(&req->mtx); 1209 } 1210 } else { 1211 /* RAM buffer: read/write directly */ 1212 void *p = sc->nvstore.ctx; 1213 void *gptr; 1214 1215 if ((lba + size) > sc->nvstore.size) { 1216 WPRINTF(("%s write would overflow RAM\r\n", __func__)); 1217 return (-1); 1218 } 1219 1220 p = (void *)((uintptr_t)p + (uintptr_t)lba); 1221 gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size); 1222 if (do_write) 1223 memcpy(p, gptr, size); 1224 else 1225 memcpy(gptr, p, size); 1226 } 1227 return (0); 1228 } 1229 1230 static void 1231 pci_nvme_set_completion(struct pci_nvme_softc *sc, 1232 struct nvme_submission_queue *sq, int sqid, uint16_t cid, 1233 uint32_t cdw0, uint16_t status, int ignore_busy) 1234 { 1235 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; 1236 struct nvme_completion *compl; 1237 int do_intr = 0; 1238 int phase; 1239 1240 DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x\r\n", 1241 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), 1242 NVME_STATUS_GET_SC(status))); 1243 1244 pthread_mutex_lock(&cq->mtx); 1245 1246 assert(cq->qbase != NULL); 1247 1248 compl = &cq->qbase[cq->tail]; 1249 1250 compl->sqhd = atomic_load_acq_short(&sq->head); 1251 compl->sqid = sqid; 1252 compl->cid = cid; 1253 1254 // toggle phase 1255 phase = NVME_STATUS_GET_P(compl->status); 1256 compl->status = status; 1257 pci_nvme_toggle_phase(&compl->status, phase); 1258 1259 cq->tail = (cq->tail + 1) % cq->size; 1260 1261 if (cq->intr_en & NVME_CQ_INTEN) 1262 do_intr = 1; 1263 1264 pthread_mutex_unlock(&cq->mtx); 1265 1266 if (ignore_busy || !atomic_load_acq_int(&sq->busy)) 1267 if (do_intr) 1268 pci_generate_msix(sc->nsc_pi, cq->intr_vec); 1269 } 1270 1271 static void 1272 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) 1273 { 1274 req->sc = NULL; 1275 req->nvme_sq = NULL; 1276 req->sqid = 0; 1277 1278 pthread_mutex_lock(&sc->mtx); 1279 1280 req->next = sc->ioreqs_free; 1281 sc->ioreqs_free = req; 1282 sc->pending_ios--; 1283 1284 /* when no more IO pending, can set to ready if device reset/enabled */ 1285 if (sc->pending_ios == 0 && 1286 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) 1287 sc->regs.csts |= NVME_CSTS_RDY; 1288 1289 pthread_mutex_unlock(&sc->mtx); 1290 1291 sem_post(&sc->iosemlock); 1292 } 1293 1294 static struct pci_nvme_ioreq * 1295 pci_nvme_get_ioreq(struct pci_nvme_softc *sc) 1296 { 1297 struct pci_nvme_ioreq *req = NULL;; 1298 1299 sem_wait(&sc->iosemlock); 1300 pthread_mutex_lock(&sc->mtx); 1301 1302 req = sc->ioreqs_free; 1303 assert(req != NULL); 1304 1305 sc->ioreqs_free = req->next; 1306 1307 req->next = NULL; 1308 req->sc = sc; 1309 1310 sc->pending_ios++; 1311 1312 pthread_mutex_unlock(&sc->mtx); 1313 1314 req->io_req.br_iovcnt = 0; 1315 req->io_req.br_offset = 0; 1316 req->io_req.br_resid = 0; 1317 req->io_req.br_param = req; 1318 req->prev_gpaddr = 0; 1319 req->prev_size = 0; 1320 1321 return req; 1322 } 1323 1324 static void 1325 pci_nvme_io_done(struct blockif_req *br, int err) 1326 { 1327 struct pci_nvme_ioreq *req = br->br_param; 1328 struct nvme_submission_queue *sq = req->nvme_sq; 1329 uint16_t code, status = 0; 1330 1331 DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err))); 1332 1333 /* TODO return correct error */ 1334 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; 1335 pci_nvme_status_genc(&status, code); 1336 1337 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0); 1338 pci_nvme_release_ioreq(req->sc, req); 1339 } 1340 1341 static void 1342 pci_nvme_io_partial(struct blockif_req *br, int err) 1343 { 1344 struct pci_nvme_ioreq *req = br->br_param; 1345 1346 DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err))); 1347 1348 pthread_cond_signal(&req->cv); 1349 } 1350 1351 1352 static void 1353 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) 1354 { 1355 struct nvme_submission_queue *sq; 1356 uint16_t status = 0; 1357 uint16_t sqhead; 1358 int err; 1359 1360 /* handle all submissions up to sq->tail index */ 1361 sq = &sc->submit_queues[idx]; 1362 1363 if (atomic_testandset_int(&sq->busy, 1)) { 1364 DPRINTF(("%s sqid %u busy\r\n", __func__, idx)); 1365 return; 1366 } 1367 1368 sqhead = atomic_load_acq_short(&sq->head); 1369 1370 DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p\r\n", 1371 idx, sqhead, sq->tail, sq->qbase)); 1372 1373 while (sqhead != atomic_load_acq_short(&sq->tail)) { 1374 struct nvme_command *cmd; 1375 struct pci_nvme_ioreq *req = NULL; 1376 uint64_t lba; 1377 uint64_t nblocks, bytes, size, cpsz; 1378 1379 /* TODO: support scatter gather list handling */ 1380 1381 cmd = &sq->qbase[sqhead]; 1382 sqhead = (sqhead + 1) % sq->size; 1383 1384 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; 1385 1386 if (cmd->opc == NVME_OPC_FLUSH) { 1387 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1388 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, 1389 status, 1); 1390 1391 continue; 1392 } else if (cmd->opc == 0x08) { 1393 /* TODO: write zeroes */ 1394 WPRINTF(("%s write zeroes lba 0x%lx blocks %u\r\n", 1395 __func__, lba, cmd->cdw12 & 0xFFFF)); 1396 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1397 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, 1398 status, 1); 1399 1400 continue; 1401 } 1402 1403 nblocks = (cmd->cdw12 & 0xFFFF) + 1; 1404 1405 bytes = nblocks * sc->nvstore.sectsz; 1406 1407 if (sc->nvstore.type == NVME_STOR_BLOCKIF) { 1408 req = pci_nvme_get_ioreq(sc); 1409 req->nvme_sq = sq; 1410 req->sqid = idx; 1411 } 1412 1413 /* 1414 * If data starts mid-page and flows into the next page, then 1415 * increase page count 1416 */ 1417 1418 DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu " 1419 "(%lu-bytes)\r\n", 1420 sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size, 1421 cmd->opc == NVME_OPC_WRITE ? 1422 "WRITE" : "READ", 1423 lba, nblocks, bytes)); 1424 1425 cmd->prp1 &= ~(0x03UL); 1426 cmd->prp2 &= ~(0x03UL); 1427 1428 DPRINTF((" prp1 0x%lx prp2 0x%lx\r\n", cmd->prp1, cmd->prp2)); 1429 1430 size = bytes; 1431 lba *= sc->nvstore.sectsz; 1432 1433 cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE); 1434 1435 if (cpsz > bytes) 1436 cpsz = bytes; 1437 1438 if (req != NULL) { 1439 req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) | 1440 cmd->cdw10; 1441 req->opc = cmd->opc; 1442 req->cid = cmd->cid; 1443 req->nsid = cmd->nsid; 1444 } 1445 1446 err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz, 1447 cmd->opc == NVME_OPC_WRITE, lba); 1448 lba += cpsz; 1449 size -= cpsz; 1450 1451 if (size == 0) 1452 goto iodone; 1453 1454 if (size <= PAGE_SIZE) { 1455 /* prp2 is second (and final) page in transfer */ 1456 1457 err = pci_nvme_append_iov_req(sc, req, cmd->prp2, 1458 size, 1459 cmd->opc == NVME_OPC_WRITE, 1460 lba); 1461 } else { 1462 uint64_t *prp_list; 1463 int i; 1464 1465 /* prp2 is pointer to a physical region page list */ 1466 prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx, 1467 cmd->prp2, PAGE_SIZE); 1468 1469 i = 0; 1470 while (size != 0) { 1471 cpsz = MIN(size, PAGE_SIZE); 1472 1473 /* 1474 * Move to linked physical region page list 1475 * in last item. 1476 */ 1477 if (i == (NVME_PRP2_ITEMS-1) && 1478 size > PAGE_SIZE) { 1479 assert((prp_list[i] & (PAGE_SIZE-1)) == 0); 1480 prp_list = paddr_guest2host( 1481 sc->nsc_pi->pi_vmctx, 1482 prp_list[i], PAGE_SIZE); 1483 i = 0; 1484 } 1485 if (prp_list[i] == 0) { 1486 WPRINTF(("PRP2[%d] = 0 !!!\r\n", i)); 1487 err = 1; 1488 break; 1489 } 1490 1491 err = pci_nvme_append_iov_req(sc, req, 1492 prp_list[i], cpsz, 1493 cmd->opc == NVME_OPC_WRITE, lba); 1494 if (err) 1495 break; 1496 1497 lba += cpsz; 1498 size -= cpsz; 1499 i++; 1500 } 1501 } 1502 1503 iodone: 1504 if (sc->nvstore.type == NVME_STOR_RAM) { 1505 uint16_t code, status = 0; 1506 1507 code = err ? NVME_SC_LBA_OUT_OF_RANGE : 1508 NVME_SC_SUCCESS; 1509 pci_nvme_status_genc(&status, code); 1510 1511 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, 1512 status, 1); 1513 1514 continue; 1515 } 1516 1517 1518 if (err) 1519 goto do_error; 1520 1521 req->io_req.br_callback = pci_nvme_io_done; 1522 1523 err = 0; 1524 switch (cmd->opc) { 1525 case NVME_OPC_READ: 1526 err = blockif_read(sc->nvstore.ctx, &req->io_req); 1527 break; 1528 case NVME_OPC_WRITE: 1529 err = blockif_write(sc->nvstore.ctx, &req->io_req); 1530 break; 1531 default: 1532 WPRINTF(("%s unhandled io command 0x%x\r\n", 1533 __func__, cmd->opc)); 1534 err = 1; 1535 } 1536 1537 do_error: 1538 if (err) { 1539 uint16_t status = 0; 1540 1541 pci_nvme_status_genc(&status, 1542 NVME_SC_DATA_TRANSFER_ERROR); 1543 1544 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, 1545 status, 1); 1546 pci_nvme_release_ioreq(sc, req); 1547 } 1548 } 1549 1550 atomic_store_short(&sq->head, sqhead); 1551 atomic_store_int(&sq->busy, 0); 1552 } 1553 1554 static void 1555 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc, 1556 uint64_t idx, int is_sq, uint64_t value) 1557 { 1558 DPRINTF(("nvme doorbell %lu, %s, val 0x%lx\r\n", 1559 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF)); 1560 1561 if (is_sq) { 1562 atomic_store_short(&sc->submit_queues[idx].tail, 1563 (uint16_t)value); 1564 1565 if (idx == 0) { 1566 pci_nvme_handle_admin_cmd(sc, value); 1567 } else { 1568 /* submission queue; handle new entries in SQ */ 1569 if (idx > sc->num_squeues) { 1570 WPRINTF(("%s SQ index %lu overflow from " 1571 "guest (max %u)\r\n", 1572 __func__, idx, sc->num_squeues)); 1573 return; 1574 } 1575 pci_nvme_handle_io_cmd(sc, (uint16_t)idx); 1576 } 1577 } else { 1578 if (idx > sc->num_cqueues) { 1579 WPRINTF(("%s queue index %lu overflow from " 1580 "guest (max %u)\r\n", 1581 __func__, idx, sc->num_cqueues)); 1582 return; 1583 } 1584 1585 sc->compl_queues[idx].head = (uint16_t)value; 1586 } 1587 } 1588 1589 static void 1590 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) 1591 { 1592 const char *s = iswrite ? "WRITE" : "READ"; 1593 1594 switch (offset) { 1595 case NVME_CR_CAP_LOW: 1596 DPRINTF(("%s %s NVME_CR_CAP_LOW\r\n", func, s)); 1597 break; 1598 case NVME_CR_CAP_HI: 1599 DPRINTF(("%s %s NVME_CR_CAP_HI\r\n", func, s)); 1600 break; 1601 case NVME_CR_VS: 1602 DPRINTF(("%s %s NVME_CR_VS\r\n", func, s)); 1603 break; 1604 case NVME_CR_INTMS: 1605 DPRINTF(("%s %s NVME_CR_INTMS\r\n", func, s)); 1606 break; 1607 case NVME_CR_INTMC: 1608 DPRINTF(("%s %s NVME_CR_INTMC\r\n", func, s)); 1609 break; 1610 case NVME_CR_CC: 1611 DPRINTF(("%s %s NVME_CR_CC\r\n", func, s)); 1612 break; 1613 case NVME_CR_CSTS: 1614 DPRINTF(("%s %s NVME_CR_CSTS\r\n", func, s)); 1615 break; 1616 case NVME_CR_NSSR: 1617 DPRINTF(("%s %s NVME_CR_NSSR\r\n", func, s)); 1618 break; 1619 case NVME_CR_AQA: 1620 DPRINTF(("%s %s NVME_CR_AQA\r\n", func, s)); 1621 break; 1622 case NVME_CR_ASQ_LOW: 1623 DPRINTF(("%s %s NVME_CR_ASQ_LOW\r\n", func, s)); 1624 break; 1625 case NVME_CR_ASQ_HI: 1626 DPRINTF(("%s %s NVME_CR_ASQ_HI\r\n", func, s)); 1627 break; 1628 case NVME_CR_ACQ_LOW: 1629 DPRINTF(("%s %s NVME_CR_ACQ_LOW\r\n", func, s)); 1630 break; 1631 case NVME_CR_ACQ_HI: 1632 DPRINTF(("%s %s NVME_CR_ACQ_HI\r\n", func, s)); 1633 break; 1634 default: 1635 DPRINTF(("unknown nvme bar-0 offset 0x%lx\r\n", offset)); 1636 } 1637 1638 } 1639 1640 static void 1641 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc, 1642 uint64_t offset, int size, uint64_t value) 1643 { 1644 uint32_t ccreg; 1645 1646 if (offset >= NVME_DOORBELL_OFFSET) { 1647 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; 1648 uint64_t idx = belloffset / 8; /* door bell size = 2*int */ 1649 int is_sq = (belloffset % 8) < 4; 1650 1651 if (belloffset > ((sc->max_queues+1) * 8 - 4)) { 1652 WPRINTF(("guest attempted an overflow write offset " 1653 "0x%lx, val 0x%lx in %s", 1654 offset, value, __func__)); 1655 return; 1656 } 1657 1658 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value); 1659 return; 1660 } 1661 1662 DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx\r\n", 1663 offset, size, value)); 1664 1665 if (size != 4) { 1666 WPRINTF(("guest wrote invalid size %d (offset 0x%lx, " 1667 "val 0x%lx) to bar0 in %s", 1668 size, offset, value, __func__)); 1669 /* TODO: shutdown device */ 1670 return; 1671 } 1672 1673 pci_nvme_bar0_reg_dumps(__func__, offset, 1); 1674 1675 pthread_mutex_lock(&sc->mtx); 1676 1677 switch (offset) { 1678 case NVME_CR_CAP_LOW: 1679 case NVME_CR_CAP_HI: 1680 /* readonly */ 1681 break; 1682 case NVME_CR_VS: 1683 /* readonly */ 1684 break; 1685 case NVME_CR_INTMS: 1686 /* MSI-X, so ignore */ 1687 break; 1688 case NVME_CR_INTMC: 1689 /* MSI-X, so ignore */ 1690 break; 1691 case NVME_CR_CC: 1692 ccreg = (uint32_t)value; 1693 1694 DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u " 1695 "iocqes %u\r\n", 1696 __func__, 1697 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), 1698 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), 1699 NVME_CC_GET_IOCQES(ccreg))); 1700 1701 if (NVME_CC_GET_SHN(ccreg)) { 1702 /* perform shutdown - flush out data to backend */ 1703 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << 1704 NVME_CSTS_REG_SHST_SHIFT); 1705 sc->regs.csts |= NVME_SHST_COMPLETE << 1706 NVME_CSTS_REG_SHST_SHIFT; 1707 } 1708 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { 1709 if (NVME_CC_GET_EN(ccreg) == 0) 1710 /* transition 1-> causes controller reset */ 1711 pci_nvme_reset_locked(sc); 1712 else 1713 pci_nvme_init_controller(ctx, sc); 1714 } 1715 1716 /* Insert the iocqes, iosqes and en bits from the write */ 1717 sc->regs.cc &= ~NVME_CC_WRITE_MASK; 1718 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; 1719 if (NVME_CC_GET_EN(ccreg) == 0) { 1720 /* Insert the ams, mps and css bit fields */ 1721 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; 1722 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; 1723 sc->regs.csts &= ~NVME_CSTS_RDY; 1724 } else if (sc->pending_ios == 0) { 1725 sc->regs.csts |= NVME_CSTS_RDY; 1726 } 1727 break; 1728 case NVME_CR_CSTS: 1729 break; 1730 case NVME_CR_NSSR: 1731 /* ignore writes; don't support subsystem reset */ 1732 break; 1733 case NVME_CR_AQA: 1734 sc->regs.aqa = (uint32_t)value; 1735 break; 1736 case NVME_CR_ASQ_LOW: 1737 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | 1738 (0xFFFFF000 & value); 1739 break; 1740 case NVME_CR_ASQ_HI: 1741 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | 1742 (value << 32); 1743 break; 1744 case NVME_CR_ACQ_LOW: 1745 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | 1746 (0xFFFFF000 & value); 1747 break; 1748 case NVME_CR_ACQ_HI: 1749 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | 1750 (value << 32); 1751 break; 1752 default: 1753 DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d\r\n", 1754 __func__, offset, value, size)); 1755 } 1756 pthread_mutex_unlock(&sc->mtx); 1757 } 1758 1759 static void 1760 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, 1761 int baridx, uint64_t offset, int size, uint64_t value) 1762 { 1763 struct pci_nvme_softc* sc = pi->pi_arg; 1764 1765 if (baridx == pci_msix_table_bar(pi) || 1766 baridx == pci_msix_pba_bar(pi)) { 1767 DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, " 1768 " value 0x%lx\r\n", baridx, offset, size, value)); 1769 1770 pci_emul_msix_twrite(pi, offset, size, value); 1771 return; 1772 } 1773 1774 switch (baridx) { 1775 case 0: 1776 pci_nvme_write_bar_0(ctx, sc, offset, size, value); 1777 break; 1778 1779 default: 1780 DPRINTF(("%s unknown baridx %d, val 0x%lx\r\n", 1781 __func__, baridx, value)); 1782 } 1783 } 1784 1785 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, 1786 uint64_t offset, int size) 1787 { 1788 uint64_t value; 1789 1790 pci_nvme_bar0_reg_dumps(__func__, offset, 0); 1791 1792 if (offset < NVME_DOORBELL_OFFSET) { 1793 void *p = &(sc->regs); 1794 pthread_mutex_lock(&sc->mtx); 1795 memcpy(&value, (void *)((uintptr_t)p + offset), size); 1796 pthread_mutex_unlock(&sc->mtx); 1797 } else { 1798 value = 0; 1799 WPRINTF(("pci_nvme: read invalid offset %ld\r\n", offset)); 1800 } 1801 1802 switch (size) { 1803 case 1: 1804 value &= 0xFF; 1805 break; 1806 case 2: 1807 value &= 0xFFFF; 1808 break; 1809 case 4: 1810 value &= 0xFFFFFFFF; 1811 break; 1812 } 1813 1814 DPRINTF((" nvme-read offset 0x%lx, size %d -> value 0x%x\r\n", 1815 offset, size, (uint32_t)value)); 1816 1817 return (value); 1818 } 1819 1820 1821 1822 static uint64_t 1823 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, 1824 uint64_t offset, int size) 1825 { 1826 struct pci_nvme_softc* sc = pi->pi_arg; 1827 1828 if (baridx == pci_msix_table_bar(pi) || 1829 baridx == pci_msix_pba_bar(pi)) { 1830 DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d\r\n", 1831 baridx, offset, size)); 1832 1833 return pci_emul_msix_tread(pi, offset, size); 1834 } 1835 1836 switch (baridx) { 1837 case 0: 1838 return pci_nvme_read_bar_0(sc, offset, size); 1839 1840 default: 1841 DPRINTF(("unknown bar %d, 0x%lx\r\n", baridx, offset)); 1842 } 1843 1844 return (0); 1845 } 1846 1847 1848 static int 1849 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts) 1850 { 1851 char bident[sizeof("XX:X:X")]; 1852 char *uopt, *xopts, *config; 1853 uint32_t sectsz; 1854 int optidx; 1855 1856 sc->max_queues = NVME_QUEUES; 1857 sc->max_qentries = NVME_MAX_QENTRIES; 1858 sc->ioslots = NVME_IOSLOTS; 1859 sc->num_squeues = sc->max_queues; 1860 sc->num_cqueues = sc->max_queues; 1861 sectsz = 0; 1862 1863 uopt = strdup(opts); 1864 optidx = 0; 1865 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), 1866 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 1867 for (xopts = strtok(uopt, ","); 1868 xopts != NULL; 1869 xopts = strtok(NULL, ",")) { 1870 1871 if ((config = strchr(xopts, '=')) != NULL) 1872 *config++ = '\0'; 1873 1874 if (!strcmp("maxq", xopts)) { 1875 sc->max_queues = atoi(config); 1876 } else if (!strcmp("qsz", xopts)) { 1877 sc->max_qentries = atoi(config); 1878 } else if (!strcmp("ioslots", xopts)) { 1879 sc->ioslots = atoi(config); 1880 } else if (!strcmp("sectsz", xopts)) { 1881 sectsz = atoi(config); 1882 } else if (!strcmp("ser", xopts)) { 1883 /* 1884 * This field indicates the Product Serial Number in 1885 * 7-bit ASCII, unused bytes should be space characters. 1886 * Ref: NVMe v1.3c. 1887 */ 1888 cpywithpad((char *)sc->ctrldata.sn, 1889 sizeof(sc->ctrldata.sn), config, ' '); 1890 } else if (!strcmp("ram", xopts)) { 1891 uint64_t sz = strtoull(&xopts[4], NULL, 10); 1892 1893 sc->nvstore.type = NVME_STOR_RAM; 1894 sc->nvstore.size = sz * 1024 * 1024; 1895 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 1896 sc->nvstore.sectsz = 4096; 1897 sc->nvstore.sectsz_bits = 12; 1898 if (sc->nvstore.ctx == NULL) { 1899 perror("Unable to allocate RAM"); 1900 free(uopt); 1901 return (-1); 1902 } 1903 } else if (!strcmp("eui64", xopts)) { 1904 sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0)); 1905 } else if (optidx == 0) { 1906 snprintf(bident, sizeof(bident), "%d:%d", 1907 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 1908 sc->nvstore.ctx = blockif_open(xopts, bident); 1909 if (sc->nvstore.ctx == NULL) { 1910 perror("Could not open backing file"); 1911 free(uopt); 1912 return (-1); 1913 } 1914 sc->nvstore.type = NVME_STOR_BLOCKIF; 1915 sc->nvstore.size = blockif_size(sc->nvstore.ctx); 1916 } else { 1917 fprintf(stderr, "Invalid option %s\n", xopts); 1918 free(uopt); 1919 return (-1); 1920 } 1921 1922 optidx++; 1923 } 1924 free(uopt); 1925 1926 if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) { 1927 fprintf(stderr, "backing store not specified\n"); 1928 return (-1); 1929 } 1930 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) 1931 sc->nvstore.sectsz = sectsz; 1932 else if (sc->nvstore.type != NVME_STOR_RAM) 1933 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); 1934 for (sc->nvstore.sectsz_bits = 9; 1935 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; 1936 sc->nvstore.sectsz_bits++); 1937 1938 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) 1939 sc->max_queues = NVME_QUEUES; 1940 1941 if (sc->max_qentries <= 0) { 1942 fprintf(stderr, "Invalid qsz option\n"); 1943 return (-1); 1944 } 1945 if (sc->ioslots <= 0) { 1946 fprintf(stderr, "Invalid ioslots option\n"); 1947 return (-1); 1948 } 1949 1950 return (0); 1951 } 1952 1953 static int 1954 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) 1955 { 1956 struct pci_nvme_softc *sc; 1957 uint32_t pci_membar_sz; 1958 int error; 1959 1960 error = 0; 1961 1962 sc = calloc(1, sizeof(struct pci_nvme_softc)); 1963 pi->pi_arg = sc; 1964 sc->nsc_pi = pi; 1965 1966 error = pci_nvme_parse_opts(sc, opts); 1967 if (error < 0) 1968 goto done; 1969 else 1970 error = 0; 1971 1972 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); 1973 for (int i = 0; i < sc->ioslots; i++) { 1974 if (i < (sc->ioslots-1)) 1975 sc->ioreqs[i].next = &sc->ioreqs[i+1]; 1976 pthread_mutex_init(&sc->ioreqs[i].mtx, NULL); 1977 pthread_cond_init(&sc->ioreqs[i].cv, NULL); 1978 } 1979 sc->ioreqs_free = sc->ioreqs; 1980 sc->intr_coales_aggr_thresh = 1; 1981 1982 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); 1983 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); 1984 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); 1985 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); 1986 pci_set_cfgdata8(pi, PCIR_PROGIF, 1987 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); 1988 1989 /* 1990 * Allocate size of NVMe registers + doorbell space for all queues. 1991 * 1992 * The specification requires a minimum memory I/O window size of 16K. 1993 * The Windows driver will refuse to start a device with a smaller 1994 * window. 1995 */ 1996 pci_membar_sz = sizeof(struct nvme_registers) + 1997 2 * sizeof(uint32_t) * (sc->max_queues + 1); 1998 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); 1999 2000 DPRINTF(("nvme membar size: %u\r\n", pci_membar_sz)); 2001 2002 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); 2003 if (error) { 2004 WPRINTF(("%s pci alloc mem bar failed\r\n", __func__)); 2005 goto done; 2006 } 2007 2008 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); 2009 if (error) { 2010 WPRINTF(("%s pci add msixcap failed\r\n", __func__)); 2011 goto done; 2012 } 2013 2014 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); 2015 if (error) { 2016 WPRINTF(("%s pci add Express capability failed\r\n", __func__)); 2017 goto done; 2018 } 2019 2020 pthread_mutex_init(&sc->mtx, NULL); 2021 sem_init(&sc->iosemlock, 0, sc->ioslots); 2022 2023 pci_nvme_reset(sc); 2024 pci_nvme_init_ctrldata(sc); 2025 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, sc->nvstore.eui64); 2026 pci_nvme_init_logpages(sc); 2027 2028 pci_lintr_request(pi); 2029 2030 done: 2031 return (error); 2032 } 2033 2034 2035 struct pci_devemu pci_de_nvme = { 2036 .pe_emu = "nvme", 2037 .pe_init = pci_nvme_init, 2038 .pe_barwrite = pci_nvme_write, 2039 .pe_barread = pci_nvme_read 2040 }; 2041 PCI_EMUL_SET(pci_de_nvme); 2042