1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2017 Shunsuke Mie 5 * Copyright (c) 2018 Leon Dang 6 * 7 * Function crc16 Copyright (c) 2017, Fedor Uporov 8 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 /* 33 * bhyve PCIe-NVMe device emulation. 34 * 35 * options: 36 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=# 37 * 38 * accepted devpath: 39 * /dev/blockdev 40 * /path/to/image 41 * ram=size_in_MiB 42 * 43 * maxq = max number of queues 44 * qsz = max elements in each queue 45 * ioslots = max number of concurrent io requests 46 * sectsz = sector size (defaults to blockif sector size) 47 * ser = serial number (20-chars max) 48 * eui64 = IEEE Extended Unique Identifier (8 byte value) 49 * 50 */ 51 52 /* TODO: 53 - create async event for smart and log 54 - intr coalesce 55 */ 56 57 #include <sys/cdefs.h> 58 __FBSDID("$FreeBSD$"); 59 60 #include <sys/types.h> 61 #include <net/ieee_oui.h> 62 63 #include <assert.h> 64 #include <pthread.h> 65 #include <semaphore.h> 66 #include <stdbool.h> 67 #include <stddef.h> 68 #include <stdint.h> 69 #include <stdio.h> 70 #include <stdlib.h> 71 #include <string.h> 72 73 #include <machine/atomic.h> 74 #include <machine/vmm.h> 75 #include <vmmapi.h> 76 77 #include <dev/nvme/nvme.h> 78 79 #include "bhyverun.h" 80 #include "block_if.h" 81 #include "debug.h" 82 #include "pci_emul.h" 83 84 85 static int nvme_debug = 0; 86 #define DPRINTF(params) if (nvme_debug) PRINTLN params 87 #define WPRINTF(params) PRINTLN params 88 89 /* defaults; can be overridden */ 90 #define NVME_MSIX_BAR 4 91 92 #define NVME_IOSLOTS 8 93 94 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ 95 #define NVME_MMIO_SPACE_MIN (1 << 14) 96 97 #define NVME_QUEUES 16 98 #define NVME_MAX_QENTRIES 2048 99 100 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) 101 #define NVME_MAX_BLOCKIOVS 512 102 103 /* This is a synthetic status code to indicate there is no status */ 104 #define NVME_NO_STATUS 0xffff 105 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) 106 107 /* helpers */ 108 109 /* Convert a zero-based value into a one-based value */ 110 #define ONE_BASED(zero) ((zero) + 1) 111 /* Convert a one-based value into a zero-based value */ 112 #define ZERO_BASED(one) ((one) - 1) 113 114 /* Encode number of SQ's and CQ's for Set/Get Features */ 115 #define NVME_FEATURE_NUM_QUEUES(sc) \ 116 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ 117 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16; 118 119 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) 120 121 enum nvme_controller_register_offsets { 122 NVME_CR_CAP_LOW = 0x00, 123 NVME_CR_CAP_HI = 0x04, 124 NVME_CR_VS = 0x08, 125 NVME_CR_INTMS = 0x0c, 126 NVME_CR_INTMC = 0x10, 127 NVME_CR_CC = 0x14, 128 NVME_CR_CSTS = 0x1c, 129 NVME_CR_NSSR = 0x20, 130 NVME_CR_AQA = 0x24, 131 NVME_CR_ASQ_LOW = 0x28, 132 NVME_CR_ASQ_HI = 0x2c, 133 NVME_CR_ACQ_LOW = 0x30, 134 NVME_CR_ACQ_HI = 0x34, 135 }; 136 137 enum nvme_cmd_cdw11 { 138 NVME_CMD_CDW11_PC = 0x0001, 139 NVME_CMD_CDW11_IEN = 0x0002, 140 NVME_CMD_CDW11_IV = 0xFFFF0000, 141 }; 142 143 enum nvme_copy_dir { 144 NVME_COPY_TO_PRP, 145 NVME_COPY_FROM_PRP, 146 }; 147 148 #define NVME_CQ_INTEN 0x01 149 #define NVME_CQ_INTCOAL 0x02 150 151 struct nvme_completion_queue { 152 struct nvme_completion *qbase; 153 uint32_t size; 154 uint16_t tail; /* nvme progress */ 155 uint16_t head; /* guest progress */ 156 uint16_t intr_vec; 157 uint32_t intr_en; 158 pthread_mutex_t mtx; 159 }; 160 161 struct nvme_submission_queue { 162 struct nvme_command *qbase; 163 uint32_t size; 164 uint16_t head; /* nvme progress */ 165 uint16_t tail; /* guest progress */ 166 uint16_t cqid; /* completion queue id */ 167 int busy; /* queue is being processed */ 168 int qpriority; 169 }; 170 171 enum nvme_storage_type { 172 NVME_STOR_BLOCKIF = 0, 173 NVME_STOR_RAM = 1, 174 }; 175 176 struct pci_nvme_blockstore { 177 enum nvme_storage_type type; 178 void *ctx; 179 uint64_t size; 180 uint32_t sectsz; 181 uint32_t sectsz_bits; 182 uint64_t eui64; 183 uint32_t deallocate:1; 184 }; 185 186 struct pci_nvme_ioreq { 187 struct pci_nvme_softc *sc; 188 STAILQ_ENTRY(pci_nvme_ioreq) link; 189 struct nvme_submission_queue *nvme_sq; 190 uint16_t sqid; 191 192 /* command information */ 193 uint16_t opc; 194 uint16_t cid; 195 uint32_t nsid; 196 197 uint64_t prev_gpaddr; 198 size_t prev_size; 199 200 /* 201 * lock if all iovs consumed (big IO); 202 * complete transaction before continuing 203 */ 204 pthread_mutex_t mtx; 205 pthread_cond_t cv; 206 207 struct blockif_req io_req; 208 209 /* pad to fit up to 512 page descriptors from guest IO request */ 210 struct iovec iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX]; 211 }; 212 213 enum nvme_dsm_type { 214 /* Dataset Management bit in ONCS reflects backing storage capability */ 215 NVME_DATASET_MANAGEMENT_AUTO, 216 /* Unconditionally set Dataset Management bit in ONCS */ 217 NVME_DATASET_MANAGEMENT_ENABLE, 218 /* Unconditionally clear Dataset Management bit in ONCS */ 219 NVME_DATASET_MANAGEMENT_DISABLE, 220 }; 221 222 struct pci_nvme_softc { 223 struct pci_devinst *nsc_pi; 224 225 pthread_mutex_t mtx; 226 227 struct nvme_registers regs; 228 229 struct nvme_namespace_data nsdata; 230 struct nvme_controller_data ctrldata; 231 struct nvme_error_information_entry err_log; 232 struct nvme_health_information_page health_log; 233 struct nvme_firmware_page fw_log; 234 235 struct pci_nvme_blockstore nvstore; 236 237 uint16_t max_qentries; /* max entries per queue */ 238 uint32_t max_queues; /* max number of IO SQ's or CQ's */ 239 uint32_t num_cqueues; 240 uint32_t num_squeues; 241 242 struct pci_nvme_ioreq *ioreqs; 243 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */ 244 uint32_t pending_ios; 245 uint32_t ioslots; 246 sem_t iosemlock; 247 248 /* 249 * Memory mapped Submission and Completion queues 250 * Each array includes both Admin and IO queues 251 */ 252 struct nvme_completion_queue *compl_queues; 253 struct nvme_submission_queue *submit_queues; 254 255 /* controller features */ 256 uint32_t intr_coales_aggr_time; /* 0x08: uS to delay intr */ 257 uint32_t intr_coales_aggr_thresh; /* 0x08: compl-Q entries */ 258 uint32_t async_ev_config; /* 0x0B: async event config */ 259 260 enum nvme_dsm_type dataset_management; 261 }; 262 263 264 static void pci_nvme_io_partial(struct blockif_req *br, int err); 265 266 /* Controller Configuration utils */ 267 #define NVME_CC_GET_EN(cc) \ 268 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) 269 #define NVME_CC_GET_CSS(cc) \ 270 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) 271 #define NVME_CC_GET_SHN(cc) \ 272 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) 273 #define NVME_CC_GET_IOSQES(cc) \ 274 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) 275 #define NVME_CC_GET_IOCQES(cc) \ 276 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) 277 278 #define NVME_CC_WRITE_MASK \ 279 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ 280 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ 281 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) 282 283 #define NVME_CC_NEN_WRITE_MASK \ 284 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ 285 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ 286 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) 287 288 /* Controller Status utils */ 289 #define NVME_CSTS_GET_RDY(sts) \ 290 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) 291 292 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) 293 294 /* Completion Queue status word utils */ 295 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) 296 #define NVME_STATUS_MASK \ 297 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ 298 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) 299 300 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \ 301 NVME_CTRLR_DATA_ONCS_DSM_SHIFT) 302 303 static __inline void 304 cpywithpad(char *dst, size_t dst_size, const char *src, char pad) 305 { 306 size_t len; 307 308 len = strnlen(src, dst_size); 309 memset(dst, pad, dst_size); 310 memcpy(dst, src, len); 311 } 312 313 static __inline void 314 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) 315 { 316 317 *status &= ~NVME_STATUS_MASK; 318 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | 319 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; 320 } 321 322 static __inline void 323 pci_nvme_status_genc(uint16_t *status, uint16_t code) 324 { 325 326 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); 327 } 328 329 static __inline void 330 pci_nvme_toggle_phase(uint16_t *status, int prev) 331 { 332 333 if (prev) 334 *status &= ~NVME_STATUS_P; 335 else 336 *status |= NVME_STATUS_P; 337 } 338 339 static void 340 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) 341 { 342 struct nvme_controller_data *cd = &sc->ctrldata; 343 344 cd->vid = 0xFB5D; 345 cd->ssvid = 0x0000; 346 347 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); 348 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); 349 350 /* Num of submission commands that we can handle at a time (2^rab) */ 351 cd->rab = 4; 352 353 /* FreeBSD OUI */ 354 cd->ieee[0] = 0x58; 355 cd->ieee[1] = 0x9c; 356 cd->ieee[2] = 0xfc; 357 358 cd->mic = 0; 359 360 cd->mdts = 9; /* max data transfer size (2^mdts * CAP.MPSMIN) */ 361 362 cd->ver = 0x00010300; 363 364 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; 365 cd->acl = 2; 366 cd->aerl = 4; 367 368 cd->lpa = 0; /* TODO: support some simple things like SMART */ 369 cd->elpe = 0; /* max error log page entries */ 370 cd->npss = 1; /* number of power states support */ 371 372 /* Warning Composite Temperature Threshold */ 373 cd->wctemp = 0x0157; 374 375 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | 376 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); 377 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | 378 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); 379 cd->nn = 1; /* number of namespaces */ 380 381 cd->oncs = 0; 382 switch (sc->dataset_management) { 383 case NVME_DATASET_MANAGEMENT_AUTO: 384 if (sc->nvstore.deallocate) 385 cd->oncs |= NVME_ONCS_DSM; 386 break; 387 case NVME_DATASET_MANAGEMENT_ENABLE: 388 cd->oncs |= NVME_ONCS_DSM; 389 break; 390 default: 391 break; 392 } 393 394 cd->fna = 0x03; 395 396 cd->power_state[0].mp = 10; 397 } 398 399 /* 400 * Calculate the CRC-16 of the given buffer 401 * See copyright attribution at top of file 402 */ 403 static uint16_t 404 crc16(uint16_t crc, const void *buffer, unsigned int len) 405 { 406 const unsigned char *cp = buffer; 407 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */ 408 static uint16_t const crc16_table[256] = { 409 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, 410 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, 411 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, 412 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841, 413 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, 414 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41, 415 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641, 416 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040, 417 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240, 418 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441, 419 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41, 420 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840, 421 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41, 422 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40, 423 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640, 424 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041, 425 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240, 426 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441, 427 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41, 428 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840, 429 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41, 430 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40, 431 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640, 432 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041, 433 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241, 434 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440, 435 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40, 436 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841, 437 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40, 438 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41, 439 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, 440 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 441 }; 442 443 while (len--) 444 crc = (((crc >> 8) & 0xffU) ^ 445 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU; 446 return crc; 447 } 448 449 static void 450 pci_nvme_init_nsdata(struct pci_nvme_softc *sc, 451 struct nvme_namespace_data *nd, uint32_t nsid, 452 struct pci_nvme_blockstore *nvstore) 453 { 454 455 /* Get capacity and block size information from backing store */ 456 nd->nsze = nvstore->size / nvstore->sectsz; 457 nd->ncap = nd->nsze; 458 nd->nuse = nd->nsze; 459 460 if (nvstore->type == NVME_STOR_BLOCKIF) 461 nvstore->deallocate = blockif_candelete(nvstore->ctx); 462 463 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ 464 nd->flbas = 0; 465 466 /* Create an EUI-64 if user did not provide one */ 467 if (nvstore->eui64 == 0) { 468 char *data = NULL; 469 uint64_t eui64 = nvstore->eui64; 470 471 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus, 472 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 473 474 if (data != NULL) { 475 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); 476 free(data); 477 } 478 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff); 479 } 480 be64enc(nd->eui64, nvstore->eui64); 481 482 /* LBA data-sz = 2^lbads */ 483 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; 484 } 485 486 static void 487 pci_nvme_init_logpages(struct pci_nvme_softc *sc) 488 { 489 490 memset(&sc->err_log, 0, sizeof(sc->err_log)); 491 memset(&sc->health_log, 0, sizeof(sc->health_log)); 492 memset(&sc->fw_log, 0, sizeof(sc->fw_log)); 493 } 494 495 static void 496 pci_nvme_reset_locked(struct pci_nvme_softc *sc) 497 { 498 DPRINTF(("%s", __func__)); 499 500 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | 501 (1 << NVME_CAP_LO_REG_CQR_SHIFT) | 502 (60 << NVME_CAP_LO_REG_TO_SHIFT); 503 504 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; 505 506 sc->regs.vs = 0x00010300; /* NVMe v1.3 */ 507 508 sc->regs.cc = 0; 509 sc->regs.csts = 0; 510 511 sc->num_cqueues = sc->num_squeues = sc->max_queues; 512 if (sc->submit_queues != NULL) { 513 for (int i = 0; i < sc->num_squeues + 1; i++) { 514 /* 515 * The Admin Submission Queue is at index 0. 516 * It must not be changed at reset otherwise the 517 * emulation will be out of sync with the guest. 518 */ 519 if (i != 0) { 520 sc->submit_queues[i].qbase = NULL; 521 sc->submit_queues[i].size = 0; 522 sc->submit_queues[i].cqid = 0; 523 } 524 sc->submit_queues[i].tail = 0; 525 sc->submit_queues[i].head = 0; 526 sc->submit_queues[i].busy = 0; 527 } 528 } else 529 sc->submit_queues = calloc(sc->num_squeues + 1, 530 sizeof(struct nvme_submission_queue)); 531 532 if (sc->compl_queues != NULL) { 533 for (int i = 0; i < sc->num_cqueues + 1; i++) { 534 /* See Admin Submission Queue note above */ 535 if (i != 0) { 536 sc->compl_queues[i].qbase = NULL; 537 sc->compl_queues[i].size = 0; 538 } 539 540 sc->compl_queues[i].tail = 0; 541 sc->compl_queues[i].head = 0; 542 } 543 } else { 544 sc->compl_queues = calloc(sc->num_cqueues + 1, 545 sizeof(struct nvme_completion_queue)); 546 547 for (int i = 0; i < sc->num_cqueues + 1; i++) 548 pthread_mutex_init(&sc->compl_queues[i].mtx, NULL); 549 } 550 } 551 552 static void 553 pci_nvme_reset(struct pci_nvme_softc *sc) 554 { 555 pthread_mutex_lock(&sc->mtx); 556 pci_nvme_reset_locked(sc); 557 pthread_mutex_unlock(&sc->mtx); 558 } 559 560 static void 561 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc) 562 { 563 uint16_t acqs, asqs; 564 565 DPRINTF(("%s", __func__)); 566 567 asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1; 568 sc->submit_queues[0].size = asqs; 569 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq, 570 sizeof(struct nvme_command) * asqs); 571 572 DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p", 573 __func__, sc->regs.asq, sc->submit_queues[0].qbase)); 574 575 acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 576 NVME_AQA_REG_ACQS_MASK) + 1; 577 sc->compl_queues[0].size = acqs; 578 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq, 579 sizeof(struct nvme_completion) * acqs); 580 DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p", 581 __func__, sc->regs.acq, sc->compl_queues[0].qbase)); 582 } 583 584 static int 585 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b, 586 size_t len, enum nvme_copy_dir dir) 587 { 588 uint8_t *p; 589 size_t bytes; 590 591 if (len > (8 * 1024)) { 592 return (-1); 593 } 594 595 /* Copy from the start of prp1 to the end of the physical page */ 596 bytes = PAGE_SIZE - (prp1 & PAGE_MASK); 597 bytes = MIN(bytes, len); 598 599 p = vm_map_gpa(ctx, prp1, bytes); 600 if (p == NULL) { 601 return (-1); 602 } 603 604 if (dir == NVME_COPY_TO_PRP) 605 memcpy(p, b, bytes); 606 else 607 memcpy(b, p, bytes); 608 609 b += bytes; 610 611 len -= bytes; 612 if (len == 0) { 613 return (0); 614 } 615 616 len = MIN(len, PAGE_SIZE); 617 618 p = vm_map_gpa(ctx, prp2, len); 619 if (p == NULL) { 620 return (-1); 621 } 622 623 if (dir == NVME_COPY_TO_PRP) 624 memcpy(p, b, len); 625 else 626 memcpy(b, p, len); 627 628 return (0); 629 } 630 631 static int 632 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 633 struct nvme_completion* compl) 634 { 635 uint16_t qid = command->cdw10 & 0xffff; 636 637 DPRINTF(("%s DELETE_IO_SQ %u", __func__, qid)); 638 if (qid == 0 || qid > sc->num_squeues) { 639 WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u", 640 __func__, qid, sc->num_squeues)); 641 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 642 NVME_SC_INVALID_QUEUE_IDENTIFIER); 643 return (1); 644 } 645 646 sc->submit_queues[qid].qbase = NULL; 647 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 648 return (1); 649 } 650 651 static int 652 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 653 struct nvme_completion* compl) 654 { 655 if (command->cdw11 & NVME_CMD_CDW11_PC) { 656 uint16_t qid = command->cdw10 & 0xffff; 657 struct nvme_submission_queue *nsq; 658 659 if ((qid == 0) || (qid > sc->num_squeues)) { 660 WPRINTF(("%s queue index %u > num_squeues %u", 661 __func__, qid, sc->num_squeues)); 662 pci_nvme_status_tc(&compl->status, 663 NVME_SCT_COMMAND_SPECIFIC, 664 NVME_SC_INVALID_QUEUE_IDENTIFIER); 665 return (1); 666 } 667 668 nsq = &sc->submit_queues[qid]; 669 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 670 671 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 672 sizeof(struct nvme_command) * (size_t)nsq->size); 673 nsq->cqid = (command->cdw11 >> 16) & 0xffff; 674 nsq->qpriority = (command->cdw11 >> 1) & 0x03; 675 676 DPRINTF(("%s sq %u size %u gaddr %p cqid %u", __func__, 677 qid, nsq->size, nsq->qbase, nsq->cqid)); 678 679 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 680 681 DPRINTF(("%s completed creating IOSQ qid %u", 682 __func__, qid)); 683 } else { 684 /* 685 * Guest sent non-cont submission queue request. 686 * This setting is unsupported by this emulation. 687 */ 688 WPRINTF(("%s unsupported non-contig (list-based) " 689 "create i/o submission queue", __func__)); 690 691 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 692 } 693 return (1); 694 } 695 696 static int 697 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 698 struct nvme_completion* compl) 699 { 700 uint16_t qid = command->cdw10 & 0xffff; 701 702 DPRINTF(("%s DELETE_IO_CQ %u", __func__, qid)); 703 if (qid == 0 || qid > sc->num_cqueues) { 704 WPRINTF(("%s queue index %u / num_cqueues %u", 705 __func__, qid, sc->num_cqueues)); 706 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 707 NVME_SC_INVALID_QUEUE_IDENTIFIER); 708 return (1); 709 } 710 711 sc->compl_queues[qid].qbase = NULL; 712 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 713 return (1); 714 } 715 716 static int 717 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 718 struct nvme_completion* compl) 719 { 720 if (command->cdw11 & NVME_CMD_CDW11_PC) { 721 uint16_t qid = command->cdw10 & 0xffff; 722 struct nvme_completion_queue *ncq; 723 724 if ((qid == 0) || (qid > sc->num_cqueues)) { 725 WPRINTF(("%s queue index %u > num_cqueues %u", 726 __func__, qid, sc->num_cqueues)); 727 pci_nvme_status_tc(&compl->status, 728 NVME_SCT_COMMAND_SPECIFIC, 729 NVME_SC_INVALID_QUEUE_IDENTIFIER); 730 return (1); 731 } 732 733 ncq = &sc->compl_queues[qid]; 734 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; 735 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; 736 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 737 738 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 739 command->prp1, 740 sizeof(struct nvme_command) * (size_t)ncq->size); 741 742 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 743 } else { 744 /* 745 * Non-contig completion queue unsupported. 746 */ 747 WPRINTF(("%s unsupported non-contig (list-based) " 748 "create i/o completion queue", 749 __func__)); 750 751 /* 0x12 = Invalid Use of Controller Memory Buffer */ 752 pci_nvme_status_genc(&compl->status, 0x12); 753 } 754 755 return (1); 756 } 757 758 static int 759 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, 760 struct nvme_completion* compl) 761 { 762 uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2; 763 uint8_t logpage = command->cdw10 & 0xFF; 764 765 DPRINTF(("%s log page %u len %u", __func__, logpage, logsize)); 766 767 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 768 769 switch (logpage) { 770 case NVME_LOG_ERROR: 771 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 772 command->prp2, (uint8_t *)&sc->err_log, logsize, 773 NVME_COPY_TO_PRP); 774 break; 775 case NVME_LOG_HEALTH_INFORMATION: 776 /* TODO: present some smart info */ 777 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 778 command->prp2, (uint8_t *)&sc->health_log, logsize, 779 NVME_COPY_TO_PRP); 780 break; 781 case NVME_LOG_FIRMWARE_SLOT: 782 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 783 command->prp2, (uint8_t *)&sc->fw_log, logsize, 784 NVME_COPY_TO_PRP); 785 break; 786 default: 787 WPRINTF(("%s get log page %x command not supported", 788 __func__, logpage)); 789 790 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 791 NVME_SC_INVALID_LOG_PAGE); 792 } 793 794 return (1); 795 } 796 797 static int 798 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, 799 struct nvme_completion* compl) 800 { 801 void *dest; 802 803 DPRINTF(("%s identify 0x%x nsid 0x%x", __func__, 804 command->cdw10 & 0xFF, command->nsid)); 805 806 switch (command->cdw10 & 0xFF) { 807 case 0x00: /* return Identify Namespace data structure */ 808 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 809 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata), 810 NVME_COPY_TO_PRP); 811 break; 812 case 0x01: /* return Identify Controller data structure */ 813 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 814 command->prp2, (uint8_t *)&sc->ctrldata, 815 sizeof(sc->ctrldata), 816 NVME_COPY_TO_PRP); 817 break; 818 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ 819 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 820 sizeof(uint32_t) * 1024); 821 ((uint32_t *)dest)[0] = 1; 822 ((uint32_t *)dest)[1] = 0; 823 break; 824 case 0x11: 825 pci_nvme_status_genc(&compl->status, 826 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 827 return (1); 828 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ 829 case 0x10: 830 case 0x12: 831 case 0x13: 832 case 0x14: 833 case 0x15: 834 default: 835 DPRINTF(("%s unsupported identify command requested 0x%x", 836 __func__, command->cdw10 & 0xFF)); 837 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 838 return (1); 839 } 840 841 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 842 return (1); 843 } 844 845 static int 846 nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command, 847 struct nvme_completion* compl) 848 { 849 uint16_t nqr; /* Number of Queues Requested */ 850 851 nqr = command->cdw11 & 0xFFFF; 852 if (nqr == 0xffff) { 853 WPRINTF(("%s: Illegal NSQR value %#x", __func__, nqr)); 854 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 855 return (-1); 856 } 857 858 sc->num_squeues = ONE_BASED(nqr); 859 if (sc->num_squeues > sc->max_queues) { 860 DPRINTF(("NSQR=%u is greater than max %u", sc->num_squeues, 861 sc->max_queues)); 862 sc->num_squeues = sc->max_queues; 863 } 864 865 nqr = (command->cdw11 >> 16) & 0xFFFF; 866 if (nqr == 0xffff) { 867 WPRINTF(("%s: Illegal NCQR value %#x", __func__, nqr)); 868 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 869 return (-1); 870 } 871 872 sc->num_cqueues = ONE_BASED(nqr); 873 if (sc->num_cqueues > sc->max_queues) { 874 DPRINTF(("NCQR=%u is greater than max %u", sc->num_cqueues, 875 sc->max_queues)); 876 sc->num_cqueues = sc->max_queues; 877 } 878 879 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 880 881 return (0); 882 } 883 884 static int 885 nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command, 886 struct nvme_completion* compl) 887 { 888 int feature = command->cdw10 & 0xFF; 889 uint32_t iv; 890 891 DPRINTF(("%s feature 0x%x", __func__, feature)); 892 compl->cdw0 = 0; 893 894 switch (feature) { 895 case NVME_FEAT_ARBITRATION: 896 DPRINTF((" arbitration 0x%x", command->cdw11)); 897 break; 898 case NVME_FEAT_POWER_MANAGEMENT: 899 DPRINTF((" power management 0x%x", command->cdw11)); 900 break; 901 case NVME_FEAT_LBA_RANGE_TYPE: 902 DPRINTF((" lba range 0x%x", command->cdw11)); 903 break; 904 case NVME_FEAT_TEMPERATURE_THRESHOLD: 905 DPRINTF((" temperature threshold 0x%x", command->cdw11)); 906 break; 907 case NVME_FEAT_ERROR_RECOVERY: 908 DPRINTF((" error recovery 0x%x", command->cdw11)); 909 break; 910 case NVME_FEAT_VOLATILE_WRITE_CACHE: 911 DPRINTF((" volatile write cache 0x%x", command->cdw11)); 912 break; 913 case NVME_FEAT_NUMBER_OF_QUEUES: 914 nvme_set_feature_queues(sc, command, compl); 915 break; 916 case NVME_FEAT_INTERRUPT_COALESCING: 917 DPRINTF((" interrupt coalescing 0x%x", command->cdw11)); 918 919 /* in uS */ 920 sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100; 921 922 sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF; 923 break; 924 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 925 iv = command->cdw11 & 0xFFFF; 926 927 DPRINTF((" interrupt vector configuration 0x%x", 928 command->cdw11)); 929 930 for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) { 931 if (sc->compl_queues[i].intr_vec == iv) { 932 if (command->cdw11 & (1 << 16)) 933 sc->compl_queues[i].intr_en |= 934 NVME_CQ_INTCOAL; 935 else 936 sc->compl_queues[i].intr_en &= 937 ~NVME_CQ_INTCOAL; 938 } 939 } 940 break; 941 case NVME_FEAT_WRITE_ATOMICITY: 942 DPRINTF((" write atomicity 0x%x", command->cdw11)); 943 break; 944 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 945 DPRINTF((" async event configuration 0x%x", 946 command->cdw11)); 947 sc->async_ev_config = command->cdw11; 948 break; 949 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 950 DPRINTF((" software progress marker 0x%x", 951 command->cdw11)); 952 break; 953 case 0x0C: 954 DPRINTF((" autonomous power state transition 0x%x", 955 command->cdw11)); 956 break; 957 default: 958 WPRINTF(("%s invalid feature", __func__)); 959 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 960 return (1); 961 } 962 963 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 964 return (1); 965 } 966 967 static int 968 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, 969 struct nvme_completion* compl) 970 { 971 int feature = command->cdw10 & 0xFF; 972 973 DPRINTF(("%s feature 0x%x", __func__, feature)); 974 975 compl->cdw0 = 0; 976 977 switch (feature) { 978 case NVME_FEAT_ARBITRATION: 979 DPRINTF((" arbitration")); 980 break; 981 case NVME_FEAT_POWER_MANAGEMENT: 982 DPRINTF((" power management")); 983 break; 984 case NVME_FEAT_LBA_RANGE_TYPE: 985 DPRINTF((" lba range")); 986 break; 987 case NVME_FEAT_TEMPERATURE_THRESHOLD: 988 DPRINTF((" temperature threshold")); 989 switch ((command->cdw11 >> 20) & 0x3) { 990 case 0: 991 /* Over temp threshold */ 992 compl->cdw0 = 0xFFFF; 993 break; 994 case 1: 995 /* Under temp threshold */ 996 compl->cdw0 = 0; 997 break; 998 default: 999 WPRINTF((" invalid threshold type select")); 1000 pci_nvme_status_genc(&compl->status, 1001 NVME_SC_INVALID_FIELD); 1002 return (1); 1003 } 1004 break; 1005 case NVME_FEAT_ERROR_RECOVERY: 1006 DPRINTF((" error recovery")); 1007 break; 1008 case NVME_FEAT_VOLATILE_WRITE_CACHE: 1009 DPRINTF((" volatile write cache")); 1010 break; 1011 case NVME_FEAT_NUMBER_OF_QUEUES: 1012 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 1013 1014 DPRINTF((" number of queues (submit %u, completion %u)", 1015 compl->cdw0 & 0xFFFF, 1016 (compl->cdw0 >> 16) & 0xFFFF)); 1017 1018 break; 1019 case NVME_FEAT_INTERRUPT_COALESCING: 1020 DPRINTF((" interrupt coalescing")); 1021 break; 1022 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 1023 DPRINTF((" interrupt vector configuration")); 1024 break; 1025 case NVME_FEAT_WRITE_ATOMICITY: 1026 DPRINTF((" write atomicity")); 1027 break; 1028 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 1029 DPRINTF((" async event configuration")); 1030 sc->async_ev_config = command->cdw11; 1031 break; 1032 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 1033 DPRINTF((" software progress marker")); 1034 break; 1035 case 0x0C: 1036 DPRINTF((" autonomous power state transition")); 1037 break; 1038 default: 1039 WPRINTF(("%s invalid feature 0x%x", __func__, feature)); 1040 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1041 return (1); 1042 } 1043 1044 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1045 return (1); 1046 } 1047 1048 static int 1049 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command, 1050 struct nvme_completion* compl) 1051 { 1052 DPRINTF(("%s submission queue %u, command ID 0x%x", __func__, 1053 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF)); 1054 1055 /* TODO: search for the command ID and abort it */ 1056 1057 compl->cdw0 = 1; 1058 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1059 return (1); 1060 } 1061 1062 static int 1063 nvme_opc_async_event_req(struct pci_nvme_softc* sc, 1064 struct nvme_command* command, struct nvme_completion* compl) 1065 { 1066 DPRINTF(("%s async event request 0x%x", __func__, command->cdw11)); 1067 1068 /* 1069 * TODO: raise events when they happen based on the Set Features cmd. 1070 * These events happen async, so only set completion successful if 1071 * there is an event reflective of the request to get event. 1072 */ 1073 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1074 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); 1075 return (0); 1076 } 1077 1078 static void 1079 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) 1080 { 1081 struct nvme_completion compl; 1082 struct nvme_command *cmd; 1083 struct nvme_submission_queue *sq; 1084 struct nvme_completion_queue *cq; 1085 uint16_t sqhead; 1086 1087 DPRINTF(("%s index %u", __func__, (uint32_t)value)); 1088 1089 sq = &sc->submit_queues[0]; 1090 cq = &sc->compl_queues[0]; 1091 1092 sqhead = atomic_load_acq_short(&sq->head); 1093 1094 if (atomic_testandset_int(&sq->busy, 1)) { 1095 DPRINTF(("%s SQ busy, head %u, tail %u", 1096 __func__, sqhead, sq->tail)); 1097 return; 1098 } 1099 1100 DPRINTF(("sqhead %u, tail %u", sqhead, sq->tail)); 1101 1102 while (sqhead != atomic_load_acq_short(&sq->tail)) { 1103 cmd = &(sq->qbase)[sqhead]; 1104 compl.cdw0 = 0; 1105 compl.status = 0; 1106 1107 switch (cmd->opc) { 1108 case NVME_OPC_DELETE_IO_SQ: 1109 DPRINTF(("%s command DELETE_IO_SQ", __func__)); 1110 nvme_opc_delete_io_sq(sc, cmd, &compl); 1111 break; 1112 case NVME_OPC_CREATE_IO_SQ: 1113 DPRINTF(("%s command CREATE_IO_SQ", __func__)); 1114 nvme_opc_create_io_sq(sc, cmd, &compl); 1115 break; 1116 case NVME_OPC_DELETE_IO_CQ: 1117 DPRINTF(("%s command DELETE_IO_CQ", __func__)); 1118 nvme_opc_delete_io_cq(sc, cmd, &compl); 1119 break; 1120 case NVME_OPC_CREATE_IO_CQ: 1121 DPRINTF(("%s command CREATE_IO_CQ", __func__)); 1122 nvme_opc_create_io_cq(sc, cmd, &compl); 1123 break; 1124 case NVME_OPC_GET_LOG_PAGE: 1125 DPRINTF(("%s command GET_LOG_PAGE", __func__)); 1126 nvme_opc_get_log_page(sc, cmd, &compl); 1127 break; 1128 case NVME_OPC_IDENTIFY: 1129 DPRINTF(("%s command IDENTIFY", __func__)); 1130 nvme_opc_identify(sc, cmd, &compl); 1131 break; 1132 case NVME_OPC_ABORT: 1133 DPRINTF(("%s command ABORT", __func__)); 1134 nvme_opc_abort(sc, cmd, &compl); 1135 break; 1136 case NVME_OPC_SET_FEATURES: 1137 DPRINTF(("%s command SET_FEATURES", __func__)); 1138 nvme_opc_set_features(sc, cmd, &compl); 1139 break; 1140 case NVME_OPC_GET_FEATURES: 1141 DPRINTF(("%s command GET_FEATURES", __func__)); 1142 nvme_opc_get_features(sc, cmd, &compl); 1143 break; 1144 case NVME_OPC_ASYNC_EVENT_REQUEST: 1145 DPRINTF(("%s command ASYNC_EVENT_REQ", __func__)); 1146 /* XXX dont care, unhandled for now 1147 nvme_opc_async_event_req(sc, cmd, &compl); 1148 */ 1149 compl.status = NVME_NO_STATUS; 1150 break; 1151 default: 1152 WPRINTF(("0x%x command is not implemented", 1153 cmd->opc)); 1154 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 1155 } 1156 sqhead = (sqhead + 1) % sq->size; 1157 1158 if (NVME_COMPLETION_VALID(compl)) { 1159 struct nvme_completion *cp; 1160 int phase; 1161 1162 cp = &(cq->qbase)[cq->tail]; 1163 cp->cdw0 = compl.cdw0; 1164 cp->sqid = 0; 1165 cp->sqhd = sqhead; 1166 cp->cid = cmd->cid; 1167 1168 phase = NVME_STATUS_GET_P(cp->status); 1169 cp->status = compl.status; 1170 pci_nvme_toggle_phase(&cp->status, phase); 1171 1172 cq->tail = (cq->tail + 1) % cq->size; 1173 } 1174 } 1175 1176 DPRINTF(("setting sqhead %u", sqhead)); 1177 atomic_store_short(&sq->head, sqhead); 1178 atomic_store_int(&sq->busy, 0); 1179 1180 if (cq->head != cq->tail) 1181 pci_generate_msix(sc->nsc_pi, 0); 1182 1183 } 1184 1185 static int 1186 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req, 1187 uint64_t gpaddr, size_t size, int do_write, uint64_t lba) 1188 { 1189 int iovidx; 1190 1191 if (req != NULL) { 1192 /* concatenate contig block-iovs to minimize number of iovs */ 1193 if ((req->prev_gpaddr + req->prev_size) == gpaddr) { 1194 iovidx = req->io_req.br_iovcnt - 1; 1195 1196 req->io_req.br_iov[iovidx].iov_base = 1197 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 1198 req->prev_gpaddr, size); 1199 1200 req->prev_size += size; 1201 req->io_req.br_resid += size; 1202 1203 req->io_req.br_iov[iovidx].iov_len = req->prev_size; 1204 } else { 1205 pthread_mutex_lock(&req->mtx); 1206 1207 iovidx = req->io_req.br_iovcnt; 1208 if (iovidx == NVME_MAX_BLOCKIOVS) { 1209 int err = 0; 1210 1211 DPRINTF(("large I/O, doing partial req")); 1212 1213 iovidx = 0; 1214 req->io_req.br_iovcnt = 0; 1215 1216 req->io_req.br_callback = pci_nvme_io_partial; 1217 1218 if (!do_write) 1219 err = blockif_read(sc->nvstore.ctx, 1220 &req->io_req); 1221 else 1222 err = blockif_write(sc->nvstore.ctx, 1223 &req->io_req); 1224 1225 /* wait until req completes before cont */ 1226 if (err == 0) 1227 pthread_cond_wait(&req->cv, &req->mtx); 1228 } 1229 if (iovidx == 0) { 1230 req->io_req.br_offset = lba; 1231 req->io_req.br_resid = 0; 1232 req->io_req.br_param = req; 1233 } 1234 1235 req->io_req.br_iov[iovidx].iov_base = 1236 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 1237 gpaddr, size); 1238 1239 req->io_req.br_iov[iovidx].iov_len = size; 1240 1241 req->prev_gpaddr = gpaddr; 1242 req->prev_size = size; 1243 req->io_req.br_resid += size; 1244 1245 req->io_req.br_iovcnt++; 1246 1247 pthread_mutex_unlock(&req->mtx); 1248 } 1249 } else { 1250 /* RAM buffer: read/write directly */ 1251 void *p = sc->nvstore.ctx; 1252 void *gptr; 1253 1254 if ((lba + size) > sc->nvstore.size) { 1255 WPRINTF(("%s write would overflow RAM", __func__)); 1256 return (-1); 1257 } 1258 1259 p = (void *)((uintptr_t)p + (uintptr_t)lba); 1260 gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size); 1261 if (do_write) 1262 memcpy(p, gptr, size); 1263 else 1264 memcpy(gptr, p, size); 1265 } 1266 return (0); 1267 } 1268 1269 static void 1270 pci_nvme_set_completion(struct pci_nvme_softc *sc, 1271 struct nvme_submission_queue *sq, int sqid, uint16_t cid, 1272 uint32_t cdw0, uint16_t status, int ignore_busy) 1273 { 1274 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; 1275 struct nvme_completion *compl; 1276 int phase; 1277 1278 DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x", 1279 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), 1280 NVME_STATUS_GET_SC(status))); 1281 1282 pthread_mutex_lock(&cq->mtx); 1283 1284 assert(cq->qbase != NULL); 1285 1286 compl = &cq->qbase[cq->tail]; 1287 1288 compl->cdw0 = cdw0; 1289 compl->sqid = sqid; 1290 compl->sqhd = atomic_load_acq_short(&sq->head); 1291 compl->cid = cid; 1292 1293 // toggle phase 1294 phase = NVME_STATUS_GET_P(compl->status); 1295 compl->status = status; 1296 pci_nvme_toggle_phase(&compl->status, phase); 1297 1298 cq->tail = (cq->tail + 1) % cq->size; 1299 1300 pthread_mutex_unlock(&cq->mtx); 1301 1302 if (cq->head != cq->tail) { 1303 if (cq->intr_en & NVME_CQ_INTEN) { 1304 pci_generate_msix(sc->nsc_pi, cq->intr_vec); 1305 } else { 1306 DPRINTF(("%s: CQ%u interrupt disabled\n", 1307 __func__, sq->cqid)); 1308 } 1309 } 1310 } 1311 1312 static void 1313 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) 1314 { 1315 req->sc = NULL; 1316 req->nvme_sq = NULL; 1317 req->sqid = 0; 1318 1319 pthread_mutex_lock(&sc->mtx); 1320 1321 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link); 1322 sc->pending_ios--; 1323 1324 /* when no more IO pending, can set to ready if device reset/enabled */ 1325 if (sc->pending_ios == 0 && 1326 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) 1327 sc->regs.csts |= NVME_CSTS_RDY; 1328 1329 pthread_mutex_unlock(&sc->mtx); 1330 1331 sem_post(&sc->iosemlock); 1332 } 1333 1334 static struct pci_nvme_ioreq * 1335 pci_nvme_get_ioreq(struct pci_nvme_softc *sc) 1336 { 1337 struct pci_nvme_ioreq *req = NULL;; 1338 1339 sem_wait(&sc->iosemlock); 1340 pthread_mutex_lock(&sc->mtx); 1341 1342 req = STAILQ_FIRST(&sc->ioreqs_free); 1343 assert(req != NULL); 1344 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link); 1345 1346 req->sc = sc; 1347 1348 sc->pending_ios++; 1349 1350 pthread_mutex_unlock(&sc->mtx); 1351 1352 req->io_req.br_iovcnt = 0; 1353 req->io_req.br_offset = 0; 1354 req->io_req.br_resid = 0; 1355 req->io_req.br_param = req; 1356 req->prev_gpaddr = 0; 1357 req->prev_size = 0; 1358 1359 return req; 1360 } 1361 1362 static void 1363 pci_nvme_io_done(struct blockif_req *br, int err) 1364 { 1365 struct pci_nvme_ioreq *req = br->br_param; 1366 struct nvme_submission_queue *sq = req->nvme_sq; 1367 uint16_t code, status; 1368 1369 DPRINTF(("%s error %d %s", __func__, err, strerror(err))); 1370 1371 /* TODO return correct error */ 1372 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; 1373 pci_nvme_status_genc(&status, code); 1374 1375 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0); 1376 pci_nvme_release_ioreq(req->sc, req); 1377 } 1378 1379 static void 1380 pci_nvme_io_partial(struct blockif_req *br, int err) 1381 { 1382 struct pci_nvme_ioreq *req = br->br_param; 1383 1384 DPRINTF(("%s error %d %s", __func__, err, strerror(err))); 1385 1386 pthread_cond_signal(&req->cv); 1387 } 1388 1389 static void 1390 pci_nvme_dealloc_sm(struct blockif_req *br, int err) 1391 { 1392 struct pci_nvme_ioreq *req = br->br_param; 1393 struct pci_nvme_softc *sc = req->sc; 1394 bool done = true; 1395 uint16_t status; 1396 1397 if (err) { 1398 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); 1399 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) { 1400 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1401 } else { 1402 struct iovec *iov = req->io_req.br_iov; 1403 1404 req->prev_gpaddr++; 1405 iov += req->prev_gpaddr; 1406 1407 /* The iov_* values already include the sector size */ 1408 req->io_req.br_offset = (off_t)iov->iov_base; 1409 req->io_req.br_resid = iov->iov_len; 1410 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) { 1411 pci_nvme_status_genc(&status, 1412 NVME_SC_INTERNAL_DEVICE_ERROR); 1413 } else 1414 done = false; 1415 } 1416 1417 if (done) { 1418 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, 1419 req->cid, 0, status, 0); 1420 pci_nvme_release_ioreq(sc, req); 1421 } 1422 } 1423 1424 static int 1425 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, 1426 struct nvme_command *cmd, 1427 struct pci_nvme_blockstore *nvstore, 1428 struct pci_nvme_ioreq *req, 1429 uint16_t *status) 1430 { 1431 int err = -1; 1432 1433 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) { 1434 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE); 1435 goto out; 1436 } 1437 1438 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) { 1439 struct nvme_dsm_range *range; 1440 uint32_t nr, r; 1441 int sectsz = sc->nvstore.sectsz; 1442 1443 /* 1444 * DSM calls are advisory only, and compliant controllers 1445 * may choose to take no actions (i.e. return Success). 1446 */ 1447 if (!nvstore->deallocate) { 1448 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 1449 goto out; 1450 } 1451 1452 if (req == NULL) { 1453 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 1454 goto out; 1455 } 1456 1457 /* copy locally because a range entry could straddle PRPs */ 1458 range = calloc(1, NVME_MAX_DSM_TRIM); 1459 if (range == NULL) { 1460 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 1461 goto out; 1462 } 1463 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, 1464 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP); 1465 1466 req->opc = cmd->opc; 1467 req->cid = cmd->cid; 1468 req->nsid = cmd->nsid; 1469 /* 1470 * If the request is for more than a single range, store 1471 * the ranges in the br_iov. Optimize for the common case 1472 * of a single range. 1473 * 1474 * Note that NVMe Number of Ranges is a zero based value 1475 */ 1476 nr = cmd->cdw10 & 0xff; 1477 1478 req->io_req.br_iovcnt = 0; 1479 req->io_req.br_offset = range[0].starting_lba * sectsz; 1480 req->io_req.br_resid = range[0].length * sectsz; 1481 1482 if (nr == 0) { 1483 req->io_req.br_callback = pci_nvme_io_done; 1484 } else { 1485 struct iovec *iov = req->io_req.br_iov; 1486 1487 for (r = 0; r <= nr; r++) { 1488 iov[r].iov_base = (void *)(range[r].starting_lba * sectsz); 1489 iov[r].iov_len = range[r].length * sectsz; 1490 } 1491 req->io_req.br_callback = pci_nvme_dealloc_sm; 1492 1493 /* 1494 * Use prev_gpaddr to track the current entry and 1495 * prev_size to track the number of entries 1496 */ 1497 req->prev_gpaddr = 0; 1498 req->prev_size = r; 1499 } 1500 1501 err = blockif_delete(nvstore->ctx, &req->io_req); 1502 if (err) 1503 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 1504 1505 free(range); 1506 } 1507 out: 1508 return (err); 1509 } 1510 1511 static void 1512 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) 1513 { 1514 struct nvme_submission_queue *sq; 1515 uint16_t status; 1516 uint16_t sqhead; 1517 int err; 1518 1519 /* handle all submissions up to sq->tail index */ 1520 sq = &sc->submit_queues[idx]; 1521 1522 if (atomic_testandset_int(&sq->busy, 1)) { 1523 DPRINTF(("%s sqid %u busy", __func__, idx)); 1524 return; 1525 } 1526 1527 sqhead = atomic_load_acq_short(&sq->head); 1528 1529 DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p", 1530 idx, sqhead, sq->tail, sq->qbase)); 1531 1532 while (sqhead != atomic_load_acq_short(&sq->tail)) { 1533 struct nvme_command *cmd; 1534 struct pci_nvme_ioreq *req = NULL; 1535 uint64_t lba; 1536 uint64_t nblocks, bytes, size, cpsz; 1537 1538 /* TODO: support scatter gather list handling */ 1539 1540 cmd = &sq->qbase[sqhead]; 1541 sqhead = (sqhead + 1) % sq->size; 1542 1543 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; 1544 1545 if (cmd->opc == NVME_OPC_FLUSH) { 1546 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1547 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, 1548 status, 1); 1549 1550 continue; 1551 } else if (cmd->opc == 0x08) { 1552 /* TODO: write zeroes */ 1553 WPRINTF(("%s write zeroes lba 0x%lx blocks %u", 1554 __func__, lba, cmd->cdw12 & 0xFFFF)); 1555 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1556 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, 1557 status, 1); 1558 1559 continue; 1560 } 1561 1562 if (sc->nvstore.type == NVME_STOR_BLOCKIF) { 1563 req = pci_nvme_get_ioreq(sc); 1564 req->nvme_sq = sq; 1565 req->sqid = idx; 1566 } 1567 1568 if (cmd->opc == NVME_OPC_DATASET_MANAGEMENT) { 1569 if (nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, req, 1570 &status)) { 1571 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 1572 0, status, 1); 1573 if (req) 1574 pci_nvme_release_ioreq(sc, req); 1575 } 1576 continue; 1577 } 1578 1579 nblocks = (cmd->cdw12 & 0xFFFF) + 1; 1580 1581 bytes = nblocks * sc->nvstore.sectsz; 1582 1583 /* 1584 * If data starts mid-page and flows into the next page, then 1585 * increase page count 1586 */ 1587 1588 DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu " 1589 "(%lu-bytes)", 1590 sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size, 1591 cmd->opc == NVME_OPC_WRITE ? 1592 "WRITE" : "READ", 1593 lba, nblocks, bytes)); 1594 1595 cmd->prp1 &= ~(0x03UL); 1596 cmd->prp2 &= ~(0x03UL); 1597 1598 DPRINTF((" prp1 0x%lx prp2 0x%lx", cmd->prp1, cmd->prp2)); 1599 1600 size = bytes; 1601 lba *= sc->nvstore.sectsz; 1602 1603 cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE); 1604 1605 if (cpsz > bytes) 1606 cpsz = bytes; 1607 1608 if (req != NULL) { 1609 req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) | 1610 cmd->cdw10; 1611 req->opc = cmd->opc; 1612 req->cid = cmd->cid; 1613 req->nsid = cmd->nsid; 1614 } 1615 1616 err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz, 1617 cmd->opc == NVME_OPC_WRITE, lba); 1618 lba += cpsz; 1619 size -= cpsz; 1620 1621 if (size == 0) 1622 goto iodone; 1623 1624 if (size <= PAGE_SIZE) { 1625 /* prp2 is second (and final) page in transfer */ 1626 1627 err = pci_nvme_append_iov_req(sc, req, cmd->prp2, 1628 size, 1629 cmd->opc == NVME_OPC_WRITE, 1630 lba); 1631 } else { 1632 uint64_t *prp_list; 1633 int i; 1634 1635 /* prp2 is pointer to a physical region page list */ 1636 prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx, 1637 cmd->prp2, PAGE_SIZE); 1638 1639 i = 0; 1640 while (size != 0) { 1641 cpsz = MIN(size, PAGE_SIZE); 1642 1643 /* 1644 * Move to linked physical region page list 1645 * in last item. 1646 */ 1647 if (i == (NVME_PRP2_ITEMS-1) && 1648 size > PAGE_SIZE) { 1649 assert((prp_list[i] & (PAGE_SIZE-1)) == 0); 1650 prp_list = paddr_guest2host( 1651 sc->nsc_pi->pi_vmctx, 1652 prp_list[i], PAGE_SIZE); 1653 i = 0; 1654 } 1655 if (prp_list[i] == 0) { 1656 WPRINTF(("PRP2[%d] = 0 !!!", i)); 1657 err = 1; 1658 break; 1659 } 1660 1661 err = pci_nvme_append_iov_req(sc, req, 1662 prp_list[i], cpsz, 1663 cmd->opc == NVME_OPC_WRITE, lba); 1664 if (err) 1665 break; 1666 1667 lba += cpsz; 1668 size -= cpsz; 1669 i++; 1670 } 1671 } 1672 1673 iodone: 1674 if (sc->nvstore.type == NVME_STOR_RAM) { 1675 uint16_t code, status; 1676 1677 code = err ? NVME_SC_LBA_OUT_OF_RANGE : 1678 NVME_SC_SUCCESS; 1679 pci_nvme_status_genc(&status, code); 1680 1681 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, 1682 status, 1); 1683 1684 continue; 1685 } 1686 1687 1688 if (err) 1689 goto do_error; 1690 1691 req->io_req.br_callback = pci_nvme_io_done; 1692 1693 err = 0; 1694 switch (cmd->opc) { 1695 case NVME_OPC_READ: 1696 err = blockif_read(sc->nvstore.ctx, &req->io_req); 1697 break; 1698 case NVME_OPC_WRITE: 1699 err = blockif_write(sc->nvstore.ctx, &req->io_req); 1700 break; 1701 default: 1702 WPRINTF(("%s unhandled io command 0x%x", 1703 __func__, cmd->opc)); 1704 err = 1; 1705 } 1706 1707 do_error: 1708 if (err) { 1709 uint16_t status; 1710 1711 pci_nvme_status_genc(&status, 1712 NVME_SC_DATA_TRANSFER_ERROR); 1713 1714 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, 1715 status, 1); 1716 pci_nvme_release_ioreq(sc, req); 1717 } 1718 } 1719 1720 atomic_store_short(&sq->head, sqhead); 1721 atomic_store_int(&sq->busy, 0); 1722 } 1723 1724 static void 1725 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc, 1726 uint64_t idx, int is_sq, uint64_t value) 1727 { 1728 DPRINTF(("nvme doorbell %lu, %s, val 0x%lx", 1729 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF)); 1730 1731 if (is_sq) { 1732 atomic_store_short(&sc->submit_queues[idx].tail, 1733 (uint16_t)value); 1734 1735 if (idx == 0) { 1736 pci_nvme_handle_admin_cmd(sc, value); 1737 } else { 1738 /* submission queue; handle new entries in SQ */ 1739 if (idx > sc->num_squeues) { 1740 WPRINTF(("%s SQ index %lu overflow from " 1741 "guest (max %u)", 1742 __func__, idx, sc->num_squeues)); 1743 return; 1744 } 1745 pci_nvme_handle_io_cmd(sc, (uint16_t)idx); 1746 } 1747 } else { 1748 if (idx > sc->num_cqueues) { 1749 WPRINTF(("%s queue index %lu overflow from " 1750 "guest (max %u)", 1751 __func__, idx, sc->num_cqueues)); 1752 return; 1753 } 1754 1755 sc->compl_queues[idx].head = (uint16_t)value; 1756 } 1757 } 1758 1759 static void 1760 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) 1761 { 1762 const char *s = iswrite ? "WRITE" : "READ"; 1763 1764 switch (offset) { 1765 case NVME_CR_CAP_LOW: 1766 DPRINTF(("%s %s NVME_CR_CAP_LOW", func, s)); 1767 break; 1768 case NVME_CR_CAP_HI: 1769 DPRINTF(("%s %s NVME_CR_CAP_HI", func, s)); 1770 break; 1771 case NVME_CR_VS: 1772 DPRINTF(("%s %s NVME_CR_VS", func, s)); 1773 break; 1774 case NVME_CR_INTMS: 1775 DPRINTF(("%s %s NVME_CR_INTMS", func, s)); 1776 break; 1777 case NVME_CR_INTMC: 1778 DPRINTF(("%s %s NVME_CR_INTMC", func, s)); 1779 break; 1780 case NVME_CR_CC: 1781 DPRINTF(("%s %s NVME_CR_CC", func, s)); 1782 break; 1783 case NVME_CR_CSTS: 1784 DPRINTF(("%s %s NVME_CR_CSTS", func, s)); 1785 break; 1786 case NVME_CR_NSSR: 1787 DPRINTF(("%s %s NVME_CR_NSSR", func, s)); 1788 break; 1789 case NVME_CR_AQA: 1790 DPRINTF(("%s %s NVME_CR_AQA", func, s)); 1791 break; 1792 case NVME_CR_ASQ_LOW: 1793 DPRINTF(("%s %s NVME_CR_ASQ_LOW", func, s)); 1794 break; 1795 case NVME_CR_ASQ_HI: 1796 DPRINTF(("%s %s NVME_CR_ASQ_HI", func, s)); 1797 break; 1798 case NVME_CR_ACQ_LOW: 1799 DPRINTF(("%s %s NVME_CR_ACQ_LOW", func, s)); 1800 break; 1801 case NVME_CR_ACQ_HI: 1802 DPRINTF(("%s %s NVME_CR_ACQ_HI", func, s)); 1803 break; 1804 default: 1805 DPRINTF(("unknown nvme bar-0 offset 0x%lx", offset)); 1806 } 1807 1808 } 1809 1810 static void 1811 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc, 1812 uint64_t offset, int size, uint64_t value) 1813 { 1814 uint32_t ccreg; 1815 1816 if (offset >= NVME_DOORBELL_OFFSET) { 1817 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; 1818 uint64_t idx = belloffset / 8; /* door bell size = 2*int */ 1819 int is_sq = (belloffset % 8) < 4; 1820 1821 if (belloffset > ((sc->max_queues+1) * 8 - 4)) { 1822 WPRINTF(("guest attempted an overflow write offset " 1823 "0x%lx, val 0x%lx in %s", 1824 offset, value, __func__)); 1825 return; 1826 } 1827 1828 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value); 1829 return; 1830 } 1831 1832 DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx", 1833 offset, size, value)); 1834 1835 if (size != 4) { 1836 WPRINTF(("guest wrote invalid size %d (offset 0x%lx, " 1837 "val 0x%lx) to bar0 in %s", 1838 size, offset, value, __func__)); 1839 /* TODO: shutdown device */ 1840 return; 1841 } 1842 1843 pci_nvme_bar0_reg_dumps(__func__, offset, 1); 1844 1845 pthread_mutex_lock(&sc->mtx); 1846 1847 switch (offset) { 1848 case NVME_CR_CAP_LOW: 1849 case NVME_CR_CAP_HI: 1850 /* readonly */ 1851 break; 1852 case NVME_CR_VS: 1853 /* readonly */ 1854 break; 1855 case NVME_CR_INTMS: 1856 /* MSI-X, so ignore */ 1857 break; 1858 case NVME_CR_INTMC: 1859 /* MSI-X, so ignore */ 1860 break; 1861 case NVME_CR_CC: 1862 ccreg = (uint32_t)value; 1863 1864 DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u " 1865 "iocqes %u", 1866 __func__, 1867 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), 1868 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), 1869 NVME_CC_GET_IOCQES(ccreg))); 1870 1871 if (NVME_CC_GET_SHN(ccreg)) { 1872 /* perform shutdown - flush out data to backend */ 1873 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << 1874 NVME_CSTS_REG_SHST_SHIFT); 1875 sc->regs.csts |= NVME_SHST_COMPLETE << 1876 NVME_CSTS_REG_SHST_SHIFT; 1877 } 1878 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { 1879 if (NVME_CC_GET_EN(ccreg) == 0) 1880 /* transition 1-> causes controller reset */ 1881 pci_nvme_reset_locked(sc); 1882 else 1883 pci_nvme_init_controller(ctx, sc); 1884 } 1885 1886 /* Insert the iocqes, iosqes and en bits from the write */ 1887 sc->regs.cc &= ~NVME_CC_WRITE_MASK; 1888 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; 1889 if (NVME_CC_GET_EN(ccreg) == 0) { 1890 /* Insert the ams, mps and css bit fields */ 1891 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; 1892 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; 1893 sc->regs.csts &= ~NVME_CSTS_RDY; 1894 } else if (sc->pending_ios == 0) { 1895 sc->regs.csts |= NVME_CSTS_RDY; 1896 } 1897 break; 1898 case NVME_CR_CSTS: 1899 break; 1900 case NVME_CR_NSSR: 1901 /* ignore writes; don't support subsystem reset */ 1902 break; 1903 case NVME_CR_AQA: 1904 sc->regs.aqa = (uint32_t)value; 1905 break; 1906 case NVME_CR_ASQ_LOW: 1907 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | 1908 (0xFFFFF000 & value); 1909 break; 1910 case NVME_CR_ASQ_HI: 1911 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | 1912 (value << 32); 1913 break; 1914 case NVME_CR_ACQ_LOW: 1915 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | 1916 (0xFFFFF000 & value); 1917 break; 1918 case NVME_CR_ACQ_HI: 1919 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | 1920 (value << 32); 1921 break; 1922 default: 1923 DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d", 1924 __func__, offset, value, size)); 1925 } 1926 pthread_mutex_unlock(&sc->mtx); 1927 } 1928 1929 static void 1930 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, 1931 int baridx, uint64_t offset, int size, uint64_t value) 1932 { 1933 struct pci_nvme_softc* sc = pi->pi_arg; 1934 1935 if (baridx == pci_msix_table_bar(pi) || 1936 baridx == pci_msix_pba_bar(pi)) { 1937 DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, " 1938 " value 0x%lx", baridx, offset, size, value)); 1939 1940 pci_emul_msix_twrite(pi, offset, size, value); 1941 return; 1942 } 1943 1944 switch (baridx) { 1945 case 0: 1946 pci_nvme_write_bar_0(ctx, sc, offset, size, value); 1947 break; 1948 1949 default: 1950 DPRINTF(("%s unknown baridx %d, val 0x%lx", 1951 __func__, baridx, value)); 1952 } 1953 } 1954 1955 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, 1956 uint64_t offset, int size) 1957 { 1958 uint64_t value; 1959 1960 pci_nvme_bar0_reg_dumps(__func__, offset, 0); 1961 1962 if (offset < NVME_DOORBELL_OFFSET) { 1963 void *p = &(sc->regs); 1964 pthread_mutex_lock(&sc->mtx); 1965 memcpy(&value, (void *)((uintptr_t)p + offset), size); 1966 pthread_mutex_unlock(&sc->mtx); 1967 } else { 1968 value = 0; 1969 WPRINTF(("pci_nvme: read invalid offset %ld", offset)); 1970 } 1971 1972 switch (size) { 1973 case 1: 1974 value &= 0xFF; 1975 break; 1976 case 2: 1977 value &= 0xFFFF; 1978 break; 1979 case 4: 1980 value &= 0xFFFFFFFF; 1981 break; 1982 } 1983 1984 DPRINTF((" nvme-read offset 0x%lx, size %d -> value 0x%x", 1985 offset, size, (uint32_t)value)); 1986 1987 return (value); 1988 } 1989 1990 1991 1992 static uint64_t 1993 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, 1994 uint64_t offset, int size) 1995 { 1996 struct pci_nvme_softc* sc = pi->pi_arg; 1997 1998 if (baridx == pci_msix_table_bar(pi) || 1999 baridx == pci_msix_pba_bar(pi)) { 2000 DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d", 2001 baridx, offset, size)); 2002 2003 return pci_emul_msix_tread(pi, offset, size); 2004 } 2005 2006 switch (baridx) { 2007 case 0: 2008 return pci_nvme_read_bar_0(sc, offset, size); 2009 2010 default: 2011 DPRINTF(("unknown bar %d, 0x%lx", baridx, offset)); 2012 } 2013 2014 return (0); 2015 } 2016 2017 2018 static int 2019 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts) 2020 { 2021 char bident[sizeof("XX:X:X")]; 2022 char *uopt, *xopts, *config; 2023 uint32_t sectsz; 2024 int optidx; 2025 2026 sc->max_queues = NVME_QUEUES; 2027 sc->max_qentries = NVME_MAX_QENTRIES; 2028 sc->ioslots = NVME_IOSLOTS; 2029 sc->num_squeues = sc->max_queues; 2030 sc->num_cqueues = sc->max_queues; 2031 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 2032 sectsz = 0; 2033 2034 uopt = strdup(opts); 2035 optidx = 0; 2036 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), 2037 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 2038 for (xopts = strtok(uopt, ","); 2039 xopts != NULL; 2040 xopts = strtok(NULL, ",")) { 2041 2042 if ((config = strchr(xopts, '=')) != NULL) 2043 *config++ = '\0'; 2044 2045 if (!strcmp("maxq", xopts)) { 2046 sc->max_queues = atoi(config); 2047 } else if (!strcmp("qsz", xopts)) { 2048 sc->max_qentries = atoi(config); 2049 } else if (!strcmp("ioslots", xopts)) { 2050 sc->ioslots = atoi(config); 2051 } else if (!strcmp("sectsz", xopts)) { 2052 sectsz = atoi(config); 2053 } else if (!strcmp("ser", xopts)) { 2054 /* 2055 * This field indicates the Product Serial Number in 2056 * 7-bit ASCII, unused bytes should be space characters. 2057 * Ref: NVMe v1.3c. 2058 */ 2059 cpywithpad((char *)sc->ctrldata.sn, 2060 sizeof(sc->ctrldata.sn), config, ' '); 2061 } else if (!strcmp("ram", xopts)) { 2062 uint64_t sz = strtoull(&xopts[4], NULL, 10); 2063 2064 sc->nvstore.type = NVME_STOR_RAM; 2065 sc->nvstore.size = sz * 1024 * 1024; 2066 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 2067 sc->nvstore.sectsz = 4096; 2068 sc->nvstore.sectsz_bits = 12; 2069 if (sc->nvstore.ctx == NULL) { 2070 perror("Unable to allocate RAM"); 2071 free(uopt); 2072 return (-1); 2073 } 2074 } else if (!strcmp("eui64", xopts)) { 2075 sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0)); 2076 } else if (!strcmp("dsm", xopts)) { 2077 if (!strcmp("auto", config)) 2078 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 2079 else if (!strcmp("enable", config)) 2080 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE; 2081 else if (!strcmp("disable", config)) 2082 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE; 2083 } else if (optidx == 0) { 2084 snprintf(bident, sizeof(bident), "%d:%d", 2085 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 2086 sc->nvstore.ctx = blockif_open(xopts, bident); 2087 if (sc->nvstore.ctx == NULL) { 2088 perror("Could not open backing file"); 2089 free(uopt); 2090 return (-1); 2091 } 2092 sc->nvstore.type = NVME_STOR_BLOCKIF; 2093 sc->nvstore.size = blockif_size(sc->nvstore.ctx); 2094 } else { 2095 EPRINTLN("Invalid option %s", xopts); 2096 free(uopt); 2097 return (-1); 2098 } 2099 2100 optidx++; 2101 } 2102 free(uopt); 2103 2104 if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) { 2105 EPRINTLN("backing store not specified"); 2106 return (-1); 2107 } 2108 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) 2109 sc->nvstore.sectsz = sectsz; 2110 else if (sc->nvstore.type != NVME_STOR_RAM) 2111 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); 2112 for (sc->nvstore.sectsz_bits = 9; 2113 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; 2114 sc->nvstore.sectsz_bits++); 2115 2116 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) 2117 sc->max_queues = NVME_QUEUES; 2118 2119 if (sc->max_qentries <= 0) { 2120 EPRINTLN("Invalid qsz option"); 2121 return (-1); 2122 } 2123 if (sc->ioslots <= 0) { 2124 EPRINTLN("Invalid ioslots option"); 2125 return (-1); 2126 } 2127 2128 return (0); 2129 } 2130 2131 static int 2132 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) 2133 { 2134 struct pci_nvme_softc *sc; 2135 uint32_t pci_membar_sz; 2136 int error; 2137 2138 error = 0; 2139 2140 sc = calloc(1, sizeof(struct pci_nvme_softc)); 2141 pi->pi_arg = sc; 2142 sc->nsc_pi = pi; 2143 2144 error = pci_nvme_parse_opts(sc, opts); 2145 if (error < 0) 2146 goto done; 2147 else 2148 error = 0; 2149 2150 STAILQ_INIT(&sc->ioreqs_free); 2151 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); 2152 for (int i = 0; i < sc->ioslots; i++) { 2153 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); 2154 pthread_mutex_init(&sc->ioreqs[i].mtx, NULL); 2155 pthread_cond_init(&sc->ioreqs[i].cv, NULL); 2156 } 2157 sc->intr_coales_aggr_thresh = 1; 2158 2159 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); 2160 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); 2161 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); 2162 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); 2163 pci_set_cfgdata8(pi, PCIR_PROGIF, 2164 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); 2165 2166 /* 2167 * Allocate size of NVMe registers + doorbell space for all queues. 2168 * 2169 * The specification requires a minimum memory I/O window size of 16K. 2170 * The Windows driver will refuse to start a device with a smaller 2171 * window. 2172 */ 2173 pci_membar_sz = sizeof(struct nvme_registers) + 2174 2 * sizeof(uint32_t) * (sc->max_queues + 1); 2175 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); 2176 2177 DPRINTF(("nvme membar size: %u", pci_membar_sz)); 2178 2179 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); 2180 if (error) { 2181 WPRINTF(("%s pci alloc mem bar failed", __func__)); 2182 goto done; 2183 } 2184 2185 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); 2186 if (error) { 2187 WPRINTF(("%s pci add msixcap failed", __func__)); 2188 goto done; 2189 } 2190 2191 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); 2192 if (error) { 2193 WPRINTF(("%s pci add Express capability failed", __func__)); 2194 goto done; 2195 } 2196 2197 pthread_mutex_init(&sc->mtx, NULL); 2198 sem_init(&sc->iosemlock, 0, sc->ioslots); 2199 2200 pci_nvme_reset(sc); 2201 /* 2202 * Controller data depends on Namespace data so initialize Namespace 2203 * data first. 2204 */ 2205 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore); 2206 pci_nvme_init_ctrldata(sc); 2207 pci_nvme_init_logpages(sc); 2208 2209 pci_lintr_request(pi); 2210 2211 done: 2212 return (error); 2213 } 2214 2215 2216 struct pci_devemu pci_de_nvme = { 2217 .pe_emu = "nvme", 2218 .pe_init = pci_nvme_init, 2219 .pe_barwrite = pci_nvme_write, 2220 .pe_barread = pci_nvme_read 2221 }; 2222 PCI_EMUL_SET(pci_de_nvme); 2223