1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2017 Shunsuke Mie 5 * Copyright (c) 2018 Leon Dang 6 * 7 * Function crc16 Copyright (c) 2017, Fedor Uporov 8 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 /* 33 * bhyve PCIe-NVMe device emulation. 34 * 35 * options: 36 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=# 37 * 38 * accepted devpath: 39 * /dev/blockdev 40 * /path/to/image 41 * ram=size_in_MiB 42 * 43 * maxq = max number of queues 44 * qsz = max elements in each queue 45 * ioslots = max number of concurrent io requests 46 * sectsz = sector size (defaults to blockif sector size) 47 * ser = serial number (20-chars max) 48 * eui64 = IEEE Extended Unique Identifier (8 byte value) 49 * 50 */ 51 52 /* TODO: 53 - create async event for smart and log 54 - intr coalesce 55 */ 56 57 #include <sys/cdefs.h> 58 __FBSDID("$FreeBSD$"); 59 60 #include <sys/types.h> 61 #include <net/ieee_oui.h> 62 #ifndef __FreeBSD__ 63 #include <endian.h> 64 #endif 65 66 #include <assert.h> 67 #include <pthread.h> 68 #include <semaphore.h> 69 #include <stdbool.h> 70 #include <stddef.h> 71 #include <stdint.h> 72 #include <stdio.h> 73 #include <stdlib.h> 74 #include <string.h> 75 76 #include <machine/atomic.h> 77 #include <machine/vmm.h> 78 #include <vmmapi.h> 79 80 #include <dev/nvme/nvme.h> 81 82 #include "bhyverun.h" 83 #include "block_if.h" 84 #include "debug.h" 85 #include "pci_emul.h" 86 87 88 static int nvme_debug = 0; 89 #define DPRINTF(params) if (nvme_debug) PRINTLN params 90 #define WPRINTF(params) PRINTLN params 91 92 /* defaults; can be overridden */ 93 #define NVME_MSIX_BAR 4 94 95 #define NVME_IOSLOTS 8 96 97 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ 98 #define NVME_MMIO_SPACE_MIN (1 << 14) 99 100 #define NVME_QUEUES 16 101 #define NVME_MAX_QENTRIES 2048 102 103 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) 104 #define NVME_MAX_BLOCKIOVS 512 105 106 /* This is a synthetic status code to indicate there is no status */ 107 #define NVME_NO_STATUS 0xffff 108 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) 109 110 /* helpers */ 111 112 /* Convert a zero-based value into a one-based value */ 113 #define ONE_BASED(zero) ((zero) + 1) 114 /* Convert a one-based value into a zero-based value */ 115 #define ZERO_BASED(one) ((one) - 1) 116 117 /* Encode number of SQ's and CQ's for Set/Get Features */ 118 #define NVME_FEATURE_NUM_QUEUES(sc) \ 119 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ 120 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16; 121 122 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) 123 124 enum nvme_controller_register_offsets { 125 NVME_CR_CAP_LOW = 0x00, 126 NVME_CR_CAP_HI = 0x04, 127 NVME_CR_VS = 0x08, 128 NVME_CR_INTMS = 0x0c, 129 NVME_CR_INTMC = 0x10, 130 NVME_CR_CC = 0x14, 131 NVME_CR_CSTS = 0x1c, 132 NVME_CR_NSSR = 0x20, 133 NVME_CR_AQA = 0x24, 134 NVME_CR_ASQ_LOW = 0x28, 135 NVME_CR_ASQ_HI = 0x2c, 136 NVME_CR_ACQ_LOW = 0x30, 137 NVME_CR_ACQ_HI = 0x34, 138 }; 139 140 enum nvme_cmd_cdw11 { 141 NVME_CMD_CDW11_PC = 0x0001, 142 NVME_CMD_CDW11_IEN = 0x0002, 143 NVME_CMD_CDW11_IV = 0xFFFF0000, 144 }; 145 146 enum nvme_copy_dir { 147 NVME_COPY_TO_PRP, 148 NVME_COPY_FROM_PRP, 149 }; 150 151 #define NVME_CQ_INTEN 0x01 152 #define NVME_CQ_INTCOAL 0x02 153 154 struct nvme_completion_queue { 155 struct nvme_completion *qbase; 156 uint32_t size; 157 uint16_t tail; /* nvme progress */ 158 uint16_t head; /* guest progress */ 159 uint16_t intr_vec; 160 uint32_t intr_en; 161 pthread_mutex_t mtx; 162 }; 163 164 struct nvme_submission_queue { 165 struct nvme_command *qbase; 166 uint32_t size; 167 uint16_t head; /* nvme progress */ 168 uint16_t tail; /* guest progress */ 169 uint16_t cqid; /* completion queue id */ 170 int busy; /* queue is being processed */ 171 int qpriority; 172 }; 173 174 enum nvme_storage_type { 175 NVME_STOR_BLOCKIF = 0, 176 NVME_STOR_RAM = 1, 177 }; 178 179 struct pci_nvme_blockstore { 180 enum nvme_storage_type type; 181 void *ctx; 182 uint64_t size; 183 uint32_t sectsz; 184 uint32_t sectsz_bits; 185 uint64_t eui64; 186 uint32_t deallocate:1; 187 }; 188 189 struct pci_nvme_ioreq { 190 struct pci_nvme_softc *sc; 191 STAILQ_ENTRY(pci_nvme_ioreq) link; 192 struct nvme_submission_queue *nvme_sq; 193 uint16_t sqid; 194 195 /* command information */ 196 uint16_t opc; 197 uint16_t cid; 198 uint32_t nsid; 199 200 uint64_t prev_gpaddr; 201 size_t prev_size; 202 203 /* 204 * lock if all iovs consumed (big IO); 205 * complete transaction before continuing 206 */ 207 pthread_mutex_t mtx; 208 pthread_cond_t cv; 209 210 struct blockif_req io_req; 211 212 /* pad to fit up to 512 page descriptors from guest IO request */ 213 struct iovec iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX]; 214 }; 215 216 enum nvme_dsm_type { 217 /* Dataset Management bit in ONCS reflects backing storage capability */ 218 NVME_DATASET_MANAGEMENT_AUTO, 219 /* Unconditionally set Dataset Management bit in ONCS */ 220 NVME_DATASET_MANAGEMENT_ENABLE, 221 /* Unconditionally clear Dataset Management bit in ONCS */ 222 NVME_DATASET_MANAGEMENT_DISABLE, 223 }; 224 225 struct pci_nvme_softc { 226 struct pci_devinst *nsc_pi; 227 228 pthread_mutex_t mtx; 229 230 struct nvme_registers regs; 231 232 struct nvme_namespace_data nsdata; 233 struct nvme_controller_data ctrldata; 234 struct nvme_error_information_entry err_log; 235 struct nvme_health_information_page health_log; 236 struct nvme_firmware_page fw_log; 237 238 struct pci_nvme_blockstore nvstore; 239 240 uint16_t max_qentries; /* max entries per queue */ 241 uint32_t max_queues; /* max number of IO SQ's or CQ's */ 242 uint32_t num_cqueues; 243 uint32_t num_squeues; 244 245 struct pci_nvme_ioreq *ioreqs; 246 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */ 247 uint32_t pending_ios; 248 uint32_t ioslots; 249 sem_t iosemlock; 250 251 /* 252 * Memory mapped Submission and Completion queues 253 * Each array includes both Admin and IO queues 254 */ 255 struct nvme_completion_queue *compl_queues; 256 struct nvme_submission_queue *submit_queues; 257 258 /* controller features */ 259 uint32_t intr_coales_aggr_time; /* 0x08: uS to delay intr */ 260 uint32_t intr_coales_aggr_thresh; /* 0x08: compl-Q entries */ 261 uint32_t async_ev_config; /* 0x0B: async event config */ 262 263 enum nvme_dsm_type dataset_management; 264 }; 265 266 267 static void pci_nvme_io_partial(struct blockif_req *br, int err); 268 269 /* Controller Configuration utils */ 270 #define NVME_CC_GET_EN(cc) \ 271 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) 272 #define NVME_CC_GET_CSS(cc) \ 273 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) 274 #define NVME_CC_GET_SHN(cc) \ 275 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) 276 #define NVME_CC_GET_IOSQES(cc) \ 277 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) 278 #define NVME_CC_GET_IOCQES(cc) \ 279 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) 280 281 #define NVME_CC_WRITE_MASK \ 282 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ 283 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ 284 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) 285 286 #define NVME_CC_NEN_WRITE_MASK \ 287 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ 288 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ 289 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) 290 291 /* Controller Status utils */ 292 #define NVME_CSTS_GET_RDY(sts) \ 293 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) 294 295 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) 296 297 /* Completion Queue status word utils */ 298 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) 299 #define NVME_STATUS_MASK \ 300 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ 301 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) 302 303 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \ 304 NVME_CTRLR_DATA_ONCS_DSM_SHIFT) 305 306 static __inline void 307 cpywithpad(char *dst, size_t dst_size, const char *src, char pad) 308 { 309 size_t len; 310 311 len = strnlen(src, dst_size); 312 memset(dst, pad, dst_size); 313 memcpy(dst, src, len); 314 } 315 316 static __inline void 317 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) 318 { 319 320 *status &= ~NVME_STATUS_MASK; 321 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | 322 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; 323 } 324 325 static __inline void 326 pci_nvme_status_genc(uint16_t *status, uint16_t code) 327 { 328 329 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); 330 } 331 332 static __inline void 333 pci_nvme_toggle_phase(uint16_t *status, int prev) 334 { 335 336 if (prev) 337 *status &= ~NVME_STATUS_P; 338 else 339 *status |= NVME_STATUS_P; 340 } 341 342 static void 343 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) 344 { 345 struct nvme_controller_data *cd = &sc->ctrldata; 346 347 cd->vid = 0xFB5D; 348 cd->ssvid = 0x0000; 349 350 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); 351 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); 352 353 /* Num of submission commands that we can handle at a time (2^rab) */ 354 cd->rab = 4; 355 356 /* FreeBSD OUI */ 357 cd->ieee[0] = 0x58; 358 cd->ieee[1] = 0x9c; 359 cd->ieee[2] = 0xfc; 360 361 cd->mic = 0; 362 363 cd->mdts = 9; /* max data transfer size (2^mdts * CAP.MPSMIN) */ 364 365 cd->ver = 0x00010300; 366 367 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; 368 cd->acl = 2; 369 cd->aerl = 4; 370 371 cd->lpa = 0; /* TODO: support some simple things like SMART */ 372 cd->elpe = 0; /* max error log page entries */ 373 cd->npss = 1; /* number of power states support */ 374 375 /* Warning Composite Temperature Threshold */ 376 cd->wctemp = 0x0157; 377 378 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | 379 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); 380 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | 381 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); 382 cd->nn = 1; /* number of namespaces */ 383 384 cd->oncs = 0; 385 switch (sc->dataset_management) { 386 case NVME_DATASET_MANAGEMENT_AUTO: 387 if (sc->nvstore.deallocate) 388 cd->oncs |= NVME_ONCS_DSM; 389 break; 390 case NVME_DATASET_MANAGEMENT_ENABLE: 391 cd->oncs |= NVME_ONCS_DSM; 392 break; 393 default: 394 break; 395 } 396 397 cd->fna = 0x03; 398 399 cd->power_state[0].mp = 10; 400 } 401 402 /* 403 * Calculate the CRC-16 of the given buffer 404 * See copyright attribution at top of file 405 */ 406 static uint16_t 407 crc16(uint16_t crc, const void *buffer, unsigned int len) 408 { 409 const unsigned char *cp = buffer; 410 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */ 411 static uint16_t const crc16_table[256] = { 412 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, 413 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, 414 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, 415 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841, 416 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, 417 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41, 418 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641, 419 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040, 420 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240, 421 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441, 422 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41, 423 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840, 424 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41, 425 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40, 426 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640, 427 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041, 428 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240, 429 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441, 430 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41, 431 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840, 432 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41, 433 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40, 434 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640, 435 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041, 436 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241, 437 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440, 438 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40, 439 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841, 440 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40, 441 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41, 442 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, 443 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 444 }; 445 446 while (len--) 447 crc = (((crc >> 8) & 0xffU) ^ 448 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU; 449 return crc; 450 } 451 452 static void 453 pci_nvme_init_nsdata(struct pci_nvme_softc *sc, 454 struct nvme_namespace_data *nd, uint32_t nsid, 455 struct pci_nvme_blockstore *nvstore) 456 { 457 458 /* Get capacity and block size information from backing store */ 459 nd->nsze = nvstore->size / nvstore->sectsz; 460 nd->ncap = nd->nsze; 461 nd->nuse = nd->nsze; 462 463 if (nvstore->type == NVME_STOR_BLOCKIF) 464 nvstore->deallocate = blockif_candelete(nvstore->ctx); 465 466 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ 467 nd->flbas = 0; 468 469 /* Create an EUI-64 if user did not provide one */ 470 if (nvstore->eui64 == 0) { 471 char *data = NULL; 472 uint64_t eui64 = nvstore->eui64; 473 474 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus, 475 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 476 477 if (data != NULL) { 478 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); 479 free(data); 480 } 481 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff); 482 } 483 be64enc(nd->eui64, nvstore->eui64); 484 485 /* LBA data-sz = 2^lbads */ 486 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; 487 } 488 489 static void 490 pci_nvme_init_logpages(struct pci_nvme_softc *sc) 491 { 492 493 memset(&sc->err_log, 0, sizeof(sc->err_log)); 494 memset(&sc->health_log, 0, sizeof(sc->health_log)); 495 memset(&sc->fw_log, 0, sizeof(sc->fw_log)); 496 } 497 498 static void 499 pci_nvme_reset_locked(struct pci_nvme_softc *sc) 500 { 501 DPRINTF(("%s", __func__)); 502 503 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | 504 (1 << NVME_CAP_LO_REG_CQR_SHIFT) | 505 (60 << NVME_CAP_LO_REG_TO_SHIFT); 506 507 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; 508 509 sc->regs.vs = 0x00010300; /* NVMe v1.3 */ 510 511 sc->regs.cc = 0; 512 sc->regs.csts = 0; 513 514 sc->num_cqueues = sc->num_squeues = sc->max_queues; 515 if (sc->submit_queues != NULL) { 516 for (int i = 0; i < sc->num_squeues + 1; i++) { 517 /* 518 * The Admin Submission Queue is at index 0. 519 * It must not be changed at reset otherwise the 520 * emulation will be out of sync with the guest. 521 */ 522 if (i != 0) { 523 sc->submit_queues[i].qbase = NULL; 524 sc->submit_queues[i].size = 0; 525 sc->submit_queues[i].cqid = 0; 526 } 527 sc->submit_queues[i].tail = 0; 528 sc->submit_queues[i].head = 0; 529 sc->submit_queues[i].busy = 0; 530 } 531 } else 532 sc->submit_queues = calloc(sc->num_squeues + 1, 533 sizeof(struct nvme_submission_queue)); 534 535 if (sc->compl_queues != NULL) { 536 for (int i = 0; i < sc->num_cqueues + 1; i++) { 537 /* See Admin Submission Queue note above */ 538 if (i != 0) { 539 sc->compl_queues[i].qbase = NULL; 540 sc->compl_queues[i].size = 0; 541 } 542 543 sc->compl_queues[i].tail = 0; 544 sc->compl_queues[i].head = 0; 545 } 546 } else { 547 sc->compl_queues = calloc(sc->num_cqueues + 1, 548 sizeof(struct nvme_completion_queue)); 549 550 for (int i = 0; i < sc->num_cqueues + 1; i++) 551 pthread_mutex_init(&sc->compl_queues[i].mtx, NULL); 552 } 553 } 554 555 static void 556 pci_nvme_reset(struct pci_nvme_softc *sc) 557 { 558 pthread_mutex_lock(&sc->mtx); 559 pci_nvme_reset_locked(sc); 560 pthread_mutex_unlock(&sc->mtx); 561 } 562 563 static void 564 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc) 565 { 566 uint16_t acqs, asqs; 567 568 DPRINTF(("%s", __func__)); 569 570 asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1; 571 sc->submit_queues[0].size = asqs; 572 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq, 573 sizeof(struct nvme_command) * asqs); 574 575 DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p", 576 __func__, sc->regs.asq, sc->submit_queues[0].qbase)); 577 578 acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 579 NVME_AQA_REG_ACQS_MASK) + 1; 580 sc->compl_queues[0].size = acqs; 581 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq, 582 sizeof(struct nvme_completion) * acqs); 583 DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p", 584 __func__, sc->regs.acq, sc->compl_queues[0].qbase)); 585 } 586 587 static int 588 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b, 589 size_t len, enum nvme_copy_dir dir) 590 { 591 uint8_t *p; 592 size_t bytes; 593 594 if (len > (8 * 1024)) { 595 return (-1); 596 } 597 598 /* Copy from the start of prp1 to the end of the physical page */ 599 bytes = PAGE_SIZE - (prp1 & PAGE_MASK); 600 bytes = MIN(bytes, len); 601 602 p = vm_map_gpa(ctx, prp1, bytes); 603 if (p == NULL) { 604 return (-1); 605 } 606 607 if (dir == NVME_COPY_TO_PRP) 608 memcpy(p, b, bytes); 609 else 610 memcpy(b, p, bytes); 611 612 b += bytes; 613 614 len -= bytes; 615 if (len == 0) { 616 return (0); 617 } 618 619 len = MIN(len, PAGE_SIZE); 620 621 p = vm_map_gpa(ctx, prp2, len); 622 if (p == NULL) { 623 return (-1); 624 } 625 626 if (dir == NVME_COPY_TO_PRP) 627 memcpy(p, b, len); 628 else 629 memcpy(b, p, len); 630 631 return (0); 632 } 633 634 static int 635 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 636 struct nvme_completion* compl) 637 { 638 uint16_t qid = command->cdw10 & 0xffff; 639 640 DPRINTF(("%s DELETE_IO_SQ %u", __func__, qid)); 641 if (qid == 0 || qid > sc->num_squeues) { 642 WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u", 643 __func__, qid, sc->num_squeues)); 644 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 645 NVME_SC_INVALID_QUEUE_IDENTIFIER); 646 return (1); 647 } 648 649 sc->submit_queues[qid].qbase = NULL; 650 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 651 return (1); 652 } 653 654 static int 655 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 656 struct nvme_completion* compl) 657 { 658 if (command->cdw11 & NVME_CMD_CDW11_PC) { 659 uint16_t qid = command->cdw10 & 0xffff; 660 struct nvme_submission_queue *nsq; 661 662 if ((qid == 0) || (qid > sc->num_squeues)) { 663 WPRINTF(("%s queue index %u > num_squeues %u", 664 __func__, qid, sc->num_squeues)); 665 pci_nvme_status_tc(&compl->status, 666 NVME_SCT_COMMAND_SPECIFIC, 667 NVME_SC_INVALID_QUEUE_IDENTIFIER); 668 return (1); 669 } 670 671 nsq = &sc->submit_queues[qid]; 672 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 673 674 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 675 sizeof(struct nvme_command) * (size_t)nsq->size); 676 nsq->cqid = (command->cdw11 >> 16) & 0xffff; 677 nsq->qpriority = (command->cdw11 >> 1) & 0x03; 678 679 DPRINTF(("%s sq %u size %u gaddr %p cqid %u", __func__, 680 qid, nsq->size, nsq->qbase, nsq->cqid)); 681 682 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 683 684 DPRINTF(("%s completed creating IOSQ qid %u", 685 __func__, qid)); 686 } else { 687 /* 688 * Guest sent non-cont submission queue request. 689 * This setting is unsupported by this emulation. 690 */ 691 WPRINTF(("%s unsupported non-contig (list-based) " 692 "create i/o submission queue", __func__)); 693 694 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 695 } 696 return (1); 697 } 698 699 static int 700 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 701 struct nvme_completion* compl) 702 { 703 uint16_t qid = command->cdw10 & 0xffff; 704 705 DPRINTF(("%s DELETE_IO_CQ %u", __func__, qid)); 706 if (qid == 0 || qid > sc->num_cqueues) { 707 WPRINTF(("%s queue index %u / num_cqueues %u", 708 __func__, qid, sc->num_cqueues)); 709 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 710 NVME_SC_INVALID_QUEUE_IDENTIFIER); 711 return (1); 712 } 713 714 sc->compl_queues[qid].qbase = NULL; 715 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 716 return (1); 717 } 718 719 static int 720 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 721 struct nvme_completion* compl) 722 { 723 if (command->cdw11 & NVME_CMD_CDW11_PC) { 724 uint16_t qid = command->cdw10 & 0xffff; 725 struct nvme_completion_queue *ncq; 726 727 if ((qid == 0) || (qid > sc->num_cqueues)) { 728 WPRINTF(("%s queue index %u > num_cqueues %u", 729 __func__, qid, sc->num_cqueues)); 730 pci_nvme_status_tc(&compl->status, 731 NVME_SCT_COMMAND_SPECIFIC, 732 NVME_SC_INVALID_QUEUE_IDENTIFIER); 733 return (1); 734 } 735 736 ncq = &sc->compl_queues[qid]; 737 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; 738 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; 739 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 740 741 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 742 command->prp1, 743 sizeof(struct nvme_command) * (size_t)ncq->size); 744 745 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 746 } else { 747 /* 748 * Non-contig completion queue unsupported. 749 */ 750 WPRINTF(("%s unsupported non-contig (list-based) " 751 "create i/o completion queue", 752 __func__)); 753 754 /* 0x12 = Invalid Use of Controller Memory Buffer */ 755 pci_nvme_status_genc(&compl->status, 0x12); 756 } 757 758 return (1); 759 } 760 761 static int 762 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, 763 struct nvme_completion* compl) 764 { 765 uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2; 766 uint8_t logpage = command->cdw10 & 0xFF; 767 768 DPRINTF(("%s log page %u len %u", __func__, logpage, logsize)); 769 770 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 771 772 switch (logpage) { 773 case NVME_LOG_ERROR: 774 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 775 command->prp2, (uint8_t *)&sc->err_log, logsize, 776 NVME_COPY_TO_PRP); 777 break; 778 case NVME_LOG_HEALTH_INFORMATION: 779 /* TODO: present some smart info */ 780 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 781 command->prp2, (uint8_t *)&sc->health_log, logsize, 782 NVME_COPY_TO_PRP); 783 break; 784 case NVME_LOG_FIRMWARE_SLOT: 785 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 786 command->prp2, (uint8_t *)&sc->fw_log, logsize, 787 NVME_COPY_TO_PRP); 788 break; 789 default: 790 WPRINTF(("%s get log page %x command not supported", 791 __func__, logpage)); 792 793 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 794 NVME_SC_INVALID_LOG_PAGE); 795 } 796 797 return (1); 798 } 799 800 static int 801 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, 802 struct nvme_completion* compl) 803 { 804 void *dest; 805 806 DPRINTF(("%s identify 0x%x nsid 0x%x", __func__, 807 command->cdw10 & 0xFF, command->nsid)); 808 809 switch (command->cdw10 & 0xFF) { 810 case 0x00: /* return Identify Namespace data structure */ 811 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 812 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata), 813 NVME_COPY_TO_PRP); 814 break; 815 case 0x01: /* return Identify Controller data structure */ 816 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 817 command->prp2, (uint8_t *)&sc->ctrldata, 818 sizeof(sc->ctrldata), 819 NVME_COPY_TO_PRP); 820 break; 821 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ 822 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 823 sizeof(uint32_t) * 1024); 824 ((uint32_t *)dest)[0] = 1; 825 ((uint32_t *)dest)[1] = 0; 826 break; 827 case 0x11: 828 pci_nvme_status_genc(&compl->status, 829 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 830 return (1); 831 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ 832 case 0x10: 833 case 0x12: 834 case 0x13: 835 case 0x14: 836 case 0x15: 837 default: 838 DPRINTF(("%s unsupported identify command requested 0x%x", 839 __func__, command->cdw10 & 0xFF)); 840 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 841 return (1); 842 } 843 844 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 845 return (1); 846 } 847 848 static int 849 nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command, 850 struct nvme_completion* compl) 851 { 852 uint16_t nqr; /* Number of Queues Requested */ 853 854 nqr = command->cdw11 & 0xFFFF; 855 if (nqr == 0xffff) { 856 WPRINTF(("%s: Illegal NSQR value %#x", __func__, nqr)); 857 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 858 return (-1); 859 } 860 861 sc->num_squeues = ONE_BASED(nqr); 862 if (sc->num_squeues > sc->max_queues) { 863 DPRINTF(("NSQR=%u is greater than max %u", sc->num_squeues, 864 sc->max_queues)); 865 sc->num_squeues = sc->max_queues; 866 } 867 868 nqr = (command->cdw11 >> 16) & 0xFFFF; 869 if (nqr == 0xffff) { 870 WPRINTF(("%s: Illegal NCQR value %#x", __func__, nqr)); 871 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 872 return (-1); 873 } 874 875 sc->num_cqueues = ONE_BASED(nqr); 876 if (sc->num_cqueues > sc->max_queues) { 877 DPRINTF(("NCQR=%u is greater than max %u", sc->num_cqueues, 878 sc->max_queues)); 879 sc->num_cqueues = sc->max_queues; 880 } 881 882 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 883 884 return (0); 885 } 886 887 static int 888 nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command, 889 struct nvme_completion* compl) 890 { 891 int feature = command->cdw10 & 0xFF; 892 uint32_t iv; 893 894 DPRINTF(("%s feature 0x%x", __func__, feature)); 895 compl->cdw0 = 0; 896 897 switch (feature) { 898 case NVME_FEAT_ARBITRATION: 899 DPRINTF((" arbitration 0x%x", command->cdw11)); 900 break; 901 case NVME_FEAT_POWER_MANAGEMENT: 902 DPRINTF((" power management 0x%x", command->cdw11)); 903 break; 904 case NVME_FEAT_LBA_RANGE_TYPE: 905 DPRINTF((" lba range 0x%x", command->cdw11)); 906 break; 907 case NVME_FEAT_TEMPERATURE_THRESHOLD: 908 DPRINTF((" temperature threshold 0x%x", command->cdw11)); 909 break; 910 case NVME_FEAT_ERROR_RECOVERY: 911 DPRINTF((" error recovery 0x%x", command->cdw11)); 912 break; 913 case NVME_FEAT_VOLATILE_WRITE_CACHE: 914 DPRINTF((" volatile write cache 0x%x", command->cdw11)); 915 break; 916 case NVME_FEAT_NUMBER_OF_QUEUES: 917 nvme_set_feature_queues(sc, command, compl); 918 break; 919 case NVME_FEAT_INTERRUPT_COALESCING: 920 DPRINTF((" interrupt coalescing 0x%x", command->cdw11)); 921 922 /* in uS */ 923 sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100; 924 925 sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF; 926 break; 927 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 928 iv = command->cdw11 & 0xFFFF; 929 930 DPRINTF((" interrupt vector configuration 0x%x", 931 command->cdw11)); 932 933 for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) { 934 if (sc->compl_queues[i].intr_vec == iv) { 935 if (command->cdw11 & (1 << 16)) 936 sc->compl_queues[i].intr_en |= 937 NVME_CQ_INTCOAL; 938 else 939 sc->compl_queues[i].intr_en &= 940 ~NVME_CQ_INTCOAL; 941 } 942 } 943 break; 944 case NVME_FEAT_WRITE_ATOMICITY: 945 DPRINTF((" write atomicity 0x%x", command->cdw11)); 946 break; 947 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 948 DPRINTF((" async event configuration 0x%x", 949 command->cdw11)); 950 sc->async_ev_config = command->cdw11; 951 break; 952 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 953 DPRINTF((" software progress marker 0x%x", 954 command->cdw11)); 955 break; 956 case 0x0C: 957 DPRINTF((" autonomous power state transition 0x%x", 958 command->cdw11)); 959 break; 960 default: 961 WPRINTF(("%s invalid feature", __func__)); 962 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 963 return (1); 964 } 965 966 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 967 return (1); 968 } 969 970 static int 971 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, 972 struct nvme_completion* compl) 973 { 974 int feature = command->cdw10 & 0xFF; 975 976 DPRINTF(("%s feature 0x%x", __func__, feature)); 977 978 compl->cdw0 = 0; 979 980 switch (feature) { 981 case NVME_FEAT_ARBITRATION: 982 DPRINTF((" arbitration")); 983 break; 984 case NVME_FEAT_POWER_MANAGEMENT: 985 DPRINTF((" power management")); 986 break; 987 case NVME_FEAT_LBA_RANGE_TYPE: 988 DPRINTF((" lba range")); 989 break; 990 case NVME_FEAT_TEMPERATURE_THRESHOLD: 991 DPRINTF((" temperature threshold")); 992 switch ((command->cdw11 >> 20) & 0x3) { 993 case 0: 994 /* Over temp threshold */ 995 compl->cdw0 = 0xFFFF; 996 break; 997 case 1: 998 /* Under temp threshold */ 999 compl->cdw0 = 0; 1000 break; 1001 default: 1002 WPRINTF((" invalid threshold type select")); 1003 pci_nvme_status_genc(&compl->status, 1004 NVME_SC_INVALID_FIELD); 1005 return (1); 1006 } 1007 break; 1008 case NVME_FEAT_ERROR_RECOVERY: 1009 DPRINTF((" error recovery")); 1010 break; 1011 case NVME_FEAT_VOLATILE_WRITE_CACHE: 1012 DPRINTF((" volatile write cache")); 1013 break; 1014 case NVME_FEAT_NUMBER_OF_QUEUES: 1015 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 1016 1017 DPRINTF((" number of queues (submit %u, completion %u)", 1018 compl->cdw0 & 0xFFFF, 1019 (compl->cdw0 >> 16) & 0xFFFF)); 1020 1021 break; 1022 case NVME_FEAT_INTERRUPT_COALESCING: 1023 DPRINTF((" interrupt coalescing")); 1024 break; 1025 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 1026 DPRINTF((" interrupt vector configuration")); 1027 break; 1028 case NVME_FEAT_WRITE_ATOMICITY: 1029 DPRINTF((" write atomicity")); 1030 break; 1031 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 1032 DPRINTF((" async event configuration")); 1033 sc->async_ev_config = command->cdw11; 1034 break; 1035 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 1036 DPRINTF((" software progress marker")); 1037 break; 1038 case 0x0C: 1039 DPRINTF((" autonomous power state transition")); 1040 break; 1041 default: 1042 WPRINTF(("%s invalid feature 0x%x", __func__, feature)); 1043 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1044 return (1); 1045 } 1046 1047 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1048 return (1); 1049 } 1050 1051 static int 1052 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command, 1053 struct nvme_completion* compl) 1054 { 1055 DPRINTF(("%s submission queue %u, command ID 0x%x", __func__, 1056 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF)); 1057 1058 /* TODO: search for the command ID and abort it */ 1059 1060 compl->cdw0 = 1; 1061 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1062 return (1); 1063 } 1064 1065 #ifdef __FreeBSD__ 1066 static int 1067 nvme_opc_async_event_req(struct pci_nvme_softc* sc, 1068 struct nvme_command* command, struct nvme_completion* compl) 1069 { 1070 DPRINTF(("%s async event request 0x%x", __func__, command->cdw11)); 1071 1072 /* 1073 * TODO: raise events when they happen based on the Set Features cmd. 1074 * These events happen async, so only set completion successful if 1075 * there is an event reflective of the request to get event. 1076 */ 1077 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1078 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); 1079 return (0); 1080 } 1081 #else 1082 /* This is kept behind an ifdef while it's unused to appease the compiler. */ 1083 #endif /* __FreeBSD__ */ 1084 1085 static void 1086 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) 1087 { 1088 struct nvme_completion compl; 1089 struct nvme_command *cmd; 1090 struct nvme_submission_queue *sq; 1091 struct nvme_completion_queue *cq; 1092 uint16_t sqhead; 1093 1094 DPRINTF(("%s index %u", __func__, (uint32_t)value)); 1095 1096 sq = &sc->submit_queues[0]; 1097 cq = &sc->compl_queues[0]; 1098 1099 sqhead = atomic_load_acq_short(&sq->head); 1100 1101 if (atomic_testandset_int(&sq->busy, 1)) { 1102 DPRINTF(("%s SQ busy, head %u, tail %u", 1103 __func__, sqhead, sq->tail)); 1104 return; 1105 } 1106 1107 DPRINTF(("sqhead %u, tail %u", sqhead, sq->tail)); 1108 1109 while (sqhead != atomic_load_acq_short(&sq->tail)) { 1110 cmd = &(sq->qbase)[sqhead]; 1111 compl.cdw0 = 0; 1112 compl.status = 0; 1113 1114 switch (cmd->opc) { 1115 case NVME_OPC_DELETE_IO_SQ: 1116 DPRINTF(("%s command DELETE_IO_SQ", __func__)); 1117 nvme_opc_delete_io_sq(sc, cmd, &compl); 1118 break; 1119 case NVME_OPC_CREATE_IO_SQ: 1120 DPRINTF(("%s command CREATE_IO_SQ", __func__)); 1121 nvme_opc_create_io_sq(sc, cmd, &compl); 1122 break; 1123 case NVME_OPC_DELETE_IO_CQ: 1124 DPRINTF(("%s command DELETE_IO_CQ", __func__)); 1125 nvme_opc_delete_io_cq(sc, cmd, &compl); 1126 break; 1127 case NVME_OPC_CREATE_IO_CQ: 1128 DPRINTF(("%s command CREATE_IO_CQ", __func__)); 1129 nvme_opc_create_io_cq(sc, cmd, &compl); 1130 break; 1131 case NVME_OPC_GET_LOG_PAGE: 1132 DPRINTF(("%s command GET_LOG_PAGE", __func__)); 1133 nvme_opc_get_log_page(sc, cmd, &compl); 1134 break; 1135 case NVME_OPC_IDENTIFY: 1136 DPRINTF(("%s command IDENTIFY", __func__)); 1137 nvme_opc_identify(sc, cmd, &compl); 1138 break; 1139 case NVME_OPC_ABORT: 1140 DPRINTF(("%s command ABORT", __func__)); 1141 nvme_opc_abort(sc, cmd, &compl); 1142 break; 1143 case NVME_OPC_SET_FEATURES: 1144 DPRINTF(("%s command SET_FEATURES", __func__)); 1145 nvme_opc_set_features(sc, cmd, &compl); 1146 break; 1147 case NVME_OPC_GET_FEATURES: 1148 DPRINTF(("%s command GET_FEATURES", __func__)); 1149 nvme_opc_get_features(sc, cmd, &compl); 1150 break; 1151 case NVME_OPC_ASYNC_EVENT_REQUEST: 1152 DPRINTF(("%s command ASYNC_EVENT_REQ", __func__)); 1153 /* XXX dont care, unhandled for now 1154 nvme_opc_async_event_req(sc, cmd, &compl); 1155 */ 1156 compl.status = NVME_NO_STATUS; 1157 break; 1158 default: 1159 WPRINTF(("0x%x command is not implemented", 1160 cmd->opc)); 1161 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 1162 } 1163 sqhead = (sqhead + 1) % sq->size; 1164 1165 if (NVME_COMPLETION_VALID(compl)) { 1166 struct nvme_completion *cp; 1167 int phase; 1168 1169 cp = &(cq->qbase)[cq->tail]; 1170 cp->cdw0 = compl.cdw0; 1171 cp->sqid = 0; 1172 cp->sqhd = sqhead; 1173 cp->cid = cmd->cid; 1174 1175 phase = NVME_STATUS_GET_P(cp->status); 1176 cp->status = compl.status; 1177 pci_nvme_toggle_phase(&cp->status, phase); 1178 1179 cq->tail = (cq->tail + 1) % cq->size; 1180 } 1181 } 1182 1183 DPRINTF(("setting sqhead %u", sqhead)); 1184 atomic_store_short(&sq->head, sqhead); 1185 atomic_store_int(&sq->busy, 0); 1186 1187 if (cq->head != cq->tail) 1188 pci_generate_msix(sc->nsc_pi, 0); 1189 1190 } 1191 1192 static int 1193 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req, 1194 uint64_t gpaddr, size_t size, int do_write, uint64_t lba) 1195 { 1196 int iovidx; 1197 1198 if (req != NULL) { 1199 /* concatenate contig block-iovs to minimize number of iovs */ 1200 if ((req->prev_gpaddr + req->prev_size) == gpaddr) { 1201 iovidx = req->io_req.br_iovcnt - 1; 1202 1203 req->io_req.br_iov[iovidx].iov_base = 1204 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 1205 req->prev_gpaddr, size); 1206 1207 req->prev_size += size; 1208 req->io_req.br_resid += size; 1209 1210 req->io_req.br_iov[iovidx].iov_len = req->prev_size; 1211 } else { 1212 pthread_mutex_lock(&req->mtx); 1213 1214 iovidx = req->io_req.br_iovcnt; 1215 if (iovidx == NVME_MAX_BLOCKIOVS) { 1216 int err = 0; 1217 1218 DPRINTF(("large I/O, doing partial req")); 1219 1220 iovidx = 0; 1221 req->io_req.br_iovcnt = 0; 1222 1223 req->io_req.br_callback = pci_nvme_io_partial; 1224 1225 if (!do_write) 1226 err = blockif_read(sc->nvstore.ctx, 1227 &req->io_req); 1228 else 1229 err = blockif_write(sc->nvstore.ctx, 1230 &req->io_req); 1231 1232 /* wait until req completes before cont */ 1233 if (err == 0) 1234 pthread_cond_wait(&req->cv, &req->mtx); 1235 } 1236 if (iovidx == 0) { 1237 req->io_req.br_offset = lba; 1238 req->io_req.br_resid = 0; 1239 req->io_req.br_param = req; 1240 } 1241 1242 req->io_req.br_iov[iovidx].iov_base = 1243 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 1244 gpaddr, size); 1245 1246 req->io_req.br_iov[iovidx].iov_len = size; 1247 1248 req->prev_gpaddr = gpaddr; 1249 req->prev_size = size; 1250 req->io_req.br_resid += size; 1251 1252 req->io_req.br_iovcnt++; 1253 1254 pthread_mutex_unlock(&req->mtx); 1255 } 1256 } else { 1257 /* RAM buffer: read/write directly */ 1258 void *p = sc->nvstore.ctx; 1259 void *gptr; 1260 1261 if ((lba + size) > sc->nvstore.size) { 1262 WPRINTF(("%s write would overflow RAM", __func__)); 1263 return (-1); 1264 } 1265 1266 p = (void *)((uintptr_t)p + (uintptr_t)lba); 1267 gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size); 1268 if (do_write) 1269 memcpy(p, gptr, size); 1270 else 1271 memcpy(gptr, p, size); 1272 } 1273 return (0); 1274 } 1275 1276 static void 1277 pci_nvme_set_completion(struct pci_nvme_softc *sc, 1278 struct nvme_submission_queue *sq, int sqid, uint16_t cid, 1279 uint32_t cdw0, uint16_t status, int ignore_busy) 1280 { 1281 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; 1282 struct nvme_completion *compl; 1283 int phase; 1284 1285 DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x", 1286 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), 1287 NVME_STATUS_GET_SC(status))); 1288 1289 pthread_mutex_lock(&cq->mtx); 1290 1291 assert(cq->qbase != NULL); 1292 1293 compl = &cq->qbase[cq->tail]; 1294 1295 compl->cdw0 = cdw0; 1296 compl->sqid = sqid; 1297 compl->sqhd = atomic_load_acq_short(&sq->head); 1298 compl->cid = cid; 1299 1300 // toggle phase 1301 phase = NVME_STATUS_GET_P(compl->status); 1302 compl->status = status; 1303 pci_nvme_toggle_phase(&compl->status, phase); 1304 1305 cq->tail = (cq->tail + 1) % cq->size; 1306 1307 pthread_mutex_unlock(&cq->mtx); 1308 1309 if (cq->head != cq->tail) { 1310 if (cq->intr_en & NVME_CQ_INTEN) { 1311 pci_generate_msix(sc->nsc_pi, cq->intr_vec); 1312 } else { 1313 DPRINTF(("%s: CQ%u interrupt disabled\n", 1314 __func__, sq->cqid)); 1315 } 1316 } 1317 } 1318 1319 static void 1320 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) 1321 { 1322 req->sc = NULL; 1323 req->nvme_sq = NULL; 1324 req->sqid = 0; 1325 1326 pthread_mutex_lock(&sc->mtx); 1327 1328 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link); 1329 sc->pending_ios--; 1330 1331 /* when no more IO pending, can set to ready if device reset/enabled */ 1332 if (sc->pending_ios == 0 && 1333 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) 1334 sc->regs.csts |= NVME_CSTS_RDY; 1335 1336 pthread_mutex_unlock(&sc->mtx); 1337 1338 sem_post(&sc->iosemlock); 1339 } 1340 1341 static struct pci_nvme_ioreq * 1342 pci_nvme_get_ioreq(struct pci_nvme_softc *sc) 1343 { 1344 struct pci_nvme_ioreq *req = NULL;; 1345 1346 sem_wait(&sc->iosemlock); 1347 pthread_mutex_lock(&sc->mtx); 1348 1349 req = STAILQ_FIRST(&sc->ioreqs_free); 1350 assert(req != NULL); 1351 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link); 1352 1353 req->sc = sc; 1354 1355 sc->pending_ios++; 1356 1357 pthread_mutex_unlock(&sc->mtx); 1358 1359 req->io_req.br_iovcnt = 0; 1360 req->io_req.br_offset = 0; 1361 req->io_req.br_resid = 0; 1362 req->io_req.br_param = req; 1363 req->prev_gpaddr = 0; 1364 req->prev_size = 0; 1365 1366 return req; 1367 } 1368 1369 static void 1370 pci_nvme_io_done(struct blockif_req *br, int err) 1371 { 1372 struct pci_nvme_ioreq *req = br->br_param; 1373 struct nvme_submission_queue *sq = req->nvme_sq; 1374 uint16_t code, status = 0; 1375 1376 DPRINTF(("%s error %d %s", __func__, err, strerror(err))); 1377 1378 /* TODO return correct error */ 1379 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; 1380 pci_nvme_status_genc(&status, code); 1381 1382 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0); 1383 pci_nvme_release_ioreq(req->sc, req); 1384 } 1385 1386 static void 1387 pci_nvme_io_partial(struct blockif_req *br, int err) 1388 { 1389 struct pci_nvme_ioreq *req = br->br_param; 1390 1391 DPRINTF(("%s error %d %s", __func__, err, strerror(err))); 1392 1393 pthread_cond_signal(&req->cv); 1394 } 1395 1396 static void 1397 pci_nvme_dealloc_sm(struct blockif_req *br, int err) 1398 { 1399 struct pci_nvme_ioreq *req = br->br_param; 1400 struct pci_nvme_softc *sc = req->sc; 1401 bool done = true; 1402 #ifdef __FreeBSD__ 1403 uint16_t status; 1404 #else 1405 uint16_t status = 0; 1406 #endif 1407 1408 if (err) { 1409 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); 1410 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) { 1411 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1412 } else { 1413 struct iovec *iov = req->io_req.br_iov; 1414 1415 req->prev_gpaddr++; 1416 iov += req->prev_gpaddr; 1417 1418 /* The iov_* values already include the sector size */ 1419 req->io_req.br_offset = (off_t)iov->iov_base; 1420 req->io_req.br_resid = iov->iov_len; 1421 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) { 1422 pci_nvme_status_genc(&status, 1423 NVME_SC_INTERNAL_DEVICE_ERROR); 1424 } else 1425 done = false; 1426 } 1427 1428 if (done) { 1429 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, 1430 req->cid, 0, status, 0); 1431 pci_nvme_release_ioreq(sc, req); 1432 } 1433 } 1434 1435 static int 1436 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, 1437 struct nvme_command *cmd, 1438 struct pci_nvme_blockstore *nvstore, 1439 struct pci_nvme_ioreq *req, 1440 uint16_t *status) 1441 { 1442 int err = -1; 1443 1444 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) { 1445 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE); 1446 goto out; 1447 } 1448 1449 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) { 1450 struct nvme_dsm_range *range; 1451 uint32_t nr, r; 1452 int sectsz = sc->nvstore.sectsz; 1453 1454 /* 1455 * DSM calls are advisory only, and compliant controllers 1456 * may choose to take no actions (i.e. return Success). 1457 */ 1458 if (!nvstore->deallocate) { 1459 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 1460 goto out; 1461 } 1462 1463 if (req == NULL) { 1464 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 1465 goto out; 1466 } 1467 1468 /* copy locally because a range entry could straddle PRPs */ 1469 range = calloc(1, NVME_MAX_DSM_TRIM); 1470 if (range == NULL) { 1471 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 1472 goto out; 1473 } 1474 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, 1475 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP); 1476 1477 req->opc = cmd->opc; 1478 req->cid = cmd->cid; 1479 req->nsid = cmd->nsid; 1480 /* 1481 * If the request is for more than a single range, store 1482 * the ranges in the br_iov. Optimize for the common case 1483 * of a single range. 1484 * 1485 * Note that NVMe Number of Ranges is a zero based value 1486 */ 1487 nr = cmd->cdw10 & 0xff; 1488 1489 req->io_req.br_iovcnt = 0; 1490 req->io_req.br_offset = range[0].starting_lba * sectsz; 1491 req->io_req.br_resid = range[0].length * sectsz; 1492 1493 if (nr == 0) { 1494 req->io_req.br_callback = pci_nvme_io_done; 1495 } else { 1496 struct iovec *iov = req->io_req.br_iov; 1497 1498 for (r = 0; r <= nr; r++) { 1499 iov[r].iov_base = (void *)(range[r].starting_lba * sectsz); 1500 iov[r].iov_len = range[r].length * sectsz; 1501 } 1502 req->io_req.br_callback = pci_nvme_dealloc_sm; 1503 1504 /* 1505 * Use prev_gpaddr to track the current entry and 1506 * prev_size to track the number of entries 1507 */ 1508 req->prev_gpaddr = 0; 1509 req->prev_size = r; 1510 } 1511 1512 err = blockif_delete(nvstore->ctx, &req->io_req); 1513 if (err) 1514 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 1515 1516 free(range); 1517 } 1518 out: 1519 return (err); 1520 } 1521 1522 static void 1523 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) 1524 { 1525 struct nvme_submission_queue *sq; 1526 uint16_t status = 0; 1527 uint16_t sqhead; 1528 int err; 1529 1530 /* handle all submissions up to sq->tail index */ 1531 sq = &sc->submit_queues[idx]; 1532 1533 if (atomic_testandset_int(&sq->busy, 1)) { 1534 DPRINTF(("%s sqid %u busy", __func__, idx)); 1535 return; 1536 } 1537 1538 sqhead = atomic_load_acq_short(&sq->head); 1539 1540 DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p", 1541 idx, sqhead, sq->tail, sq->qbase)); 1542 1543 while (sqhead != atomic_load_acq_short(&sq->tail)) { 1544 struct nvme_command *cmd; 1545 struct pci_nvme_ioreq *req = NULL; 1546 uint64_t lba; 1547 uint64_t nblocks, bytes, size, cpsz; 1548 1549 /* TODO: support scatter gather list handling */ 1550 1551 cmd = &sq->qbase[sqhead]; 1552 sqhead = (sqhead + 1) % sq->size; 1553 1554 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; 1555 1556 if (cmd->opc == NVME_OPC_FLUSH) { 1557 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1558 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, 1559 status, 1); 1560 1561 continue; 1562 } else if (cmd->opc == 0x08) { 1563 /* TODO: write zeroes */ 1564 WPRINTF(("%s write zeroes lba 0x%lx blocks %u", 1565 __func__, lba, cmd->cdw12 & 0xFFFF)); 1566 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1567 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, 1568 status, 1); 1569 1570 continue; 1571 } 1572 1573 if (sc->nvstore.type == NVME_STOR_BLOCKIF) { 1574 req = pci_nvme_get_ioreq(sc); 1575 req->nvme_sq = sq; 1576 req->sqid = idx; 1577 } 1578 1579 if (cmd->opc == NVME_OPC_DATASET_MANAGEMENT) { 1580 if (nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, req, 1581 &status)) { 1582 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 1583 0, status, 1); 1584 if (req) 1585 pci_nvme_release_ioreq(sc, req); 1586 } 1587 continue; 1588 } 1589 1590 nblocks = (cmd->cdw12 & 0xFFFF) + 1; 1591 1592 bytes = nblocks * sc->nvstore.sectsz; 1593 1594 /* 1595 * If data starts mid-page and flows into the next page, then 1596 * increase page count 1597 */ 1598 1599 DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu " 1600 "(%lu-bytes)", 1601 sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size, 1602 cmd->opc == NVME_OPC_WRITE ? 1603 "WRITE" : "READ", 1604 lba, nblocks, bytes)); 1605 1606 cmd->prp1 &= ~(0x03UL); 1607 cmd->prp2 &= ~(0x03UL); 1608 1609 DPRINTF((" prp1 0x%lx prp2 0x%lx", cmd->prp1, cmd->prp2)); 1610 1611 size = bytes; 1612 lba *= sc->nvstore.sectsz; 1613 1614 cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE); 1615 1616 if (cpsz > bytes) 1617 cpsz = bytes; 1618 1619 if (req != NULL) { 1620 req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) | 1621 cmd->cdw10; 1622 req->opc = cmd->opc; 1623 req->cid = cmd->cid; 1624 req->nsid = cmd->nsid; 1625 } 1626 1627 err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz, 1628 cmd->opc == NVME_OPC_WRITE, lba); 1629 lba += cpsz; 1630 size -= cpsz; 1631 1632 if (size == 0) 1633 goto iodone; 1634 1635 if (size <= PAGE_SIZE) { 1636 /* prp2 is second (and final) page in transfer */ 1637 1638 err = pci_nvme_append_iov_req(sc, req, cmd->prp2, 1639 size, 1640 cmd->opc == NVME_OPC_WRITE, 1641 lba); 1642 } else { 1643 uint64_t *prp_list; 1644 int i; 1645 1646 /* prp2 is pointer to a physical region page list */ 1647 prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx, 1648 cmd->prp2, PAGE_SIZE); 1649 1650 i = 0; 1651 while (size != 0) { 1652 cpsz = MIN(size, PAGE_SIZE); 1653 1654 /* 1655 * Move to linked physical region page list 1656 * in last item. 1657 */ 1658 if (i == (NVME_PRP2_ITEMS-1) && 1659 size > PAGE_SIZE) { 1660 assert((prp_list[i] & (PAGE_SIZE-1)) == 0); 1661 prp_list = paddr_guest2host( 1662 sc->nsc_pi->pi_vmctx, 1663 prp_list[i], PAGE_SIZE); 1664 i = 0; 1665 } 1666 if (prp_list[i] == 0) { 1667 WPRINTF(("PRP2[%d] = 0 !!!", i)); 1668 err = 1; 1669 break; 1670 } 1671 1672 err = pci_nvme_append_iov_req(sc, req, 1673 prp_list[i], cpsz, 1674 cmd->opc == NVME_OPC_WRITE, lba); 1675 if (err) 1676 break; 1677 1678 lba += cpsz; 1679 size -= cpsz; 1680 i++; 1681 } 1682 } 1683 1684 iodone: 1685 if (sc->nvstore.type == NVME_STOR_RAM) { 1686 uint16_t code, status = 0; 1687 1688 code = err ? NVME_SC_LBA_OUT_OF_RANGE : 1689 NVME_SC_SUCCESS; 1690 pci_nvme_status_genc(&status, code); 1691 1692 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, 1693 status, 1); 1694 1695 continue; 1696 } 1697 1698 1699 if (err) 1700 goto do_error; 1701 1702 req->io_req.br_callback = pci_nvme_io_done; 1703 1704 err = 0; 1705 switch (cmd->opc) { 1706 case NVME_OPC_READ: 1707 err = blockif_read(sc->nvstore.ctx, &req->io_req); 1708 break; 1709 case NVME_OPC_WRITE: 1710 err = blockif_write(sc->nvstore.ctx, &req->io_req); 1711 break; 1712 default: 1713 WPRINTF(("%s unhandled io command 0x%x", 1714 __func__, cmd->opc)); 1715 err = 1; 1716 } 1717 1718 do_error: 1719 if (err) { 1720 uint16_t status = 0; 1721 1722 pci_nvme_status_genc(&status, 1723 NVME_SC_DATA_TRANSFER_ERROR); 1724 1725 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, 1726 status, 1); 1727 pci_nvme_release_ioreq(sc, req); 1728 } 1729 } 1730 1731 atomic_store_short(&sq->head, sqhead); 1732 atomic_store_int(&sq->busy, 0); 1733 } 1734 1735 static void 1736 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc, 1737 uint64_t idx, int is_sq, uint64_t value) 1738 { 1739 DPRINTF(("nvme doorbell %lu, %s, val 0x%lx", 1740 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF)); 1741 1742 if (is_sq) { 1743 atomic_store_short(&sc->submit_queues[idx].tail, 1744 (uint16_t)value); 1745 1746 if (idx == 0) { 1747 pci_nvme_handle_admin_cmd(sc, value); 1748 } else { 1749 /* submission queue; handle new entries in SQ */ 1750 if (idx > sc->num_squeues) { 1751 WPRINTF(("%s SQ index %lu overflow from " 1752 "guest (max %u)", 1753 __func__, idx, sc->num_squeues)); 1754 return; 1755 } 1756 pci_nvme_handle_io_cmd(sc, (uint16_t)idx); 1757 } 1758 } else { 1759 if (idx > sc->num_cqueues) { 1760 WPRINTF(("%s queue index %lu overflow from " 1761 "guest (max %u)", 1762 __func__, idx, sc->num_cqueues)); 1763 return; 1764 } 1765 1766 sc->compl_queues[idx].head = (uint16_t)value; 1767 } 1768 } 1769 1770 static void 1771 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) 1772 { 1773 const char *s = iswrite ? "WRITE" : "READ"; 1774 1775 switch (offset) { 1776 case NVME_CR_CAP_LOW: 1777 DPRINTF(("%s %s NVME_CR_CAP_LOW", func, s)); 1778 break; 1779 case NVME_CR_CAP_HI: 1780 DPRINTF(("%s %s NVME_CR_CAP_HI", func, s)); 1781 break; 1782 case NVME_CR_VS: 1783 DPRINTF(("%s %s NVME_CR_VS", func, s)); 1784 break; 1785 case NVME_CR_INTMS: 1786 DPRINTF(("%s %s NVME_CR_INTMS", func, s)); 1787 break; 1788 case NVME_CR_INTMC: 1789 DPRINTF(("%s %s NVME_CR_INTMC", func, s)); 1790 break; 1791 case NVME_CR_CC: 1792 DPRINTF(("%s %s NVME_CR_CC", func, s)); 1793 break; 1794 case NVME_CR_CSTS: 1795 DPRINTF(("%s %s NVME_CR_CSTS", func, s)); 1796 break; 1797 case NVME_CR_NSSR: 1798 DPRINTF(("%s %s NVME_CR_NSSR", func, s)); 1799 break; 1800 case NVME_CR_AQA: 1801 DPRINTF(("%s %s NVME_CR_AQA", func, s)); 1802 break; 1803 case NVME_CR_ASQ_LOW: 1804 DPRINTF(("%s %s NVME_CR_ASQ_LOW", func, s)); 1805 break; 1806 case NVME_CR_ASQ_HI: 1807 DPRINTF(("%s %s NVME_CR_ASQ_HI", func, s)); 1808 break; 1809 case NVME_CR_ACQ_LOW: 1810 DPRINTF(("%s %s NVME_CR_ACQ_LOW", func, s)); 1811 break; 1812 case NVME_CR_ACQ_HI: 1813 DPRINTF(("%s %s NVME_CR_ACQ_HI", func, s)); 1814 break; 1815 default: 1816 DPRINTF(("unknown nvme bar-0 offset 0x%lx", offset)); 1817 } 1818 1819 } 1820 1821 static void 1822 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc, 1823 uint64_t offset, int size, uint64_t value) 1824 { 1825 uint32_t ccreg; 1826 1827 if (offset >= NVME_DOORBELL_OFFSET) { 1828 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; 1829 uint64_t idx = belloffset / 8; /* door bell size = 2*int */ 1830 int is_sq = (belloffset % 8) < 4; 1831 1832 if (belloffset > ((sc->max_queues+1) * 8 - 4)) { 1833 WPRINTF(("guest attempted an overflow write offset " 1834 "0x%lx, val 0x%lx in %s", 1835 offset, value, __func__)); 1836 return; 1837 } 1838 1839 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value); 1840 return; 1841 } 1842 1843 DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx", 1844 offset, size, value)); 1845 1846 if (size != 4) { 1847 WPRINTF(("guest wrote invalid size %d (offset 0x%lx, " 1848 "val 0x%lx) to bar0 in %s", 1849 size, offset, value, __func__)); 1850 /* TODO: shutdown device */ 1851 return; 1852 } 1853 1854 pci_nvme_bar0_reg_dumps(__func__, offset, 1); 1855 1856 pthread_mutex_lock(&sc->mtx); 1857 1858 switch (offset) { 1859 case NVME_CR_CAP_LOW: 1860 case NVME_CR_CAP_HI: 1861 /* readonly */ 1862 break; 1863 case NVME_CR_VS: 1864 /* readonly */ 1865 break; 1866 case NVME_CR_INTMS: 1867 /* MSI-X, so ignore */ 1868 break; 1869 case NVME_CR_INTMC: 1870 /* MSI-X, so ignore */ 1871 break; 1872 case NVME_CR_CC: 1873 ccreg = (uint32_t)value; 1874 1875 DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u " 1876 "iocqes %u", 1877 __func__, 1878 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), 1879 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), 1880 NVME_CC_GET_IOCQES(ccreg))); 1881 1882 if (NVME_CC_GET_SHN(ccreg)) { 1883 /* perform shutdown - flush out data to backend */ 1884 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << 1885 NVME_CSTS_REG_SHST_SHIFT); 1886 sc->regs.csts |= NVME_SHST_COMPLETE << 1887 NVME_CSTS_REG_SHST_SHIFT; 1888 } 1889 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { 1890 if (NVME_CC_GET_EN(ccreg) == 0) 1891 /* transition 1-> causes controller reset */ 1892 pci_nvme_reset_locked(sc); 1893 else 1894 pci_nvme_init_controller(ctx, sc); 1895 } 1896 1897 /* Insert the iocqes, iosqes and en bits from the write */ 1898 sc->regs.cc &= ~NVME_CC_WRITE_MASK; 1899 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; 1900 if (NVME_CC_GET_EN(ccreg) == 0) { 1901 /* Insert the ams, mps and css bit fields */ 1902 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; 1903 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; 1904 sc->regs.csts &= ~NVME_CSTS_RDY; 1905 } else if (sc->pending_ios == 0) { 1906 sc->regs.csts |= NVME_CSTS_RDY; 1907 } 1908 break; 1909 case NVME_CR_CSTS: 1910 break; 1911 case NVME_CR_NSSR: 1912 /* ignore writes; don't support subsystem reset */ 1913 break; 1914 case NVME_CR_AQA: 1915 sc->regs.aqa = (uint32_t)value; 1916 break; 1917 case NVME_CR_ASQ_LOW: 1918 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | 1919 (0xFFFFF000 & value); 1920 break; 1921 case NVME_CR_ASQ_HI: 1922 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | 1923 (value << 32); 1924 break; 1925 case NVME_CR_ACQ_LOW: 1926 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | 1927 (0xFFFFF000 & value); 1928 break; 1929 case NVME_CR_ACQ_HI: 1930 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | 1931 (value << 32); 1932 break; 1933 default: 1934 DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d", 1935 __func__, offset, value, size)); 1936 } 1937 pthread_mutex_unlock(&sc->mtx); 1938 } 1939 1940 static void 1941 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, 1942 int baridx, uint64_t offset, int size, uint64_t value) 1943 { 1944 struct pci_nvme_softc* sc = pi->pi_arg; 1945 1946 if (baridx == pci_msix_table_bar(pi) || 1947 baridx == pci_msix_pba_bar(pi)) { 1948 DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, " 1949 " value 0x%lx", baridx, offset, size, value)); 1950 1951 pci_emul_msix_twrite(pi, offset, size, value); 1952 return; 1953 } 1954 1955 switch (baridx) { 1956 case 0: 1957 pci_nvme_write_bar_0(ctx, sc, offset, size, value); 1958 break; 1959 1960 default: 1961 DPRINTF(("%s unknown baridx %d, val 0x%lx", 1962 __func__, baridx, value)); 1963 } 1964 } 1965 1966 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, 1967 uint64_t offset, int size) 1968 { 1969 uint64_t value; 1970 1971 pci_nvme_bar0_reg_dumps(__func__, offset, 0); 1972 1973 if (offset < NVME_DOORBELL_OFFSET) { 1974 void *p = &(sc->regs); 1975 pthread_mutex_lock(&sc->mtx); 1976 memcpy(&value, (void *)((uintptr_t)p + offset), size); 1977 pthread_mutex_unlock(&sc->mtx); 1978 } else { 1979 value = 0; 1980 WPRINTF(("pci_nvme: read invalid offset %ld", offset)); 1981 } 1982 1983 switch (size) { 1984 case 1: 1985 value &= 0xFF; 1986 break; 1987 case 2: 1988 value &= 0xFFFF; 1989 break; 1990 case 4: 1991 value &= 0xFFFFFFFF; 1992 break; 1993 } 1994 1995 DPRINTF((" nvme-read offset 0x%lx, size %d -> value 0x%x", 1996 offset, size, (uint32_t)value)); 1997 1998 return (value); 1999 } 2000 2001 2002 2003 static uint64_t 2004 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, 2005 uint64_t offset, int size) 2006 { 2007 struct pci_nvme_softc* sc = pi->pi_arg; 2008 2009 if (baridx == pci_msix_table_bar(pi) || 2010 baridx == pci_msix_pba_bar(pi)) { 2011 DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d", 2012 baridx, offset, size)); 2013 2014 return pci_emul_msix_tread(pi, offset, size); 2015 } 2016 2017 switch (baridx) { 2018 case 0: 2019 return pci_nvme_read_bar_0(sc, offset, size); 2020 2021 default: 2022 DPRINTF(("unknown bar %d, 0x%lx", baridx, offset)); 2023 } 2024 2025 return (0); 2026 } 2027 2028 2029 static int 2030 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts) 2031 { 2032 char bident[sizeof("XX:X:X")]; 2033 char *uopt, *xopts, *config; 2034 uint32_t sectsz; 2035 int optidx; 2036 2037 sc->max_queues = NVME_QUEUES; 2038 sc->max_qentries = NVME_MAX_QENTRIES; 2039 sc->ioslots = NVME_IOSLOTS; 2040 sc->num_squeues = sc->max_queues; 2041 sc->num_cqueues = sc->max_queues; 2042 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 2043 sectsz = 0; 2044 2045 uopt = strdup(opts); 2046 optidx = 0; 2047 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), 2048 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 2049 for (xopts = strtok(uopt, ","); 2050 xopts != NULL; 2051 xopts = strtok(NULL, ",")) { 2052 2053 if ((config = strchr(xopts, '=')) != NULL) 2054 *config++ = '\0'; 2055 2056 if (!strcmp("maxq", xopts)) { 2057 sc->max_queues = atoi(config); 2058 } else if (!strcmp("qsz", xopts)) { 2059 sc->max_qentries = atoi(config); 2060 } else if (!strcmp("ioslots", xopts)) { 2061 sc->ioslots = atoi(config); 2062 } else if (!strcmp("sectsz", xopts)) { 2063 sectsz = atoi(config); 2064 } else if (!strcmp("ser", xopts)) { 2065 /* 2066 * This field indicates the Product Serial Number in 2067 * 7-bit ASCII, unused bytes should be space characters. 2068 * Ref: NVMe v1.3c. 2069 */ 2070 cpywithpad((char *)sc->ctrldata.sn, 2071 sizeof(sc->ctrldata.sn), config, ' '); 2072 } else if (!strcmp("ram", xopts)) { 2073 uint64_t sz = strtoull(&xopts[4], NULL, 10); 2074 2075 sc->nvstore.type = NVME_STOR_RAM; 2076 sc->nvstore.size = sz * 1024 * 1024; 2077 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 2078 sc->nvstore.sectsz = 4096; 2079 sc->nvstore.sectsz_bits = 12; 2080 if (sc->nvstore.ctx == NULL) { 2081 perror("Unable to allocate RAM"); 2082 free(uopt); 2083 return (-1); 2084 } 2085 } else if (!strcmp("eui64", xopts)) { 2086 sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0)); 2087 } else if (!strcmp("dsm", xopts)) { 2088 if (!strcmp("auto", config)) 2089 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 2090 else if (!strcmp("enable", config)) 2091 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE; 2092 else if (!strcmp("disable", config)) 2093 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE; 2094 } else if (optidx == 0) { 2095 snprintf(bident, sizeof(bident), "%d:%d", 2096 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 2097 sc->nvstore.ctx = blockif_open(xopts, bident); 2098 if (sc->nvstore.ctx == NULL) { 2099 perror("Could not open backing file"); 2100 free(uopt); 2101 return (-1); 2102 } 2103 sc->nvstore.type = NVME_STOR_BLOCKIF; 2104 sc->nvstore.size = blockif_size(sc->nvstore.ctx); 2105 } else { 2106 EPRINTLN("Invalid option %s", xopts); 2107 free(uopt); 2108 return (-1); 2109 } 2110 2111 optidx++; 2112 } 2113 free(uopt); 2114 2115 if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) { 2116 EPRINTLN("backing store not specified"); 2117 return (-1); 2118 } 2119 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) 2120 sc->nvstore.sectsz = sectsz; 2121 else if (sc->nvstore.type != NVME_STOR_RAM) 2122 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); 2123 for (sc->nvstore.sectsz_bits = 9; 2124 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; 2125 sc->nvstore.sectsz_bits++); 2126 2127 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) 2128 sc->max_queues = NVME_QUEUES; 2129 2130 if (sc->max_qentries <= 0) { 2131 EPRINTLN("Invalid qsz option"); 2132 return (-1); 2133 } 2134 if (sc->ioslots <= 0) { 2135 EPRINTLN("Invalid ioslots option"); 2136 return (-1); 2137 } 2138 2139 return (0); 2140 } 2141 2142 static int 2143 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) 2144 { 2145 struct pci_nvme_softc *sc; 2146 uint32_t pci_membar_sz; 2147 int error; 2148 2149 error = 0; 2150 2151 sc = calloc(1, sizeof(struct pci_nvme_softc)); 2152 pi->pi_arg = sc; 2153 sc->nsc_pi = pi; 2154 2155 error = pci_nvme_parse_opts(sc, opts); 2156 if (error < 0) 2157 goto done; 2158 else 2159 error = 0; 2160 2161 STAILQ_INIT(&sc->ioreqs_free); 2162 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); 2163 for (int i = 0; i < sc->ioslots; i++) { 2164 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); 2165 pthread_mutex_init(&sc->ioreqs[i].mtx, NULL); 2166 pthread_cond_init(&sc->ioreqs[i].cv, NULL); 2167 } 2168 sc->intr_coales_aggr_thresh = 1; 2169 2170 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); 2171 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); 2172 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); 2173 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); 2174 pci_set_cfgdata8(pi, PCIR_PROGIF, 2175 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); 2176 2177 /* 2178 * Allocate size of NVMe registers + doorbell space for all queues. 2179 * 2180 * The specification requires a minimum memory I/O window size of 16K. 2181 * The Windows driver will refuse to start a device with a smaller 2182 * window. 2183 */ 2184 pci_membar_sz = sizeof(struct nvme_registers) + 2185 2 * sizeof(uint32_t) * (sc->max_queues + 1); 2186 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); 2187 2188 DPRINTF(("nvme membar size: %u", pci_membar_sz)); 2189 2190 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); 2191 if (error) { 2192 WPRINTF(("%s pci alloc mem bar failed", __func__)); 2193 goto done; 2194 } 2195 2196 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); 2197 if (error) { 2198 WPRINTF(("%s pci add msixcap failed", __func__)); 2199 goto done; 2200 } 2201 2202 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); 2203 if (error) { 2204 WPRINTF(("%s pci add Express capability failed", __func__)); 2205 goto done; 2206 } 2207 2208 pthread_mutex_init(&sc->mtx, NULL); 2209 sem_init(&sc->iosemlock, 0, sc->ioslots); 2210 2211 pci_nvme_reset(sc); 2212 /* 2213 * Controller data depends on Namespace data so initialize Namespace 2214 * data first. 2215 */ 2216 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore); 2217 pci_nvme_init_ctrldata(sc); 2218 pci_nvme_init_logpages(sc); 2219 2220 pci_lintr_request(pi); 2221 2222 done: 2223 return (error); 2224 } 2225 2226 2227 struct pci_devemu pci_de_nvme = { 2228 .pe_emu = "nvme", 2229 .pe_init = pci_nvme_init, 2230 .pe_barwrite = pci_nvme_write, 2231 .pe_barread = pci_nvme_read 2232 }; 2233 PCI_EMUL_SET(pci_de_nvme); 2234