1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2017 Shunsuke Mie 5 * Copyright (c) 2018 Leon Dang 6 * Copyright (c) 2020 Chuck Tuffli 7 * 8 * Function crc16 Copyright (c) 2017, Fedor Uporov 9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 33 /* 34 * bhyve PCIe-NVMe device emulation. 35 * 36 * options: 37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt> 38 * 39 * accepted devpath: 40 * /dev/blockdev 41 * /path/to/image 42 * ram=size_in_MiB 43 * 44 * maxq = max number of queues 45 * qsz = max elements in each queue 46 * ioslots = max number of concurrent io requests 47 * sectsz = sector size (defaults to blockif sector size) 48 * ser = serial number (20-chars max) 49 * eui64 = IEEE Extended Unique Identifier (8 byte value) 50 * dsm = DataSet Management support. Option is one of auto, enable,disable 51 * 52 */ 53 54 /* TODO: 55 - create async event for smart and log 56 - intr coalesce 57 */ 58 59 #include <sys/cdefs.h> 60 __FBSDID("$FreeBSD$"); 61 62 #include <sys/errno.h> 63 #include <sys/types.h> 64 #include <net/ieee_oui.h> 65 66 #include <assert.h> 67 #include <pthread.h> 68 #include <semaphore.h> 69 #include <stdbool.h> 70 #include <stddef.h> 71 #include <stdint.h> 72 #include <stdio.h> 73 #include <stdlib.h> 74 #include <string.h> 75 76 #include <machine/atomic.h> 77 #include <machine/vmm.h> 78 #include <vmmapi.h> 79 80 #include <dev/nvme/nvme.h> 81 82 #include "bhyverun.h" 83 #include "block_if.h" 84 #include "debug.h" 85 #include "pci_emul.h" 86 87 88 static int nvme_debug = 0; 89 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args) 90 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args) 91 92 /* defaults; can be overridden */ 93 #define NVME_MSIX_BAR 4 94 95 #define NVME_IOSLOTS 8 96 97 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ 98 #define NVME_MMIO_SPACE_MIN (1 << 14) 99 100 #define NVME_QUEUES 16 101 #define NVME_MAX_QENTRIES 2048 102 /* Memory Page size Minimum reported in CAP register */ 103 #define NVME_MPSMIN 0 104 /* MPSMIN converted to bytes */ 105 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN)) 106 107 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) 108 #define NVME_MDTS 9 109 /* Note the + 1 allows for the initial descriptor to not be page aligned */ 110 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1) 111 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES) 112 113 /* This is a synthetic status code to indicate there is no status */ 114 #define NVME_NO_STATUS 0xffff 115 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) 116 117 /* helpers */ 118 119 /* Convert a zero-based value into a one-based value */ 120 #define ONE_BASED(zero) ((zero) + 1) 121 /* Convert a one-based value into a zero-based value */ 122 #define ZERO_BASED(one) ((one) - 1) 123 124 /* Encode number of SQ's and CQ's for Set/Get Features */ 125 #define NVME_FEATURE_NUM_QUEUES(sc) \ 126 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ 127 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16; 128 129 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) 130 131 enum nvme_controller_register_offsets { 132 NVME_CR_CAP_LOW = 0x00, 133 NVME_CR_CAP_HI = 0x04, 134 NVME_CR_VS = 0x08, 135 NVME_CR_INTMS = 0x0c, 136 NVME_CR_INTMC = 0x10, 137 NVME_CR_CC = 0x14, 138 NVME_CR_CSTS = 0x1c, 139 NVME_CR_NSSR = 0x20, 140 NVME_CR_AQA = 0x24, 141 NVME_CR_ASQ_LOW = 0x28, 142 NVME_CR_ASQ_HI = 0x2c, 143 NVME_CR_ACQ_LOW = 0x30, 144 NVME_CR_ACQ_HI = 0x34, 145 }; 146 147 enum nvme_cmd_cdw11 { 148 NVME_CMD_CDW11_PC = 0x0001, 149 NVME_CMD_CDW11_IEN = 0x0002, 150 NVME_CMD_CDW11_IV = 0xFFFF0000, 151 }; 152 153 enum nvme_copy_dir { 154 NVME_COPY_TO_PRP, 155 NVME_COPY_FROM_PRP, 156 }; 157 158 #define NVME_CQ_INTEN 0x01 159 #define NVME_CQ_INTCOAL 0x02 160 161 struct nvme_completion_queue { 162 struct nvme_completion *qbase; 163 pthread_mutex_t mtx; 164 uint32_t size; 165 uint16_t tail; /* nvme progress */ 166 uint16_t head; /* guest progress */ 167 uint16_t intr_vec; 168 uint32_t intr_en; 169 }; 170 171 struct nvme_submission_queue { 172 struct nvme_command *qbase; 173 pthread_mutex_t mtx; 174 uint32_t size; 175 uint16_t head; /* nvme progress */ 176 uint16_t tail; /* guest progress */ 177 uint16_t cqid; /* completion queue id */ 178 int qpriority; 179 }; 180 181 enum nvme_storage_type { 182 NVME_STOR_BLOCKIF = 0, 183 NVME_STOR_RAM = 1, 184 }; 185 186 struct pci_nvme_blockstore { 187 enum nvme_storage_type type; 188 void *ctx; 189 uint64_t size; 190 uint32_t sectsz; 191 uint32_t sectsz_bits; 192 uint64_t eui64; 193 uint32_t deallocate:1; 194 }; 195 196 /* 197 * Calculate the number of additional page descriptors for guest IO requests 198 * based on the advertised Max Data Transfer (MDTS) and given the number of 199 * default iovec's in a struct blockif_req. 200 * 201 * Note the + 1 allows for the initial descriptor to not be page aligned. 202 */ 203 #define MDTS_PAD_SIZE \ 204 NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \ 205 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \ 206 0 207 208 struct pci_nvme_ioreq { 209 struct pci_nvme_softc *sc; 210 STAILQ_ENTRY(pci_nvme_ioreq) link; 211 struct nvme_submission_queue *nvme_sq; 212 uint16_t sqid; 213 214 /* command information */ 215 uint16_t opc; 216 uint16_t cid; 217 uint32_t nsid; 218 219 uint64_t prev_gpaddr; 220 size_t prev_size; 221 size_t bytes; 222 223 struct blockif_req io_req; 224 225 struct iovec iovpadding[MDTS_PAD_SIZE]; 226 }; 227 228 enum nvme_dsm_type { 229 /* Dataset Management bit in ONCS reflects backing storage capability */ 230 NVME_DATASET_MANAGEMENT_AUTO, 231 /* Unconditionally set Dataset Management bit in ONCS */ 232 NVME_DATASET_MANAGEMENT_ENABLE, 233 /* Unconditionally clear Dataset Management bit in ONCS */ 234 NVME_DATASET_MANAGEMENT_DISABLE, 235 }; 236 237 struct pci_nvme_softc; 238 struct nvme_feature_obj; 239 240 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *, 241 struct nvme_feature_obj *, 242 struct nvme_command *, 243 struct nvme_completion *); 244 245 struct nvme_feature_obj { 246 uint32_t cdw11; 247 nvme_feature_cb set; 248 nvme_feature_cb get; 249 bool namespace_specific; 250 }; 251 252 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1) 253 254 struct pci_nvme_aer { 255 STAILQ_ENTRY(pci_nvme_aer) link; 256 uint16_t cid; /* Command ID of the submitted AER */ 257 }; 258 259 struct pci_nvme_softc { 260 struct pci_devinst *nsc_pi; 261 262 pthread_mutex_t mtx; 263 264 struct nvme_registers regs; 265 266 struct nvme_namespace_data nsdata; 267 struct nvme_controller_data ctrldata; 268 struct nvme_error_information_entry err_log; 269 struct nvme_health_information_page health_log; 270 struct nvme_firmware_page fw_log; 271 272 struct pci_nvme_blockstore nvstore; 273 274 uint16_t max_qentries; /* max entries per queue */ 275 uint32_t max_queues; /* max number of IO SQ's or CQ's */ 276 uint32_t num_cqueues; 277 uint32_t num_squeues; 278 bool num_q_is_set; /* Has host set Number of Queues */ 279 280 struct pci_nvme_ioreq *ioreqs; 281 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */ 282 uint32_t pending_ios; 283 uint32_t ioslots; 284 sem_t iosemlock; 285 286 /* 287 * Memory mapped Submission and Completion queues 288 * Each array includes both Admin and IO queues 289 */ 290 struct nvme_completion_queue *compl_queues; 291 struct nvme_submission_queue *submit_queues; 292 293 struct nvme_feature_obj feat[NVME_FID_MAX]; 294 295 enum nvme_dsm_type dataset_management; 296 297 /* Accounting for SMART data */ 298 __uint128_t read_data_units; 299 __uint128_t write_data_units; 300 __uint128_t read_commands; 301 __uint128_t write_commands; 302 uint32_t read_dunits_remainder; 303 uint32_t write_dunits_remainder; 304 305 STAILQ_HEAD(, pci_nvme_aer) aer_list; 306 uint32_t aer_count; 307 }; 308 309 310 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); 311 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); 312 static void pci_nvme_io_done(struct blockif_req *, int); 313 314 /* Controller Configuration utils */ 315 #define NVME_CC_GET_EN(cc) \ 316 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) 317 #define NVME_CC_GET_CSS(cc) \ 318 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) 319 #define NVME_CC_GET_SHN(cc) \ 320 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) 321 #define NVME_CC_GET_IOSQES(cc) \ 322 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) 323 #define NVME_CC_GET_IOCQES(cc) \ 324 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) 325 326 #define NVME_CC_WRITE_MASK \ 327 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ 328 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ 329 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) 330 331 #define NVME_CC_NEN_WRITE_MASK \ 332 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ 333 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ 334 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) 335 336 /* Controller Status utils */ 337 #define NVME_CSTS_GET_RDY(sts) \ 338 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) 339 340 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) 341 342 /* Completion Queue status word utils */ 343 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) 344 #define NVME_STATUS_MASK \ 345 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ 346 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) 347 348 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \ 349 NVME_CTRLR_DATA_ONCS_DSM_SHIFT) 350 351 static void nvme_feature_invalid_cb(struct pci_nvme_softc *, 352 struct nvme_feature_obj *, 353 struct nvme_command *, 354 struct nvme_completion *); 355 static void nvme_feature_num_queues(struct pci_nvme_softc *, 356 struct nvme_feature_obj *, 357 struct nvme_command *, 358 struct nvme_completion *); 359 static void nvme_feature_iv_config(struct pci_nvme_softc *, 360 struct nvme_feature_obj *, 361 struct nvme_command *, 362 struct nvme_completion *); 363 364 static __inline void 365 cpywithpad(char *dst, size_t dst_size, const char *src, char pad) 366 { 367 size_t len; 368 369 len = strnlen(src, dst_size); 370 memset(dst, pad, dst_size); 371 memcpy(dst, src, len); 372 } 373 374 static __inline void 375 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) 376 { 377 378 *status &= ~NVME_STATUS_MASK; 379 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | 380 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; 381 } 382 383 static __inline void 384 pci_nvme_status_genc(uint16_t *status, uint16_t code) 385 { 386 387 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); 388 } 389 390 /* 391 * Initialize the requested number or IO Submission and Completion Queues. 392 * Admin queues are allocated implicitly. 393 */ 394 static void 395 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq) 396 { 397 uint32_t i; 398 399 /* 400 * Allocate and initialize the Submission Queues 401 */ 402 if (nsq > NVME_QUEUES) { 403 WPRINTF("%s: clamping number of SQ from %u to %u", 404 __func__, nsq, NVME_QUEUES); 405 nsq = NVME_QUEUES; 406 } 407 408 sc->num_squeues = nsq; 409 410 sc->submit_queues = calloc(sc->num_squeues + 1, 411 sizeof(struct nvme_submission_queue)); 412 if (sc->submit_queues == NULL) { 413 WPRINTF("%s: SQ allocation failed", __func__); 414 sc->num_squeues = 0; 415 } else { 416 struct nvme_submission_queue *sq = sc->submit_queues; 417 418 for (i = 0; i < sc->num_squeues; i++) 419 pthread_mutex_init(&sq[i].mtx, NULL); 420 } 421 422 /* 423 * Allocate and initialize the Completion Queues 424 */ 425 if (ncq > NVME_QUEUES) { 426 WPRINTF("%s: clamping number of CQ from %u to %u", 427 __func__, ncq, NVME_QUEUES); 428 ncq = NVME_QUEUES; 429 } 430 431 sc->num_cqueues = ncq; 432 433 sc->compl_queues = calloc(sc->num_cqueues + 1, 434 sizeof(struct nvme_completion_queue)); 435 if (sc->compl_queues == NULL) { 436 WPRINTF("%s: CQ allocation failed", __func__); 437 sc->num_cqueues = 0; 438 } else { 439 struct nvme_completion_queue *cq = sc->compl_queues; 440 441 for (i = 0; i < sc->num_cqueues; i++) 442 pthread_mutex_init(&cq[i].mtx, NULL); 443 } 444 } 445 446 static void 447 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) 448 { 449 struct nvme_controller_data *cd = &sc->ctrldata; 450 451 cd->vid = 0xFB5D; 452 cd->ssvid = 0x0000; 453 454 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); 455 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); 456 457 /* Num of submission commands that we can handle at a time (2^rab) */ 458 cd->rab = 4; 459 460 /* FreeBSD OUI */ 461 cd->ieee[0] = 0x58; 462 cd->ieee[1] = 0x9c; 463 cd->ieee[2] = 0xfc; 464 465 cd->mic = 0; 466 467 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */ 468 469 cd->ver = 0x00010300; 470 471 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; 472 cd->acl = 2; 473 cd->aerl = 4; 474 475 /* Advertise 1, Read-only firmware slot */ 476 cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK | 477 (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT); 478 cd->lpa = 0; /* TODO: support some simple things like SMART */ 479 cd->elpe = 0; /* max error log page entries */ 480 cd->npss = 1; /* number of power states support */ 481 482 /* Warning Composite Temperature Threshold */ 483 cd->wctemp = 0x0157; 484 485 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | 486 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); 487 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | 488 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); 489 cd->nn = 1; /* number of namespaces */ 490 491 cd->oncs = 0; 492 switch (sc->dataset_management) { 493 case NVME_DATASET_MANAGEMENT_AUTO: 494 if (sc->nvstore.deallocate) 495 cd->oncs |= NVME_ONCS_DSM; 496 break; 497 case NVME_DATASET_MANAGEMENT_ENABLE: 498 cd->oncs |= NVME_ONCS_DSM; 499 break; 500 default: 501 break; 502 } 503 504 cd->fna = 0x03; 505 506 cd->power_state[0].mp = 10; 507 } 508 509 /* 510 * Calculate the CRC-16 of the given buffer 511 * See copyright attribution at top of file 512 */ 513 static uint16_t 514 crc16(uint16_t crc, const void *buffer, unsigned int len) 515 { 516 const unsigned char *cp = buffer; 517 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */ 518 static uint16_t const crc16_table[256] = { 519 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, 520 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, 521 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, 522 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841, 523 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, 524 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41, 525 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641, 526 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040, 527 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240, 528 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441, 529 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41, 530 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840, 531 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41, 532 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40, 533 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640, 534 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041, 535 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240, 536 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441, 537 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41, 538 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840, 539 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41, 540 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40, 541 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640, 542 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041, 543 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241, 544 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440, 545 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40, 546 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841, 547 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40, 548 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41, 549 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, 550 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 551 }; 552 553 while (len--) 554 crc = (((crc >> 8) & 0xffU) ^ 555 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU; 556 return crc; 557 } 558 559 static void 560 pci_nvme_init_nsdata(struct pci_nvme_softc *sc, 561 struct nvme_namespace_data *nd, uint32_t nsid, 562 struct pci_nvme_blockstore *nvstore) 563 { 564 565 /* Get capacity and block size information from backing store */ 566 nd->nsze = nvstore->size / nvstore->sectsz; 567 nd->ncap = nd->nsze; 568 nd->nuse = nd->nsze; 569 570 if (nvstore->type == NVME_STOR_BLOCKIF) 571 nvstore->deallocate = blockif_candelete(nvstore->ctx); 572 573 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ 574 nd->flbas = 0; 575 576 /* Create an EUI-64 if user did not provide one */ 577 if (nvstore->eui64 == 0) { 578 char *data = NULL; 579 uint64_t eui64 = nvstore->eui64; 580 581 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus, 582 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 583 584 if (data != NULL) { 585 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); 586 free(data); 587 } 588 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff); 589 } 590 be64enc(nd->eui64, nvstore->eui64); 591 592 /* LBA data-sz = 2^lbads */ 593 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; 594 } 595 596 static void 597 pci_nvme_init_logpages(struct pci_nvme_softc *sc) 598 { 599 600 memset(&sc->err_log, 0, sizeof(sc->err_log)); 601 memset(&sc->health_log, 0, sizeof(sc->health_log)); 602 memset(&sc->fw_log, 0, sizeof(sc->fw_log)); 603 604 /* Set read/write remainder to round up according to spec */ 605 sc->read_dunits_remainder = 999; 606 sc->write_dunits_remainder = 999; 607 } 608 609 static void 610 pci_nvme_init_features(struct pci_nvme_softc *sc) 611 { 612 613 sc->feat[0].set = nvme_feature_invalid_cb; 614 sc->feat[0].get = nvme_feature_invalid_cb; 615 616 sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true; 617 sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true; 618 sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues; 619 sc->feat[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION].set = 620 nvme_feature_iv_config; 621 sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG].get = 622 nvme_feature_invalid_cb; 623 sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW].get = 624 nvme_feature_invalid_cb; 625 } 626 627 static void 628 pci_nvme_aer_init(struct pci_nvme_softc *sc) 629 { 630 631 STAILQ_INIT(&sc->aer_list); 632 sc->aer_count = 0; 633 } 634 635 static void 636 pci_nvme_aer_destroy(struct pci_nvme_softc *sc) 637 { 638 struct pci_nvme_aer *aer = NULL; 639 640 while (!STAILQ_EMPTY(&sc->aer_list)) { 641 aer = STAILQ_FIRST(&sc->aer_list); 642 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 643 free(aer); 644 } 645 646 pci_nvme_aer_init(sc); 647 } 648 649 static bool 650 pci_nvme_aer_available(struct pci_nvme_softc *sc) 651 { 652 653 return (!STAILQ_EMPTY(&sc->aer_list)); 654 } 655 656 static bool 657 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc) 658 { 659 struct nvme_controller_data *cd = &sc->ctrldata; 660 661 /* AERL is a zero based value while aer_count is one's based */ 662 return (sc->aer_count == (cd->aerl + 1)); 663 } 664 665 /* 666 * Add an Async Event Request 667 * 668 * Stores an AER to be returned later if the Controller needs to notify the 669 * host of an event. 670 * Note that while the NVMe spec doesn't require Controllers to return AER's 671 * in order, this implementation does preserve the order. 672 */ 673 static int 674 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid) 675 { 676 struct pci_nvme_aer *aer = NULL; 677 678 if (pci_nvme_aer_limit_reached(sc)) 679 return (-1); 680 681 aer = calloc(1, sizeof(struct pci_nvme_aer)); 682 if (aer == NULL) 683 return (-1); 684 685 sc->aer_count++; 686 687 /* Save the Command ID for use in the completion message */ 688 aer->cid = cid; 689 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link); 690 691 return (0); 692 } 693 694 /* 695 * Get an Async Event Request structure 696 * 697 * Returns a pointer to an AER previously submitted by the host or NULL if 698 * no AER's exist. Caller is responsible for freeing the returned struct. 699 */ 700 static struct pci_nvme_aer * 701 pci_nvme_aer_get(struct pci_nvme_softc *sc) 702 { 703 struct pci_nvme_aer *aer = NULL; 704 705 aer = STAILQ_FIRST(&sc->aer_list); 706 if (aer != NULL) { 707 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 708 sc->aer_count--; 709 } 710 711 return (aer); 712 } 713 714 static void 715 pci_nvme_reset_locked(struct pci_nvme_softc *sc) 716 { 717 uint32_t i; 718 719 DPRINTF("%s", __func__); 720 721 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | 722 (1 << NVME_CAP_LO_REG_CQR_SHIFT) | 723 (60 << NVME_CAP_LO_REG_TO_SHIFT); 724 725 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; 726 727 sc->regs.vs = 0x00010300; /* NVMe v1.3 */ 728 729 sc->regs.cc = 0; 730 sc->regs.csts = 0; 731 732 assert(sc->submit_queues != NULL); 733 734 for (i = 0; i < sc->num_squeues + 1; i++) { 735 sc->submit_queues[i].qbase = NULL; 736 sc->submit_queues[i].size = 0; 737 sc->submit_queues[i].cqid = 0; 738 sc->submit_queues[i].tail = 0; 739 sc->submit_queues[i].head = 0; 740 } 741 742 assert(sc->compl_queues != NULL); 743 744 for (i = 0; i < sc->num_cqueues + 1; i++) { 745 sc->compl_queues[i].qbase = NULL; 746 sc->compl_queues[i].size = 0; 747 sc->compl_queues[i].tail = 0; 748 sc->compl_queues[i].head = 0; 749 } 750 751 sc->num_q_is_set = false; 752 753 pci_nvme_aer_destroy(sc); 754 } 755 756 static void 757 pci_nvme_reset(struct pci_nvme_softc *sc) 758 { 759 pthread_mutex_lock(&sc->mtx); 760 pci_nvme_reset_locked(sc); 761 pthread_mutex_unlock(&sc->mtx); 762 } 763 764 static void 765 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc) 766 { 767 uint16_t acqs, asqs; 768 769 DPRINTF("%s", __func__); 770 771 asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1; 772 sc->submit_queues[0].size = asqs; 773 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq, 774 sizeof(struct nvme_command) * asqs); 775 776 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p", 777 __func__, sc->regs.asq, sc->submit_queues[0].qbase); 778 779 acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 780 NVME_AQA_REG_ACQS_MASK) + 1; 781 sc->compl_queues[0].size = acqs; 782 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq, 783 sizeof(struct nvme_completion) * acqs); 784 sc->compl_queues[0].intr_en = NVME_CQ_INTEN; 785 786 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p", 787 __func__, sc->regs.acq, sc->compl_queues[0].qbase); 788 } 789 790 static int 791 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b, 792 size_t len, enum nvme_copy_dir dir) 793 { 794 uint8_t *p; 795 size_t bytes; 796 797 if (len > (8 * 1024)) { 798 return (-1); 799 } 800 801 /* Copy from the start of prp1 to the end of the physical page */ 802 bytes = PAGE_SIZE - (prp1 & PAGE_MASK); 803 bytes = MIN(bytes, len); 804 805 p = vm_map_gpa(ctx, prp1, bytes); 806 if (p == NULL) { 807 return (-1); 808 } 809 810 if (dir == NVME_COPY_TO_PRP) 811 memcpy(p, b, bytes); 812 else 813 memcpy(b, p, bytes); 814 815 b += bytes; 816 817 len -= bytes; 818 if (len == 0) { 819 return (0); 820 } 821 822 len = MIN(len, PAGE_SIZE); 823 824 p = vm_map_gpa(ctx, prp2, len); 825 if (p == NULL) { 826 return (-1); 827 } 828 829 if (dir == NVME_COPY_TO_PRP) 830 memcpy(p, b, len); 831 else 832 memcpy(b, p, len); 833 834 return (0); 835 } 836 837 /* 838 * Write a Completion Queue Entry update 839 * 840 * Write the completion and update the doorbell value 841 */ 842 static void 843 pci_nvme_cq_update(struct pci_nvme_softc *sc, 844 struct nvme_completion_queue *cq, 845 uint32_t cdw0, 846 uint16_t cid, 847 uint16_t sqid, 848 uint16_t status) 849 { 850 struct nvme_submission_queue *sq = &sc->submit_queues[sqid]; 851 struct nvme_completion *cqe; 852 853 assert(cq->qbase != NULL); 854 855 pthread_mutex_lock(&cq->mtx); 856 857 cqe = &cq->qbase[cq->tail]; 858 859 /* Flip the phase bit */ 860 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK; 861 862 cqe->cdw0 = cdw0; 863 cqe->sqhd = sq->head; 864 cqe->sqid = sqid; 865 cqe->cid = cid; 866 cqe->status = status; 867 868 cq->tail++; 869 if (cq->tail >= cq->size) { 870 cq->tail = 0; 871 } 872 873 pthread_mutex_unlock(&cq->mtx); 874 } 875 876 static int 877 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 878 struct nvme_completion* compl) 879 { 880 uint16_t qid = command->cdw10 & 0xffff; 881 882 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid); 883 if (qid == 0 || qid > sc->num_squeues || 884 (sc->submit_queues[qid].qbase == NULL)) { 885 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u", 886 __func__, qid, sc->num_squeues); 887 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 888 NVME_SC_INVALID_QUEUE_IDENTIFIER); 889 return (1); 890 } 891 892 sc->submit_queues[qid].qbase = NULL; 893 sc->submit_queues[qid].cqid = 0; 894 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 895 return (1); 896 } 897 898 static int 899 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 900 struct nvme_completion* compl) 901 { 902 if (command->cdw11 & NVME_CMD_CDW11_PC) { 903 uint16_t qid = command->cdw10 & 0xffff; 904 struct nvme_submission_queue *nsq; 905 906 if ((qid == 0) || (qid > sc->num_squeues) || 907 (sc->submit_queues[qid].qbase != NULL)) { 908 WPRINTF("%s queue index %u > num_squeues %u", 909 __func__, qid, sc->num_squeues); 910 pci_nvme_status_tc(&compl->status, 911 NVME_SCT_COMMAND_SPECIFIC, 912 NVME_SC_INVALID_QUEUE_IDENTIFIER); 913 return (1); 914 } 915 916 nsq = &sc->submit_queues[qid]; 917 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 918 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries); 919 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) { 920 /* 921 * Queues must specify at least two entries 922 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 923 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 924 */ 925 pci_nvme_status_tc(&compl->status, 926 NVME_SCT_COMMAND_SPECIFIC, 927 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 928 return (1); 929 } 930 931 nsq->cqid = (command->cdw11 >> 16) & 0xffff; 932 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) { 933 pci_nvme_status_tc(&compl->status, 934 NVME_SCT_COMMAND_SPECIFIC, 935 NVME_SC_INVALID_QUEUE_IDENTIFIER); 936 return (1); 937 } 938 939 if (sc->compl_queues[nsq->cqid].qbase == NULL) { 940 pci_nvme_status_tc(&compl->status, 941 NVME_SCT_COMMAND_SPECIFIC, 942 NVME_SC_COMPLETION_QUEUE_INVALID); 943 return (1); 944 } 945 946 nsq->qpriority = (command->cdw11 >> 1) & 0x03; 947 948 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 949 sizeof(struct nvme_command) * (size_t)nsq->size); 950 951 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__, 952 qid, nsq->size, nsq->qbase, nsq->cqid); 953 954 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 955 956 DPRINTF("%s completed creating IOSQ qid %u", 957 __func__, qid); 958 } else { 959 /* 960 * Guest sent non-cont submission queue request. 961 * This setting is unsupported by this emulation. 962 */ 963 WPRINTF("%s unsupported non-contig (list-based) " 964 "create i/o submission queue", __func__); 965 966 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 967 } 968 return (1); 969 } 970 971 static int 972 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 973 struct nvme_completion* compl) 974 { 975 uint16_t qid = command->cdw10 & 0xffff; 976 uint16_t sqid; 977 978 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid); 979 if (qid == 0 || qid > sc->num_cqueues || 980 (sc->compl_queues[qid].qbase == NULL)) { 981 WPRINTF("%s queue index %u / num_cqueues %u", 982 __func__, qid, sc->num_cqueues); 983 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 984 NVME_SC_INVALID_QUEUE_IDENTIFIER); 985 return (1); 986 } 987 988 /* Deleting an Active CQ is an error */ 989 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++) 990 if (sc->submit_queues[sqid].cqid == qid) { 991 pci_nvme_status_tc(&compl->status, 992 NVME_SCT_COMMAND_SPECIFIC, 993 NVME_SC_INVALID_QUEUE_DELETION); 994 return (1); 995 } 996 997 sc->compl_queues[qid].qbase = NULL; 998 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 999 return (1); 1000 } 1001 1002 static int 1003 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1004 struct nvme_completion* compl) 1005 { 1006 struct nvme_completion_queue *ncq; 1007 uint16_t qid = command->cdw10 & 0xffff; 1008 1009 /* Only support Physically Contiguous queues */ 1010 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) { 1011 WPRINTF("%s unsupported non-contig (list-based) " 1012 "create i/o completion queue", 1013 __func__); 1014 1015 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1016 return (1); 1017 } 1018 1019 if ((qid == 0) || (qid > sc->num_cqueues) || 1020 (sc->compl_queues[qid].qbase != NULL)) { 1021 WPRINTF("%s queue index %u > num_cqueues %u", 1022 __func__, qid, sc->num_cqueues); 1023 pci_nvme_status_tc(&compl->status, 1024 NVME_SCT_COMMAND_SPECIFIC, 1025 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1026 return (1); 1027 } 1028 1029 ncq = &sc->compl_queues[qid]; 1030 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; 1031 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; 1032 if (ncq->intr_vec > (sc->max_queues + 1)) { 1033 pci_nvme_status_tc(&compl->status, 1034 NVME_SCT_COMMAND_SPECIFIC, 1035 NVME_SC_INVALID_INTERRUPT_VECTOR); 1036 return (1); 1037 } 1038 1039 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1040 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) { 1041 /* 1042 * Queues must specify at least two entries 1043 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1044 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1045 */ 1046 pci_nvme_status_tc(&compl->status, 1047 NVME_SCT_COMMAND_SPECIFIC, 1048 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1049 return (1); 1050 } 1051 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1052 command->prp1, 1053 sizeof(struct nvme_command) * (size_t)ncq->size); 1054 1055 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1056 1057 1058 return (1); 1059 } 1060 1061 static int 1062 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, 1063 struct nvme_completion* compl) 1064 { 1065 uint32_t logsize; 1066 uint8_t logpage = command->cdw10 & 0xFF; 1067 1068 DPRINTF("%s log page %u len %u", __func__, logpage, logsize); 1069 1070 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1071 1072 /* 1073 * Command specifies the number of dwords to return in fields NUMDU 1074 * and NUMDL. This is a zero-based value. 1075 */ 1076 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1; 1077 logsize *= sizeof(uint32_t); 1078 1079 switch (logpage) { 1080 case NVME_LOG_ERROR: 1081 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1082 command->prp2, (uint8_t *)&sc->err_log, 1083 MIN(logsize, sizeof(sc->err_log)), 1084 NVME_COPY_TO_PRP); 1085 break; 1086 case NVME_LOG_HEALTH_INFORMATION: 1087 pthread_mutex_lock(&sc->mtx); 1088 memcpy(&sc->health_log.data_units_read, &sc->read_data_units, 1089 sizeof(sc->health_log.data_units_read)); 1090 memcpy(&sc->health_log.data_units_written, &sc->write_data_units, 1091 sizeof(sc->health_log.data_units_written)); 1092 memcpy(&sc->health_log.host_read_commands, &sc->read_commands, 1093 sizeof(sc->health_log.host_read_commands)); 1094 memcpy(&sc->health_log.host_write_commands, &sc->write_commands, 1095 sizeof(sc->health_log.host_write_commands)); 1096 pthread_mutex_unlock(&sc->mtx); 1097 1098 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1099 command->prp2, (uint8_t *)&sc->health_log, 1100 MIN(logsize, sizeof(sc->health_log)), 1101 NVME_COPY_TO_PRP); 1102 break; 1103 case NVME_LOG_FIRMWARE_SLOT: 1104 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1105 command->prp2, (uint8_t *)&sc->fw_log, 1106 MIN(logsize, sizeof(sc->fw_log)), 1107 NVME_COPY_TO_PRP); 1108 break; 1109 default: 1110 DPRINTF("%s get log page %x command not supported", 1111 __func__, logpage); 1112 1113 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1114 NVME_SC_INVALID_LOG_PAGE); 1115 } 1116 1117 return (1); 1118 } 1119 1120 static int 1121 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, 1122 struct nvme_completion* compl) 1123 { 1124 void *dest; 1125 uint16_t status; 1126 1127 DPRINTF("%s identify 0x%x nsid 0x%x", __func__, 1128 command->cdw10 & 0xFF, command->nsid); 1129 1130 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1131 1132 switch (command->cdw10 & 0xFF) { 1133 case 0x00: /* return Identify Namespace data structure */ 1134 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1135 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata), 1136 NVME_COPY_TO_PRP); 1137 break; 1138 case 0x01: /* return Identify Controller data structure */ 1139 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1140 command->prp2, (uint8_t *)&sc->ctrldata, 1141 sizeof(sc->ctrldata), 1142 NVME_COPY_TO_PRP); 1143 break; 1144 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ 1145 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1146 sizeof(uint32_t) * 1024); 1147 /* All unused entries shall be zero */ 1148 bzero(dest, sizeof(uint32_t) * 1024); 1149 ((uint32_t *)dest)[0] = 1; 1150 break; 1151 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ 1152 if (command->nsid != 1) { 1153 pci_nvme_status_genc(&status, 1154 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1155 break; 1156 } 1157 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1158 sizeof(uint32_t) * 1024); 1159 /* All bytes after the descriptor shall be zero */ 1160 bzero(dest, sizeof(uint32_t) * 1024); 1161 1162 /* Return NIDT=1 (i.e. EUI64) descriptor */ 1163 ((uint8_t *)dest)[0] = 1; 1164 ((uint8_t *)dest)[1] = sizeof(uint64_t); 1165 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t)); 1166 break; 1167 default: 1168 DPRINTF("%s unsupported identify command requested 0x%x", 1169 __func__, command->cdw10 & 0xFF); 1170 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD); 1171 break; 1172 } 1173 1174 compl->status = status; 1175 return (1); 1176 } 1177 1178 static const char * 1179 nvme_fid_to_name(uint8_t fid) 1180 { 1181 const char *name; 1182 1183 switch (fid) { 1184 case NVME_FEAT_ARBITRATION: 1185 name = "Arbitration"; 1186 break; 1187 case NVME_FEAT_POWER_MANAGEMENT: 1188 name = "Power Management"; 1189 break; 1190 case NVME_FEAT_LBA_RANGE_TYPE: 1191 name = "LBA Range Type"; 1192 break; 1193 case NVME_FEAT_TEMPERATURE_THRESHOLD: 1194 name = "Temperature Threshold"; 1195 break; 1196 case NVME_FEAT_ERROR_RECOVERY: 1197 name = "Error Recovery"; 1198 break; 1199 case NVME_FEAT_VOLATILE_WRITE_CACHE: 1200 name = "Volatile Write Cache"; 1201 break; 1202 case NVME_FEAT_NUMBER_OF_QUEUES: 1203 name = "Number of Queues"; 1204 break; 1205 case NVME_FEAT_INTERRUPT_COALESCING: 1206 name = "Interrupt Coalescing"; 1207 break; 1208 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 1209 name = "Interrupt Vector Configuration"; 1210 break; 1211 case NVME_FEAT_WRITE_ATOMICITY: 1212 name = "Write Atomicity Normal"; 1213 break; 1214 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 1215 name = "Asynchronous Event Configuration"; 1216 break; 1217 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 1218 name = "Autonomous Power State Transition"; 1219 break; 1220 case NVME_FEAT_HOST_MEMORY_BUFFER: 1221 name = "Host Memory Buffer"; 1222 break; 1223 case NVME_FEAT_TIMESTAMP: 1224 name = "Timestamp"; 1225 break; 1226 case NVME_FEAT_KEEP_ALIVE_TIMER: 1227 name = "Keep Alive Timer"; 1228 break; 1229 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT: 1230 name = "Host Controlled Thermal Management"; 1231 break; 1232 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG: 1233 name = "Non-Operation Power State Config"; 1234 break; 1235 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG: 1236 name = "Read Recovery Level Config"; 1237 break; 1238 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 1239 name = "Predictable Latency Mode Config"; 1240 break; 1241 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW: 1242 name = "Predictable Latency Mode Window"; 1243 break; 1244 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES: 1245 name = "LBA Status Information Report Interval"; 1246 break; 1247 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 1248 name = "Host Behavior Support"; 1249 break; 1250 case NVME_FEAT_SANITIZE_CONFIG: 1251 name = "Sanitize Config"; 1252 break; 1253 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION: 1254 name = "Endurance Group Event Configuration"; 1255 break; 1256 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 1257 name = "Software Progress Marker"; 1258 break; 1259 case NVME_FEAT_HOST_IDENTIFIER: 1260 name = "Host Identifier"; 1261 break; 1262 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK: 1263 name = "Reservation Notification Mask"; 1264 break; 1265 case NVME_FEAT_RESERVATION_PERSISTENCE: 1266 name = "Reservation Persistence"; 1267 break; 1268 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG: 1269 name = "Namespace Write Protection Config"; 1270 break; 1271 default: 1272 name = "Unknown"; 1273 break; 1274 } 1275 1276 return (name); 1277 } 1278 1279 static void 1280 nvme_feature_invalid_cb(struct pci_nvme_softc *sc, 1281 struct nvme_feature_obj *feat, 1282 struct nvme_command *command, 1283 struct nvme_completion *compl) 1284 { 1285 1286 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1287 } 1288 1289 static void 1290 nvme_feature_iv_config(struct pci_nvme_softc *sc, 1291 struct nvme_feature_obj *feat, 1292 struct nvme_command *command, 1293 struct nvme_completion *compl) 1294 { 1295 uint32_t i; 1296 uint32_t cdw11 = command->cdw11; 1297 uint16_t iv; 1298 bool cd; 1299 1300 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1301 1302 iv = cdw11 & 0xffff; 1303 cd = cdw11 & (1 << 16); 1304 1305 if (iv > (sc->max_queues + 1)) { 1306 return; 1307 } 1308 1309 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */ 1310 if ((iv == 0) && !cd) 1311 return; 1312 1313 /* Requested Interrupt Vector must be used by a CQ */ 1314 for (i = 0; i < sc->num_cqueues + 1; i++) { 1315 if (sc->compl_queues[i].intr_vec == iv) { 1316 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1317 } 1318 } 1319 1320 } 1321 1322 static void 1323 nvme_feature_num_queues(struct pci_nvme_softc *sc, 1324 struct nvme_feature_obj *feat, 1325 struct nvme_command *command, 1326 struct nvme_completion *compl) 1327 { 1328 uint16_t nqr; /* Number of Queues Requested */ 1329 1330 if (sc->num_q_is_set) { 1331 WPRINTF("%s: Number of Queues already set", __func__); 1332 pci_nvme_status_genc(&compl->status, 1333 NVME_SC_COMMAND_SEQUENCE_ERROR); 1334 return; 1335 } 1336 1337 nqr = command->cdw11 & 0xFFFF; 1338 if (nqr == 0xffff) { 1339 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr); 1340 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1341 return; 1342 } 1343 1344 sc->num_squeues = ONE_BASED(nqr); 1345 if (sc->num_squeues > sc->max_queues) { 1346 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues, 1347 sc->max_queues); 1348 sc->num_squeues = sc->max_queues; 1349 } 1350 1351 nqr = (command->cdw11 >> 16) & 0xFFFF; 1352 if (nqr == 0xffff) { 1353 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr); 1354 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1355 return; 1356 } 1357 1358 sc->num_cqueues = ONE_BASED(nqr); 1359 if (sc->num_cqueues > sc->max_queues) { 1360 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues, 1361 sc->max_queues); 1362 sc->num_cqueues = sc->max_queues; 1363 } 1364 1365 /* Patch the command value which will be saved on callback's return */ 1366 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc); 1367 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 1368 1369 sc->num_q_is_set = true; 1370 } 1371 1372 static int 1373 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command, 1374 struct nvme_completion *compl) 1375 { 1376 struct nvme_feature_obj *feat; 1377 uint32_t nsid = command->nsid; 1378 uint8_t fid = command->cdw10 & 0xFF; 1379 1380 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1381 1382 if (fid >= NVME_FID_MAX) { 1383 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1384 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1385 return (1); 1386 } 1387 feat = &sc->feat[fid]; 1388 1389 if (!feat->namespace_specific && 1390 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) { 1391 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1392 NVME_SC_FEATURE_NOT_NS_SPECIFIC); 1393 return (1); 1394 } 1395 1396 compl->cdw0 = 0; 1397 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1398 1399 if (feat->set) 1400 feat->set(sc, feat, command, compl); 1401 1402 if (compl->status == NVME_SC_SUCCESS) 1403 feat->cdw11 = command->cdw11; 1404 1405 return (0); 1406 } 1407 1408 static int 1409 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, 1410 struct nvme_completion* compl) 1411 { 1412 struct nvme_feature_obj *feat; 1413 uint8_t fid = command->cdw10 & 0xFF; 1414 1415 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1416 1417 if (fid >= NVME_FID_MAX) { 1418 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1419 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1420 return (1); 1421 } 1422 1423 compl->cdw0 = 0; 1424 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1425 1426 feat = &sc->feat[fid]; 1427 if (feat->get) { 1428 feat->get(sc, feat, command, compl); 1429 } 1430 1431 if (compl->status == NVME_SC_SUCCESS) { 1432 compl->cdw0 = feat->cdw11; 1433 } 1434 1435 return (0); 1436 } 1437 1438 static int 1439 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command, 1440 struct nvme_completion* compl) 1441 { 1442 uint8_t ses, lbaf, pi; 1443 1444 /* Only supports Secure Erase Setting - User Data Erase */ 1445 ses = (command->cdw10 >> 9) & 0x7; 1446 if (ses > 0x1) { 1447 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1448 return (1); 1449 } 1450 1451 /* Only supports a single LBA Format */ 1452 lbaf = command->cdw10 & 0xf; 1453 if (lbaf != 0) { 1454 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1455 NVME_SC_INVALID_FORMAT); 1456 return (1); 1457 } 1458 1459 /* Doesn't support Protection Infomation */ 1460 pi = (command->cdw10 >> 5) & 0x7; 1461 if (pi != 0) { 1462 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1463 return (1); 1464 } 1465 1466 if (sc->nvstore.type == NVME_STOR_RAM) { 1467 if (sc->nvstore.ctx) 1468 free(sc->nvstore.ctx); 1469 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 1470 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1471 } else { 1472 struct pci_nvme_ioreq *req; 1473 int err; 1474 1475 req = pci_nvme_get_ioreq(sc); 1476 if (req == NULL) { 1477 pci_nvme_status_genc(&compl->status, 1478 NVME_SC_INTERNAL_DEVICE_ERROR); 1479 WPRINTF("%s: unable to allocate IO req", __func__); 1480 return (1); 1481 } 1482 req->nvme_sq = &sc->submit_queues[0]; 1483 req->sqid = 0; 1484 req->opc = command->opc; 1485 req->cid = command->cid; 1486 req->nsid = command->nsid; 1487 1488 req->io_req.br_offset = 0; 1489 req->io_req.br_resid = sc->nvstore.size; 1490 req->io_req.br_callback = pci_nvme_io_done; 1491 1492 err = blockif_delete(sc->nvstore.ctx, &req->io_req); 1493 if (err) { 1494 pci_nvme_status_genc(&compl->status, 1495 NVME_SC_INTERNAL_DEVICE_ERROR); 1496 pci_nvme_release_ioreq(sc, req); 1497 } 1498 } 1499 1500 return (1); 1501 } 1502 1503 static int 1504 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command, 1505 struct nvme_completion* compl) 1506 { 1507 DPRINTF("%s submission queue %u, command ID 0x%x", __func__, 1508 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF); 1509 1510 /* TODO: search for the command ID and abort it */ 1511 1512 compl->cdw0 = 1; 1513 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1514 return (1); 1515 } 1516 1517 static int 1518 nvme_opc_async_event_req(struct pci_nvme_softc* sc, 1519 struct nvme_command* command, struct nvme_completion* compl) 1520 { 1521 DPRINTF("%s async event request 0x%x", __func__, command->cdw11); 1522 1523 /* Don't exceed the Async Event Request Limit (AERL). */ 1524 if (pci_nvme_aer_limit_reached(sc)) { 1525 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1526 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); 1527 return (1); 1528 } 1529 1530 if (pci_nvme_aer_add(sc, command->cid)) { 1531 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC, 1532 NVME_SC_INTERNAL_DEVICE_ERROR); 1533 return (1); 1534 } 1535 1536 /* 1537 * Raise events when they happen based on the Set Features cmd. 1538 * These events happen async, so only set completion successful if 1539 * there is an event reflective of the request to get event. 1540 */ 1541 compl->status = NVME_NO_STATUS; 1542 1543 return (0); 1544 } 1545 1546 static void 1547 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) 1548 { 1549 struct nvme_completion compl; 1550 struct nvme_command *cmd; 1551 struct nvme_submission_queue *sq; 1552 struct nvme_completion_queue *cq; 1553 uint16_t sqhead; 1554 1555 DPRINTF("%s index %u", __func__, (uint32_t)value); 1556 1557 sq = &sc->submit_queues[0]; 1558 cq = &sc->compl_queues[0]; 1559 1560 pthread_mutex_lock(&sq->mtx); 1561 1562 sqhead = sq->head; 1563 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail); 1564 1565 while (sqhead != atomic_load_acq_short(&sq->tail)) { 1566 cmd = &(sq->qbase)[sqhead]; 1567 compl.cdw0 = 0; 1568 compl.status = 0; 1569 1570 switch (cmd->opc) { 1571 case NVME_OPC_DELETE_IO_SQ: 1572 DPRINTF("%s command DELETE_IO_SQ", __func__); 1573 nvme_opc_delete_io_sq(sc, cmd, &compl); 1574 break; 1575 case NVME_OPC_CREATE_IO_SQ: 1576 DPRINTF("%s command CREATE_IO_SQ", __func__); 1577 nvme_opc_create_io_sq(sc, cmd, &compl); 1578 break; 1579 case NVME_OPC_DELETE_IO_CQ: 1580 DPRINTF("%s command DELETE_IO_CQ", __func__); 1581 nvme_opc_delete_io_cq(sc, cmd, &compl); 1582 break; 1583 case NVME_OPC_CREATE_IO_CQ: 1584 DPRINTF("%s command CREATE_IO_CQ", __func__); 1585 nvme_opc_create_io_cq(sc, cmd, &compl); 1586 break; 1587 case NVME_OPC_GET_LOG_PAGE: 1588 DPRINTF("%s command GET_LOG_PAGE", __func__); 1589 nvme_opc_get_log_page(sc, cmd, &compl); 1590 break; 1591 case NVME_OPC_IDENTIFY: 1592 DPRINTF("%s command IDENTIFY", __func__); 1593 nvme_opc_identify(sc, cmd, &compl); 1594 break; 1595 case NVME_OPC_ABORT: 1596 DPRINTF("%s command ABORT", __func__); 1597 nvme_opc_abort(sc, cmd, &compl); 1598 break; 1599 case NVME_OPC_SET_FEATURES: 1600 DPRINTF("%s command SET_FEATURES", __func__); 1601 nvme_opc_set_features(sc, cmd, &compl); 1602 break; 1603 case NVME_OPC_GET_FEATURES: 1604 DPRINTF("%s command GET_FEATURES", __func__); 1605 nvme_opc_get_features(sc, cmd, &compl); 1606 break; 1607 case NVME_OPC_FIRMWARE_ACTIVATE: 1608 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__); 1609 pci_nvme_status_tc(&compl.status, 1610 NVME_SCT_COMMAND_SPECIFIC, 1611 NVME_SC_INVALID_FIRMWARE_SLOT); 1612 break; 1613 case NVME_OPC_ASYNC_EVENT_REQUEST: 1614 DPRINTF("%s command ASYNC_EVENT_REQ", __func__); 1615 nvme_opc_async_event_req(sc, cmd, &compl); 1616 break; 1617 case NVME_OPC_FORMAT_NVM: 1618 DPRINTF("%s command FORMAT_NVM", __func__); 1619 if ((sc->ctrldata.oacs & 1620 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) { 1621 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 1622 } 1623 compl.status = NVME_NO_STATUS; 1624 nvme_opc_format_nvm(sc, cmd, &compl); 1625 break; 1626 default: 1627 DPRINTF("0x%x command is not implemented", 1628 cmd->opc); 1629 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 1630 } 1631 sqhead = (sqhead + 1) % sq->size; 1632 1633 if (NVME_COMPLETION_VALID(compl)) { 1634 pci_nvme_cq_update(sc, &sc->compl_queues[0], 1635 compl.cdw0, 1636 cmd->cid, 1637 0, /* SQID */ 1638 compl.status); 1639 } 1640 } 1641 1642 DPRINTF("setting sqhead %u", sqhead); 1643 sq->head = sqhead; 1644 1645 if (cq->head != cq->tail) 1646 pci_generate_msix(sc->nsc_pi, 0); 1647 1648 pthread_mutex_unlock(&sq->mtx); 1649 } 1650 1651 /* 1652 * Update the Write and Read statistics reported in SMART data 1653 * 1654 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up. 1655 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000 1656 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999. 1657 */ 1658 static void 1659 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc, 1660 size_t bytes, uint16_t status) 1661 { 1662 1663 pthread_mutex_lock(&sc->mtx); 1664 switch (opc) { 1665 case NVME_OPC_WRITE: 1666 sc->write_commands++; 1667 if (status != NVME_SC_SUCCESS) 1668 break; 1669 sc->write_dunits_remainder += (bytes / 512); 1670 while (sc->write_dunits_remainder >= 1000) { 1671 sc->write_data_units++; 1672 sc->write_dunits_remainder -= 1000; 1673 } 1674 break; 1675 case NVME_OPC_READ: 1676 sc->read_commands++; 1677 if (status != NVME_SC_SUCCESS) 1678 break; 1679 sc->read_dunits_remainder += (bytes / 512); 1680 while (sc->read_dunits_remainder >= 1000) { 1681 sc->read_data_units++; 1682 sc->read_dunits_remainder -= 1000; 1683 } 1684 break; 1685 default: 1686 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc); 1687 break; 1688 } 1689 pthread_mutex_unlock(&sc->mtx); 1690 } 1691 1692 /* 1693 * Check if the combination of Starting LBA (slba) and Number of Logical 1694 * Blocks (nlb) exceeds the range of the underlying storage. 1695 * 1696 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores 1697 * the capacity in bytes as a uint64_t, care must be taken to avoid integer 1698 * overflow. 1699 */ 1700 static bool 1701 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba, 1702 uint32_t nlb) 1703 { 1704 size_t offset, bytes; 1705 1706 /* Overflow check of multiplying Starting LBA by the sector size */ 1707 if (slba >> (64 - nvstore->sectsz_bits)) 1708 return (true); 1709 1710 offset = slba << nvstore->sectsz_bits; 1711 bytes = nlb << nvstore->sectsz_bits; 1712 1713 /* Overflow check of Number of Logical Blocks */ 1714 if ((nvstore->size - offset) < bytes) 1715 return (true); 1716 1717 return (false); 1718 } 1719 1720 static int 1721 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req, 1722 uint64_t gpaddr, size_t size, int do_write, uint64_t lba) 1723 { 1724 int iovidx; 1725 1726 if (req == NULL) 1727 return (-1); 1728 1729 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) { 1730 return (-1); 1731 } 1732 1733 /* concatenate contig block-iovs to minimize number of iovs */ 1734 if ((req->prev_gpaddr + req->prev_size) == gpaddr) { 1735 iovidx = req->io_req.br_iovcnt - 1; 1736 1737 req->io_req.br_iov[iovidx].iov_base = 1738 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 1739 req->prev_gpaddr, size); 1740 1741 req->prev_size += size; 1742 req->io_req.br_resid += size; 1743 1744 req->io_req.br_iov[iovidx].iov_len = req->prev_size; 1745 } else { 1746 iovidx = req->io_req.br_iovcnt; 1747 if (iovidx == 0) { 1748 req->io_req.br_offset = lba; 1749 req->io_req.br_resid = 0; 1750 req->io_req.br_param = req; 1751 } 1752 1753 req->io_req.br_iov[iovidx].iov_base = 1754 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 1755 gpaddr, size); 1756 1757 req->io_req.br_iov[iovidx].iov_len = size; 1758 1759 req->prev_gpaddr = gpaddr; 1760 req->prev_size = size; 1761 req->io_req.br_resid += size; 1762 1763 req->io_req.br_iovcnt++; 1764 } 1765 1766 return (0); 1767 } 1768 1769 static void 1770 pci_nvme_set_completion(struct pci_nvme_softc *sc, 1771 struct nvme_submission_queue *sq, int sqid, uint16_t cid, 1772 uint32_t cdw0, uint16_t status) 1773 { 1774 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; 1775 1776 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x", 1777 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), 1778 NVME_STATUS_GET_SC(status)); 1779 1780 pci_nvme_cq_update(sc, cq, 1781 0, /* CDW0 */ 1782 cid, 1783 sqid, 1784 status); 1785 1786 if (cq->head != cq->tail) { 1787 if (cq->intr_en & NVME_CQ_INTEN) { 1788 pci_generate_msix(sc->nsc_pi, cq->intr_vec); 1789 } else { 1790 DPRINTF("%s: CQ%u interrupt disabled", 1791 __func__, sq->cqid); 1792 } 1793 } 1794 } 1795 1796 static void 1797 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) 1798 { 1799 req->sc = NULL; 1800 req->nvme_sq = NULL; 1801 req->sqid = 0; 1802 1803 pthread_mutex_lock(&sc->mtx); 1804 1805 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link); 1806 sc->pending_ios--; 1807 1808 /* when no more IO pending, can set to ready if device reset/enabled */ 1809 if (sc->pending_ios == 0 && 1810 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) 1811 sc->regs.csts |= NVME_CSTS_RDY; 1812 1813 pthread_mutex_unlock(&sc->mtx); 1814 1815 sem_post(&sc->iosemlock); 1816 } 1817 1818 static struct pci_nvme_ioreq * 1819 pci_nvme_get_ioreq(struct pci_nvme_softc *sc) 1820 { 1821 struct pci_nvme_ioreq *req = NULL;; 1822 1823 sem_wait(&sc->iosemlock); 1824 pthread_mutex_lock(&sc->mtx); 1825 1826 req = STAILQ_FIRST(&sc->ioreqs_free); 1827 assert(req != NULL); 1828 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link); 1829 1830 req->sc = sc; 1831 1832 sc->pending_ios++; 1833 1834 pthread_mutex_unlock(&sc->mtx); 1835 1836 req->io_req.br_iovcnt = 0; 1837 req->io_req.br_offset = 0; 1838 req->io_req.br_resid = 0; 1839 req->io_req.br_param = req; 1840 req->prev_gpaddr = 0; 1841 req->prev_size = 0; 1842 1843 return req; 1844 } 1845 1846 static void 1847 pci_nvme_io_done(struct blockif_req *br, int err) 1848 { 1849 struct pci_nvme_ioreq *req = br->br_param; 1850 struct nvme_submission_queue *sq = req->nvme_sq; 1851 uint16_t code, status; 1852 1853 DPRINTF("%s error %d %s", __func__, err, strerror(err)); 1854 1855 /* TODO return correct error */ 1856 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; 1857 pci_nvme_status_genc(&status, code); 1858 1859 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status); 1860 pci_nvme_stats_write_read_update(req->sc, req->opc, 1861 req->bytes, status); 1862 pci_nvme_release_ioreq(req->sc, req); 1863 } 1864 1865 /* 1866 * Implements the Flush command. The specification states: 1867 * If a volatile write cache is not present, Flush commands complete 1868 * successfully and have no effect 1869 * in the description of the Volatile Write Cache (VWC) field of the Identify 1870 * Controller data. Therefore, set status to Success if the command is 1871 * not supported (i.e. RAM or as indicated by the blockif). 1872 */ 1873 static bool 1874 nvme_opc_flush(struct pci_nvme_softc *sc, 1875 struct nvme_command *cmd, 1876 struct pci_nvme_blockstore *nvstore, 1877 struct pci_nvme_ioreq *req, 1878 uint16_t *status) 1879 { 1880 bool pending = false; 1881 1882 if (nvstore->type == NVME_STOR_RAM) { 1883 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 1884 } else { 1885 int err; 1886 1887 req->io_req.br_callback = pci_nvme_io_done; 1888 1889 err = blockif_flush(nvstore->ctx, &req->io_req); 1890 switch (err) { 1891 case 0: 1892 pending = true; 1893 break; 1894 case EOPNOTSUPP: 1895 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 1896 break; 1897 default: 1898 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 1899 } 1900 } 1901 1902 return (pending); 1903 } 1904 1905 static uint16_t 1906 nvme_write_read_ram(struct pci_nvme_softc *sc, 1907 struct pci_nvme_blockstore *nvstore, 1908 uint64_t prp1, uint64_t prp2, 1909 size_t offset, uint64_t bytes, 1910 bool is_write) 1911 { 1912 uint8_t *buf = nvstore->ctx; 1913 enum nvme_copy_dir dir; 1914 uint16_t status; 1915 1916 if (is_write) 1917 dir = NVME_COPY_TO_PRP; 1918 else 1919 dir = NVME_COPY_FROM_PRP; 1920 1921 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2, 1922 buf + offset, bytes, dir)) 1923 pci_nvme_status_genc(&status, 1924 NVME_SC_DATA_TRANSFER_ERROR); 1925 else 1926 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1927 1928 return (status); 1929 } 1930 1931 static uint16_t 1932 nvme_write_read_blockif(struct pci_nvme_softc *sc, 1933 struct pci_nvme_blockstore *nvstore, 1934 struct pci_nvme_ioreq *req, 1935 uint64_t prp1, uint64_t prp2, 1936 size_t offset, uint64_t bytes, 1937 bool is_write) 1938 { 1939 uint64_t size; 1940 int err; 1941 uint16_t status = NVME_NO_STATUS; 1942 1943 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes); 1944 if (pci_nvme_append_iov_req(sc, req, prp1, 1945 size, is_write, offset)) { 1946 pci_nvme_status_genc(&status, 1947 NVME_SC_DATA_TRANSFER_ERROR); 1948 goto out; 1949 } 1950 1951 offset += size; 1952 bytes -= size; 1953 1954 if (bytes == 0) { 1955 ; 1956 } else if (bytes <= PAGE_SIZE) { 1957 size = bytes; 1958 if (pci_nvme_append_iov_req(sc, req, prp2, 1959 size, is_write, offset)) { 1960 pci_nvme_status_genc(&status, 1961 NVME_SC_DATA_TRANSFER_ERROR); 1962 goto out; 1963 } 1964 } else { 1965 void *vmctx = sc->nsc_pi->pi_vmctx; 1966 uint64_t *prp_list = &prp2; 1967 uint64_t *last = prp_list; 1968 1969 /* PRP2 is pointer to a physical region page list */ 1970 while (bytes) { 1971 /* Last entry in list points to the next list */ 1972 if (prp_list == last) { 1973 uint64_t prp = *prp_list; 1974 1975 prp_list = paddr_guest2host(vmctx, prp, 1976 PAGE_SIZE - (prp % PAGE_SIZE)); 1977 last = prp_list + (NVME_PRP2_ITEMS - 1); 1978 } 1979 1980 size = MIN(bytes, PAGE_SIZE); 1981 1982 if (pci_nvme_append_iov_req(sc, req, *prp_list, 1983 size, is_write, offset)) { 1984 pci_nvme_status_genc(&status, 1985 NVME_SC_DATA_TRANSFER_ERROR); 1986 goto out; 1987 } 1988 1989 offset += size; 1990 bytes -= size; 1991 1992 prp_list++; 1993 } 1994 } 1995 req->io_req.br_callback = pci_nvme_io_done; 1996 if (is_write) 1997 err = blockif_write(nvstore->ctx, &req->io_req); 1998 else 1999 err = blockif_read(nvstore->ctx, &req->io_req); 2000 2001 if (err) 2002 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); 2003 out: 2004 return (status); 2005 } 2006 2007 static bool 2008 nvme_opc_write_read(struct pci_nvme_softc *sc, 2009 struct nvme_command *cmd, 2010 struct pci_nvme_blockstore *nvstore, 2011 struct pci_nvme_ioreq *req, 2012 uint16_t *status) 2013 { 2014 uint64_t lba, nblocks, bytes; 2015 size_t offset; 2016 bool is_write = cmd->opc == NVME_OPC_WRITE; 2017 bool pending = false; 2018 2019 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; 2020 nblocks = (cmd->cdw12 & 0xFFFF) + 1; 2021 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) { 2022 WPRINTF("%s command would exceed LBA range", __func__); 2023 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2024 goto out; 2025 } 2026 2027 bytes = nblocks << nvstore->sectsz_bits; 2028 if (bytes > NVME_MAX_DATA_SIZE) { 2029 WPRINTF("%s command would exceed MDTS", __func__); 2030 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD); 2031 goto out; 2032 } 2033 2034 offset = lba << nvstore->sectsz_bits; 2035 2036 req->bytes = bytes; 2037 req->io_req.br_offset = lba; 2038 2039 /* PRP bits 1:0 must be zero */ 2040 cmd->prp1 &= ~0x3UL; 2041 cmd->prp2 &= ~0x3UL; 2042 2043 if (nvstore->type == NVME_STOR_RAM) { 2044 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1, 2045 cmd->prp2, offset, bytes, is_write); 2046 } else { 2047 *status = nvme_write_read_blockif(sc, nvstore, req, 2048 cmd->prp1, cmd->prp2, offset, bytes, is_write); 2049 2050 if (*status == NVME_NO_STATUS) 2051 pending = true; 2052 } 2053 out: 2054 if (!pending) 2055 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status); 2056 2057 return (pending); 2058 } 2059 2060 static void 2061 pci_nvme_dealloc_sm(struct blockif_req *br, int err) 2062 { 2063 struct pci_nvme_ioreq *req = br->br_param; 2064 struct pci_nvme_softc *sc = req->sc; 2065 bool done = true; 2066 uint16_t status; 2067 2068 if (err) { 2069 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); 2070 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) { 2071 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2072 } else { 2073 struct iovec *iov = req->io_req.br_iov; 2074 2075 req->prev_gpaddr++; 2076 iov += req->prev_gpaddr; 2077 2078 /* The iov_* values already include the sector size */ 2079 req->io_req.br_offset = (off_t)iov->iov_base; 2080 req->io_req.br_resid = iov->iov_len; 2081 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) { 2082 pci_nvme_status_genc(&status, 2083 NVME_SC_INTERNAL_DEVICE_ERROR); 2084 } else 2085 done = false; 2086 } 2087 2088 if (done) { 2089 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, 2090 req->cid, 0, status); 2091 pci_nvme_release_ioreq(sc, req); 2092 } 2093 } 2094 2095 static bool 2096 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, 2097 struct nvme_command *cmd, 2098 struct pci_nvme_blockstore *nvstore, 2099 struct pci_nvme_ioreq *req, 2100 uint16_t *status) 2101 { 2102 struct nvme_dsm_range *range; 2103 uint32_t nr, r, non_zero, dr; 2104 int err; 2105 bool pending = false; 2106 2107 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) { 2108 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE); 2109 goto out; 2110 } 2111 2112 nr = cmd->cdw10 & 0xff; 2113 2114 /* copy locally because a range entry could straddle PRPs */ 2115 range = calloc(1, NVME_MAX_DSM_TRIM); 2116 if (range == NULL) { 2117 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2118 goto out; 2119 } 2120 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, 2121 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP); 2122 2123 /* Check for invalid ranges and the number of non-zero lengths */ 2124 non_zero = 0; 2125 for (r = 0; r <= nr; r++) { 2126 if (pci_nvme_out_of_range(nvstore, 2127 range[r].starting_lba, range[r].length)) { 2128 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2129 goto out; 2130 } 2131 if (range[r].length != 0) 2132 non_zero++; 2133 } 2134 2135 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) { 2136 size_t offset, bytes; 2137 int sectsz_bits = sc->nvstore.sectsz_bits; 2138 2139 /* 2140 * DSM calls are advisory only, and compliant controllers 2141 * may choose to take no actions (i.e. return Success). 2142 */ 2143 if (!nvstore->deallocate) { 2144 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2145 goto out; 2146 } 2147 2148 /* If all ranges have a zero length, return Success */ 2149 if (non_zero == 0) { 2150 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2151 goto out; 2152 } 2153 2154 if (req == NULL) { 2155 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2156 goto out; 2157 } 2158 2159 offset = range[0].starting_lba << sectsz_bits; 2160 bytes = range[0].length << sectsz_bits; 2161 2162 /* 2163 * If the request is for more than a single range, store 2164 * the ranges in the br_iov. Optimize for the common case 2165 * of a single range. 2166 * 2167 * Note that NVMe Number of Ranges is a zero based value 2168 */ 2169 req->io_req.br_iovcnt = 0; 2170 req->io_req.br_offset = offset; 2171 req->io_req.br_resid = bytes; 2172 2173 if (nr == 0) { 2174 req->io_req.br_callback = pci_nvme_io_done; 2175 } else { 2176 struct iovec *iov = req->io_req.br_iov; 2177 2178 for (r = 0, dr = 0; r <= nr; r++) { 2179 offset = range[r].starting_lba << sectsz_bits; 2180 bytes = range[r].length << sectsz_bits; 2181 if (bytes == 0) 2182 continue; 2183 2184 if ((nvstore->size - offset) < bytes) { 2185 pci_nvme_status_genc(status, 2186 NVME_SC_LBA_OUT_OF_RANGE); 2187 goto out; 2188 } 2189 iov[dr].iov_base = (void *)offset; 2190 iov[dr].iov_len = bytes; 2191 dr++; 2192 } 2193 req->io_req.br_callback = pci_nvme_dealloc_sm; 2194 2195 /* 2196 * Use prev_gpaddr to track the current entry and 2197 * prev_size to track the number of entries 2198 */ 2199 req->prev_gpaddr = 0; 2200 req->prev_size = dr; 2201 } 2202 2203 err = blockif_delete(nvstore->ctx, &req->io_req); 2204 if (err) 2205 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2206 else 2207 pending = true; 2208 } 2209 out: 2210 free(range); 2211 return (pending); 2212 } 2213 2214 static void 2215 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) 2216 { 2217 struct nvme_submission_queue *sq; 2218 uint16_t status; 2219 uint16_t sqhead; 2220 2221 /* handle all submissions up to sq->tail index */ 2222 sq = &sc->submit_queues[idx]; 2223 2224 pthread_mutex_lock(&sq->mtx); 2225 2226 sqhead = sq->head; 2227 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p", 2228 idx, sqhead, sq->tail, sq->qbase); 2229 2230 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2231 struct nvme_command *cmd; 2232 struct pci_nvme_ioreq *req; 2233 uint32_t nsid; 2234 bool pending; 2235 2236 pending = false; 2237 req = NULL; 2238 status = 0; 2239 2240 cmd = &sq->qbase[sqhead]; 2241 sqhead = (sqhead + 1) % sq->size; 2242 2243 nsid = le32toh(cmd->nsid); 2244 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) { 2245 pci_nvme_status_genc(&status, 2246 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 2247 status |= 2248 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT; 2249 goto complete; 2250 } 2251 2252 req = pci_nvme_get_ioreq(sc); 2253 if (req == NULL) { 2254 pci_nvme_status_genc(&status, 2255 NVME_SC_INTERNAL_DEVICE_ERROR); 2256 WPRINTF("%s: unable to allocate IO req", __func__); 2257 goto complete; 2258 } 2259 req->nvme_sq = sq; 2260 req->sqid = idx; 2261 req->opc = cmd->opc; 2262 req->cid = cmd->cid; 2263 req->nsid = cmd->nsid; 2264 2265 switch (cmd->opc) { 2266 case NVME_OPC_FLUSH: 2267 pending = nvme_opc_flush(sc, cmd, &sc->nvstore, 2268 req, &status); 2269 break; 2270 case NVME_OPC_WRITE: 2271 case NVME_OPC_READ: 2272 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore, 2273 req, &status); 2274 break; 2275 case NVME_OPC_WRITE_ZEROES: 2276 /* TODO: write zeroes 2277 WPRINTF("%s write zeroes lba 0x%lx blocks %u", 2278 __func__, lba, cmd->cdw12 & 0xFFFF); */ 2279 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2280 break; 2281 case NVME_OPC_DATASET_MANAGEMENT: 2282 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, 2283 req, &status); 2284 break; 2285 default: 2286 WPRINTF("%s unhandled io command 0x%x", 2287 __func__, cmd->opc); 2288 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE); 2289 } 2290 complete: 2291 if (!pending) { 2292 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, 2293 status); 2294 if (req != NULL) 2295 pci_nvme_release_ioreq(sc, req); 2296 } 2297 } 2298 2299 sq->head = sqhead; 2300 2301 pthread_mutex_unlock(&sq->mtx); 2302 } 2303 2304 static void 2305 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc, 2306 uint64_t idx, int is_sq, uint64_t value) 2307 { 2308 DPRINTF("nvme doorbell %lu, %s, val 0x%lx", 2309 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF); 2310 2311 if (is_sq) { 2312 if (idx > sc->num_squeues) { 2313 WPRINTF("%s queue index %lu overflow from " 2314 "guest (max %u)", 2315 __func__, idx, sc->num_squeues); 2316 return; 2317 } 2318 2319 atomic_store_short(&sc->submit_queues[idx].tail, 2320 (uint16_t)value); 2321 2322 if (idx == 0) { 2323 pci_nvme_handle_admin_cmd(sc, value); 2324 } else { 2325 /* submission queue; handle new entries in SQ */ 2326 if (idx > sc->num_squeues) { 2327 WPRINTF("%s SQ index %lu overflow from " 2328 "guest (max %u)", 2329 __func__, idx, sc->num_squeues); 2330 return; 2331 } 2332 pci_nvme_handle_io_cmd(sc, (uint16_t)idx); 2333 } 2334 } else { 2335 if (idx > sc->num_cqueues) { 2336 WPRINTF("%s queue index %lu overflow from " 2337 "guest (max %u)", 2338 __func__, idx, sc->num_cqueues); 2339 return; 2340 } 2341 2342 atomic_store_short(&sc->compl_queues[idx].head, 2343 (uint16_t)value); 2344 } 2345 } 2346 2347 static void 2348 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) 2349 { 2350 const char *s = iswrite ? "WRITE" : "READ"; 2351 2352 switch (offset) { 2353 case NVME_CR_CAP_LOW: 2354 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s); 2355 break; 2356 case NVME_CR_CAP_HI: 2357 DPRINTF("%s %s NVME_CR_CAP_HI", func, s); 2358 break; 2359 case NVME_CR_VS: 2360 DPRINTF("%s %s NVME_CR_VS", func, s); 2361 break; 2362 case NVME_CR_INTMS: 2363 DPRINTF("%s %s NVME_CR_INTMS", func, s); 2364 break; 2365 case NVME_CR_INTMC: 2366 DPRINTF("%s %s NVME_CR_INTMC", func, s); 2367 break; 2368 case NVME_CR_CC: 2369 DPRINTF("%s %s NVME_CR_CC", func, s); 2370 break; 2371 case NVME_CR_CSTS: 2372 DPRINTF("%s %s NVME_CR_CSTS", func, s); 2373 break; 2374 case NVME_CR_NSSR: 2375 DPRINTF("%s %s NVME_CR_NSSR", func, s); 2376 break; 2377 case NVME_CR_AQA: 2378 DPRINTF("%s %s NVME_CR_AQA", func, s); 2379 break; 2380 case NVME_CR_ASQ_LOW: 2381 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s); 2382 break; 2383 case NVME_CR_ASQ_HI: 2384 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s); 2385 break; 2386 case NVME_CR_ACQ_LOW: 2387 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s); 2388 break; 2389 case NVME_CR_ACQ_HI: 2390 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s); 2391 break; 2392 default: 2393 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset); 2394 } 2395 2396 } 2397 2398 static void 2399 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc, 2400 uint64_t offset, int size, uint64_t value) 2401 { 2402 uint32_t ccreg; 2403 2404 if (offset >= NVME_DOORBELL_OFFSET) { 2405 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; 2406 uint64_t idx = belloffset / 8; /* door bell size = 2*int */ 2407 int is_sq = (belloffset % 8) < 4; 2408 2409 if (belloffset > ((sc->max_queues+1) * 8 - 4)) { 2410 WPRINTF("guest attempted an overflow write offset " 2411 "0x%lx, val 0x%lx in %s", 2412 offset, value, __func__); 2413 return; 2414 } 2415 2416 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value); 2417 return; 2418 } 2419 2420 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx", 2421 offset, size, value); 2422 2423 if (size != 4) { 2424 WPRINTF("guest wrote invalid size %d (offset 0x%lx, " 2425 "val 0x%lx) to bar0 in %s", 2426 size, offset, value, __func__); 2427 /* TODO: shutdown device */ 2428 return; 2429 } 2430 2431 pci_nvme_bar0_reg_dumps(__func__, offset, 1); 2432 2433 pthread_mutex_lock(&sc->mtx); 2434 2435 switch (offset) { 2436 case NVME_CR_CAP_LOW: 2437 case NVME_CR_CAP_HI: 2438 /* readonly */ 2439 break; 2440 case NVME_CR_VS: 2441 /* readonly */ 2442 break; 2443 case NVME_CR_INTMS: 2444 /* MSI-X, so ignore */ 2445 break; 2446 case NVME_CR_INTMC: 2447 /* MSI-X, so ignore */ 2448 break; 2449 case NVME_CR_CC: 2450 ccreg = (uint32_t)value; 2451 2452 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u " 2453 "iocqes %u", 2454 __func__, 2455 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), 2456 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), 2457 NVME_CC_GET_IOCQES(ccreg)); 2458 2459 if (NVME_CC_GET_SHN(ccreg)) { 2460 /* perform shutdown - flush out data to backend */ 2461 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << 2462 NVME_CSTS_REG_SHST_SHIFT); 2463 sc->regs.csts |= NVME_SHST_COMPLETE << 2464 NVME_CSTS_REG_SHST_SHIFT; 2465 } 2466 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { 2467 if (NVME_CC_GET_EN(ccreg) == 0) 2468 /* transition 1-> causes controller reset */ 2469 pci_nvme_reset_locked(sc); 2470 else 2471 pci_nvme_init_controller(ctx, sc); 2472 } 2473 2474 /* Insert the iocqes, iosqes and en bits from the write */ 2475 sc->regs.cc &= ~NVME_CC_WRITE_MASK; 2476 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; 2477 if (NVME_CC_GET_EN(ccreg) == 0) { 2478 /* Insert the ams, mps and css bit fields */ 2479 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; 2480 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; 2481 sc->regs.csts &= ~NVME_CSTS_RDY; 2482 } else if (sc->pending_ios == 0) { 2483 sc->regs.csts |= NVME_CSTS_RDY; 2484 } 2485 break; 2486 case NVME_CR_CSTS: 2487 break; 2488 case NVME_CR_NSSR: 2489 /* ignore writes; don't support subsystem reset */ 2490 break; 2491 case NVME_CR_AQA: 2492 sc->regs.aqa = (uint32_t)value; 2493 break; 2494 case NVME_CR_ASQ_LOW: 2495 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | 2496 (0xFFFFF000 & value); 2497 break; 2498 case NVME_CR_ASQ_HI: 2499 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | 2500 (value << 32); 2501 break; 2502 case NVME_CR_ACQ_LOW: 2503 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | 2504 (0xFFFFF000 & value); 2505 break; 2506 case NVME_CR_ACQ_HI: 2507 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | 2508 (value << 32); 2509 break; 2510 default: 2511 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d", 2512 __func__, offset, value, size); 2513 } 2514 pthread_mutex_unlock(&sc->mtx); 2515 } 2516 2517 static void 2518 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, 2519 int baridx, uint64_t offset, int size, uint64_t value) 2520 { 2521 struct pci_nvme_softc* sc = pi->pi_arg; 2522 2523 if (baridx == pci_msix_table_bar(pi) || 2524 baridx == pci_msix_pba_bar(pi)) { 2525 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, " 2526 " value 0x%lx", baridx, offset, size, value); 2527 2528 pci_emul_msix_twrite(pi, offset, size, value); 2529 return; 2530 } 2531 2532 switch (baridx) { 2533 case 0: 2534 pci_nvme_write_bar_0(ctx, sc, offset, size, value); 2535 break; 2536 2537 default: 2538 DPRINTF("%s unknown baridx %d, val 0x%lx", 2539 __func__, baridx, value); 2540 } 2541 } 2542 2543 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, 2544 uint64_t offset, int size) 2545 { 2546 uint64_t value; 2547 2548 pci_nvme_bar0_reg_dumps(__func__, offset, 0); 2549 2550 if (offset < NVME_DOORBELL_OFFSET) { 2551 void *p = &(sc->regs); 2552 pthread_mutex_lock(&sc->mtx); 2553 memcpy(&value, (void *)((uintptr_t)p + offset), size); 2554 pthread_mutex_unlock(&sc->mtx); 2555 } else { 2556 value = 0; 2557 WPRINTF("pci_nvme: read invalid offset %ld", offset); 2558 } 2559 2560 switch (size) { 2561 case 1: 2562 value &= 0xFF; 2563 break; 2564 case 2: 2565 value &= 0xFFFF; 2566 break; 2567 case 4: 2568 value &= 0xFFFFFFFF; 2569 break; 2570 } 2571 2572 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x", 2573 offset, size, (uint32_t)value); 2574 2575 return (value); 2576 } 2577 2578 2579 2580 static uint64_t 2581 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, 2582 uint64_t offset, int size) 2583 { 2584 struct pci_nvme_softc* sc = pi->pi_arg; 2585 2586 if (baridx == pci_msix_table_bar(pi) || 2587 baridx == pci_msix_pba_bar(pi)) { 2588 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d", 2589 baridx, offset, size); 2590 2591 return pci_emul_msix_tread(pi, offset, size); 2592 } 2593 2594 switch (baridx) { 2595 case 0: 2596 return pci_nvme_read_bar_0(sc, offset, size); 2597 2598 default: 2599 DPRINTF("unknown bar %d, 0x%lx", baridx, offset); 2600 } 2601 2602 return (0); 2603 } 2604 2605 2606 static int 2607 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts) 2608 { 2609 char bident[sizeof("XX:X:X")]; 2610 char *uopt, *xopts, *config; 2611 uint32_t sectsz; 2612 int optidx; 2613 2614 sc->max_queues = NVME_QUEUES; 2615 sc->max_qentries = NVME_MAX_QENTRIES; 2616 sc->ioslots = NVME_IOSLOTS; 2617 sc->num_squeues = sc->max_queues; 2618 sc->num_cqueues = sc->max_queues; 2619 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 2620 sectsz = 0; 2621 2622 uopt = strdup(opts); 2623 optidx = 0; 2624 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), 2625 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 2626 for (xopts = strtok(uopt, ","); 2627 xopts != NULL; 2628 xopts = strtok(NULL, ",")) { 2629 2630 if ((config = strchr(xopts, '=')) != NULL) 2631 *config++ = '\0'; 2632 2633 if (!strcmp("maxq", xopts)) { 2634 sc->max_queues = atoi(config); 2635 } else if (!strcmp("qsz", xopts)) { 2636 sc->max_qentries = atoi(config); 2637 } else if (!strcmp("ioslots", xopts)) { 2638 sc->ioslots = atoi(config); 2639 } else if (!strcmp("sectsz", xopts)) { 2640 sectsz = atoi(config); 2641 } else if (!strcmp("ser", xopts)) { 2642 /* 2643 * This field indicates the Product Serial Number in 2644 * 7-bit ASCII, unused bytes should be space characters. 2645 * Ref: NVMe v1.3c. 2646 */ 2647 cpywithpad((char *)sc->ctrldata.sn, 2648 sizeof(sc->ctrldata.sn), config, ' '); 2649 } else if (!strcmp("ram", xopts)) { 2650 uint64_t sz = strtoull(&xopts[4], NULL, 10); 2651 2652 sc->nvstore.type = NVME_STOR_RAM; 2653 sc->nvstore.size = sz * 1024 * 1024; 2654 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 2655 sc->nvstore.sectsz = 4096; 2656 sc->nvstore.sectsz_bits = 12; 2657 if (sc->nvstore.ctx == NULL) { 2658 perror("Unable to allocate RAM"); 2659 free(uopt); 2660 return (-1); 2661 } 2662 } else if (!strcmp("eui64", xopts)) { 2663 sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0)); 2664 } else if (!strcmp("dsm", xopts)) { 2665 if (!strcmp("auto", config)) 2666 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 2667 else if (!strcmp("enable", config)) 2668 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE; 2669 else if (!strcmp("disable", config)) 2670 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE; 2671 } else if (optidx == 0) { 2672 snprintf(bident, sizeof(bident), "%d:%d", 2673 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 2674 sc->nvstore.ctx = blockif_open(xopts, bident); 2675 if (sc->nvstore.ctx == NULL) { 2676 perror("Could not open backing file"); 2677 free(uopt); 2678 return (-1); 2679 } 2680 sc->nvstore.type = NVME_STOR_BLOCKIF; 2681 sc->nvstore.size = blockif_size(sc->nvstore.ctx); 2682 } else { 2683 EPRINTLN("Invalid option %s", xopts); 2684 free(uopt); 2685 return (-1); 2686 } 2687 2688 optidx++; 2689 } 2690 free(uopt); 2691 2692 if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) { 2693 EPRINTLN("backing store not specified"); 2694 return (-1); 2695 } 2696 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) 2697 sc->nvstore.sectsz = sectsz; 2698 else if (sc->nvstore.type != NVME_STOR_RAM) 2699 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); 2700 for (sc->nvstore.sectsz_bits = 9; 2701 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; 2702 sc->nvstore.sectsz_bits++); 2703 2704 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) 2705 sc->max_queues = NVME_QUEUES; 2706 2707 if (sc->max_qentries <= 0) { 2708 EPRINTLN("Invalid qsz option"); 2709 return (-1); 2710 } 2711 if (sc->ioslots <= 0) { 2712 EPRINTLN("Invalid ioslots option"); 2713 return (-1); 2714 } 2715 2716 return (0); 2717 } 2718 2719 static int 2720 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) 2721 { 2722 struct pci_nvme_softc *sc; 2723 uint32_t pci_membar_sz; 2724 int error; 2725 2726 error = 0; 2727 2728 sc = calloc(1, sizeof(struct pci_nvme_softc)); 2729 pi->pi_arg = sc; 2730 sc->nsc_pi = pi; 2731 2732 error = pci_nvme_parse_opts(sc, opts); 2733 if (error < 0) 2734 goto done; 2735 else 2736 error = 0; 2737 2738 STAILQ_INIT(&sc->ioreqs_free); 2739 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); 2740 for (int i = 0; i < sc->ioslots; i++) { 2741 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); 2742 } 2743 2744 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); 2745 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); 2746 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); 2747 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); 2748 pci_set_cfgdata8(pi, PCIR_PROGIF, 2749 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); 2750 2751 /* 2752 * Allocate size of NVMe registers + doorbell space for all queues. 2753 * 2754 * The specification requires a minimum memory I/O window size of 16K. 2755 * The Windows driver will refuse to start a device with a smaller 2756 * window. 2757 */ 2758 pci_membar_sz = sizeof(struct nvme_registers) + 2759 2 * sizeof(uint32_t) * (sc->max_queues + 1); 2760 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); 2761 2762 DPRINTF("nvme membar size: %u", pci_membar_sz); 2763 2764 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); 2765 if (error) { 2766 WPRINTF("%s pci alloc mem bar failed", __func__); 2767 goto done; 2768 } 2769 2770 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); 2771 if (error) { 2772 WPRINTF("%s pci add msixcap failed", __func__); 2773 goto done; 2774 } 2775 2776 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); 2777 if (error) { 2778 WPRINTF("%s pci add Express capability failed", __func__); 2779 goto done; 2780 } 2781 2782 pthread_mutex_init(&sc->mtx, NULL); 2783 sem_init(&sc->iosemlock, 0, sc->ioslots); 2784 2785 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues); 2786 /* 2787 * Controller data depends on Namespace data so initialize Namespace 2788 * data first. 2789 */ 2790 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore); 2791 pci_nvme_init_ctrldata(sc); 2792 pci_nvme_init_logpages(sc); 2793 pci_nvme_init_features(sc); 2794 2795 pci_nvme_aer_init(sc); 2796 2797 pci_nvme_reset(sc); 2798 2799 pci_lintr_request(pi); 2800 2801 done: 2802 return (error); 2803 } 2804 2805 2806 struct pci_devemu pci_de_nvme = { 2807 .pe_emu = "nvme", 2808 .pe_init = pci_nvme_init, 2809 .pe_barwrite = pci_nvme_write, 2810 .pe_barread = pci_nvme_read 2811 }; 2812 PCI_EMUL_SET(pci_de_nvme); 2813