1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2017 Shunsuke Mie 5 * Copyright (c) 2018 Leon Dang 6 * Copyright (c) 2020 Chuck Tuffli 7 * 8 * Function crc16 Copyright (c) 2017, Fedor Uporov 9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 33 /* 34 * bhyve PCIe-NVMe device emulation. 35 * 36 * options: 37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt> 38 * 39 * accepted devpath: 40 * /dev/blockdev 41 * /path/to/image 42 * ram=size_in_MiB 43 * 44 * maxq = max number of queues 45 * qsz = max elements in each queue 46 * ioslots = max number of concurrent io requests 47 * sectsz = sector size (defaults to blockif sector size) 48 * ser = serial number (20-chars max) 49 * eui64 = IEEE Extended Unique Identifier (8 byte value) 50 * dsm = DataSet Management support. Option is one of auto, enable,disable 51 * 52 */ 53 54 /* TODO: 55 - create async event for smart and log 56 - intr coalesce 57 */ 58 59 #include <sys/cdefs.h> 60 __FBSDID("$FreeBSD$"); 61 62 #include <sys/errno.h> 63 #include <sys/types.h> 64 #include <net/ieee_oui.h> 65 66 #include <assert.h> 67 #include <pthread.h> 68 #include <pthread_np.h> 69 #include <semaphore.h> 70 #include <stdbool.h> 71 #include <stddef.h> 72 #include <stdint.h> 73 #include <stdio.h> 74 #include <stdlib.h> 75 #include <string.h> 76 77 #include <machine/atomic.h> 78 #include <machine/vmm.h> 79 #include <vmmapi.h> 80 81 #include <dev/nvme/nvme.h> 82 83 #include "bhyverun.h" 84 #include "block_if.h" 85 #include "config.h" 86 #include "debug.h" 87 #include "pci_emul.h" 88 89 90 static int nvme_debug = 0; 91 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args) 92 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args) 93 94 /* defaults; can be overridden */ 95 #define NVME_MSIX_BAR 4 96 97 #define NVME_IOSLOTS 8 98 99 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ 100 #define NVME_MMIO_SPACE_MIN (1 << 14) 101 102 #define NVME_QUEUES 16 103 #define NVME_MAX_QENTRIES 2048 104 /* Memory Page size Minimum reported in CAP register */ 105 #define NVME_MPSMIN 0 106 /* MPSMIN converted to bytes */ 107 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN)) 108 109 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) 110 #define NVME_MDTS 9 111 /* Note the + 1 allows for the initial descriptor to not be page aligned */ 112 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1) 113 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES) 114 115 /* This is a synthetic status code to indicate there is no status */ 116 #define NVME_NO_STATUS 0xffff 117 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) 118 119 /* helpers */ 120 121 /* Convert a zero-based value into a one-based value */ 122 #define ONE_BASED(zero) ((zero) + 1) 123 /* Convert a one-based value into a zero-based value */ 124 #define ZERO_BASED(one) ((one) - 1) 125 126 /* Encode number of SQ's and CQ's for Set/Get Features */ 127 #define NVME_FEATURE_NUM_QUEUES(sc) \ 128 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ 129 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16; 130 131 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) 132 133 enum nvme_controller_register_offsets { 134 NVME_CR_CAP_LOW = 0x00, 135 NVME_CR_CAP_HI = 0x04, 136 NVME_CR_VS = 0x08, 137 NVME_CR_INTMS = 0x0c, 138 NVME_CR_INTMC = 0x10, 139 NVME_CR_CC = 0x14, 140 NVME_CR_CSTS = 0x1c, 141 NVME_CR_NSSR = 0x20, 142 NVME_CR_AQA = 0x24, 143 NVME_CR_ASQ_LOW = 0x28, 144 NVME_CR_ASQ_HI = 0x2c, 145 NVME_CR_ACQ_LOW = 0x30, 146 NVME_CR_ACQ_HI = 0x34, 147 }; 148 149 enum nvme_cmd_cdw11 { 150 NVME_CMD_CDW11_PC = 0x0001, 151 NVME_CMD_CDW11_IEN = 0x0002, 152 NVME_CMD_CDW11_IV = 0xFFFF0000, 153 }; 154 155 enum nvme_copy_dir { 156 NVME_COPY_TO_PRP, 157 NVME_COPY_FROM_PRP, 158 }; 159 160 #define NVME_CQ_INTEN 0x01 161 #define NVME_CQ_INTCOAL 0x02 162 163 struct nvme_completion_queue { 164 struct nvme_completion *qbase; 165 pthread_mutex_t mtx; 166 uint32_t size; 167 uint16_t tail; /* nvme progress */ 168 uint16_t head; /* guest progress */ 169 uint16_t intr_vec; 170 uint32_t intr_en; 171 }; 172 173 struct nvme_submission_queue { 174 struct nvme_command *qbase; 175 pthread_mutex_t mtx; 176 uint32_t size; 177 uint16_t head; /* nvme progress */ 178 uint16_t tail; /* guest progress */ 179 uint16_t cqid; /* completion queue id */ 180 int qpriority; 181 }; 182 183 enum nvme_storage_type { 184 NVME_STOR_BLOCKIF = 0, 185 NVME_STOR_RAM = 1, 186 }; 187 188 struct pci_nvme_blockstore { 189 enum nvme_storage_type type; 190 void *ctx; 191 uint64_t size; 192 uint32_t sectsz; 193 uint32_t sectsz_bits; 194 uint64_t eui64; 195 uint32_t deallocate:1; 196 }; 197 198 /* 199 * Calculate the number of additional page descriptors for guest IO requests 200 * based on the advertised Max Data Transfer (MDTS) and given the number of 201 * default iovec's in a struct blockif_req. 202 */ 203 #define MDTS_PAD_SIZE \ 204 ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \ 205 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \ 206 0 ) 207 208 struct pci_nvme_ioreq { 209 struct pci_nvme_softc *sc; 210 STAILQ_ENTRY(pci_nvme_ioreq) link; 211 struct nvme_submission_queue *nvme_sq; 212 uint16_t sqid; 213 214 /* command information */ 215 uint16_t opc; 216 uint16_t cid; 217 uint32_t nsid; 218 219 uint64_t prev_gpaddr; 220 size_t prev_size; 221 size_t bytes; 222 223 struct blockif_req io_req; 224 225 struct iovec iovpadding[MDTS_PAD_SIZE]; 226 }; 227 228 enum nvme_dsm_type { 229 /* Dataset Management bit in ONCS reflects backing storage capability */ 230 NVME_DATASET_MANAGEMENT_AUTO, 231 /* Unconditionally set Dataset Management bit in ONCS */ 232 NVME_DATASET_MANAGEMENT_ENABLE, 233 /* Unconditionally clear Dataset Management bit in ONCS */ 234 NVME_DATASET_MANAGEMENT_DISABLE, 235 }; 236 237 struct pci_nvme_softc; 238 struct nvme_feature_obj; 239 240 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *, 241 struct nvme_feature_obj *, 242 struct nvme_command *, 243 struct nvme_completion *); 244 245 struct nvme_feature_obj { 246 uint32_t cdw11; 247 nvme_feature_cb set; 248 nvme_feature_cb get; 249 bool namespace_specific; 250 }; 251 252 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1) 253 254 typedef enum { 255 PCI_NVME_AE_TYPE_ERROR = 0, 256 PCI_NVME_AE_TYPE_SMART, 257 PCI_NVME_AE_TYPE_NOTICE, 258 PCI_NVME_AE_TYPE_IO_CMD = 6, 259 PCI_NVME_AE_TYPE_VENDOR = 7, 260 PCI_NVME_AE_TYPE_MAX /* Must be last */ 261 } pci_nvme_async_type; 262 263 /* Asynchronous Event Requests */ 264 struct pci_nvme_aer { 265 STAILQ_ENTRY(pci_nvme_aer) link; 266 uint16_t cid; /* Command ID of the submitted AER */ 267 }; 268 269 typedef enum { 270 PCI_NVME_AE_INFO_NS_ATTR_CHANGED = 0, 271 PCI_NVME_AE_INFO_FW_ACTIVATION, 272 PCI_NVME_AE_INFO_TELEMETRY_CHANGE, 273 PCI_NVME_AE_INFO_ANA_CHANGE, 274 PCI_NVME_AE_INFO_PREDICT_LATENCY_CHANGE, 275 PCI_NVME_AE_INFO_LBA_STATUS_ALERT, 276 PCI_NVME_AE_INFO_ENDURANCE_GROUP_CHANGE, 277 PCI_NVME_AE_INFO_MAX, 278 } pci_nvme_async_info; 279 280 /* Asynchronous Event Notifications */ 281 struct pci_nvme_aen { 282 pci_nvme_async_type atype; 283 uint32_t event_data; 284 bool posted; 285 }; 286 287 struct pci_nvme_softc { 288 struct pci_devinst *nsc_pi; 289 290 pthread_mutex_t mtx; 291 292 struct nvme_registers regs; 293 294 struct nvme_namespace_data nsdata; 295 struct nvme_controller_data ctrldata; 296 struct nvme_error_information_entry err_log; 297 struct nvme_health_information_page health_log; 298 struct nvme_firmware_page fw_log; 299 struct nvme_ns_list ns_log; 300 301 struct pci_nvme_blockstore nvstore; 302 303 uint16_t max_qentries; /* max entries per queue */ 304 uint32_t max_queues; /* max number of IO SQ's or CQ's */ 305 uint32_t num_cqueues; 306 uint32_t num_squeues; 307 bool num_q_is_set; /* Has host set Number of Queues */ 308 309 struct pci_nvme_ioreq *ioreqs; 310 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */ 311 uint32_t pending_ios; 312 uint32_t ioslots; 313 sem_t iosemlock; 314 315 /* 316 * Memory mapped Submission and Completion queues 317 * Each array includes both Admin and IO queues 318 */ 319 struct nvme_completion_queue *compl_queues; 320 struct nvme_submission_queue *submit_queues; 321 322 struct nvme_feature_obj feat[NVME_FID_MAX]; 323 324 enum nvme_dsm_type dataset_management; 325 326 /* Accounting for SMART data */ 327 __uint128_t read_data_units; 328 __uint128_t write_data_units; 329 __uint128_t read_commands; 330 __uint128_t write_commands; 331 uint32_t read_dunits_remainder; 332 uint32_t write_dunits_remainder; 333 334 STAILQ_HEAD(, pci_nvme_aer) aer_list; 335 pthread_mutex_t aer_mtx; 336 uint32_t aer_count; 337 struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX]; 338 pthread_t aen_tid; 339 pthread_mutex_t aen_mtx; 340 pthread_cond_t aen_cond; 341 }; 342 343 344 static void pci_nvme_cq_update(struct pci_nvme_softc *sc, 345 struct nvme_completion_queue *cq, 346 uint32_t cdw0, 347 uint16_t cid, 348 uint16_t sqid, 349 uint16_t status); 350 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); 351 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); 352 static void pci_nvme_io_done(struct blockif_req *, int); 353 354 /* Controller Configuration utils */ 355 #define NVME_CC_GET_EN(cc) \ 356 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) 357 #define NVME_CC_GET_CSS(cc) \ 358 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) 359 #define NVME_CC_GET_SHN(cc) \ 360 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) 361 #define NVME_CC_GET_IOSQES(cc) \ 362 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) 363 #define NVME_CC_GET_IOCQES(cc) \ 364 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) 365 366 #define NVME_CC_WRITE_MASK \ 367 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ 368 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ 369 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) 370 371 #define NVME_CC_NEN_WRITE_MASK \ 372 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ 373 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ 374 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) 375 376 /* Controller Status utils */ 377 #define NVME_CSTS_GET_RDY(sts) \ 378 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) 379 380 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) 381 382 /* Completion Queue status word utils */ 383 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) 384 #define NVME_STATUS_MASK \ 385 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ 386 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) 387 388 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \ 389 NVME_CTRLR_DATA_ONCS_DSM_SHIFT) 390 391 static void nvme_feature_invalid_cb(struct pci_nvme_softc *, 392 struct nvme_feature_obj *, 393 struct nvme_command *, 394 struct nvme_completion *); 395 static void nvme_feature_num_queues(struct pci_nvme_softc *, 396 struct nvme_feature_obj *, 397 struct nvme_command *, 398 struct nvme_completion *); 399 static void nvme_feature_iv_config(struct pci_nvme_softc *, 400 struct nvme_feature_obj *, 401 struct nvme_command *, 402 struct nvme_completion *); 403 404 static void *aen_thr(void *arg); 405 406 static __inline void 407 cpywithpad(char *dst, size_t dst_size, const char *src, char pad) 408 { 409 size_t len; 410 411 len = strnlen(src, dst_size); 412 memset(dst, pad, dst_size); 413 memcpy(dst, src, len); 414 } 415 416 static __inline void 417 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) 418 { 419 420 *status &= ~NVME_STATUS_MASK; 421 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | 422 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; 423 } 424 425 static __inline void 426 pci_nvme_status_genc(uint16_t *status, uint16_t code) 427 { 428 429 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); 430 } 431 432 /* 433 * Initialize the requested number or IO Submission and Completion Queues. 434 * Admin queues are allocated implicitly. 435 */ 436 static void 437 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq) 438 { 439 uint32_t i; 440 441 /* 442 * Allocate and initialize the Submission Queues 443 */ 444 if (nsq > NVME_QUEUES) { 445 WPRINTF("%s: clamping number of SQ from %u to %u", 446 __func__, nsq, NVME_QUEUES); 447 nsq = NVME_QUEUES; 448 } 449 450 sc->num_squeues = nsq; 451 452 sc->submit_queues = calloc(sc->num_squeues + 1, 453 sizeof(struct nvme_submission_queue)); 454 if (sc->submit_queues == NULL) { 455 WPRINTF("%s: SQ allocation failed", __func__); 456 sc->num_squeues = 0; 457 } else { 458 struct nvme_submission_queue *sq = sc->submit_queues; 459 460 for (i = 0; i < sc->num_squeues; i++) 461 pthread_mutex_init(&sq[i].mtx, NULL); 462 } 463 464 /* 465 * Allocate and initialize the Completion Queues 466 */ 467 if (ncq > NVME_QUEUES) { 468 WPRINTF("%s: clamping number of CQ from %u to %u", 469 __func__, ncq, NVME_QUEUES); 470 ncq = NVME_QUEUES; 471 } 472 473 sc->num_cqueues = ncq; 474 475 sc->compl_queues = calloc(sc->num_cqueues + 1, 476 sizeof(struct nvme_completion_queue)); 477 if (sc->compl_queues == NULL) { 478 WPRINTF("%s: CQ allocation failed", __func__); 479 sc->num_cqueues = 0; 480 } else { 481 struct nvme_completion_queue *cq = sc->compl_queues; 482 483 for (i = 0; i < sc->num_cqueues; i++) 484 pthread_mutex_init(&cq[i].mtx, NULL); 485 } 486 } 487 488 static void 489 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) 490 { 491 struct nvme_controller_data *cd = &sc->ctrldata; 492 493 cd->vid = 0xFB5D; 494 cd->ssvid = 0x0000; 495 496 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); 497 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); 498 499 /* Num of submission commands that we can handle at a time (2^rab) */ 500 cd->rab = 4; 501 502 /* FreeBSD OUI */ 503 cd->ieee[0] = 0x58; 504 cd->ieee[1] = 0x9c; 505 cd->ieee[2] = 0xfc; 506 507 cd->mic = 0; 508 509 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */ 510 511 cd->ver = 0x00010300; 512 513 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; 514 cd->acl = 2; 515 cd->aerl = 4; 516 517 /* Advertise 1, Read-only firmware slot */ 518 cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK | 519 (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT); 520 cd->lpa = 0; /* TODO: support some simple things like SMART */ 521 cd->elpe = 0; /* max error log page entries */ 522 cd->npss = 1; /* number of power states support */ 523 524 /* Warning Composite Temperature Threshold */ 525 cd->wctemp = 0x0157; 526 527 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | 528 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); 529 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | 530 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); 531 cd->nn = 1; /* number of namespaces */ 532 533 cd->oncs = 0; 534 switch (sc->dataset_management) { 535 case NVME_DATASET_MANAGEMENT_AUTO: 536 if (sc->nvstore.deallocate) 537 cd->oncs |= NVME_ONCS_DSM; 538 break; 539 case NVME_DATASET_MANAGEMENT_ENABLE: 540 cd->oncs |= NVME_ONCS_DSM; 541 break; 542 default: 543 break; 544 } 545 546 cd->fna = 0x03; 547 548 cd->power_state[0].mp = 10; 549 } 550 551 /* 552 * Calculate the CRC-16 of the given buffer 553 * See copyright attribution at top of file 554 */ 555 static uint16_t 556 crc16(uint16_t crc, const void *buffer, unsigned int len) 557 { 558 const unsigned char *cp = buffer; 559 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */ 560 static uint16_t const crc16_table[256] = { 561 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, 562 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, 563 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, 564 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841, 565 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, 566 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41, 567 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641, 568 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040, 569 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240, 570 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441, 571 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41, 572 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840, 573 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41, 574 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40, 575 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640, 576 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041, 577 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240, 578 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441, 579 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41, 580 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840, 581 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41, 582 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40, 583 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640, 584 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041, 585 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241, 586 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440, 587 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40, 588 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841, 589 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40, 590 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41, 591 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, 592 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 593 }; 594 595 while (len--) 596 crc = (((crc >> 8) & 0xffU) ^ 597 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU; 598 return crc; 599 } 600 601 static void 602 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore, 603 struct nvme_namespace_data *nd) 604 { 605 606 /* Get capacity and block size information from backing store */ 607 nd->nsze = nvstore->size / nvstore->sectsz; 608 nd->ncap = nd->nsze; 609 nd->nuse = nd->nsze; 610 } 611 612 static void 613 pci_nvme_init_nsdata(struct pci_nvme_softc *sc, 614 struct nvme_namespace_data *nd, uint32_t nsid, 615 struct pci_nvme_blockstore *nvstore) 616 { 617 618 pci_nvme_init_nsdata_size(nvstore, nd); 619 620 if (nvstore->type == NVME_STOR_BLOCKIF) 621 nvstore->deallocate = blockif_candelete(nvstore->ctx); 622 623 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ 624 nd->flbas = 0; 625 626 /* Create an EUI-64 if user did not provide one */ 627 if (nvstore->eui64 == 0) { 628 char *data = NULL; 629 uint64_t eui64 = nvstore->eui64; 630 631 asprintf(&data, "%s%u%u%u", get_config_value("name"), 632 sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot, 633 sc->nsc_pi->pi_func); 634 635 if (data != NULL) { 636 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); 637 free(data); 638 } 639 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff); 640 } 641 be64enc(nd->eui64, nvstore->eui64); 642 643 /* LBA data-sz = 2^lbads */ 644 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; 645 } 646 647 static void 648 pci_nvme_init_logpages(struct pci_nvme_softc *sc) 649 { 650 651 memset(&sc->err_log, 0, sizeof(sc->err_log)); 652 memset(&sc->health_log, 0, sizeof(sc->health_log)); 653 memset(&sc->fw_log, 0, sizeof(sc->fw_log)); 654 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 655 656 /* Set read/write remainder to round up according to spec */ 657 sc->read_dunits_remainder = 999; 658 sc->write_dunits_remainder = 999; 659 660 /* Set nominal Health values checked by implementations */ 661 sc->health_log.temperature = 310; 662 sc->health_log.available_spare = 100; 663 sc->health_log.available_spare_threshold = 10; 664 } 665 666 static void 667 pci_nvme_init_features(struct pci_nvme_softc *sc) 668 { 669 670 sc->feat[0].set = nvme_feature_invalid_cb; 671 sc->feat[0].get = nvme_feature_invalid_cb; 672 673 sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true; 674 sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true; 675 sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues; 676 sc->feat[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION].set = 677 nvme_feature_iv_config; 678 /* Enable all AENs by default */ 679 sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11 = 0x31f; 680 sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG].get = 681 nvme_feature_invalid_cb; 682 sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW].get = 683 nvme_feature_invalid_cb; 684 } 685 686 static void 687 pci_nvme_aer_reset(struct pci_nvme_softc *sc) 688 { 689 690 STAILQ_INIT(&sc->aer_list); 691 sc->aer_count = 0; 692 } 693 694 static void 695 pci_nvme_aer_init(struct pci_nvme_softc *sc) 696 { 697 698 pthread_mutex_init(&sc->aer_mtx, NULL); 699 pci_nvme_aer_reset(sc); 700 } 701 702 static void 703 pci_nvme_aer_destroy(struct pci_nvme_softc *sc) 704 { 705 struct pci_nvme_aer *aer = NULL; 706 707 pthread_mutex_lock(&sc->aer_mtx); 708 while (!STAILQ_EMPTY(&sc->aer_list)) { 709 aer = STAILQ_FIRST(&sc->aer_list); 710 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 711 free(aer); 712 } 713 pthread_mutex_unlock(&sc->aer_mtx); 714 715 pci_nvme_aer_reset(sc); 716 } 717 718 static bool 719 pci_nvme_aer_available(struct pci_nvme_softc *sc) 720 { 721 722 return (sc->aer_count != 0); 723 } 724 725 static bool 726 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc) 727 { 728 struct nvme_controller_data *cd = &sc->ctrldata; 729 730 /* AERL is a zero based value while aer_count is one's based */ 731 return (sc->aer_count == (cd->aerl + 1)); 732 } 733 734 /* 735 * Add an Async Event Request 736 * 737 * Stores an AER to be returned later if the Controller needs to notify the 738 * host of an event. 739 * Note that while the NVMe spec doesn't require Controllers to return AER's 740 * in order, this implementation does preserve the order. 741 */ 742 static int 743 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid) 744 { 745 struct pci_nvme_aer *aer = NULL; 746 747 if (pci_nvme_aer_limit_reached(sc)) 748 return (-1); 749 750 aer = calloc(1, sizeof(struct pci_nvme_aer)); 751 if (aer == NULL) 752 return (-1); 753 754 /* Save the Command ID for use in the completion message */ 755 aer->cid = cid; 756 757 pthread_mutex_lock(&sc->aer_mtx); 758 sc->aer_count++; 759 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link); 760 pthread_mutex_unlock(&sc->aer_mtx); 761 762 return (0); 763 } 764 765 /* 766 * Get an Async Event Request structure 767 * 768 * Returns a pointer to an AER previously submitted by the host or NULL if 769 * no AER's exist. Caller is responsible for freeing the returned struct. 770 */ 771 static struct pci_nvme_aer * 772 pci_nvme_aer_get(struct pci_nvme_softc *sc) 773 { 774 struct pci_nvme_aer *aer = NULL; 775 776 pthread_mutex_lock(&sc->aer_mtx); 777 aer = STAILQ_FIRST(&sc->aer_list); 778 if (aer != NULL) { 779 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 780 sc->aer_count--; 781 } 782 pthread_mutex_unlock(&sc->aer_mtx); 783 784 return (aer); 785 } 786 787 static void 788 pci_nvme_aen_reset(struct pci_nvme_softc *sc) 789 { 790 uint32_t atype; 791 792 memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen)); 793 794 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 795 sc->aen[atype].atype = atype; 796 } 797 } 798 799 static void 800 pci_nvme_aen_init(struct pci_nvme_softc *sc) 801 { 802 char nstr[80]; 803 804 pci_nvme_aen_reset(sc); 805 806 pthread_mutex_init(&sc->aen_mtx, NULL); 807 pthread_create(&sc->aen_tid, NULL, aen_thr, sc); 808 snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot, 809 sc->nsc_pi->pi_func); 810 pthread_set_name_np(sc->aen_tid, nstr); 811 } 812 813 static void 814 pci_nvme_aen_destroy(struct pci_nvme_softc *sc) 815 { 816 817 pci_nvme_aen_reset(sc); 818 } 819 820 /* Notify the AEN thread of pending work */ 821 static void 822 pci_nvme_aen_notify(struct pci_nvme_softc *sc) 823 { 824 825 pthread_cond_signal(&sc->aen_cond); 826 } 827 828 /* 829 * Post an Asynchronous Event Notification 830 */ 831 static int32_t 832 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype, 833 uint32_t event_data) 834 { 835 struct pci_nvme_aen *aen; 836 837 if (atype >= PCI_NVME_AE_TYPE_MAX) { 838 return(EINVAL); 839 } 840 841 pthread_mutex_lock(&sc->aen_mtx); 842 aen = &sc->aen[atype]; 843 844 /* Has the controller already posted an event of this type? */ 845 if (aen->posted) { 846 pthread_mutex_unlock(&sc->aen_mtx); 847 return(EALREADY); 848 } 849 850 aen->event_data = event_data; 851 aen->posted = true; 852 pthread_mutex_unlock(&sc->aen_mtx); 853 854 pci_nvme_aen_notify(sc); 855 856 return(0); 857 } 858 859 static void 860 pci_nvme_aen_process(struct pci_nvme_softc *sc) 861 { 862 struct pci_nvme_aer *aer; 863 struct pci_nvme_aen *aen; 864 pci_nvme_async_type atype; 865 uint32_t mask; 866 uint16_t status; 867 uint8_t lid; 868 869 assert(pthread_mutex_isowned_np(&sc->aen_mtx)); 870 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 871 aen = &sc->aen[atype]; 872 /* Previous iterations may have depleted the available AER's */ 873 if (!pci_nvme_aer_available(sc)) { 874 DPRINTF("%s: no AER", __func__); 875 break; 876 } 877 878 if (!aen->posted) { 879 DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype); 880 continue; 881 } 882 883 status = NVME_SC_SUCCESS; 884 885 /* Is the event masked? */ 886 mask = 887 sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11; 888 889 DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data); 890 switch (atype) { 891 case PCI_NVME_AE_TYPE_ERROR: 892 lid = NVME_LOG_ERROR; 893 break; 894 case PCI_NVME_AE_TYPE_SMART: 895 mask &= 0xff; 896 if ((mask & aen->event_data) == 0) 897 continue; 898 lid = NVME_LOG_HEALTH_INFORMATION; 899 break; 900 case PCI_NVME_AE_TYPE_NOTICE: 901 if (aen->event_data >= PCI_NVME_AE_INFO_MAX) { 902 EPRINTLN("%s unknown AEN notice type %u", 903 __func__, aen->event_data); 904 status = NVME_SC_INTERNAL_DEVICE_ERROR; 905 break; 906 } 907 mask >>= 8; 908 if (((1 << aen->event_data) & mask) == 0) 909 continue; 910 switch (aen->event_data) { 911 case PCI_NVME_AE_INFO_NS_ATTR_CHANGED: 912 lid = NVME_LOG_CHANGED_NAMESPACE; 913 break; 914 case PCI_NVME_AE_INFO_FW_ACTIVATION: 915 lid = NVME_LOG_FIRMWARE_SLOT; 916 break; 917 case PCI_NVME_AE_INFO_TELEMETRY_CHANGE: 918 lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED; 919 break; 920 case PCI_NVME_AE_INFO_ANA_CHANGE: 921 lid = NVME_LOG_ASYMMETRIC_NAMESPAVE_ACCESS; //TODO spelling 922 break; 923 case PCI_NVME_AE_INFO_PREDICT_LATENCY_CHANGE: 924 lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE; 925 break; 926 case PCI_NVME_AE_INFO_LBA_STATUS_ALERT: 927 lid = NVME_LOG_LBA_STATUS_INFORMATION; 928 break; 929 case PCI_NVME_AE_INFO_ENDURANCE_GROUP_CHANGE: 930 lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE; 931 break; 932 default: 933 lid = 0; 934 } 935 break; 936 default: 937 /* bad type?!? */ 938 EPRINTLN("%s unknown AEN type %u", __func__, atype); 939 status = NVME_SC_INTERNAL_DEVICE_ERROR; 940 break; 941 } 942 943 aer = pci_nvme_aer_get(sc); 944 assert(aer != NULL); 945 946 DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype); 947 pci_nvme_cq_update(sc, &sc->compl_queues[0], 948 (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */ 949 aer->cid, 950 0, /* SQID */ 951 status); 952 953 aen->event_data = 0; 954 aen->posted = false; 955 956 pci_generate_msix(sc->nsc_pi, 0); 957 } 958 } 959 960 static void * 961 aen_thr(void *arg) 962 { 963 struct pci_nvme_softc *sc; 964 965 sc = arg; 966 967 pthread_mutex_lock(&sc->aen_mtx); 968 for (;;) { 969 pci_nvme_aen_process(sc); 970 pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx); 971 } 972 pthread_mutex_unlock(&sc->aen_mtx); 973 974 pthread_exit(NULL); 975 return (NULL); 976 } 977 978 static void 979 pci_nvme_reset_locked(struct pci_nvme_softc *sc) 980 { 981 uint32_t i; 982 983 DPRINTF("%s", __func__); 984 985 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | 986 (1 << NVME_CAP_LO_REG_CQR_SHIFT) | 987 (60 << NVME_CAP_LO_REG_TO_SHIFT); 988 989 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; 990 991 sc->regs.vs = 0x00010300; /* NVMe v1.3 */ 992 993 sc->regs.cc = 0; 994 sc->regs.csts = 0; 995 996 assert(sc->submit_queues != NULL); 997 998 for (i = 0; i < sc->num_squeues + 1; i++) { 999 sc->submit_queues[i].qbase = NULL; 1000 sc->submit_queues[i].size = 0; 1001 sc->submit_queues[i].cqid = 0; 1002 sc->submit_queues[i].tail = 0; 1003 sc->submit_queues[i].head = 0; 1004 } 1005 1006 assert(sc->compl_queues != NULL); 1007 1008 for (i = 0; i < sc->num_cqueues + 1; i++) { 1009 sc->compl_queues[i].qbase = NULL; 1010 sc->compl_queues[i].size = 0; 1011 sc->compl_queues[i].tail = 0; 1012 sc->compl_queues[i].head = 0; 1013 } 1014 1015 sc->num_q_is_set = false; 1016 1017 pci_nvme_aer_destroy(sc); 1018 pci_nvme_aen_destroy(sc); 1019 } 1020 1021 static void 1022 pci_nvme_reset(struct pci_nvme_softc *sc) 1023 { 1024 pthread_mutex_lock(&sc->mtx); 1025 pci_nvme_reset_locked(sc); 1026 pthread_mutex_unlock(&sc->mtx); 1027 } 1028 1029 static void 1030 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc) 1031 { 1032 uint16_t acqs, asqs; 1033 1034 DPRINTF("%s", __func__); 1035 1036 asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1; 1037 sc->submit_queues[0].size = asqs; 1038 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq, 1039 sizeof(struct nvme_command) * asqs); 1040 1041 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p", 1042 __func__, sc->regs.asq, sc->submit_queues[0].qbase); 1043 1044 acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 1045 NVME_AQA_REG_ACQS_MASK) + 1; 1046 sc->compl_queues[0].size = acqs; 1047 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq, 1048 sizeof(struct nvme_completion) * acqs); 1049 sc->compl_queues[0].intr_en = NVME_CQ_INTEN; 1050 1051 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p", 1052 __func__, sc->regs.acq, sc->compl_queues[0].qbase); 1053 } 1054 1055 static int 1056 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b, 1057 size_t len, enum nvme_copy_dir dir) 1058 { 1059 uint8_t *p; 1060 size_t bytes; 1061 1062 if (len > (8 * 1024)) { 1063 return (-1); 1064 } 1065 1066 /* Copy from the start of prp1 to the end of the physical page */ 1067 bytes = PAGE_SIZE - (prp1 & PAGE_MASK); 1068 bytes = MIN(bytes, len); 1069 1070 p = vm_map_gpa(ctx, prp1, bytes); 1071 if (p == NULL) { 1072 return (-1); 1073 } 1074 1075 if (dir == NVME_COPY_TO_PRP) 1076 memcpy(p, b, bytes); 1077 else 1078 memcpy(b, p, bytes); 1079 1080 b += bytes; 1081 1082 len -= bytes; 1083 if (len == 0) { 1084 return (0); 1085 } 1086 1087 len = MIN(len, PAGE_SIZE); 1088 1089 p = vm_map_gpa(ctx, prp2, len); 1090 if (p == NULL) { 1091 return (-1); 1092 } 1093 1094 if (dir == NVME_COPY_TO_PRP) 1095 memcpy(p, b, len); 1096 else 1097 memcpy(b, p, len); 1098 1099 return (0); 1100 } 1101 1102 /* 1103 * Write a Completion Queue Entry update 1104 * 1105 * Write the completion and update the doorbell value 1106 */ 1107 static void 1108 pci_nvme_cq_update(struct pci_nvme_softc *sc, 1109 struct nvme_completion_queue *cq, 1110 uint32_t cdw0, 1111 uint16_t cid, 1112 uint16_t sqid, 1113 uint16_t status) 1114 { 1115 struct nvme_submission_queue *sq = &sc->submit_queues[sqid]; 1116 struct nvme_completion *cqe; 1117 1118 assert(cq->qbase != NULL); 1119 1120 pthread_mutex_lock(&cq->mtx); 1121 1122 cqe = &cq->qbase[cq->tail]; 1123 1124 /* Flip the phase bit */ 1125 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK; 1126 1127 cqe->cdw0 = cdw0; 1128 cqe->sqhd = sq->head; 1129 cqe->sqid = sqid; 1130 cqe->cid = cid; 1131 cqe->status = status; 1132 1133 cq->tail++; 1134 if (cq->tail >= cq->size) { 1135 cq->tail = 0; 1136 } 1137 1138 pthread_mutex_unlock(&cq->mtx); 1139 } 1140 1141 static int 1142 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1143 struct nvme_completion* compl) 1144 { 1145 uint16_t qid = command->cdw10 & 0xffff; 1146 1147 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid); 1148 if (qid == 0 || qid > sc->num_squeues || 1149 (sc->submit_queues[qid].qbase == NULL)) { 1150 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u", 1151 __func__, qid, sc->num_squeues); 1152 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1153 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1154 return (1); 1155 } 1156 1157 sc->submit_queues[qid].qbase = NULL; 1158 sc->submit_queues[qid].cqid = 0; 1159 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1160 return (1); 1161 } 1162 1163 static int 1164 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1165 struct nvme_completion* compl) 1166 { 1167 if (command->cdw11 & NVME_CMD_CDW11_PC) { 1168 uint16_t qid = command->cdw10 & 0xffff; 1169 struct nvme_submission_queue *nsq; 1170 1171 if ((qid == 0) || (qid > sc->num_squeues) || 1172 (sc->submit_queues[qid].qbase != NULL)) { 1173 WPRINTF("%s queue index %u > num_squeues %u", 1174 __func__, qid, sc->num_squeues); 1175 pci_nvme_status_tc(&compl->status, 1176 NVME_SCT_COMMAND_SPECIFIC, 1177 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1178 return (1); 1179 } 1180 1181 nsq = &sc->submit_queues[qid]; 1182 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1183 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries); 1184 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) { 1185 /* 1186 * Queues must specify at least two entries 1187 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1188 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1189 */ 1190 pci_nvme_status_tc(&compl->status, 1191 NVME_SCT_COMMAND_SPECIFIC, 1192 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1193 return (1); 1194 } 1195 nsq->head = nsq->tail = 0; 1196 1197 nsq->cqid = (command->cdw11 >> 16) & 0xffff; 1198 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) { 1199 pci_nvme_status_tc(&compl->status, 1200 NVME_SCT_COMMAND_SPECIFIC, 1201 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1202 return (1); 1203 } 1204 1205 if (sc->compl_queues[nsq->cqid].qbase == NULL) { 1206 pci_nvme_status_tc(&compl->status, 1207 NVME_SCT_COMMAND_SPECIFIC, 1208 NVME_SC_COMPLETION_QUEUE_INVALID); 1209 return (1); 1210 } 1211 1212 nsq->qpriority = (command->cdw11 >> 1) & 0x03; 1213 1214 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1215 sizeof(struct nvme_command) * (size_t)nsq->size); 1216 1217 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__, 1218 qid, nsq->size, nsq->qbase, nsq->cqid); 1219 1220 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1221 1222 DPRINTF("%s completed creating IOSQ qid %u", 1223 __func__, qid); 1224 } else { 1225 /* 1226 * Guest sent non-cont submission queue request. 1227 * This setting is unsupported by this emulation. 1228 */ 1229 WPRINTF("%s unsupported non-contig (list-based) " 1230 "create i/o submission queue", __func__); 1231 1232 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1233 } 1234 return (1); 1235 } 1236 1237 static int 1238 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1239 struct nvme_completion* compl) 1240 { 1241 uint16_t qid = command->cdw10 & 0xffff; 1242 uint16_t sqid; 1243 1244 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid); 1245 if (qid == 0 || qid > sc->num_cqueues || 1246 (sc->compl_queues[qid].qbase == NULL)) { 1247 WPRINTF("%s queue index %u / num_cqueues %u", 1248 __func__, qid, sc->num_cqueues); 1249 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1250 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1251 return (1); 1252 } 1253 1254 /* Deleting an Active CQ is an error */ 1255 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++) 1256 if (sc->submit_queues[sqid].cqid == qid) { 1257 pci_nvme_status_tc(&compl->status, 1258 NVME_SCT_COMMAND_SPECIFIC, 1259 NVME_SC_INVALID_QUEUE_DELETION); 1260 return (1); 1261 } 1262 1263 sc->compl_queues[qid].qbase = NULL; 1264 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1265 return (1); 1266 } 1267 1268 static int 1269 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1270 struct nvme_completion* compl) 1271 { 1272 struct nvme_completion_queue *ncq; 1273 uint16_t qid = command->cdw10 & 0xffff; 1274 1275 /* Only support Physically Contiguous queues */ 1276 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) { 1277 WPRINTF("%s unsupported non-contig (list-based) " 1278 "create i/o completion queue", 1279 __func__); 1280 1281 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1282 return (1); 1283 } 1284 1285 if ((qid == 0) || (qid > sc->num_cqueues) || 1286 (sc->compl_queues[qid].qbase != NULL)) { 1287 WPRINTF("%s queue index %u > num_cqueues %u", 1288 __func__, qid, sc->num_cqueues); 1289 pci_nvme_status_tc(&compl->status, 1290 NVME_SCT_COMMAND_SPECIFIC, 1291 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1292 return (1); 1293 } 1294 1295 ncq = &sc->compl_queues[qid]; 1296 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; 1297 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; 1298 if (ncq->intr_vec > (sc->max_queues + 1)) { 1299 pci_nvme_status_tc(&compl->status, 1300 NVME_SCT_COMMAND_SPECIFIC, 1301 NVME_SC_INVALID_INTERRUPT_VECTOR); 1302 return (1); 1303 } 1304 1305 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1306 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) { 1307 /* 1308 * Queues must specify at least two entries 1309 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1310 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1311 */ 1312 pci_nvme_status_tc(&compl->status, 1313 NVME_SCT_COMMAND_SPECIFIC, 1314 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1315 return (1); 1316 } 1317 ncq->head = ncq->tail = 0; 1318 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1319 command->prp1, 1320 sizeof(struct nvme_command) * (size_t)ncq->size); 1321 1322 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1323 1324 1325 return (1); 1326 } 1327 1328 static int 1329 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, 1330 struct nvme_completion* compl) 1331 { 1332 uint32_t logsize; 1333 uint8_t logpage = command->cdw10 & 0xFF; 1334 1335 DPRINTF("%s log page %u len %u", __func__, logpage, logsize); 1336 1337 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1338 1339 /* 1340 * Command specifies the number of dwords to return in fields NUMDU 1341 * and NUMDL. This is a zero-based value. 1342 */ 1343 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1; 1344 logsize *= sizeof(uint32_t); 1345 1346 switch (logpage) { 1347 case NVME_LOG_ERROR: 1348 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1349 command->prp2, (uint8_t *)&sc->err_log, 1350 MIN(logsize, sizeof(sc->err_log)), 1351 NVME_COPY_TO_PRP); 1352 break; 1353 case NVME_LOG_HEALTH_INFORMATION: 1354 pthread_mutex_lock(&sc->mtx); 1355 memcpy(&sc->health_log.data_units_read, &sc->read_data_units, 1356 sizeof(sc->health_log.data_units_read)); 1357 memcpy(&sc->health_log.data_units_written, &sc->write_data_units, 1358 sizeof(sc->health_log.data_units_written)); 1359 memcpy(&sc->health_log.host_read_commands, &sc->read_commands, 1360 sizeof(sc->health_log.host_read_commands)); 1361 memcpy(&sc->health_log.host_write_commands, &sc->write_commands, 1362 sizeof(sc->health_log.host_write_commands)); 1363 pthread_mutex_unlock(&sc->mtx); 1364 1365 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1366 command->prp2, (uint8_t *)&sc->health_log, 1367 MIN(logsize, sizeof(sc->health_log)), 1368 NVME_COPY_TO_PRP); 1369 break; 1370 case NVME_LOG_FIRMWARE_SLOT: 1371 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1372 command->prp2, (uint8_t *)&sc->fw_log, 1373 MIN(logsize, sizeof(sc->fw_log)), 1374 NVME_COPY_TO_PRP); 1375 break; 1376 case NVME_LOG_CHANGED_NAMESPACE: 1377 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1378 command->prp2, (uint8_t *)&sc->ns_log, 1379 MIN(logsize, sizeof(sc->ns_log)), 1380 NVME_COPY_TO_PRP); 1381 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 1382 break; 1383 default: 1384 DPRINTF("%s get log page %x command not supported", 1385 __func__, logpage); 1386 1387 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1388 NVME_SC_INVALID_LOG_PAGE); 1389 } 1390 1391 return (1); 1392 } 1393 1394 static int 1395 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, 1396 struct nvme_completion* compl) 1397 { 1398 void *dest; 1399 uint16_t status; 1400 1401 DPRINTF("%s identify 0x%x nsid 0x%x", __func__, 1402 command->cdw10 & 0xFF, command->nsid); 1403 1404 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1405 1406 switch (command->cdw10 & 0xFF) { 1407 case 0x00: /* return Identify Namespace data structure */ 1408 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1409 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata), 1410 NVME_COPY_TO_PRP); 1411 break; 1412 case 0x01: /* return Identify Controller data structure */ 1413 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1414 command->prp2, (uint8_t *)&sc->ctrldata, 1415 sizeof(sc->ctrldata), 1416 NVME_COPY_TO_PRP); 1417 break; 1418 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ 1419 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1420 sizeof(uint32_t) * 1024); 1421 /* All unused entries shall be zero */ 1422 bzero(dest, sizeof(uint32_t) * 1024); 1423 ((uint32_t *)dest)[0] = 1; 1424 break; 1425 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ 1426 if (command->nsid != 1) { 1427 pci_nvme_status_genc(&status, 1428 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1429 break; 1430 } 1431 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1432 sizeof(uint32_t) * 1024); 1433 /* All bytes after the descriptor shall be zero */ 1434 bzero(dest, sizeof(uint32_t) * 1024); 1435 1436 /* Return NIDT=1 (i.e. EUI64) descriptor */ 1437 ((uint8_t *)dest)[0] = 1; 1438 ((uint8_t *)dest)[1] = sizeof(uint64_t); 1439 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t)); 1440 break; 1441 default: 1442 DPRINTF("%s unsupported identify command requested 0x%x", 1443 __func__, command->cdw10 & 0xFF); 1444 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD); 1445 break; 1446 } 1447 1448 compl->status = status; 1449 return (1); 1450 } 1451 1452 static const char * 1453 nvme_fid_to_name(uint8_t fid) 1454 { 1455 const char *name; 1456 1457 switch (fid) { 1458 case NVME_FEAT_ARBITRATION: 1459 name = "Arbitration"; 1460 break; 1461 case NVME_FEAT_POWER_MANAGEMENT: 1462 name = "Power Management"; 1463 break; 1464 case NVME_FEAT_LBA_RANGE_TYPE: 1465 name = "LBA Range Type"; 1466 break; 1467 case NVME_FEAT_TEMPERATURE_THRESHOLD: 1468 name = "Temperature Threshold"; 1469 break; 1470 case NVME_FEAT_ERROR_RECOVERY: 1471 name = "Error Recovery"; 1472 break; 1473 case NVME_FEAT_VOLATILE_WRITE_CACHE: 1474 name = "Volatile Write Cache"; 1475 break; 1476 case NVME_FEAT_NUMBER_OF_QUEUES: 1477 name = "Number of Queues"; 1478 break; 1479 case NVME_FEAT_INTERRUPT_COALESCING: 1480 name = "Interrupt Coalescing"; 1481 break; 1482 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 1483 name = "Interrupt Vector Configuration"; 1484 break; 1485 case NVME_FEAT_WRITE_ATOMICITY: 1486 name = "Write Atomicity Normal"; 1487 break; 1488 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 1489 name = "Asynchronous Event Configuration"; 1490 break; 1491 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 1492 name = "Autonomous Power State Transition"; 1493 break; 1494 case NVME_FEAT_HOST_MEMORY_BUFFER: 1495 name = "Host Memory Buffer"; 1496 break; 1497 case NVME_FEAT_TIMESTAMP: 1498 name = "Timestamp"; 1499 break; 1500 case NVME_FEAT_KEEP_ALIVE_TIMER: 1501 name = "Keep Alive Timer"; 1502 break; 1503 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT: 1504 name = "Host Controlled Thermal Management"; 1505 break; 1506 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG: 1507 name = "Non-Operation Power State Config"; 1508 break; 1509 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG: 1510 name = "Read Recovery Level Config"; 1511 break; 1512 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 1513 name = "Predictable Latency Mode Config"; 1514 break; 1515 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW: 1516 name = "Predictable Latency Mode Window"; 1517 break; 1518 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES: 1519 name = "LBA Status Information Report Interval"; 1520 break; 1521 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 1522 name = "Host Behavior Support"; 1523 break; 1524 case NVME_FEAT_SANITIZE_CONFIG: 1525 name = "Sanitize Config"; 1526 break; 1527 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION: 1528 name = "Endurance Group Event Configuration"; 1529 break; 1530 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 1531 name = "Software Progress Marker"; 1532 break; 1533 case NVME_FEAT_HOST_IDENTIFIER: 1534 name = "Host Identifier"; 1535 break; 1536 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK: 1537 name = "Reservation Notification Mask"; 1538 break; 1539 case NVME_FEAT_RESERVATION_PERSISTENCE: 1540 name = "Reservation Persistence"; 1541 break; 1542 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG: 1543 name = "Namespace Write Protection Config"; 1544 break; 1545 default: 1546 name = "Unknown"; 1547 break; 1548 } 1549 1550 return (name); 1551 } 1552 1553 static void 1554 nvme_feature_invalid_cb(struct pci_nvme_softc *sc, 1555 struct nvme_feature_obj *feat, 1556 struct nvme_command *command, 1557 struct nvme_completion *compl) 1558 { 1559 1560 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1561 } 1562 1563 static void 1564 nvme_feature_iv_config(struct pci_nvme_softc *sc, 1565 struct nvme_feature_obj *feat, 1566 struct nvme_command *command, 1567 struct nvme_completion *compl) 1568 { 1569 uint32_t i; 1570 uint32_t cdw11 = command->cdw11; 1571 uint16_t iv; 1572 bool cd; 1573 1574 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1575 1576 iv = cdw11 & 0xffff; 1577 cd = cdw11 & (1 << 16); 1578 1579 if (iv > (sc->max_queues + 1)) { 1580 return; 1581 } 1582 1583 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */ 1584 if ((iv == 0) && !cd) 1585 return; 1586 1587 /* Requested Interrupt Vector must be used by a CQ */ 1588 for (i = 0; i < sc->num_cqueues + 1; i++) { 1589 if (sc->compl_queues[i].intr_vec == iv) { 1590 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1591 } 1592 } 1593 1594 } 1595 1596 static void 1597 nvme_feature_num_queues(struct pci_nvme_softc *sc, 1598 struct nvme_feature_obj *feat, 1599 struct nvme_command *command, 1600 struct nvme_completion *compl) 1601 { 1602 uint16_t nqr; /* Number of Queues Requested */ 1603 1604 if (sc->num_q_is_set) { 1605 WPRINTF("%s: Number of Queues already set", __func__); 1606 pci_nvme_status_genc(&compl->status, 1607 NVME_SC_COMMAND_SEQUENCE_ERROR); 1608 return; 1609 } 1610 1611 nqr = command->cdw11 & 0xFFFF; 1612 if (nqr == 0xffff) { 1613 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr); 1614 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1615 return; 1616 } 1617 1618 sc->num_squeues = ONE_BASED(nqr); 1619 if (sc->num_squeues > sc->max_queues) { 1620 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues, 1621 sc->max_queues); 1622 sc->num_squeues = sc->max_queues; 1623 } 1624 1625 nqr = (command->cdw11 >> 16) & 0xFFFF; 1626 if (nqr == 0xffff) { 1627 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr); 1628 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1629 return; 1630 } 1631 1632 sc->num_cqueues = ONE_BASED(nqr); 1633 if (sc->num_cqueues > sc->max_queues) { 1634 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues, 1635 sc->max_queues); 1636 sc->num_cqueues = sc->max_queues; 1637 } 1638 1639 /* Patch the command value which will be saved on callback's return */ 1640 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc); 1641 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 1642 1643 sc->num_q_is_set = true; 1644 } 1645 1646 static int 1647 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command, 1648 struct nvme_completion *compl) 1649 { 1650 struct nvme_feature_obj *feat; 1651 uint32_t nsid = command->nsid; 1652 uint8_t fid = command->cdw10 & 0xFF; 1653 1654 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1655 1656 if (fid >= NVME_FID_MAX) { 1657 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1658 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1659 return (1); 1660 } 1661 feat = &sc->feat[fid]; 1662 1663 if (!feat->namespace_specific && 1664 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) { 1665 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1666 NVME_SC_FEATURE_NOT_NS_SPECIFIC); 1667 return (1); 1668 } 1669 1670 compl->cdw0 = 0; 1671 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1672 1673 if (feat->set) 1674 feat->set(sc, feat, command, compl); 1675 1676 DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11); 1677 if (compl->status == NVME_SC_SUCCESS) { 1678 feat->cdw11 = command->cdw11; 1679 if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) && 1680 (command->cdw11 != 0)) 1681 pci_nvme_aen_notify(sc); 1682 } 1683 1684 return (0); 1685 } 1686 1687 static int 1688 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, 1689 struct nvme_completion* compl) 1690 { 1691 struct nvme_feature_obj *feat; 1692 uint8_t fid = command->cdw10 & 0xFF; 1693 1694 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1695 1696 if (fid >= NVME_FID_MAX) { 1697 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1698 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1699 return (1); 1700 } 1701 1702 compl->cdw0 = 0; 1703 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1704 1705 feat = &sc->feat[fid]; 1706 if (feat->get) { 1707 feat->get(sc, feat, command, compl); 1708 } 1709 1710 if (compl->status == NVME_SC_SUCCESS) { 1711 compl->cdw0 = feat->cdw11; 1712 } 1713 1714 return (0); 1715 } 1716 1717 static int 1718 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command, 1719 struct nvme_completion* compl) 1720 { 1721 uint8_t ses, lbaf, pi; 1722 1723 /* Only supports Secure Erase Setting - User Data Erase */ 1724 ses = (command->cdw10 >> 9) & 0x7; 1725 if (ses > 0x1) { 1726 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1727 return (1); 1728 } 1729 1730 /* Only supports a single LBA Format */ 1731 lbaf = command->cdw10 & 0xf; 1732 if (lbaf != 0) { 1733 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1734 NVME_SC_INVALID_FORMAT); 1735 return (1); 1736 } 1737 1738 /* Doesn't support Protection Infomation */ 1739 pi = (command->cdw10 >> 5) & 0x7; 1740 if (pi != 0) { 1741 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1742 return (1); 1743 } 1744 1745 if (sc->nvstore.type == NVME_STOR_RAM) { 1746 if (sc->nvstore.ctx) 1747 free(sc->nvstore.ctx); 1748 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 1749 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1750 } else { 1751 struct pci_nvme_ioreq *req; 1752 int err; 1753 1754 req = pci_nvme_get_ioreq(sc); 1755 if (req == NULL) { 1756 pci_nvme_status_genc(&compl->status, 1757 NVME_SC_INTERNAL_DEVICE_ERROR); 1758 WPRINTF("%s: unable to allocate IO req", __func__); 1759 return (1); 1760 } 1761 req->nvme_sq = &sc->submit_queues[0]; 1762 req->sqid = 0; 1763 req->opc = command->opc; 1764 req->cid = command->cid; 1765 req->nsid = command->nsid; 1766 1767 req->io_req.br_offset = 0; 1768 req->io_req.br_resid = sc->nvstore.size; 1769 req->io_req.br_callback = pci_nvme_io_done; 1770 1771 err = blockif_delete(sc->nvstore.ctx, &req->io_req); 1772 if (err) { 1773 pci_nvme_status_genc(&compl->status, 1774 NVME_SC_INTERNAL_DEVICE_ERROR); 1775 pci_nvme_release_ioreq(sc, req); 1776 } 1777 } 1778 1779 return (1); 1780 } 1781 1782 static int 1783 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command, 1784 struct nvme_completion* compl) 1785 { 1786 DPRINTF("%s submission queue %u, command ID 0x%x", __func__, 1787 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF); 1788 1789 /* TODO: search for the command ID and abort it */ 1790 1791 compl->cdw0 = 1; 1792 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1793 return (1); 1794 } 1795 1796 static int 1797 nvme_opc_async_event_req(struct pci_nvme_softc* sc, 1798 struct nvme_command* command, struct nvme_completion* compl) 1799 { 1800 DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__, 1801 sc->aer_count, sc->ctrldata.aerl, command->cid); 1802 1803 /* Don't exceed the Async Event Request Limit (AERL). */ 1804 if (pci_nvme_aer_limit_reached(sc)) { 1805 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1806 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); 1807 return (1); 1808 } 1809 1810 if (pci_nvme_aer_add(sc, command->cid)) { 1811 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC, 1812 NVME_SC_INTERNAL_DEVICE_ERROR); 1813 return (1); 1814 } 1815 1816 /* 1817 * Raise events when they happen based on the Set Features cmd. 1818 * These events happen async, so only set completion successful if 1819 * there is an event reflective of the request to get event. 1820 */ 1821 compl->status = NVME_NO_STATUS; 1822 pci_nvme_aen_notify(sc); 1823 1824 return (0); 1825 } 1826 1827 static void 1828 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) 1829 { 1830 struct nvme_completion compl; 1831 struct nvme_command *cmd; 1832 struct nvme_submission_queue *sq; 1833 struct nvme_completion_queue *cq; 1834 uint16_t sqhead; 1835 1836 DPRINTF("%s index %u", __func__, (uint32_t)value); 1837 1838 sq = &sc->submit_queues[0]; 1839 cq = &sc->compl_queues[0]; 1840 1841 pthread_mutex_lock(&sq->mtx); 1842 1843 sqhead = sq->head; 1844 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail); 1845 1846 while (sqhead != atomic_load_acq_short(&sq->tail)) { 1847 cmd = &(sq->qbase)[sqhead]; 1848 compl.cdw0 = 0; 1849 compl.status = 0; 1850 1851 switch (cmd->opc) { 1852 case NVME_OPC_DELETE_IO_SQ: 1853 DPRINTF("%s command DELETE_IO_SQ", __func__); 1854 nvme_opc_delete_io_sq(sc, cmd, &compl); 1855 break; 1856 case NVME_OPC_CREATE_IO_SQ: 1857 DPRINTF("%s command CREATE_IO_SQ", __func__); 1858 nvme_opc_create_io_sq(sc, cmd, &compl); 1859 break; 1860 case NVME_OPC_DELETE_IO_CQ: 1861 DPRINTF("%s command DELETE_IO_CQ", __func__); 1862 nvme_opc_delete_io_cq(sc, cmd, &compl); 1863 break; 1864 case NVME_OPC_CREATE_IO_CQ: 1865 DPRINTF("%s command CREATE_IO_CQ", __func__); 1866 nvme_opc_create_io_cq(sc, cmd, &compl); 1867 break; 1868 case NVME_OPC_GET_LOG_PAGE: 1869 DPRINTF("%s command GET_LOG_PAGE", __func__); 1870 nvme_opc_get_log_page(sc, cmd, &compl); 1871 break; 1872 case NVME_OPC_IDENTIFY: 1873 DPRINTF("%s command IDENTIFY", __func__); 1874 nvme_opc_identify(sc, cmd, &compl); 1875 break; 1876 case NVME_OPC_ABORT: 1877 DPRINTF("%s command ABORT", __func__); 1878 nvme_opc_abort(sc, cmd, &compl); 1879 break; 1880 case NVME_OPC_SET_FEATURES: 1881 DPRINTF("%s command SET_FEATURES", __func__); 1882 nvme_opc_set_features(sc, cmd, &compl); 1883 break; 1884 case NVME_OPC_GET_FEATURES: 1885 DPRINTF("%s command GET_FEATURES", __func__); 1886 nvme_opc_get_features(sc, cmd, &compl); 1887 break; 1888 case NVME_OPC_FIRMWARE_ACTIVATE: 1889 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__); 1890 pci_nvme_status_tc(&compl.status, 1891 NVME_SCT_COMMAND_SPECIFIC, 1892 NVME_SC_INVALID_FIRMWARE_SLOT); 1893 break; 1894 case NVME_OPC_ASYNC_EVENT_REQUEST: 1895 DPRINTF("%s command ASYNC_EVENT_REQ", __func__); 1896 nvme_opc_async_event_req(sc, cmd, &compl); 1897 break; 1898 case NVME_OPC_FORMAT_NVM: 1899 DPRINTF("%s command FORMAT_NVM", __func__); 1900 if ((sc->ctrldata.oacs & 1901 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) { 1902 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 1903 } 1904 compl.status = NVME_NO_STATUS; 1905 nvme_opc_format_nvm(sc, cmd, &compl); 1906 break; 1907 default: 1908 DPRINTF("0x%x command is not implemented", 1909 cmd->opc); 1910 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 1911 } 1912 sqhead = (sqhead + 1) % sq->size; 1913 1914 if (NVME_COMPLETION_VALID(compl)) { 1915 pci_nvme_cq_update(sc, &sc->compl_queues[0], 1916 compl.cdw0, 1917 cmd->cid, 1918 0, /* SQID */ 1919 compl.status); 1920 } 1921 } 1922 1923 DPRINTF("setting sqhead %u", sqhead); 1924 sq->head = sqhead; 1925 1926 if (cq->head != cq->tail) 1927 pci_generate_msix(sc->nsc_pi, 0); 1928 1929 pthread_mutex_unlock(&sq->mtx); 1930 } 1931 1932 /* 1933 * Update the Write and Read statistics reported in SMART data 1934 * 1935 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up. 1936 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000 1937 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999. 1938 */ 1939 static void 1940 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc, 1941 size_t bytes, uint16_t status) 1942 { 1943 1944 pthread_mutex_lock(&sc->mtx); 1945 switch (opc) { 1946 case NVME_OPC_WRITE: 1947 sc->write_commands++; 1948 if (status != NVME_SC_SUCCESS) 1949 break; 1950 sc->write_dunits_remainder += (bytes / 512); 1951 while (sc->write_dunits_remainder >= 1000) { 1952 sc->write_data_units++; 1953 sc->write_dunits_remainder -= 1000; 1954 } 1955 break; 1956 case NVME_OPC_READ: 1957 sc->read_commands++; 1958 if (status != NVME_SC_SUCCESS) 1959 break; 1960 sc->read_dunits_remainder += (bytes / 512); 1961 while (sc->read_dunits_remainder >= 1000) { 1962 sc->read_data_units++; 1963 sc->read_dunits_remainder -= 1000; 1964 } 1965 break; 1966 default: 1967 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc); 1968 break; 1969 } 1970 pthread_mutex_unlock(&sc->mtx); 1971 } 1972 1973 /* 1974 * Check if the combination of Starting LBA (slba) and Number of Logical 1975 * Blocks (nlb) exceeds the range of the underlying storage. 1976 * 1977 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores 1978 * the capacity in bytes as a uint64_t, care must be taken to avoid integer 1979 * overflow. 1980 */ 1981 static bool 1982 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba, 1983 uint32_t nlb) 1984 { 1985 size_t offset, bytes; 1986 1987 /* Overflow check of multiplying Starting LBA by the sector size */ 1988 if (slba >> (64 - nvstore->sectsz_bits)) 1989 return (true); 1990 1991 offset = slba << nvstore->sectsz_bits; 1992 bytes = nlb << nvstore->sectsz_bits; 1993 1994 /* Overflow check of Number of Logical Blocks */ 1995 if ((nvstore->size - offset) < bytes) 1996 return (true); 1997 1998 return (false); 1999 } 2000 2001 static int 2002 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req, 2003 uint64_t gpaddr, size_t size, int do_write, uint64_t lba) 2004 { 2005 int iovidx; 2006 2007 if (req == NULL) 2008 return (-1); 2009 2010 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) { 2011 return (-1); 2012 } 2013 2014 /* concatenate contig block-iovs to minimize number of iovs */ 2015 if ((req->prev_gpaddr + req->prev_size) == gpaddr) { 2016 iovidx = req->io_req.br_iovcnt - 1; 2017 2018 req->io_req.br_iov[iovidx].iov_base = 2019 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2020 req->prev_gpaddr, size); 2021 2022 req->prev_size += size; 2023 req->io_req.br_resid += size; 2024 2025 req->io_req.br_iov[iovidx].iov_len = req->prev_size; 2026 } else { 2027 iovidx = req->io_req.br_iovcnt; 2028 if (iovidx == 0) { 2029 req->io_req.br_offset = lba; 2030 req->io_req.br_resid = 0; 2031 req->io_req.br_param = req; 2032 } 2033 2034 req->io_req.br_iov[iovidx].iov_base = 2035 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2036 gpaddr, size); 2037 2038 req->io_req.br_iov[iovidx].iov_len = size; 2039 2040 req->prev_gpaddr = gpaddr; 2041 req->prev_size = size; 2042 req->io_req.br_resid += size; 2043 2044 req->io_req.br_iovcnt++; 2045 } 2046 2047 return (0); 2048 } 2049 2050 static void 2051 pci_nvme_set_completion(struct pci_nvme_softc *sc, 2052 struct nvme_submission_queue *sq, int sqid, uint16_t cid, 2053 uint32_t cdw0, uint16_t status) 2054 { 2055 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; 2056 2057 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x", 2058 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), 2059 NVME_STATUS_GET_SC(status)); 2060 2061 pci_nvme_cq_update(sc, cq, 2062 0, /* CDW0 */ 2063 cid, 2064 sqid, 2065 status); 2066 2067 if (cq->head != cq->tail) { 2068 if (cq->intr_en & NVME_CQ_INTEN) { 2069 pci_generate_msix(sc->nsc_pi, cq->intr_vec); 2070 } else { 2071 DPRINTF("%s: CQ%u interrupt disabled", 2072 __func__, sq->cqid); 2073 } 2074 } 2075 } 2076 2077 static void 2078 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) 2079 { 2080 req->sc = NULL; 2081 req->nvme_sq = NULL; 2082 req->sqid = 0; 2083 2084 pthread_mutex_lock(&sc->mtx); 2085 2086 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link); 2087 sc->pending_ios--; 2088 2089 /* when no more IO pending, can set to ready if device reset/enabled */ 2090 if (sc->pending_ios == 0 && 2091 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) 2092 sc->regs.csts |= NVME_CSTS_RDY; 2093 2094 pthread_mutex_unlock(&sc->mtx); 2095 2096 sem_post(&sc->iosemlock); 2097 } 2098 2099 static struct pci_nvme_ioreq * 2100 pci_nvme_get_ioreq(struct pci_nvme_softc *sc) 2101 { 2102 struct pci_nvme_ioreq *req = NULL; 2103 2104 sem_wait(&sc->iosemlock); 2105 pthread_mutex_lock(&sc->mtx); 2106 2107 req = STAILQ_FIRST(&sc->ioreqs_free); 2108 assert(req != NULL); 2109 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link); 2110 2111 req->sc = sc; 2112 2113 sc->pending_ios++; 2114 2115 pthread_mutex_unlock(&sc->mtx); 2116 2117 req->io_req.br_iovcnt = 0; 2118 req->io_req.br_offset = 0; 2119 req->io_req.br_resid = 0; 2120 req->io_req.br_param = req; 2121 req->prev_gpaddr = 0; 2122 req->prev_size = 0; 2123 2124 return req; 2125 } 2126 2127 static void 2128 pci_nvme_io_done(struct blockif_req *br, int err) 2129 { 2130 struct pci_nvme_ioreq *req = br->br_param; 2131 struct nvme_submission_queue *sq = req->nvme_sq; 2132 uint16_t code, status; 2133 2134 DPRINTF("%s error %d %s", __func__, err, strerror(err)); 2135 2136 /* TODO return correct error */ 2137 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; 2138 pci_nvme_status_genc(&status, code); 2139 2140 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status); 2141 pci_nvme_stats_write_read_update(req->sc, req->opc, 2142 req->bytes, status); 2143 pci_nvme_release_ioreq(req->sc, req); 2144 } 2145 2146 /* 2147 * Implements the Flush command. The specification states: 2148 * If a volatile write cache is not present, Flush commands complete 2149 * successfully and have no effect 2150 * in the description of the Volatile Write Cache (VWC) field of the Identify 2151 * Controller data. Therefore, set status to Success if the command is 2152 * not supported (i.e. RAM or as indicated by the blockif). 2153 */ 2154 static bool 2155 nvme_opc_flush(struct pci_nvme_softc *sc, 2156 struct nvme_command *cmd, 2157 struct pci_nvme_blockstore *nvstore, 2158 struct pci_nvme_ioreq *req, 2159 uint16_t *status) 2160 { 2161 bool pending = false; 2162 2163 if (nvstore->type == NVME_STOR_RAM) { 2164 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2165 } else { 2166 int err; 2167 2168 req->io_req.br_callback = pci_nvme_io_done; 2169 2170 err = blockif_flush(nvstore->ctx, &req->io_req); 2171 switch (err) { 2172 case 0: 2173 pending = true; 2174 break; 2175 case EOPNOTSUPP: 2176 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2177 break; 2178 default: 2179 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2180 } 2181 } 2182 2183 return (pending); 2184 } 2185 2186 static uint16_t 2187 nvme_write_read_ram(struct pci_nvme_softc *sc, 2188 struct pci_nvme_blockstore *nvstore, 2189 uint64_t prp1, uint64_t prp2, 2190 size_t offset, uint64_t bytes, 2191 bool is_write) 2192 { 2193 uint8_t *buf = nvstore->ctx; 2194 enum nvme_copy_dir dir; 2195 uint16_t status; 2196 2197 if (is_write) 2198 dir = NVME_COPY_TO_PRP; 2199 else 2200 dir = NVME_COPY_FROM_PRP; 2201 2202 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2, 2203 buf + offset, bytes, dir)) 2204 pci_nvme_status_genc(&status, 2205 NVME_SC_DATA_TRANSFER_ERROR); 2206 else 2207 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2208 2209 return (status); 2210 } 2211 2212 static uint16_t 2213 nvme_write_read_blockif(struct pci_nvme_softc *sc, 2214 struct pci_nvme_blockstore *nvstore, 2215 struct pci_nvme_ioreq *req, 2216 uint64_t prp1, uint64_t prp2, 2217 size_t offset, uint64_t bytes, 2218 bool is_write) 2219 { 2220 uint64_t size; 2221 int err; 2222 uint16_t status = NVME_NO_STATUS; 2223 2224 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes); 2225 if (pci_nvme_append_iov_req(sc, req, prp1, 2226 size, is_write, offset)) { 2227 pci_nvme_status_genc(&status, 2228 NVME_SC_DATA_TRANSFER_ERROR); 2229 goto out; 2230 } 2231 2232 offset += size; 2233 bytes -= size; 2234 2235 if (bytes == 0) { 2236 ; 2237 } else if (bytes <= PAGE_SIZE) { 2238 size = bytes; 2239 if (pci_nvme_append_iov_req(sc, req, prp2, 2240 size, is_write, offset)) { 2241 pci_nvme_status_genc(&status, 2242 NVME_SC_DATA_TRANSFER_ERROR); 2243 goto out; 2244 } 2245 } else { 2246 void *vmctx = sc->nsc_pi->pi_vmctx; 2247 uint64_t *prp_list = &prp2; 2248 uint64_t *last = prp_list; 2249 2250 /* PRP2 is pointer to a physical region page list */ 2251 while (bytes) { 2252 /* Last entry in list points to the next list */ 2253 if ((prp_list == last) && (bytes > PAGE_SIZE)) { 2254 uint64_t prp = *prp_list; 2255 2256 prp_list = paddr_guest2host(vmctx, prp, 2257 PAGE_SIZE - (prp % PAGE_SIZE)); 2258 last = prp_list + (NVME_PRP2_ITEMS - 1); 2259 } 2260 2261 size = MIN(bytes, PAGE_SIZE); 2262 2263 if (pci_nvme_append_iov_req(sc, req, *prp_list, 2264 size, is_write, offset)) { 2265 pci_nvme_status_genc(&status, 2266 NVME_SC_DATA_TRANSFER_ERROR); 2267 goto out; 2268 } 2269 2270 offset += size; 2271 bytes -= size; 2272 2273 prp_list++; 2274 } 2275 } 2276 req->io_req.br_callback = pci_nvme_io_done; 2277 if (is_write) 2278 err = blockif_write(nvstore->ctx, &req->io_req); 2279 else 2280 err = blockif_read(nvstore->ctx, &req->io_req); 2281 2282 if (err) 2283 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); 2284 out: 2285 return (status); 2286 } 2287 2288 static bool 2289 nvme_opc_write_read(struct pci_nvme_softc *sc, 2290 struct nvme_command *cmd, 2291 struct pci_nvme_blockstore *nvstore, 2292 struct pci_nvme_ioreq *req, 2293 uint16_t *status) 2294 { 2295 uint64_t lba, nblocks, bytes; 2296 size_t offset; 2297 bool is_write = cmd->opc == NVME_OPC_WRITE; 2298 bool pending = false; 2299 2300 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; 2301 nblocks = (cmd->cdw12 & 0xFFFF) + 1; 2302 2303 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) { 2304 WPRINTF("%s command would exceed LBA range", __func__); 2305 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2306 goto out; 2307 } 2308 2309 bytes = nblocks << nvstore->sectsz_bits; 2310 if (bytes > NVME_MAX_DATA_SIZE) { 2311 WPRINTF("%s command would exceed MDTS", __func__); 2312 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD); 2313 goto out; 2314 } 2315 2316 offset = lba << nvstore->sectsz_bits; 2317 2318 req->bytes = bytes; 2319 req->io_req.br_offset = lba; 2320 2321 /* PRP bits 1:0 must be zero */ 2322 cmd->prp1 &= ~0x3UL; 2323 cmd->prp2 &= ~0x3UL; 2324 2325 if (nvstore->type == NVME_STOR_RAM) { 2326 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1, 2327 cmd->prp2, offset, bytes, is_write); 2328 } else { 2329 *status = nvme_write_read_blockif(sc, nvstore, req, 2330 cmd->prp1, cmd->prp2, offset, bytes, is_write); 2331 2332 if (*status == NVME_NO_STATUS) 2333 pending = true; 2334 } 2335 out: 2336 if (!pending) 2337 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status); 2338 2339 return (pending); 2340 } 2341 2342 static void 2343 pci_nvme_dealloc_sm(struct blockif_req *br, int err) 2344 { 2345 struct pci_nvme_ioreq *req = br->br_param; 2346 struct pci_nvme_softc *sc = req->sc; 2347 bool done = true; 2348 uint16_t status; 2349 2350 if (err) { 2351 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); 2352 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) { 2353 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2354 } else { 2355 struct iovec *iov = req->io_req.br_iov; 2356 2357 req->prev_gpaddr++; 2358 iov += req->prev_gpaddr; 2359 2360 /* The iov_* values already include the sector size */ 2361 req->io_req.br_offset = (off_t)iov->iov_base; 2362 req->io_req.br_resid = iov->iov_len; 2363 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) { 2364 pci_nvme_status_genc(&status, 2365 NVME_SC_INTERNAL_DEVICE_ERROR); 2366 } else 2367 done = false; 2368 } 2369 2370 if (done) { 2371 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, 2372 req->cid, 0, status); 2373 pci_nvme_release_ioreq(sc, req); 2374 } 2375 } 2376 2377 static bool 2378 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, 2379 struct nvme_command *cmd, 2380 struct pci_nvme_blockstore *nvstore, 2381 struct pci_nvme_ioreq *req, 2382 uint16_t *status) 2383 { 2384 struct nvme_dsm_range *range; 2385 uint32_t nr, r, non_zero, dr; 2386 int err; 2387 bool pending = false; 2388 2389 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) { 2390 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE); 2391 goto out; 2392 } 2393 2394 nr = cmd->cdw10 & 0xff; 2395 2396 /* copy locally because a range entry could straddle PRPs */ 2397 range = calloc(1, NVME_MAX_DSM_TRIM); 2398 if (range == NULL) { 2399 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2400 goto out; 2401 } 2402 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, 2403 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP); 2404 2405 /* Check for invalid ranges and the number of non-zero lengths */ 2406 non_zero = 0; 2407 for (r = 0; r <= nr; r++) { 2408 if (pci_nvme_out_of_range(nvstore, 2409 range[r].starting_lba, range[r].length)) { 2410 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2411 goto out; 2412 } 2413 if (range[r].length != 0) 2414 non_zero++; 2415 } 2416 2417 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) { 2418 size_t offset, bytes; 2419 int sectsz_bits = sc->nvstore.sectsz_bits; 2420 2421 /* 2422 * DSM calls are advisory only, and compliant controllers 2423 * may choose to take no actions (i.e. return Success). 2424 */ 2425 if (!nvstore->deallocate) { 2426 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2427 goto out; 2428 } 2429 2430 /* If all ranges have a zero length, return Success */ 2431 if (non_zero == 0) { 2432 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2433 goto out; 2434 } 2435 2436 if (req == NULL) { 2437 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2438 goto out; 2439 } 2440 2441 offset = range[0].starting_lba << sectsz_bits; 2442 bytes = range[0].length << sectsz_bits; 2443 2444 /* 2445 * If the request is for more than a single range, store 2446 * the ranges in the br_iov. Optimize for the common case 2447 * of a single range. 2448 * 2449 * Note that NVMe Number of Ranges is a zero based value 2450 */ 2451 req->io_req.br_iovcnt = 0; 2452 req->io_req.br_offset = offset; 2453 req->io_req.br_resid = bytes; 2454 2455 if (nr == 0) { 2456 req->io_req.br_callback = pci_nvme_io_done; 2457 } else { 2458 struct iovec *iov = req->io_req.br_iov; 2459 2460 for (r = 0, dr = 0; r <= nr; r++) { 2461 offset = range[r].starting_lba << sectsz_bits; 2462 bytes = range[r].length << sectsz_bits; 2463 if (bytes == 0) 2464 continue; 2465 2466 if ((nvstore->size - offset) < bytes) { 2467 pci_nvme_status_genc(status, 2468 NVME_SC_LBA_OUT_OF_RANGE); 2469 goto out; 2470 } 2471 iov[dr].iov_base = (void *)offset; 2472 iov[dr].iov_len = bytes; 2473 dr++; 2474 } 2475 req->io_req.br_callback = pci_nvme_dealloc_sm; 2476 2477 /* 2478 * Use prev_gpaddr to track the current entry and 2479 * prev_size to track the number of entries 2480 */ 2481 req->prev_gpaddr = 0; 2482 req->prev_size = dr; 2483 } 2484 2485 err = blockif_delete(nvstore->ctx, &req->io_req); 2486 if (err) 2487 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2488 else 2489 pending = true; 2490 } 2491 out: 2492 free(range); 2493 return (pending); 2494 } 2495 2496 static void 2497 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) 2498 { 2499 struct nvme_submission_queue *sq; 2500 uint16_t status; 2501 uint16_t sqhead; 2502 2503 /* handle all submissions up to sq->tail index */ 2504 sq = &sc->submit_queues[idx]; 2505 2506 pthread_mutex_lock(&sq->mtx); 2507 2508 sqhead = sq->head; 2509 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p", 2510 idx, sqhead, sq->tail, sq->qbase); 2511 2512 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2513 struct nvme_command *cmd; 2514 struct pci_nvme_ioreq *req; 2515 uint32_t nsid; 2516 bool pending; 2517 2518 pending = false; 2519 req = NULL; 2520 status = 0; 2521 2522 cmd = &sq->qbase[sqhead]; 2523 sqhead = (sqhead + 1) % sq->size; 2524 2525 nsid = le32toh(cmd->nsid); 2526 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) { 2527 pci_nvme_status_genc(&status, 2528 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 2529 status |= 2530 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT; 2531 goto complete; 2532 } 2533 2534 req = pci_nvme_get_ioreq(sc); 2535 if (req == NULL) { 2536 pci_nvme_status_genc(&status, 2537 NVME_SC_INTERNAL_DEVICE_ERROR); 2538 WPRINTF("%s: unable to allocate IO req", __func__); 2539 goto complete; 2540 } 2541 req->nvme_sq = sq; 2542 req->sqid = idx; 2543 req->opc = cmd->opc; 2544 req->cid = cmd->cid; 2545 req->nsid = cmd->nsid; 2546 2547 switch (cmd->opc) { 2548 case NVME_OPC_FLUSH: 2549 pending = nvme_opc_flush(sc, cmd, &sc->nvstore, 2550 req, &status); 2551 break; 2552 case NVME_OPC_WRITE: 2553 case NVME_OPC_READ: 2554 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore, 2555 req, &status); 2556 break; 2557 case NVME_OPC_WRITE_ZEROES: 2558 /* TODO: write zeroes 2559 WPRINTF("%s write zeroes lba 0x%lx blocks %u", 2560 __func__, lba, cmd->cdw12 & 0xFFFF); */ 2561 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2562 break; 2563 case NVME_OPC_DATASET_MANAGEMENT: 2564 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, 2565 req, &status); 2566 break; 2567 default: 2568 WPRINTF("%s unhandled io command 0x%x", 2569 __func__, cmd->opc); 2570 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE); 2571 } 2572 complete: 2573 if (!pending) { 2574 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, 2575 status); 2576 if (req != NULL) 2577 pci_nvme_release_ioreq(sc, req); 2578 } 2579 } 2580 2581 sq->head = sqhead; 2582 2583 pthread_mutex_unlock(&sq->mtx); 2584 } 2585 2586 static void 2587 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc, 2588 uint64_t idx, int is_sq, uint64_t value) 2589 { 2590 DPRINTF("nvme doorbell %lu, %s, val 0x%lx", 2591 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF); 2592 2593 if (is_sq) { 2594 if (idx > sc->num_squeues) { 2595 WPRINTF("%s queue index %lu overflow from " 2596 "guest (max %u)", 2597 __func__, idx, sc->num_squeues); 2598 return; 2599 } 2600 2601 atomic_store_short(&sc->submit_queues[idx].tail, 2602 (uint16_t)value); 2603 2604 if (idx == 0) { 2605 pci_nvme_handle_admin_cmd(sc, value); 2606 } else { 2607 /* submission queue; handle new entries in SQ */ 2608 if (idx > sc->num_squeues) { 2609 WPRINTF("%s SQ index %lu overflow from " 2610 "guest (max %u)", 2611 __func__, idx, sc->num_squeues); 2612 return; 2613 } 2614 pci_nvme_handle_io_cmd(sc, (uint16_t)idx); 2615 } 2616 } else { 2617 if (idx > sc->num_cqueues) { 2618 WPRINTF("%s queue index %lu overflow from " 2619 "guest (max %u)", 2620 __func__, idx, sc->num_cqueues); 2621 return; 2622 } 2623 2624 atomic_store_short(&sc->compl_queues[idx].head, 2625 (uint16_t)value); 2626 } 2627 } 2628 2629 static void 2630 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) 2631 { 2632 const char *s = iswrite ? "WRITE" : "READ"; 2633 2634 switch (offset) { 2635 case NVME_CR_CAP_LOW: 2636 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s); 2637 break; 2638 case NVME_CR_CAP_HI: 2639 DPRINTF("%s %s NVME_CR_CAP_HI", func, s); 2640 break; 2641 case NVME_CR_VS: 2642 DPRINTF("%s %s NVME_CR_VS", func, s); 2643 break; 2644 case NVME_CR_INTMS: 2645 DPRINTF("%s %s NVME_CR_INTMS", func, s); 2646 break; 2647 case NVME_CR_INTMC: 2648 DPRINTF("%s %s NVME_CR_INTMC", func, s); 2649 break; 2650 case NVME_CR_CC: 2651 DPRINTF("%s %s NVME_CR_CC", func, s); 2652 break; 2653 case NVME_CR_CSTS: 2654 DPRINTF("%s %s NVME_CR_CSTS", func, s); 2655 break; 2656 case NVME_CR_NSSR: 2657 DPRINTF("%s %s NVME_CR_NSSR", func, s); 2658 break; 2659 case NVME_CR_AQA: 2660 DPRINTF("%s %s NVME_CR_AQA", func, s); 2661 break; 2662 case NVME_CR_ASQ_LOW: 2663 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s); 2664 break; 2665 case NVME_CR_ASQ_HI: 2666 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s); 2667 break; 2668 case NVME_CR_ACQ_LOW: 2669 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s); 2670 break; 2671 case NVME_CR_ACQ_HI: 2672 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s); 2673 break; 2674 default: 2675 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset); 2676 } 2677 2678 } 2679 2680 static void 2681 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc, 2682 uint64_t offset, int size, uint64_t value) 2683 { 2684 uint32_t ccreg; 2685 2686 if (offset >= NVME_DOORBELL_OFFSET) { 2687 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; 2688 uint64_t idx = belloffset / 8; /* door bell size = 2*int */ 2689 int is_sq = (belloffset % 8) < 4; 2690 2691 if (belloffset > ((sc->max_queues+1) * 8 - 4)) { 2692 WPRINTF("guest attempted an overflow write offset " 2693 "0x%lx, val 0x%lx in %s", 2694 offset, value, __func__); 2695 return; 2696 } 2697 2698 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value); 2699 return; 2700 } 2701 2702 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx", 2703 offset, size, value); 2704 2705 if (size != 4) { 2706 WPRINTF("guest wrote invalid size %d (offset 0x%lx, " 2707 "val 0x%lx) to bar0 in %s", 2708 size, offset, value, __func__); 2709 /* TODO: shutdown device */ 2710 return; 2711 } 2712 2713 pci_nvme_bar0_reg_dumps(__func__, offset, 1); 2714 2715 pthread_mutex_lock(&sc->mtx); 2716 2717 switch (offset) { 2718 case NVME_CR_CAP_LOW: 2719 case NVME_CR_CAP_HI: 2720 /* readonly */ 2721 break; 2722 case NVME_CR_VS: 2723 /* readonly */ 2724 break; 2725 case NVME_CR_INTMS: 2726 /* MSI-X, so ignore */ 2727 break; 2728 case NVME_CR_INTMC: 2729 /* MSI-X, so ignore */ 2730 break; 2731 case NVME_CR_CC: 2732 ccreg = (uint32_t)value; 2733 2734 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u " 2735 "iocqes %u", 2736 __func__, 2737 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), 2738 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), 2739 NVME_CC_GET_IOCQES(ccreg)); 2740 2741 if (NVME_CC_GET_SHN(ccreg)) { 2742 /* perform shutdown - flush out data to backend */ 2743 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << 2744 NVME_CSTS_REG_SHST_SHIFT); 2745 sc->regs.csts |= NVME_SHST_COMPLETE << 2746 NVME_CSTS_REG_SHST_SHIFT; 2747 } 2748 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { 2749 if (NVME_CC_GET_EN(ccreg) == 0) 2750 /* transition 1-> causes controller reset */ 2751 pci_nvme_reset_locked(sc); 2752 else 2753 pci_nvme_init_controller(ctx, sc); 2754 } 2755 2756 /* Insert the iocqes, iosqes and en bits from the write */ 2757 sc->regs.cc &= ~NVME_CC_WRITE_MASK; 2758 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; 2759 if (NVME_CC_GET_EN(ccreg) == 0) { 2760 /* Insert the ams, mps and css bit fields */ 2761 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; 2762 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; 2763 sc->regs.csts &= ~NVME_CSTS_RDY; 2764 } else if (sc->pending_ios == 0) { 2765 sc->regs.csts |= NVME_CSTS_RDY; 2766 } 2767 break; 2768 case NVME_CR_CSTS: 2769 break; 2770 case NVME_CR_NSSR: 2771 /* ignore writes; don't support subsystem reset */ 2772 break; 2773 case NVME_CR_AQA: 2774 sc->regs.aqa = (uint32_t)value; 2775 break; 2776 case NVME_CR_ASQ_LOW: 2777 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | 2778 (0xFFFFF000 & value); 2779 break; 2780 case NVME_CR_ASQ_HI: 2781 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | 2782 (value << 32); 2783 break; 2784 case NVME_CR_ACQ_LOW: 2785 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | 2786 (0xFFFFF000 & value); 2787 break; 2788 case NVME_CR_ACQ_HI: 2789 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | 2790 (value << 32); 2791 break; 2792 default: 2793 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d", 2794 __func__, offset, value, size); 2795 } 2796 pthread_mutex_unlock(&sc->mtx); 2797 } 2798 2799 static void 2800 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, 2801 int baridx, uint64_t offset, int size, uint64_t value) 2802 { 2803 struct pci_nvme_softc* sc = pi->pi_arg; 2804 2805 if (baridx == pci_msix_table_bar(pi) || 2806 baridx == pci_msix_pba_bar(pi)) { 2807 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, " 2808 " value 0x%lx", baridx, offset, size, value); 2809 2810 pci_emul_msix_twrite(pi, offset, size, value); 2811 return; 2812 } 2813 2814 switch (baridx) { 2815 case 0: 2816 pci_nvme_write_bar_0(ctx, sc, offset, size, value); 2817 break; 2818 2819 default: 2820 DPRINTF("%s unknown baridx %d, val 0x%lx", 2821 __func__, baridx, value); 2822 } 2823 } 2824 2825 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, 2826 uint64_t offset, int size) 2827 { 2828 uint64_t value; 2829 2830 pci_nvme_bar0_reg_dumps(__func__, offset, 0); 2831 2832 if (offset < NVME_DOORBELL_OFFSET) { 2833 void *p = &(sc->regs); 2834 pthread_mutex_lock(&sc->mtx); 2835 memcpy(&value, (void *)((uintptr_t)p + offset), size); 2836 pthread_mutex_unlock(&sc->mtx); 2837 } else { 2838 value = 0; 2839 WPRINTF("pci_nvme: read invalid offset %ld", offset); 2840 } 2841 2842 switch (size) { 2843 case 1: 2844 value &= 0xFF; 2845 break; 2846 case 2: 2847 value &= 0xFFFF; 2848 break; 2849 case 4: 2850 value &= 0xFFFFFFFF; 2851 break; 2852 } 2853 2854 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x", 2855 offset, size, (uint32_t)value); 2856 2857 return (value); 2858 } 2859 2860 2861 2862 static uint64_t 2863 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, 2864 uint64_t offset, int size) 2865 { 2866 struct pci_nvme_softc* sc = pi->pi_arg; 2867 2868 if (baridx == pci_msix_table_bar(pi) || 2869 baridx == pci_msix_pba_bar(pi)) { 2870 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d", 2871 baridx, offset, size); 2872 2873 return pci_emul_msix_tread(pi, offset, size); 2874 } 2875 2876 switch (baridx) { 2877 case 0: 2878 return pci_nvme_read_bar_0(sc, offset, size); 2879 2880 default: 2881 DPRINTF("unknown bar %d, 0x%lx", baridx, offset); 2882 } 2883 2884 return (0); 2885 } 2886 2887 static int 2888 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl) 2889 { 2890 char bident[sizeof("XX:X:X")]; 2891 const char *value; 2892 uint32_t sectsz; 2893 2894 sc->max_queues = NVME_QUEUES; 2895 sc->max_qentries = NVME_MAX_QENTRIES; 2896 sc->ioslots = NVME_IOSLOTS; 2897 sc->num_squeues = sc->max_queues; 2898 sc->num_cqueues = sc->max_queues; 2899 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 2900 sectsz = 0; 2901 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), 2902 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 2903 2904 value = get_config_value_node(nvl, "maxq"); 2905 if (value != NULL) 2906 sc->max_queues = atoi(value); 2907 value = get_config_value_node(nvl, "qsz"); 2908 if (value != NULL) { 2909 sc->max_qentries = atoi(value); 2910 if (sc->max_qentries <= 0) { 2911 EPRINTLN("nvme: Invalid qsz option %d", 2912 sc->max_qentries); 2913 return (-1); 2914 } 2915 } 2916 value = get_config_value_node(nvl, "ioslots"); 2917 if (value != NULL) { 2918 sc->ioslots = atoi(value); 2919 if (sc->ioslots <= 0) { 2920 EPRINTLN("Invalid ioslots option %d", sc->ioslots); 2921 return (-1); 2922 } 2923 } 2924 value = get_config_value_node(nvl, "sectsz"); 2925 if (value != NULL) 2926 sectsz = atoi(value); 2927 value = get_config_value_node(nvl, "ser"); 2928 if (value != NULL) { 2929 /* 2930 * This field indicates the Product Serial Number in 2931 * 7-bit ASCII, unused bytes should be space characters. 2932 * Ref: NVMe v1.3c. 2933 */ 2934 cpywithpad((char *)sc->ctrldata.sn, 2935 sizeof(sc->ctrldata.sn), value, ' '); 2936 } 2937 value = get_config_value_node(nvl, "eui64"); 2938 if (value != NULL) 2939 sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0)); 2940 value = get_config_value_node(nvl, "dsm"); 2941 if (value != NULL) { 2942 if (strcmp(value, "auto") == 0) 2943 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 2944 else if (strcmp(value, "enable") == 0) 2945 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE; 2946 else if (strcmp(value, "disable") == 0) 2947 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE; 2948 } 2949 2950 value = get_config_value_node(nvl, "ram"); 2951 if (value != NULL) { 2952 uint64_t sz = strtoull(value, NULL, 10); 2953 2954 sc->nvstore.type = NVME_STOR_RAM; 2955 sc->nvstore.size = sz * 1024 * 1024; 2956 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 2957 sc->nvstore.sectsz = 4096; 2958 sc->nvstore.sectsz_bits = 12; 2959 if (sc->nvstore.ctx == NULL) { 2960 EPRINTLN("nvme: Unable to allocate RAM"); 2961 return (-1); 2962 } 2963 } else { 2964 snprintf(bident, sizeof(bident), "%d:%d", 2965 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 2966 sc->nvstore.ctx = blockif_open(nvl, bident); 2967 if (sc->nvstore.ctx == NULL) { 2968 EPRINTLN("nvme: Could not open backing file: %s", 2969 strerror(errno)); 2970 return (-1); 2971 } 2972 sc->nvstore.type = NVME_STOR_BLOCKIF; 2973 sc->nvstore.size = blockif_size(sc->nvstore.ctx); 2974 } 2975 2976 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) 2977 sc->nvstore.sectsz = sectsz; 2978 else if (sc->nvstore.type != NVME_STOR_RAM) 2979 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); 2980 for (sc->nvstore.sectsz_bits = 9; 2981 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; 2982 sc->nvstore.sectsz_bits++); 2983 2984 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) 2985 sc->max_queues = NVME_QUEUES; 2986 2987 return (0); 2988 } 2989 2990 static void 2991 pci_nvme_resized(struct blockif_ctxt *bctxt, void *arg, size_t new_size) 2992 { 2993 struct pci_nvme_softc *sc; 2994 struct pci_nvme_blockstore *nvstore; 2995 struct nvme_namespace_data *nd; 2996 2997 sc = arg; 2998 nvstore = &sc->nvstore; 2999 nd = &sc->nsdata; 3000 3001 nvstore->size = new_size; 3002 pci_nvme_init_nsdata_size(nvstore, nd); 3003 3004 /* Add changed NSID to list */ 3005 sc->ns_log.ns[0] = 1; 3006 sc->ns_log.ns[1] = 0; 3007 3008 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE, 3009 PCI_NVME_AE_INFO_NS_ATTR_CHANGED); 3010 } 3011 3012 static int 3013 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) 3014 { 3015 struct pci_nvme_softc *sc; 3016 uint32_t pci_membar_sz; 3017 int error; 3018 3019 error = 0; 3020 3021 sc = calloc(1, sizeof(struct pci_nvme_softc)); 3022 pi->pi_arg = sc; 3023 sc->nsc_pi = pi; 3024 3025 error = pci_nvme_parse_config(sc, nvl); 3026 if (error < 0) 3027 goto done; 3028 else 3029 error = 0; 3030 3031 STAILQ_INIT(&sc->ioreqs_free); 3032 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); 3033 for (int i = 0; i < sc->ioslots; i++) { 3034 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); 3035 } 3036 3037 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); 3038 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); 3039 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); 3040 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); 3041 pci_set_cfgdata8(pi, PCIR_PROGIF, 3042 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); 3043 3044 /* 3045 * Allocate size of NVMe registers + doorbell space for all queues. 3046 * 3047 * The specification requires a minimum memory I/O window size of 16K. 3048 * The Windows driver will refuse to start a device with a smaller 3049 * window. 3050 */ 3051 pci_membar_sz = sizeof(struct nvme_registers) + 3052 2 * sizeof(uint32_t) * (sc->max_queues + 1); 3053 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); 3054 3055 DPRINTF("nvme membar size: %u", pci_membar_sz); 3056 3057 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); 3058 if (error) { 3059 WPRINTF("%s pci alloc mem bar failed", __func__); 3060 goto done; 3061 } 3062 3063 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); 3064 if (error) { 3065 WPRINTF("%s pci add msixcap failed", __func__); 3066 goto done; 3067 } 3068 3069 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); 3070 if (error) { 3071 WPRINTF("%s pci add Express capability failed", __func__); 3072 goto done; 3073 } 3074 3075 pthread_mutex_init(&sc->mtx, NULL); 3076 sem_init(&sc->iosemlock, 0, sc->ioslots); 3077 blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc); 3078 3079 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues); 3080 /* 3081 * Controller data depends on Namespace data so initialize Namespace 3082 * data first. 3083 */ 3084 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore); 3085 pci_nvme_init_ctrldata(sc); 3086 pci_nvme_init_logpages(sc); 3087 pci_nvme_init_features(sc); 3088 3089 pci_nvme_aer_init(sc); 3090 pci_nvme_aen_init(sc); 3091 3092 pci_nvme_reset(sc); 3093 3094 pci_lintr_request(pi); 3095 3096 done: 3097 return (error); 3098 } 3099 3100 static int 3101 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts) 3102 { 3103 char *cp, *ram; 3104 3105 if (opts == NULL) 3106 return (0); 3107 3108 if (strncmp(opts, "ram=", 4) == 0) { 3109 cp = strchr(opts, ','); 3110 if (cp == NULL) { 3111 set_config_value_node(nvl, "ram", opts + 4); 3112 return (0); 3113 } 3114 ram = strndup(opts + 4, cp - opts - 4); 3115 set_config_value_node(nvl, "ram", ram); 3116 free(ram); 3117 return (pci_parse_legacy_config(nvl, cp + 1)); 3118 } else 3119 return (blockif_legacy_config(nvl, opts)); 3120 } 3121 3122 struct pci_devemu pci_de_nvme = { 3123 .pe_emu = "nvme", 3124 .pe_init = pci_nvme_init, 3125 .pe_legacy_config = pci_nvme_legacy_config, 3126 .pe_barwrite = pci_nvme_write, 3127 .pe_barread = pci_nvme_read 3128 }; 3129 PCI_EMUL_SET(pci_de_nvme); 3130