1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2017 Shunsuke Mie 5 * Copyright (c) 2018 Leon Dang 6 * Copyright (c) 2020 Chuck Tuffli 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 /* 31 * bhyve PCIe-NVMe device emulation. 32 * 33 * options: 34 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt> 35 * 36 * accepted devpath: 37 * /dev/blockdev 38 * /path/to/image 39 * ram=size_in_MiB 40 * 41 * maxq = max number of queues 42 * qsz = max elements in each queue 43 * ioslots = max number of concurrent io requests 44 * sectsz = sector size (defaults to blockif sector size) 45 * ser = serial number (20-chars max) 46 * eui64 = IEEE Extended Unique Identifier (8 byte value) 47 * dsm = DataSet Management support. Option is one of auto, enable,disable 48 * 49 */ 50 51 /* TODO: 52 - create async event for smart and log 53 - intr coalesce 54 */ 55 56 57 #include <sys/errno.h> 58 #include <sys/types.h> 59 #ifdef __FreeBSD__ 60 #include <sys/crc16.h> 61 #else 62 #include "crc16.h" 63 #endif 64 #include <net/ieee_oui.h> 65 #ifndef __FreeBSD__ 66 #include <endian.h> 67 #endif 68 69 #include <assert.h> 70 #include <pthread.h> 71 #include <pthread_np.h> 72 #include <semaphore.h> 73 #include <stdbool.h> 74 #include <stddef.h> 75 #include <stdint.h> 76 #include <stdio.h> 77 #include <stdlib.h> 78 #include <string.h> 79 80 #include <machine/atomic.h> 81 #include <machine/vmm.h> 82 #include <vmmapi.h> 83 84 #include <dev/nvme/nvme.h> 85 86 #include "bhyverun.h" 87 #include "block_if.h" 88 #include "config.h" 89 #include "debug.h" 90 #include "pci_emul.h" 91 92 93 static int nvme_debug = 0; 94 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args) 95 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args) 96 97 /* defaults; can be overridden */ 98 #define NVME_MSIX_BAR 4 99 100 #define NVME_IOSLOTS 8 101 102 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ 103 #define NVME_MMIO_SPACE_MIN (1 << 14) 104 105 #define NVME_QUEUES 16 106 #define NVME_MAX_QENTRIES 2048 107 /* Memory Page size Minimum reported in CAP register */ 108 #define NVME_MPSMIN 0 109 /* MPSMIN converted to bytes */ 110 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN)) 111 112 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) 113 #define NVME_MDTS 9 114 /* Note the + 1 allows for the initial descriptor to not be page aligned */ 115 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1) 116 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES) 117 118 /* This is a synthetic status code to indicate there is no status */ 119 #define NVME_NO_STATUS 0xffff 120 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) 121 122 /* Reported temperature in Kelvin (i.e. room temperature) */ 123 #define NVME_TEMPERATURE 296 124 125 /* helpers */ 126 127 /* Convert a zero-based value into a one-based value */ 128 #define ONE_BASED(zero) ((zero) + 1) 129 /* Convert a one-based value into a zero-based value */ 130 #define ZERO_BASED(one) ((one) - 1) 131 132 /* Encode number of SQ's and CQ's for Set/Get Features */ 133 #define NVME_FEATURE_NUM_QUEUES(sc) \ 134 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ 135 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16 136 137 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) 138 139 enum nvme_controller_register_offsets { 140 NVME_CR_CAP_LOW = 0x00, 141 NVME_CR_CAP_HI = 0x04, 142 NVME_CR_VS = 0x08, 143 NVME_CR_INTMS = 0x0c, 144 NVME_CR_INTMC = 0x10, 145 NVME_CR_CC = 0x14, 146 NVME_CR_CSTS = 0x1c, 147 NVME_CR_NSSR = 0x20, 148 NVME_CR_AQA = 0x24, 149 NVME_CR_ASQ_LOW = 0x28, 150 NVME_CR_ASQ_HI = 0x2c, 151 NVME_CR_ACQ_LOW = 0x30, 152 NVME_CR_ACQ_HI = 0x34, 153 }; 154 155 enum nvme_cmd_cdw11 { 156 NVME_CMD_CDW11_PC = 0x0001, 157 NVME_CMD_CDW11_IEN = 0x0002, 158 NVME_CMD_CDW11_IV = 0xFFFF0000, 159 }; 160 161 enum nvme_copy_dir { 162 NVME_COPY_TO_PRP, 163 NVME_COPY_FROM_PRP, 164 }; 165 166 #define NVME_CQ_INTEN 0x01 167 #define NVME_CQ_INTCOAL 0x02 168 169 struct nvme_completion_queue { 170 struct nvme_completion *qbase; 171 pthread_mutex_t mtx; 172 uint32_t size; 173 uint16_t tail; /* nvme progress */ 174 uint16_t head; /* guest progress */ 175 uint16_t intr_vec; 176 uint32_t intr_en; 177 }; 178 179 struct nvme_submission_queue { 180 struct nvme_command *qbase; 181 pthread_mutex_t mtx; 182 uint32_t size; 183 uint16_t head; /* nvme progress */ 184 uint16_t tail; /* guest progress */ 185 uint16_t cqid; /* completion queue id */ 186 int qpriority; 187 }; 188 189 enum nvme_storage_type { 190 NVME_STOR_BLOCKIF = 0, 191 NVME_STOR_RAM = 1, 192 }; 193 194 struct pci_nvme_blockstore { 195 enum nvme_storage_type type; 196 void *ctx; 197 uint64_t size; 198 uint32_t sectsz; 199 uint32_t sectsz_bits; 200 uint64_t eui64; 201 uint32_t deallocate:1; 202 }; 203 204 /* 205 * Calculate the number of additional page descriptors for guest IO requests 206 * based on the advertised Max Data Transfer (MDTS) and given the number of 207 * default iovec's in a struct blockif_req. 208 */ 209 #define MDTS_PAD_SIZE \ 210 ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \ 211 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \ 212 0 ) 213 214 struct pci_nvme_ioreq { 215 struct pci_nvme_softc *sc; 216 STAILQ_ENTRY(pci_nvme_ioreq) link; 217 struct nvme_submission_queue *nvme_sq; 218 uint16_t sqid; 219 220 /* command information */ 221 uint16_t opc; 222 uint16_t cid; 223 uint32_t nsid; 224 225 uint64_t prev_gpaddr; 226 size_t prev_size; 227 size_t bytes; 228 229 struct blockif_req io_req; 230 231 struct iovec iovpadding[MDTS_PAD_SIZE]; 232 }; 233 234 enum nvme_dsm_type { 235 /* Dataset Management bit in ONCS reflects backing storage capability */ 236 NVME_DATASET_MANAGEMENT_AUTO, 237 /* Unconditionally set Dataset Management bit in ONCS */ 238 NVME_DATASET_MANAGEMENT_ENABLE, 239 /* Unconditionally clear Dataset Management bit in ONCS */ 240 NVME_DATASET_MANAGEMENT_DISABLE, 241 }; 242 243 struct pci_nvme_softc; 244 struct nvme_feature_obj; 245 246 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *, 247 struct nvme_feature_obj *, 248 struct nvme_command *, 249 struct nvme_completion *); 250 251 struct nvme_feature_obj { 252 uint32_t cdw11; 253 nvme_feature_cb set; 254 nvme_feature_cb get; 255 bool namespace_specific; 256 }; 257 258 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1) 259 260 typedef enum { 261 PCI_NVME_AE_TYPE_ERROR = 0, 262 PCI_NVME_AE_TYPE_SMART, 263 PCI_NVME_AE_TYPE_NOTICE, 264 PCI_NVME_AE_TYPE_IO_CMD = 6, 265 PCI_NVME_AE_TYPE_VENDOR = 7, 266 PCI_NVME_AE_TYPE_MAX /* Must be last */ 267 } pci_nvme_async_type; 268 269 /* Asynchronous Event Requests */ 270 struct pci_nvme_aer { 271 STAILQ_ENTRY(pci_nvme_aer) link; 272 uint16_t cid; /* Command ID of the submitted AER */ 273 }; 274 275 /** Asynchronous Event Information - Error */ 276 typedef enum { 277 PCI_NVME_AEI_ERROR_INVALID_DB, 278 PCI_NVME_AEI_ERROR_INVALID_DB_VALUE, 279 PCI_NVME_AEI_ERROR_DIAG_FAILURE, 280 PCI_NVME_AEI_ERROR_PERSISTANT_ERR, 281 PCI_NVME_AEI_ERROR_TRANSIENT_ERR, 282 PCI_NVME_AEI_ERROR_FIRMWARE_LOAD_ERR, 283 PCI_NVME_AEI_ERROR_MAX, 284 } pci_nvme_async_event_info_error; 285 286 /** Asynchronous Event Information - Notice */ 287 typedef enum { 288 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED = 0, 289 PCI_NVME_AEI_NOTICE_FW_ACTIVATION, 290 PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE, 291 PCI_NVME_AEI_NOTICE_ANA_CHANGE, 292 PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE, 293 PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT, 294 PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE, 295 PCI_NVME_AEI_NOTICE_MAX, 296 } pci_nvme_async_event_info_notice; 297 298 #define PCI_NVME_AEI_NOTICE_SHIFT 8 299 #define PCI_NVME_AEI_NOTICE_MASK(event) (1 << (event + PCI_NVME_AEI_NOTICE_SHIFT)) 300 301 /* Asynchronous Event Notifications */ 302 struct pci_nvme_aen { 303 pci_nvme_async_type atype; 304 uint32_t event_data; 305 bool posted; 306 }; 307 308 /* 309 * By default, enable all Asynchrnous Event Notifications: 310 * SMART / Health Critical Warnings 311 * Namespace Attribute Notices 312 */ 313 #define PCI_NVME_AEN_DEFAULT_MASK 0x11f 314 315 typedef enum { 316 NVME_CNTRLTYPE_IO = 1, 317 NVME_CNTRLTYPE_DISCOVERY = 2, 318 NVME_CNTRLTYPE_ADMIN = 3, 319 } pci_nvme_cntrl_type; 320 321 struct pci_nvme_softc { 322 struct pci_devinst *nsc_pi; 323 324 pthread_mutex_t mtx; 325 326 struct nvme_registers regs; 327 328 struct nvme_namespace_data nsdata; 329 struct nvme_controller_data ctrldata; 330 struct nvme_error_information_entry err_log; 331 struct nvme_health_information_page health_log; 332 struct nvme_firmware_page fw_log; 333 struct nvme_ns_list ns_log; 334 335 struct pci_nvme_blockstore nvstore; 336 337 uint16_t max_qentries; /* max entries per queue */ 338 uint32_t max_queues; /* max number of IO SQ's or CQ's */ 339 uint32_t num_cqueues; 340 uint32_t num_squeues; 341 bool num_q_is_set; /* Has host set Number of Queues */ 342 343 struct pci_nvme_ioreq *ioreqs; 344 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */ 345 uint32_t pending_ios; 346 uint32_t ioslots; 347 sem_t iosemlock; 348 349 /* 350 * Memory mapped Submission and Completion queues 351 * Each array includes both Admin and IO queues 352 */ 353 struct nvme_completion_queue *compl_queues; 354 struct nvme_submission_queue *submit_queues; 355 356 struct nvme_feature_obj feat[NVME_FID_MAX]; 357 358 enum nvme_dsm_type dataset_management; 359 360 /* Accounting for SMART data */ 361 __uint128_t read_data_units; 362 __uint128_t write_data_units; 363 __uint128_t read_commands; 364 __uint128_t write_commands; 365 uint32_t read_dunits_remainder; 366 uint32_t write_dunits_remainder; 367 368 STAILQ_HEAD(, pci_nvme_aer) aer_list; 369 pthread_mutex_t aer_mtx; 370 uint32_t aer_count; 371 struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX]; 372 pthread_t aen_tid; 373 pthread_mutex_t aen_mtx; 374 pthread_cond_t aen_cond; 375 }; 376 377 378 static void pci_nvme_cq_update(struct pci_nvme_softc *sc, 379 struct nvme_completion_queue *cq, 380 uint32_t cdw0, 381 uint16_t cid, 382 uint16_t sqid, 383 uint16_t status); 384 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); 385 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); 386 static void pci_nvme_io_done(struct blockif_req *, int); 387 388 /* Controller Configuration utils */ 389 #define NVME_CC_GET_EN(cc) \ 390 NVMEV(NVME_CC_REG_EN, cc) 391 #define NVME_CC_GET_CSS(cc) \ 392 NVMEV(NVME_CC_REG_CSS, cc) 393 #define NVME_CC_GET_SHN(cc) \ 394 NVMEV(NVME_CC_REG_SHN, cc) 395 #define NVME_CC_GET_IOSQES(cc) \ 396 NVMEV(NVME_CC_REG_IOSQES, cc) 397 #define NVME_CC_GET_IOCQES(cc) \ 398 NVMEV(NVME_CC_REG_IOCQES, cc) 399 400 #define NVME_CC_WRITE_MASK \ 401 (NVMEM(NVME_CC_REG_EN) | \ 402 NVMEM(NVME_CC_REG_IOSQES) | \ 403 NVMEM(NVME_CC_REG_IOCQES)) 404 405 #define NVME_CC_NEN_WRITE_MASK \ 406 (NVMEM(NVME_CC_REG_CSS) | \ 407 NVMEM(NVME_CC_REG_MPS) | \ 408 NVMEM(NVME_CC_REG_AMS)) 409 410 /* Controller Status utils */ 411 #define NVME_CSTS_GET_RDY(sts) \ 412 NVMEV(NVME_CSTS_REG_RDY, sts) 413 414 #define NVME_CSTS_RDY (NVMEF(NVME_CSTS_REG_RDY, 1)) 415 #define NVME_CSTS_CFS (NVMEF(NVME_CSTS_REG_CFS, 1)) 416 417 /* Completion Queue status word utils */ 418 #define NVME_STATUS_P (NVMEF(NVME_STATUS_P, 1)) 419 #define NVME_STATUS_MASK \ 420 (NVMEM(NVME_STATUS_SCT) | \ 421 NVMEM(NVME_STATUS_SC)) 422 423 #define NVME_ONCS_DSM NVMEM(NVME_CTRLR_DATA_ONCS_DSM) 424 425 static void nvme_feature_invalid_cb(struct pci_nvme_softc *, 426 struct nvme_feature_obj *, 427 struct nvme_command *, 428 struct nvme_completion *); 429 static void nvme_feature_temperature(struct pci_nvme_softc *, 430 struct nvme_feature_obj *, 431 struct nvme_command *, 432 struct nvme_completion *); 433 static void nvme_feature_num_queues(struct pci_nvme_softc *, 434 struct nvme_feature_obj *, 435 struct nvme_command *, 436 struct nvme_completion *); 437 static void nvme_feature_iv_config(struct pci_nvme_softc *, 438 struct nvme_feature_obj *, 439 struct nvme_command *, 440 struct nvme_completion *); 441 static void nvme_feature_async_event(struct pci_nvme_softc *, 442 struct nvme_feature_obj *, 443 struct nvme_command *, 444 struct nvme_completion *); 445 446 static void *aen_thr(void *arg); 447 448 static __inline void 449 cpywithpad(char *dst, size_t dst_size, const char *src, char pad) 450 { 451 size_t len; 452 453 len = strnlen(src, dst_size); 454 memset(dst, pad, dst_size); 455 memcpy(dst, src, len); 456 } 457 458 static __inline void 459 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) 460 { 461 462 *status &= ~NVME_STATUS_MASK; 463 *status |= NVMEF(NVME_STATUS_SCT, type) | NVMEF(NVME_STATUS_SC, code); 464 } 465 466 static __inline void 467 pci_nvme_status_genc(uint16_t *status, uint16_t code) 468 { 469 470 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); 471 } 472 473 /* 474 * Initialize the requested number or IO Submission and Completion Queues. 475 * Admin queues are allocated implicitly. 476 */ 477 static void 478 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq) 479 { 480 uint32_t i; 481 482 /* 483 * Allocate and initialize the Submission Queues 484 */ 485 if (nsq > NVME_QUEUES) { 486 WPRINTF("%s: clamping number of SQ from %u to %u", 487 __func__, nsq, NVME_QUEUES); 488 nsq = NVME_QUEUES; 489 } 490 491 sc->num_squeues = nsq; 492 493 sc->submit_queues = calloc(sc->num_squeues + 1, 494 sizeof(struct nvme_submission_queue)); 495 if (sc->submit_queues == NULL) { 496 WPRINTF("%s: SQ allocation failed", __func__); 497 sc->num_squeues = 0; 498 } else { 499 struct nvme_submission_queue *sq = sc->submit_queues; 500 501 for (i = 0; i < sc->num_squeues + 1; i++) 502 pthread_mutex_init(&sq[i].mtx, NULL); 503 } 504 505 /* 506 * Allocate and initialize the Completion Queues 507 */ 508 if (ncq > NVME_QUEUES) { 509 WPRINTF("%s: clamping number of CQ from %u to %u", 510 __func__, ncq, NVME_QUEUES); 511 ncq = NVME_QUEUES; 512 } 513 514 sc->num_cqueues = ncq; 515 516 sc->compl_queues = calloc(sc->num_cqueues + 1, 517 sizeof(struct nvme_completion_queue)); 518 if (sc->compl_queues == NULL) { 519 WPRINTF("%s: CQ allocation failed", __func__); 520 sc->num_cqueues = 0; 521 } else { 522 struct nvme_completion_queue *cq = sc->compl_queues; 523 524 for (i = 0; i < sc->num_cqueues + 1; i++) 525 pthread_mutex_init(&cq[i].mtx, NULL); 526 } 527 } 528 529 static void 530 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) 531 { 532 struct nvme_controller_data *cd = &sc->ctrldata; 533 int ret; 534 535 cd->vid = 0xFB5D; 536 cd->ssvid = 0x0000; 537 538 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); 539 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); 540 541 /* Num of submission commands that we can handle at a time (2^rab) */ 542 cd->rab = 4; 543 544 /* FreeBSD OUI */ 545 cd->ieee[0] = 0xfc; 546 cd->ieee[1] = 0x9c; 547 cd->ieee[2] = 0x58; 548 549 cd->mic = 0; 550 551 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */ 552 553 cd->ver = NVME_REV(1,4); 554 555 cd->cntrltype = NVME_CNTRLTYPE_IO; 556 cd->oacs = NVMEF(NVME_CTRLR_DATA_OACS_FORMAT, 1); 557 cd->oaes = NVMEM(NVME_CTRLR_DATA_OAES_NS_ATTR); 558 cd->acl = 2; 559 cd->aerl = 4; 560 561 /* Advertise 1, Read-only firmware slot */ 562 cd->frmw = NVMEM(NVME_CTRLR_DATA_FRMW_SLOT1_RO) | 563 NVMEF(NVME_CTRLR_DATA_FRMW_NUM_SLOTS, 1); 564 cd->lpa = 0; /* TODO: support some simple things like SMART */ 565 cd->elpe = 0; /* max error log page entries */ 566 /* 567 * Report a single power state (zero-based value) 568 * power_state[] values are left as zero to indicate "Not reported" 569 */ 570 cd->npss = 0; 571 572 /* Warning Composite Temperature Threshold */ 573 cd->wctemp = 0x0157; 574 cd->cctemp = 0x0157; 575 576 /* SANICAP must not be 0 for Revision 1.4 and later NVMe Controllers */ 577 cd->sanicap = NVMEF(NVME_CTRLR_DATA_SANICAP_NODMMAS, 578 NVME_CTRLR_DATA_SANICAP_NODMMAS_NO); 579 580 cd->sqes = NVMEF(NVME_CTRLR_DATA_SQES_MAX, 6) | 581 NVMEF(NVME_CTRLR_DATA_SQES_MIN, 6); 582 cd->cqes = NVMEF(NVME_CTRLR_DATA_CQES_MAX, 4) | 583 NVMEF(NVME_CTRLR_DATA_CQES_MIN, 4); 584 cd->nn = 1; /* number of namespaces */ 585 586 cd->oncs = 0; 587 switch (sc->dataset_management) { 588 case NVME_DATASET_MANAGEMENT_AUTO: 589 if (sc->nvstore.deallocate) 590 cd->oncs |= NVME_ONCS_DSM; 591 break; 592 case NVME_DATASET_MANAGEMENT_ENABLE: 593 cd->oncs |= NVME_ONCS_DSM; 594 break; 595 default: 596 break; 597 } 598 599 cd->fna = NVMEM(NVME_CTRLR_DATA_FNA_FORMAT_ALL); 600 601 cd->vwc = NVMEF(NVME_CTRLR_DATA_VWC_ALL, NVME_CTRLR_DATA_VWC_ALL_NO); 602 603 #ifdef __FreeBSD__ 604 ret = snprintf(cd->subnqn, sizeof(cd->subnqn), 605 "nqn.2013-12.org.freebsd:bhyve-%s-%u-%u-%u", 606 get_config_value("name"), sc->nsc_pi->pi_bus, 607 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 608 #else 609 ret = snprintf((char *)cd->subnqn, sizeof (cd->subnqn), 610 "nqn.2013-12.org.illumos:bhyve-%s-%u-%u-%u", 611 get_config_value("name"), sc->nsc_pi->pi_bus, 612 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 613 #endif 614 if ((ret < 0) || ((unsigned)ret > sizeof(cd->subnqn))) 615 EPRINTLN("%s: error setting subnqn (%d)", __func__, ret); 616 } 617 618 static void 619 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore, 620 struct nvme_namespace_data *nd) 621 { 622 623 /* Get capacity and block size information from backing store */ 624 nd->nsze = nvstore->size / nvstore->sectsz; 625 nd->ncap = nd->nsze; 626 nd->nuse = nd->nsze; 627 } 628 629 static void 630 pci_nvme_init_nsdata(struct pci_nvme_softc *sc, 631 struct nvme_namespace_data *nd, uint32_t nsid, 632 struct pci_nvme_blockstore *nvstore) 633 { 634 635 pci_nvme_init_nsdata_size(nvstore, nd); 636 637 if (nvstore->type == NVME_STOR_BLOCKIF) 638 nvstore->deallocate = blockif_candelete(nvstore->ctx); 639 640 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ 641 nd->flbas = 0; 642 643 /* Create an EUI-64 if user did not provide one */ 644 if (nvstore->eui64 == 0) { 645 char *data = NULL; 646 uint64_t eui64 = nvstore->eui64; 647 648 asprintf(&data, "%s%u%u%u", get_config_value("name"), 649 sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot, 650 sc->nsc_pi->pi_func); 651 652 if (data != NULL) { 653 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); 654 free(data); 655 } 656 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff); 657 } 658 be64enc(nd->eui64, nvstore->eui64); 659 660 /* LBA data-sz = 2^lbads */ 661 nd->lbaf[0] = NVMEF(NVME_NS_DATA_LBAF_LBADS, nvstore->sectsz_bits); 662 } 663 664 static void 665 pci_nvme_init_logpages(struct pci_nvme_softc *sc) 666 { 667 __uint128_t power_cycles = 1; 668 669 memset(&sc->err_log, 0, sizeof(sc->err_log)); 670 memset(&sc->health_log, 0, sizeof(sc->health_log)); 671 memset(&sc->fw_log, 0, sizeof(sc->fw_log)); 672 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 673 674 /* Set read/write remainder to round up according to spec */ 675 sc->read_dunits_remainder = 999; 676 sc->write_dunits_remainder = 999; 677 678 /* Set nominal Health values checked by implementations */ 679 sc->health_log.temperature = NVME_TEMPERATURE; 680 sc->health_log.available_spare = 100; 681 sc->health_log.available_spare_threshold = 10; 682 683 /* Set Active Firmware Info to slot 1 */ 684 sc->fw_log.afi = NVMEF(NVME_FIRMWARE_PAGE_AFI_SLOT, 1); 685 memcpy(&sc->fw_log.revision[0], sc->ctrldata.fr, 686 sizeof(sc->fw_log.revision[0])); 687 688 memcpy(&sc->health_log.power_cycles, &power_cycles, 689 sizeof(sc->health_log.power_cycles)); 690 } 691 692 static void 693 pci_nvme_init_features(struct pci_nvme_softc *sc) 694 { 695 enum nvme_feature fid; 696 697 for (fid = 0; fid < NVME_FID_MAX; fid++) { 698 switch (fid) { 699 case NVME_FEAT_ARBITRATION: 700 case NVME_FEAT_POWER_MANAGEMENT: 701 case NVME_FEAT_INTERRUPT_COALESCING: //XXX 702 case NVME_FEAT_WRITE_ATOMICITY: 703 /* Mandatory but no special handling required */ 704 //XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 705 //XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 706 // this returns a data buffer 707 break; 708 case NVME_FEAT_TEMPERATURE_THRESHOLD: 709 sc->feat[fid].set = nvme_feature_temperature; 710 break; 711 case NVME_FEAT_ERROR_RECOVERY: 712 sc->feat[fid].namespace_specific = true; 713 break; 714 case NVME_FEAT_NUMBER_OF_QUEUES: 715 sc->feat[fid].set = nvme_feature_num_queues; 716 break; 717 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 718 sc->feat[fid].set = nvme_feature_iv_config; 719 break; 720 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 721 sc->feat[fid].set = nvme_feature_async_event; 722 /* Enable all AENs by default */ 723 sc->feat[fid].cdw11 = PCI_NVME_AEN_DEFAULT_MASK; 724 break; 725 default: 726 sc->feat[fid].set = nvme_feature_invalid_cb; 727 sc->feat[fid].get = nvme_feature_invalid_cb; 728 } 729 } 730 } 731 732 static void 733 pci_nvme_aer_reset(struct pci_nvme_softc *sc) 734 { 735 736 STAILQ_INIT(&sc->aer_list); 737 sc->aer_count = 0; 738 } 739 740 static void 741 pci_nvme_aer_init(struct pci_nvme_softc *sc) 742 { 743 744 pthread_mutex_init(&sc->aer_mtx, NULL); 745 pci_nvme_aer_reset(sc); 746 } 747 748 static void 749 pci_nvme_aer_destroy(struct pci_nvme_softc *sc) 750 { 751 struct pci_nvme_aer *aer = NULL; 752 753 pthread_mutex_lock(&sc->aer_mtx); 754 while (!STAILQ_EMPTY(&sc->aer_list)) { 755 aer = STAILQ_FIRST(&sc->aer_list); 756 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 757 free(aer); 758 } 759 pthread_mutex_unlock(&sc->aer_mtx); 760 761 pci_nvme_aer_reset(sc); 762 } 763 764 static bool 765 pci_nvme_aer_available(struct pci_nvme_softc *sc) 766 { 767 768 return (sc->aer_count != 0); 769 } 770 771 static bool 772 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc) 773 { 774 struct nvme_controller_data *cd = &sc->ctrldata; 775 776 /* AERL is a zero based value while aer_count is one's based */ 777 return (sc->aer_count == (cd->aerl + 1U)); 778 } 779 780 /* 781 * Add an Async Event Request 782 * 783 * Stores an AER to be returned later if the Controller needs to notify the 784 * host of an event. 785 * Note that while the NVMe spec doesn't require Controllers to return AER's 786 * in order, this implementation does preserve the order. 787 */ 788 static int 789 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid) 790 { 791 struct pci_nvme_aer *aer = NULL; 792 793 aer = calloc(1, sizeof(struct pci_nvme_aer)); 794 if (aer == NULL) 795 return (-1); 796 797 /* Save the Command ID for use in the completion message */ 798 aer->cid = cid; 799 800 pthread_mutex_lock(&sc->aer_mtx); 801 sc->aer_count++; 802 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link); 803 pthread_mutex_unlock(&sc->aer_mtx); 804 805 return (0); 806 } 807 808 /* 809 * Get an Async Event Request structure 810 * 811 * Returns a pointer to an AER previously submitted by the host or NULL if 812 * no AER's exist. Caller is responsible for freeing the returned struct. 813 */ 814 static struct pci_nvme_aer * 815 pci_nvme_aer_get(struct pci_nvme_softc *sc) 816 { 817 struct pci_nvme_aer *aer = NULL; 818 819 pthread_mutex_lock(&sc->aer_mtx); 820 aer = STAILQ_FIRST(&sc->aer_list); 821 if (aer != NULL) { 822 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 823 sc->aer_count--; 824 } 825 pthread_mutex_unlock(&sc->aer_mtx); 826 827 return (aer); 828 } 829 830 static void 831 pci_nvme_aen_reset(struct pci_nvme_softc *sc) 832 { 833 uint32_t atype; 834 835 memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen)); 836 837 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 838 sc->aen[atype].atype = atype; 839 } 840 } 841 842 static void 843 pci_nvme_aen_init(struct pci_nvme_softc *sc) 844 { 845 char nstr[80]; 846 847 pci_nvme_aen_reset(sc); 848 849 pthread_mutex_init(&sc->aen_mtx, NULL); 850 pthread_create(&sc->aen_tid, NULL, aen_thr, sc); 851 snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot, 852 sc->nsc_pi->pi_func); 853 pthread_set_name_np(sc->aen_tid, nstr); 854 } 855 856 static void 857 pci_nvme_aen_destroy(struct pci_nvme_softc *sc) 858 { 859 860 pci_nvme_aen_reset(sc); 861 } 862 863 /* Notify the AEN thread of pending work */ 864 static void 865 pci_nvme_aen_notify(struct pci_nvme_softc *sc) 866 { 867 868 pthread_cond_signal(&sc->aen_cond); 869 } 870 871 /* 872 * Post an Asynchronous Event Notification 873 */ 874 static int32_t 875 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype, 876 uint32_t event_data) 877 { 878 struct pci_nvme_aen *aen; 879 880 if (atype >= PCI_NVME_AE_TYPE_MAX) { 881 return(EINVAL); 882 } 883 884 pthread_mutex_lock(&sc->aen_mtx); 885 aen = &sc->aen[atype]; 886 887 /* Has the controller already posted an event of this type? */ 888 if (aen->posted) { 889 pthread_mutex_unlock(&sc->aen_mtx); 890 return(EALREADY); 891 } 892 893 aen->event_data = event_data; 894 aen->posted = true; 895 pthread_mutex_unlock(&sc->aen_mtx); 896 897 pci_nvme_aen_notify(sc); 898 899 return(0); 900 } 901 902 static void 903 pci_nvme_aen_process(struct pci_nvme_softc *sc) 904 { 905 struct pci_nvme_aer *aer; 906 struct pci_nvme_aen *aen; 907 pci_nvme_async_type atype; 908 uint32_t mask; 909 uint16_t status; 910 uint8_t lid; 911 912 #ifndef __FreeBSD__ 913 lid = 0; 914 #endif 915 916 assert(pthread_mutex_isowned_np(&sc->aen_mtx)); 917 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 918 aen = &sc->aen[atype]; 919 /* Previous iterations may have depleted the available AER's */ 920 if (!pci_nvme_aer_available(sc)) { 921 DPRINTF("%s: no AER", __func__); 922 break; 923 } 924 925 if (!aen->posted) { 926 DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype); 927 continue; 928 } 929 930 status = NVME_SC_SUCCESS; 931 932 /* Is the event masked? */ 933 mask = 934 sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11; 935 936 DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data); 937 switch (atype) { 938 case PCI_NVME_AE_TYPE_ERROR: 939 lid = NVME_LOG_ERROR; 940 break; 941 case PCI_NVME_AE_TYPE_SMART: 942 mask &= 0xff; 943 if ((mask & aen->event_data) == 0) 944 continue; 945 lid = NVME_LOG_HEALTH_INFORMATION; 946 break; 947 case PCI_NVME_AE_TYPE_NOTICE: 948 if (aen->event_data >= PCI_NVME_AEI_NOTICE_MAX) { 949 EPRINTLN("%s unknown AEN notice type %u", 950 __func__, aen->event_data); 951 status = NVME_SC_INTERNAL_DEVICE_ERROR; 952 lid = 0; 953 break; 954 } 955 if ((PCI_NVME_AEI_NOTICE_MASK(aen->event_data) & mask) == 0) 956 continue; 957 switch (aen->event_data) { 958 case PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED: 959 lid = NVME_LOG_CHANGED_NAMESPACE; 960 break; 961 case PCI_NVME_AEI_NOTICE_FW_ACTIVATION: 962 lid = NVME_LOG_FIRMWARE_SLOT; 963 break; 964 case PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE: 965 lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED; 966 break; 967 case PCI_NVME_AEI_NOTICE_ANA_CHANGE: 968 lid = NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS; 969 break; 970 case PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE: 971 lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE; 972 break; 973 case PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT: 974 lid = NVME_LOG_LBA_STATUS_INFORMATION; 975 break; 976 case PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE: 977 lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE; 978 break; 979 default: 980 lid = 0; 981 } 982 break; 983 default: 984 /* bad type?!? */ 985 EPRINTLN("%s unknown AEN type %u", __func__, atype); 986 status = NVME_SC_INTERNAL_DEVICE_ERROR; 987 lid = 0; 988 break; 989 } 990 991 aer = pci_nvme_aer_get(sc); 992 assert(aer != NULL); 993 994 DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype); 995 pci_nvme_cq_update(sc, &sc->compl_queues[0], 996 (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */ 997 aer->cid, 998 0, /* SQID */ 999 status); 1000 1001 aen->event_data = 0; 1002 aen->posted = false; 1003 1004 pci_generate_msix(sc->nsc_pi, 0); 1005 } 1006 } 1007 1008 static void * 1009 aen_thr(void *arg) 1010 { 1011 struct pci_nvme_softc *sc; 1012 1013 sc = arg; 1014 1015 pthread_mutex_lock(&sc->aen_mtx); 1016 for (;;) { 1017 pci_nvme_aen_process(sc); 1018 pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx); 1019 } 1020 #ifdef __FreeBSD__ /* Smatch spots unreachable code */ 1021 pthread_mutex_unlock(&sc->aen_mtx); 1022 1023 pthread_exit(NULL); 1024 #endif 1025 return (NULL); 1026 } 1027 1028 static void 1029 pci_nvme_reset_locked(struct pci_nvme_softc *sc) 1030 { 1031 uint32_t i; 1032 1033 DPRINTF("%s", __func__); 1034 1035 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | 1036 NVMEF(NVME_CAP_LO_REG_CQR, 1) | 1037 NVMEF(NVME_CAP_LO_REG_TO, 60); 1038 1039 sc->regs.cap_hi = NVMEF(NVME_CAP_HI_REG_CSS_NVM, 1); 1040 1041 sc->regs.vs = NVME_REV(1,4); /* NVMe v1.4 */ 1042 1043 sc->regs.cc = 0; 1044 1045 assert(sc->submit_queues != NULL); 1046 1047 for (i = 0; i < sc->num_squeues + 1; i++) { 1048 sc->submit_queues[i].qbase = NULL; 1049 sc->submit_queues[i].size = 0; 1050 sc->submit_queues[i].cqid = 0; 1051 sc->submit_queues[i].tail = 0; 1052 sc->submit_queues[i].head = 0; 1053 } 1054 1055 assert(sc->compl_queues != NULL); 1056 1057 for (i = 0; i < sc->num_cqueues + 1; i++) { 1058 sc->compl_queues[i].qbase = NULL; 1059 sc->compl_queues[i].size = 0; 1060 sc->compl_queues[i].tail = 0; 1061 sc->compl_queues[i].head = 0; 1062 } 1063 1064 sc->num_q_is_set = false; 1065 1066 pci_nvme_aer_destroy(sc); 1067 pci_nvme_aen_destroy(sc); 1068 1069 /* 1070 * Clear CSTS.RDY last to prevent the host from enabling Controller 1071 * before cleanup completes 1072 */ 1073 sc->regs.csts = 0; 1074 } 1075 1076 static void 1077 pci_nvme_reset(struct pci_nvme_softc *sc) 1078 { 1079 pthread_mutex_lock(&sc->mtx); 1080 pci_nvme_reset_locked(sc); 1081 pthread_mutex_unlock(&sc->mtx); 1082 } 1083 1084 static int 1085 pci_nvme_init_controller(struct pci_nvme_softc *sc) 1086 { 1087 uint16_t acqs, asqs; 1088 1089 DPRINTF("%s", __func__); 1090 1091 /* 1092 * NVMe 2.0 states that "enabling a controller while this field is 1093 * cleared to 0h produces undefined results" for both ACQS and 1094 * ASQS. If zero, set CFS and do not become ready. 1095 */ 1096 asqs = ONE_BASED(NVMEV(NVME_AQA_REG_ASQS, sc->regs.aqa)); 1097 if (asqs < 2) { 1098 EPRINTLN("%s: illegal ASQS value %#x (aqa=%#x)", __func__, 1099 asqs - 1, sc->regs.aqa); 1100 sc->regs.csts |= NVME_CSTS_CFS; 1101 return (-1); 1102 } 1103 sc->submit_queues[0].size = asqs; 1104 sc->submit_queues[0].qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1105 sc->regs.asq, sizeof(struct nvme_command) * asqs); 1106 if (sc->submit_queues[0].qbase == NULL) { 1107 EPRINTLN("%s: ASQ vm_map_gpa(%lx) failed", __func__, 1108 sc->regs.asq); 1109 sc->regs.csts |= NVME_CSTS_CFS; 1110 return (-1); 1111 } 1112 1113 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p", 1114 __func__, sc->regs.asq, sc->submit_queues[0].qbase); 1115 1116 acqs = ONE_BASED(NVMEV(NVME_AQA_REG_ACQS, sc->regs.aqa)); 1117 if (acqs < 2) { 1118 EPRINTLN("%s: illegal ACQS value %#x (aqa=%#x)", __func__, 1119 acqs - 1, sc->regs.aqa); 1120 sc->regs.csts |= NVME_CSTS_CFS; 1121 return (-1); 1122 } 1123 sc->compl_queues[0].size = acqs; 1124 sc->compl_queues[0].qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1125 sc->regs.acq, sizeof(struct nvme_completion) * acqs); 1126 if (sc->compl_queues[0].qbase == NULL) { 1127 EPRINTLN("%s: ACQ vm_map_gpa(%lx) failed", __func__, 1128 sc->regs.acq); 1129 sc->regs.csts |= NVME_CSTS_CFS; 1130 return (-1); 1131 } 1132 sc->compl_queues[0].intr_en = NVME_CQ_INTEN; 1133 1134 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p", 1135 __func__, sc->regs.acq, sc->compl_queues[0].qbase); 1136 1137 return (0); 1138 } 1139 1140 static int 1141 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b, 1142 size_t len, enum nvme_copy_dir dir) 1143 { 1144 uint8_t *p; 1145 size_t bytes; 1146 1147 if (len > (8 * 1024)) { 1148 return (-1); 1149 } 1150 1151 /* Copy from the start of prp1 to the end of the physical page */ 1152 bytes = PAGE_SIZE - (prp1 & PAGE_MASK); 1153 bytes = MIN(bytes, len); 1154 1155 p = vm_map_gpa(ctx, prp1, bytes); 1156 if (p == NULL) { 1157 return (-1); 1158 } 1159 1160 if (dir == NVME_COPY_TO_PRP) 1161 memcpy(p, b, bytes); 1162 else 1163 memcpy(b, p, bytes); 1164 1165 b += bytes; 1166 1167 len -= bytes; 1168 if (len == 0) { 1169 return (0); 1170 } 1171 1172 len = MIN(len, PAGE_SIZE); 1173 1174 p = vm_map_gpa(ctx, prp2, len); 1175 if (p == NULL) { 1176 return (-1); 1177 } 1178 1179 if (dir == NVME_COPY_TO_PRP) 1180 memcpy(p, b, len); 1181 else 1182 memcpy(b, p, len); 1183 1184 return (0); 1185 } 1186 1187 /* 1188 * Write a Completion Queue Entry update 1189 * 1190 * Write the completion and update the doorbell value 1191 */ 1192 static void 1193 pci_nvme_cq_update(struct pci_nvme_softc *sc, 1194 struct nvme_completion_queue *cq, 1195 uint32_t cdw0, 1196 uint16_t cid, 1197 uint16_t sqid, 1198 uint16_t status) 1199 { 1200 struct nvme_submission_queue *sq = &sc->submit_queues[sqid]; 1201 struct nvme_completion *cqe; 1202 1203 assert(cq->qbase != NULL); 1204 1205 pthread_mutex_lock(&cq->mtx); 1206 1207 cqe = &cq->qbase[cq->tail]; 1208 1209 /* Flip the phase bit */ 1210 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK; 1211 1212 cqe->cdw0 = cdw0; 1213 cqe->sqhd = sq->head; 1214 cqe->sqid = sqid; 1215 cqe->cid = cid; 1216 cqe->status = status; 1217 1218 cq->tail++; 1219 if (cq->tail >= cq->size) { 1220 cq->tail = 0; 1221 } 1222 1223 pthread_mutex_unlock(&cq->mtx); 1224 } 1225 1226 static int 1227 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1228 struct nvme_completion* compl) 1229 { 1230 uint16_t qid = command->cdw10 & 0xffff; 1231 1232 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid); 1233 if (qid == 0 || qid > sc->num_squeues || 1234 (sc->submit_queues[qid].qbase == NULL)) { 1235 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u", 1236 __func__, qid, sc->num_squeues); 1237 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1238 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1239 return (1); 1240 } 1241 1242 sc->submit_queues[qid].qbase = NULL; 1243 sc->submit_queues[qid].cqid = 0; 1244 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1245 return (1); 1246 } 1247 1248 static int 1249 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1250 struct nvme_completion* compl) 1251 { 1252 if (command->cdw11 & NVME_CMD_CDW11_PC) { 1253 uint16_t qid = command->cdw10 & 0xffff; 1254 struct nvme_submission_queue *nsq; 1255 1256 if ((qid == 0) || (qid > sc->num_squeues) || 1257 (sc->submit_queues[qid].qbase != NULL)) { 1258 WPRINTF("%s queue index %u > num_squeues %u", 1259 __func__, qid, sc->num_squeues); 1260 pci_nvme_status_tc(&compl->status, 1261 NVME_SCT_COMMAND_SPECIFIC, 1262 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1263 return (1); 1264 } 1265 1266 nsq = &sc->submit_queues[qid]; 1267 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1268 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries); 1269 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) { 1270 /* 1271 * Queues must specify at least two entries 1272 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1273 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1274 */ 1275 pci_nvme_status_tc(&compl->status, 1276 NVME_SCT_COMMAND_SPECIFIC, 1277 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1278 return (1); 1279 } 1280 nsq->head = nsq->tail = 0; 1281 1282 nsq->cqid = (command->cdw11 >> 16) & 0xffff; 1283 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) { 1284 pci_nvme_status_tc(&compl->status, 1285 NVME_SCT_COMMAND_SPECIFIC, 1286 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1287 return (1); 1288 } 1289 1290 if (sc->compl_queues[nsq->cqid].qbase == NULL) { 1291 pci_nvme_status_tc(&compl->status, 1292 NVME_SCT_COMMAND_SPECIFIC, 1293 NVME_SC_COMPLETION_QUEUE_INVALID); 1294 return (1); 1295 } 1296 1297 nsq->qpriority = (command->cdw11 >> 1) & 0x03; 1298 1299 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1300 sizeof(struct nvme_command) * (size_t)nsq->size); 1301 1302 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__, 1303 qid, nsq->size, nsq->qbase, nsq->cqid); 1304 1305 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1306 1307 DPRINTF("%s completed creating IOSQ qid %u", 1308 __func__, qid); 1309 } else { 1310 /* 1311 * Guest sent non-cont submission queue request. 1312 * This setting is unsupported by this emulation. 1313 */ 1314 WPRINTF("%s unsupported non-contig (list-based) " 1315 "create i/o submission queue", __func__); 1316 1317 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1318 } 1319 return (1); 1320 } 1321 1322 static int 1323 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1324 struct nvme_completion* compl) 1325 { 1326 uint16_t qid = command->cdw10 & 0xffff; 1327 uint16_t sqid; 1328 1329 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid); 1330 if (qid == 0 || qid > sc->num_cqueues || 1331 (sc->compl_queues[qid].qbase == NULL)) { 1332 WPRINTF("%s queue index %u / num_cqueues %u", 1333 __func__, qid, sc->num_cqueues); 1334 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1335 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1336 return (1); 1337 } 1338 1339 /* Deleting an Active CQ is an error */ 1340 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++) 1341 if (sc->submit_queues[sqid].cqid == qid) { 1342 pci_nvme_status_tc(&compl->status, 1343 NVME_SCT_COMMAND_SPECIFIC, 1344 NVME_SC_INVALID_QUEUE_DELETION); 1345 return (1); 1346 } 1347 1348 sc->compl_queues[qid].qbase = NULL; 1349 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1350 return (1); 1351 } 1352 1353 static int 1354 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1355 struct nvme_completion* compl) 1356 { 1357 struct nvme_completion_queue *ncq; 1358 uint16_t qid = command->cdw10 & 0xffff; 1359 1360 /* Only support Physically Contiguous queues */ 1361 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) { 1362 WPRINTF("%s unsupported non-contig (list-based) " 1363 "create i/o completion queue", 1364 __func__); 1365 1366 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1367 return (1); 1368 } 1369 1370 if ((qid == 0) || (qid > sc->num_cqueues) || 1371 (sc->compl_queues[qid].qbase != NULL)) { 1372 WPRINTF("%s queue index %u > num_cqueues %u", 1373 __func__, qid, sc->num_cqueues); 1374 pci_nvme_status_tc(&compl->status, 1375 NVME_SCT_COMMAND_SPECIFIC, 1376 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1377 return (1); 1378 } 1379 1380 ncq = &sc->compl_queues[qid]; 1381 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; 1382 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; 1383 if (ncq->intr_vec > (sc->max_queues + 1)) { 1384 pci_nvme_status_tc(&compl->status, 1385 NVME_SCT_COMMAND_SPECIFIC, 1386 NVME_SC_INVALID_INTERRUPT_VECTOR); 1387 return (1); 1388 } 1389 1390 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1391 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) { 1392 /* 1393 * Queues must specify at least two entries 1394 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1395 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1396 */ 1397 pci_nvme_status_tc(&compl->status, 1398 NVME_SCT_COMMAND_SPECIFIC, 1399 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1400 return (1); 1401 } 1402 ncq->head = ncq->tail = 0; 1403 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1404 command->prp1, 1405 sizeof(struct nvme_command) * (size_t)ncq->size); 1406 1407 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1408 1409 1410 return (1); 1411 } 1412 1413 static int 1414 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, 1415 struct nvme_completion* compl) 1416 { 1417 uint64_t logoff; 1418 uint32_t logsize; 1419 uint8_t logpage; 1420 1421 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1422 1423 /* 1424 * Command specifies the number of dwords to return in fields NUMDU 1425 * and NUMDL. This is a zero-based value. 1426 */ 1427 logpage = command->cdw10 & 0xFF; 1428 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1; 1429 logsize *= sizeof(uint32_t); 1430 logoff = ((uint64_t)(command->cdw13) << 32) | command->cdw12; 1431 1432 DPRINTF("%s log page %u offset %lu len %u", __func__, logpage, logoff, logsize); 1433 1434 switch (logpage) { 1435 case NVME_LOG_ERROR: 1436 if (logoff >= sizeof(sc->err_log)) { 1437 pci_nvme_status_genc(&compl->status, 1438 NVME_SC_INVALID_FIELD); 1439 break; 1440 } 1441 1442 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1443 command->prp2, (uint8_t *)&sc->err_log + logoff, 1444 MIN(logsize, sizeof(sc->err_log) - logoff), 1445 NVME_COPY_TO_PRP); 1446 break; 1447 case NVME_LOG_HEALTH_INFORMATION: 1448 if (logoff >= sizeof(sc->health_log)) { 1449 pci_nvme_status_genc(&compl->status, 1450 NVME_SC_INVALID_FIELD); 1451 break; 1452 } 1453 1454 pthread_mutex_lock(&sc->mtx); 1455 memcpy(&sc->health_log.data_units_read, &sc->read_data_units, 1456 sizeof(sc->health_log.data_units_read)); 1457 memcpy(&sc->health_log.data_units_written, &sc->write_data_units, 1458 sizeof(sc->health_log.data_units_written)); 1459 memcpy(&sc->health_log.host_read_commands, &sc->read_commands, 1460 sizeof(sc->health_log.host_read_commands)); 1461 memcpy(&sc->health_log.host_write_commands, &sc->write_commands, 1462 sizeof(sc->health_log.host_write_commands)); 1463 pthread_mutex_unlock(&sc->mtx); 1464 1465 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1466 command->prp2, (uint8_t *)&sc->health_log + logoff, 1467 MIN(logsize, sizeof(sc->health_log) - logoff), 1468 NVME_COPY_TO_PRP); 1469 break; 1470 case NVME_LOG_FIRMWARE_SLOT: 1471 if (logoff >= sizeof(sc->fw_log)) { 1472 pci_nvme_status_genc(&compl->status, 1473 NVME_SC_INVALID_FIELD); 1474 break; 1475 } 1476 1477 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1478 command->prp2, (uint8_t *)&sc->fw_log + logoff, 1479 MIN(logsize, sizeof(sc->fw_log) - logoff), 1480 NVME_COPY_TO_PRP); 1481 break; 1482 case NVME_LOG_CHANGED_NAMESPACE: 1483 if (logoff >= sizeof(sc->ns_log)) { 1484 pci_nvme_status_genc(&compl->status, 1485 NVME_SC_INVALID_FIELD); 1486 break; 1487 } 1488 1489 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1490 command->prp2, (uint8_t *)&sc->ns_log + logoff, 1491 MIN(logsize, sizeof(sc->ns_log) - logoff), 1492 NVME_COPY_TO_PRP); 1493 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 1494 break; 1495 default: 1496 DPRINTF("%s get log page %x command not supported", 1497 __func__, logpage); 1498 1499 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1500 NVME_SC_INVALID_LOG_PAGE); 1501 } 1502 1503 return (1); 1504 } 1505 1506 static int 1507 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, 1508 struct nvme_completion* compl) 1509 { 1510 void *dest; 1511 uint16_t status; 1512 1513 DPRINTF("%s identify 0x%x nsid 0x%x", __func__, 1514 command->cdw10 & 0xFF, command->nsid); 1515 1516 status = 0; 1517 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1518 1519 switch (command->cdw10 & 0xFF) { 1520 case 0x00: /* return Identify Namespace data structure */ 1521 /* Global NS only valid with NS Management */ 1522 if (command->nsid == NVME_GLOBAL_NAMESPACE_TAG) { 1523 pci_nvme_status_genc(&status, 1524 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1525 break; 1526 } 1527 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1528 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata), 1529 NVME_COPY_TO_PRP); 1530 break; 1531 case 0x01: /* return Identify Controller data structure */ 1532 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1533 command->prp2, (uint8_t *)&sc->ctrldata, 1534 sizeof(sc->ctrldata), 1535 NVME_COPY_TO_PRP); 1536 break; 1537 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ 1538 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1539 sizeof(uint32_t) * 1024); 1540 /* All unused entries shall be zero */ 1541 memset(dest, 0, sizeof(uint32_t) * 1024); 1542 ((uint32_t *)dest)[0] = 1; 1543 break; 1544 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ 1545 if (command->nsid != 1) { 1546 pci_nvme_status_genc(&status, 1547 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1548 break; 1549 } 1550 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1551 sizeof(uint32_t) * 1024); 1552 /* All bytes after the descriptor shall be zero */ 1553 memset(dest, 0, sizeof(uint32_t) * 1024); 1554 1555 /* Return NIDT=1 (i.e. EUI64) descriptor */ 1556 ((uint8_t *)dest)[0] = 1; 1557 ((uint8_t *)dest)[1] = sizeof(uint64_t); 1558 memcpy(((uint8_t *)dest) + 4, sc->nsdata.eui64, sizeof(uint64_t)); 1559 break; 1560 case 0x13: 1561 /* 1562 * Controller list is optional but used by UNH tests. Return 1563 * a valid but empty list. 1564 */ 1565 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1566 sizeof(uint16_t) * 2048); 1567 memset(dest, 0, sizeof(uint16_t) * 2048); 1568 break; 1569 default: 1570 DPRINTF("%s unsupported identify command requested 0x%x", 1571 __func__, command->cdw10 & 0xFF); 1572 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD); 1573 break; 1574 } 1575 1576 compl->status = status; 1577 return (1); 1578 } 1579 1580 static const char * 1581 nvme_fid_to_name(uint8_t fid) 1582 { 1583 const char *name; 1584 1585 switch (fid) { 1586 case NVME_FEAT_ARBITRATION: 1587 name = "Arbitration"; 1588 break; 1589 case NVME_FEAT_POWER_MANAGEMENT: 1590 name = "Power Management"; 1591 break; 1592 case NVME_FEAT_LBA_RANGE_TYPE: 1593 name = "LBA Range Type"; 1594 break; 1595 case NVME_FEAT_TEMPERATURE_THRESHOLD: 1596 name = "Temperature Threshold"; 1597 break; 1598 case NVME_FEAT_ERROR_RECOVERY: 1599 name = "Error Recovery"; 1600 break; 1601 case NVME_FEAT_VOLATILE_WRITE_CACHE: 1602 name = "Volatile Write Cache"; 1603 break; 1604 case NVME_FEAT_NUMBER_OF_QUEUES: 1605 name = "Number of Queues"; 1606 break; 1607 case NVME_FEAT_INTERRUPT_COALESCING: 1608 name = "Interrupt Coalescing"; 1609 break; 1610 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 1611 name = "Interrupt Vector Configuration"; 1612 break; 1613 case NVME_FEAT_WRITE_ATOMICITY: 1614 name = "Write Atomicity Normal"; 1615 break; 1616 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 1617 name = "Asynchronous Event Configuration"; 1618 break; 1619 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 1620 name = "Autonomous Power State Transition"; 1621 break; 1622 case NVME_FEAT_HOST_MEMORY_BUFFER: 1623 name = "Host Memory Buffer"; 1624 break; 1625 case NVME_FEAT_TIMESTAMP: 1626 name = "Timestamp"; 1627 break; 1628 case NVME_FEAT_KEEP_ALIVE_TIMER: 1629 name = "Keep Alive Timer"; 1630 break; 1631 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT: 1632 name = "Host Controlled Thermal Management"; 1633 break; 1634 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG: 1635 name = "Non-Operation Power State Config"; 1636 break; 1637 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG: 1638 name = "Read Recovery Level Config"; 1639 break; 1640 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 1641 name = "Predictable Latency Mode Config"; 1642 break; 1643 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW: 1644 name = "Predictable Latency Mode Window"; 1645 break; 1646 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES: 1647 name = "LBA Status Information Report Interval"; 1648 break; 1649 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 1650 name = "Host Behavior Support"; 1651 break; 1652 case NVME_FEAT_SANITIZE_CONFIG: 1653 name = "Sanitize Config"; 1654 break; 1655 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION: 1656 name = "Endurance Group Event Configuration"; 1657 break; 1658 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 1659 name = "Software Progress Marker"; 1660 break; 1661 case NVME_FEAT_HOST_IDENTIFIER: 1662 name = "Host Identifier"; 1663 break; 1664 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK: 1665 name = "Reservation Notification Mask"; 1666 break; 1667 case NVME_FEAT_RESERVATION_PERSISTENCE: 1668 name = "Reservation Persistence"; 1669 break; 1670 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG: 1671 name = "Namespace Write Protection Config"; 1672 break; 1673 default: 1674 name = "Unknown"; 1675 break; 1676 } 1677 1678 return (name); 1679 } 1680 1681 static void 1682 nvme_feature_invalid_cb(struct pci_nvme_softc *sc __unused, 1683 struct nvme_feature_obj *feat __unused, 1684 struct nvme_command *command __unused, 1685 struct nvme_completion *compl) 1686 { 1687 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1688 } 1689 1690 static void 1691 nvme_feature_iv_config(struct pci_nvme_softc *sc, 1692 struct nvme_feature_obj *feat __unused, 1693 struct nvme_command *command, 1694 struct nvme_completion *compl) 1695 { 1696 uint32_t i; 1697 uint32_t cdw11 = command->cdw11; 1698 uint16_t iv; 1699 bool cd; 1700 1701 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1702 1703 iv = cdw11 & 0xffff; 1704 cd = cdw11 & (1 << 16); 1705 1706 if (iv > (sc->max_queues + 1)) { 1707 return; 1708 } 1709 1710 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */ 1711 if ((iv == 0) && !cd) 1712 return; 1713 1714 /* Requested Interrupt Vector must be used by a CQ */ 1715 for (i = 0; i < sc->num_cqueues + 1; i++) { 1716 if (sc->compl_queues[i].intr_vec == iv) { 1717 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1718 } 1719 } 1720 } 1721 1722 #define NVME_ASYNC_EVENT_ENDURANCE_GROUP (0x4000) 1723 static void 1724 nvme_feature_async_event(struct pci_nvme_softc *sc __unused, 1725 struct nvme_feature_obj *feat __unused, 1726 struct nvme_command *command, 1727 struct nvme_completion *compl) 1728 { 1729 if (command->cdw11 & NVME_ASYNC_EVENT_ENDURANCE_GROUP) 1730 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1731 } 1732 1733 #define NVME_TEMP_THRESH_OVER 0 1734 #define NVME_TEMP_THRESH_UNDER 1 1735 static void 1736 nvme_feature_temperature(struct pci_nvme_softc *sc, 1737 struct nvme_feature_obj *feat __unused, 1738 struct nvme_command *command, 1739 struct nvme_completion *compl) 1740 { 1741 uint16_t tmpth; /* Temperature Threshold */ 1742 uint8_t tmpsel; /* Threshold Temperature Select */ 1743 uint8_t thsel; /* Threshold Type Select */ 1744 bool set_crit = false; 1745 bool report_crit; 1746 1747 tmpth = command->cdw11 & 0xffff; 1748 tmpsel = (command->cdw11 >> 16) & 0xf; 1749 thsel = (command->cdw11 >> 20) & 0x3; 1750 1751 DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel); 1752 1753 /* Check for unsupported values */ 1754 if (((tmpsel != 0) && (tmpsel != 0xf)) || 1755 (thsel > NVME_TEMP_THRESH_UNDER)) { 1756 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1757 return; 1758 } 1759 1760 if (((thsel == NVME_TEMP_THRESH_OVER) && (NVME_TEMPERATURE >= tmpth)) || 1761 ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth))) 1762 set_crit = true; 1763 1764 pthread_mutex_lock(&sc->mtx); 1765 if (set_crit) 1766 sc->health_log.critical_warning |= 1767 NVME_CRIT_WARN_ST_TEMPERATURE; 1768 else 1769 sc->health_log.critical_warning &= 1770 ~NVME_CRIT_WARN_ST_TEMPERATURE; 1771 pthread_mutex_unlock(&sc->mtx); 1772 1773 report_crit = sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11 & 1774 NVME_CRIT_WARN_ST_TEMPERATURE; 1775 1776 if (set_crit && report_crit) 1777 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART, 1778 sc->health_log.critical_warning); 1779 1780 DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status); 1781 } 1782 1783 static void 1784 nvme_feature_num_queues(struct pci_nvme_softc *sc, 1785 struct nvme_feature_obj *feat __unused, 1786 struct nvme_command *command, 1787 struct nvme_completion *compl) 1788 { 1789 uint16_t nqr; /* Number of Queues Requested */ 1790 1791 if (sc->num_q_is_set) { 1792 WPRINTF("%s: Number of Queues already set", __func__); 1793 pci_nvme_status_genc(&compl->status, 1794 NVME_SC_COMMAND_SEQUENCE_ERROR); 1795 return; 1796 } 1797 1798 nqr = command->cdw11 & 0xFFFF; 1799 if (nqr == 0xffff) { 1800 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr); 1801 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1802 return; 1803 } 1804 1805 sc->num_squeues = ONE_BASED(nqr); 1806 if (sc->num_squeues > sc->max_queues) { 1807 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues, 1808 sc->max_queues); 1809 sc->num_squeues = sc->max_queues; 1810 } 1811 1812 nqr = (command->cdw11 >> 16) & 0xFFFF; 1813 if (nqr == 0xffff) { 1814 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr); 1815 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1816 return; 1817 } 1818 1819 sc->num_cqueues = ONE_BASED(nqr); 1820 if (sc->num_cqueues > sc->max_queues) { 1821 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues, 1822 sc->max_queues); 1823 sc->num_cqueues = sc->max_queues; 1824 } 1825 1826 /* Patch the command value which will be saved on callback's return */ 1827 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc); 1828 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 1829 1830 sc->num_q_is_set = true; 1831 } 1832 1833 static int 1834 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command, 1835 struct nvme_completion *compl) 1836 { 1837 struct nvme_feature_obj *feat; 1838 uint32_t nsid = command->nsid; 1839 uint8_t fid = NVMEV(NVME_FEAT_SET_FID, command->cdw10); 1840 bool sv = NVMEV(NVME_FEAT_SET_SV, command->cdw10); 1841 1842 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1843 1844 if (fid >= NVME_FID_MAX) { 1845 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1846 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1847 return (1); 1848 } 1849 1850 if (sv) { 1851 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1852 NVME_SC_FEATURE_NOT_SAVEABLE); 1853 return (1); 1854 } 1855 1856 feat = &sc->feat[fid]; 1857 1858 if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) { 1859 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1860 return (1); 1861 } 1862 1863 if (!feat->namespace_specific && 1864 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) { 1865 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1866 NVME_SC_FEATURE_NOT_NS_SPECIFIC); 1867 return (1); 1868 } 1869 1870 compl->cdw0 = 0; 1871 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1872 1873 if (feat->set) 1874 feat->set(sc, feat, command, compl); 1875 else { 1876 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1877 NVME_SC_FEATURE_NOT_CHANGEABLE); 1878 return (1); 1879 } 1880 1881 DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11); 1882 if (compl->status == NVME_SC_SUCCESS) { 1883 feat->cdw11 = command->cdw11; 1884 if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) && 1885 (command->cdw11 != 0)) 1886 pci_nvme_aen_notify(sc); 1887 } 1888 1889 return (0); 1890 } 1891 1892 #define NVME_FEATURES_SEL_SUPPORTED 0x3 1893 #define NVME_FEATURES_NS_SPECIFIC (1 << 1) 1894 1895 static int 1896 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, 1897 struct nvme_completion* compl) 1898 { 1899 struct nvme_feature_obj *feat; 1900 uint8_t fid = command->cdw10 & 0xFF; 1901 uint8_t sel = (command->cdw10 >> 8) & 0x7; 1902 1903 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1904 1905 if (fid >= NVME_FID_MAX) { 1906 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1907 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1908 return (1); 1909 } 1910 1911 compl->cdw0 = 0; 1912 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1913 1914 feat = &sc->feat[fid]; 1915 if (feat->get) { 1916 feat->get(sc, feat, command, compl); 1917 } 1918 1919 if (compl->status == NVME_SC_SUCCESS) { 1920 if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific) 1921 compl->cdw0 = NVME_FEATURES_NS_SPECIFIC; 1922 else 1923 compl->cdw0 = feat->cdw11; 1924 } 1925 1926 return (0); 1927 } 1928 1929 static int 1930 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command, 1931 struct nvme_completion* compl) 1932 { 1933 uint8_t ses, lbaf, pi; 1934 1935 /* Only supports Secure Erase Setting - User Data Erase */ 1936 ses = (command->cdw10 >> 9) & 0x7; 1937 if (ses > 0x1) { 1938 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1939 return (1); 1940 } 1941 1942 /* Only supports a single LBA Format */ 1943 lbaf = command->cdw10 & 0xf; 1944 if (lbaf != 0) { 1945 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1946 NVME_SC_INVALID_FORMAT); 1947 return (1); 1948 } 1949 1950 /* Doesn't support Protection Information */ 1951 pi = (command->cdw10 >> 5) & 0x7; 1952 if (pi != 0) { 1953 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1954 return (1); 1955 } 1956 1957 if (sc->nvstore.type == NVME_STOR_RAM) { 1958 if (sc->nvstore.ctx) 1959 free(sc->nvstore.ctx); 1960 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 1961 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1962 } else { 1963 struct pci_nvme_ioreq *req; 1964 int err; 1965 1966 req = pci_nvme_get_ioreq(sc); 1967 if (req == NULL) { 1968 pci_nvme_status_genc(&compl->status, 1969 NVME_SC_INTERNAL_DEVICE_ERROR); 1970 WPRINTF("%s: unable to allocate IO req", __func__); 1971 return (1); 1972 } 1973 req->nvme_sq = &sc->submit_queues[0]; 1974 req->sqid = 0; 1975 req->opc = command->opc; 1976 req->cid = command->cid; 1977 req->nsid = command->nsid; 1978 1979 req->io_req.br_offset = 0; 1980 req->io_req.br_resid = sc->nvstore.size; 1981 req->io_req.br_callback = pci_nvme_io_done; 1982 1983 err = blockif_delete(sc->nvstore.ctx, &req->io_req); 1984 if (err) { 1985 pci_nvme_status_genc(&compl->status, 1986 NVME_SC_INTERNAL_DEVICE_ERROR); 1987 pci_nvme_release_ioreq(sc, req); 1988 } else 1989 compl->status = NVME_NO_STATUS; 1990 } 1991 1992 return (1); 1993 } 1994 1995 static int 1996 nvme_opc_abort(struct pci_nvme_softc *sc __unused, struct nvme_command *command, 1997 struct nvme_completion *compl) 1998 { 1999 DPRINTF("%s submission queue %u, command ID 0x%x", __func__, 2000 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF); 2001 2002 /* TODO: search for the command ID and abort it */ 2003 2004 compl->cdw0 = 1; 2005 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 2006 return (1); 2007 } 2008 2009 static int 2010 nvme_opc_async_event_req(struct pci_nvme_softc* sc, 2011 struct nvme_command* command, struct nvme_completion* compl) 2012 { 2013 DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__, 2014 sc->aer_count, sc->ctrldata.aerl, command->cid); 2015 2016 /* Don't exceed the Async Event Request Limit (AERL). */ 2017 if (pci_nvme_aer_limit_reached(sc)) { 2018 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 2019 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); 2020 return (1); 2021 } 2022 2023 if (pci_nvme_aer_add(sc, command->cid)) { 2024 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC, 2025 NVME_SC_INTERNAL_DEVICE_ERROR); 2026 return (1); 2027 } 2028 2029 /* 2030 * Raise events when they happen based on the Set Features cmd. 2031 * These events happen async, so only set completion successful if 2032 * there is an event reflective of the request to get event. 2033 */ 2034 compl->status = NVME_NO_STATUS; 2035 pci_nvme_aen_notify(sc); 2036 2037 return (0); 2038 } 2039 2040 static void 2041 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) 2042 { 2043 struct nvme_completion compl; 2044 struct nvme_command *cmd; 2045 struct nvme_submission_queue *sq; 2046 struct nvme_completion_queue *cq; 2047 uint16_t sqhead; 2048 2049 DPRINTF("%s index %u", __func__, (uint32_t)value); 2050 2051 sq = &sc->submit_queues[0]; 2052 cq = &sc->compl_queues[0]; 2053 2054 pthread_mutex_lock(&sq->mtx); 2055 2056 sqhead = sq->head; 2057 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail); 2058 2059 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2060 cmd = &(sq->qbase)[sqhead]; 2061 compl.cdw0 = 0; 2062 compl.status = 0; 2063 2064 switch (cmd->opc) { 2065 case NVME_OPC_DELETE_IO_SQ: 2066 DPRINTF("%s command DELETE_IO_SQ", __func__); 2067 nvme_opc_delete_io_sq(sc, cmd, &compl); 2068 break; 2069 case NVME_OPC_CREATE_IO_SQ: 2070 DPRINTF("%s command CREATE_IO_SQ", __func__); 2071 nvme_opc_create_io_sq(sc, cmd, &compl); 2072 break; 2073 case NVME_OPC_DELETE_IO_CQ: 2074 DPRINTF("%s command DELETE_IO_CQ", __func__); 2075 nvme_opc_delete_io_cq(sc, cmd, &compl); 2076 break; 2077 case NVME_OPC_CREATE_IO_CQ: 2078 DPRINTF("%s command CREATE_IO_CQ", __func__); 2079 nvme_opc_create_io_cq(sc, cmd, &compl); 2080 break; 2081 case NVME_OPC_GET_LOG_PAGE: 2082 DPRINTF("%s command GET_LOG_PAGE", __func__); 2083 nvme_opc_get_log_page(sc, cmd, &compl); 2084 break; 2085 case NVME_OPC_IDENTIFY: 2086 DPRINTF("%s command IDENTIFY", __func__); 2087 nvme_opc_identify(sc, cmd, &compl); 2088 break; 2089 case NVME_OPC_ABORT: 2090 DPRINTF("%s command ABORT", __func__); 2091 nvme_opc_abort(sc, cmd, &compl); 2092 break; 2093 case NVME_OPC_SET_FEATURES: 2094 DPRINTF("%s command SET_FEATURES", __func__); 2095 nvme_opc_set_features(sc, cmd, &compl); 2096 break; 2097 case NVME_OPC_GET_FEATURES: 2098 DPRINTF("%s command GET_FEATURES", __func__); 2099 nvme_opc_get_features(sc, cmd, &compl); 2100 break; 2101 case NVME_OPC_FIRMWARE_ACTIVATE: 2102 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__); 2103 pci_nvme_status_tc(&compl.status, 2104 NVME_SCT_COMMAND_SPECIFIC, 2105 NVME_SC_INVALID_FIRMWARE_SLOT); 2106 break; 2107 case NVME_OPC_ASYNC_EVENT_REQUEST: 2108 DPRINTF("%s command ASYNC_EVENT_REQ", __func__); 2109 nvme_opc_async_event_req(sc, cmd, &compl); 2110 break; 2111 case NVME_OPC_FORMAT_NVM: 2112 DPRINTF("%s command FORMAT_NVM", __func__); 2113 if (NVMEV(NVME_CTRLR_DATA_OACS_FORMAT, 2114 sc->ctrldata.oacs) == 0) { 2115 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 2116 break; 2117 } 2118 nvme_opc_format_nvm(sc, cmd, &compl); 2119 break; 2120 case NVME_OPC_SECURITY_SEND: 2121 case NVME_OPC_SECURITY_RECEIVE: 2122 case NVME_OPC_SANITIZE: 2123 case NVME_OPC_GET_LBA_STATUS: 2124 DPRINTF("%s command OPC=%#x (unsupported)", __func__, 2125 cmd->opc); 2126 /* Valid but unsupported opcodes */ 2127 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD); 2128 break; 2129 default: 2130 DPRINTF("%s command OPC=%#X (not implemented)", 2131 __func__, 2132 cmd->opc); 2133 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 2134 } 2135 sqhead = (sqhead + 1) % sq->size; 2136 2137 if (NVME_COMPLETION_VALID(compl)) { 2138 pci_nvme_cq_update(sc, &sc->compl_queues[0], 2139 compl.cdw0, 2140 cmd->cid, 2141 0, /* SQID */ 2142 compl.status); 2143 } 2144 } 2145 2146 DPRINTF("setting sqhead %u", sqhead); 2147 sq->head = sqhead; 2148 2149 if (cq->head != cq->tail) 2150 pci_generate_msix(sc->nsc_pi, 0); 2151 2152 pthread_mutex_unlock(&sq->mtx); 2153 } 2154 2155 /* 2156 * Update the Write and Read statistics reported in SMART data 2157 * 2158 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up. 2159 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000 2160 * 512 byte blocks. Rounding up is achieved by initializing the remainder to 999. 2161 */ 2162 static void 2163 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc, 2164 size_t bytes, uint16_t status) 2165 { 2166 2167 pthread_mutex_lock(&sc->mtx); 2168 switch (opc) { 2169 case NVME_OPC_WRITE: 2170 sc->write_commands++; 2171 if (status != NVME_SC_SUCCESS) 2172 break; 2173 sc->write_dunits_remainder += (bytes / 512); 2174 while (sc->write_dunits_remainder >= 1000) { 2175 sc->write_data_units++; 2176 sc->write_dunits_remainder -= 1000; 2177 } 2178 break; 2179 case NVME_OPC_READ: 2180 sc->read_commands++; 2181 if (status != NVME_SC_SUCCESS) 2182 break; 2183 sc->read_dunits_remainder += (bytes / 512); 2184 while (sc->read_dunits_remainder >= 1000) { 2185 sc->read_data_units++; 2186 sc->read_dunits_remainder -= 1000; 2187 } 2188 break; 2189 default: 2190 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc); 2191 break; 2192 } 2193 pthread_mutex_unlock(&sc->mtx); 2194 } 2195 2196 /* 2197 * Check if the combination of Starting LBA (slba) and number of blocks 2198 * exceeds the range of the underlying storage. 2199 * 2200 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores 2201 * the capacity in bytes as a uint64_t, care must be taken to avoid integer 2202 * overflow. 2203 */ 2204 static bool 2205 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba, 2206 uint32_t nblocks) 2207 { 2208 size_t offset, bytes; 2209 2210 /* Overflow check of multiplying Starting LBA by the sector size */ 2211 if (slba >> (64 - nvstore->sectsz_bits)) 2212 return (true); 2213 2214 offset = slba << nvstore->sectsz_bits; 2215 bytes = nblocks << nvstore->sectsz_bits; 2216 2217 /* Overflow check of Number of Logical Blocks */ 2218 if ((nvstore->size <= offset) || ((nvstore->size - offset) < bytes)) 2219 return (true); 2220 2221 return (false); 2222 } 2223 2224 static int 2225 pci_nvme_append_iov_req(struct pci_nvme_softc *sc __unused, 2226 struct pci_nvme_ioreq *req, uint64_t gpaddr, size_t size, uint64_t offset) 2227 { 2228 int iovidx; 2229 bool range_is_contiguous; 2230 2231 if (req == NULL) 2232 return (-1); 2233 2234 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) { 2235 return (-1); 2236 } 2237 2238 /* 2239 * Minimize the number of IOVs by concatenating contiguous address 2240 * ranges. If the IOV count is zero, there is no previous range to 2241 * concatenate. 2242 */ 2243 if (req->io_req.br_iovcnt == 0) 2244 range_is_contiguous = false; 2245 else 2246 range_is_contiguous = (req->prev_gpaddr + req->prev_size) == gpaddr; 2247 2248 if (range_is_contiguous) { 2249 iovidx = req->io_req.br_iovcnt - 1; 2250 2251 req->io_req.br_iov[iovidx].iov_base = 2252 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2253 req->prev_gpaddr, size); 2254 if (req->io_req.br_iov[iovidx].iov_base == NULL) 2255 return (-1); 2256 2257 req->prev_size += size; 2258 req->io_req.br_resid += size; 2259 2260 req->io_req.br_iov[iovidx].iov_len = req->prev_size; 2261 } else { 2262 iovidx = req->io_req.br_iovcnt; 2263 if (iovidx == 0) { 2264 req->io_req.br_offset = offset; 2265 req->io_req.br_resid = 0; 2266 req->io_req.br_param = req; 2267 } 2268 2269 req->io_req.br_iov[iovidx].iov_base = 2270 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2271 gpaddr, size); 2272 if (req->io_req.br_iov[iovidx].iov_base == NULL) 2273 return (-1); 2274 2275 req->io_req.br_iov[iovidx].iov_len = size; 2276 2277 req->prev_gpaddr = gpaddr; 2278 req->prev_size = size; 2279 req->io_req.br_resid += size; 2280 2281 req->io_req.br_iovcnt++; 2282 } 2283 2284 return (0); 2285 } 2286 2287 static void 2288 pci_nvme_set_completion(struct pci_nvme_softc *sc, 2289 struct nvme_submission_queue *sq, int sqid, uint16_t cid, uint16_t status) 2290 { 2291 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; 2292 2293 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x", 2294 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), 2295 NVME_STATUS_GET_SC(status)); 2296 2297 pci_nvme_cq_update(sc, cq, 0, cid, sqid, status); 2298 2299 if (cq->head != cq->tail) { 2300 if (cq->intr_en & NVME_CQ_INTEN) { 2301 pci_generate_msix(sc->nsc_pi, cq->intr_vec); 2302 } else { 2303 DPRINTF("%s: CQ%u interrupt disabled", 2304 __func__, sq->cqid); 2305 } 2306 } 2307 } 2308 2309 static void 2310 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) 2311 { 2312 req->sc = NULL; 2313 req->nvme_sq = NULL; 2314 req->sqid = 0; 2315 2316 pthread_mutex_lock(&sc->mtx); 2317 2318 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link); 2319 sc->pending_ios--; 2320 2321 /* when no more IO pending, can set to ready if device reset/enabled */ 2322 if (sc->pending_ios == 0 && 2323 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) 2324 sc->regs.csts |= NVME_CSTS_RDY; 2325 2326 pthread_mutex_unlock(&sc->mtx); 2327 2328 sem_post(&sc->iosemlock); 2329 } 2330 2331 static struct pci_nvme_ioreq * 2332 pci_nvme_get_ioreq(struct pci_nvme_softc *sc) 2333 { 2334 struct pci_nvme_ioreq *req = NULL; 2335 2336 sem_wait(&sc->iosemlock); 2337 pthread_mutex_lock(&sc->mtx); 2338 2339 req = STAILQ_FIRST(&sc->ioreqs_free); 2340 assert(req != NULL); 2341 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link); 2342 2343 req->sc = sc; 2344 2345 sc->pending_ios++; 2346 2347 pthread_mutex_unlock(&sc->mtx); 2348 2349 req->io_req.br_iovcnt = 0; 2350 req->io_req.br_offset = 0; 2351 req->io_req.br_resid = 0; 2352 req->io_req.br_param = req; 2353 req->prev_gpaddr = 0; 2354 req->prev_size = 0; 2355 2356 return req; 2357 } 2358 2359 static void 2360 pci_nvme_io_done(struct blockif_req *br, int err) 2361 { 2362 struct pci_nvme_ioreq *req = br->br_param; 2363 struct nvme_submission_queue *sq = req->nvme_sq; 2364 uint16_t code, status; 2365 2366 DPRINTF("%s error %d %s", __func__, err, strerror(err)); 2367 2368 /* TODO return correct error */ 2369 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; 2370 status = 0; 2371 pci_nvme_status_genc(&status, code); 2372 2373 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, status); 2374 pci_nvme_stats_write_read_update(req->sc, req->opc, 2375 req->bytes, status); 2376 pci_nvme_release_ioreq(req->sc, req); 2377 } 2378 2379 /* 2380 * Implements the Flush command. The specification states: 2381 * If a volatile write cache is not present, Flush commands complete 2382 * successfully and have no effect 2383 * in the description of the Volatile Write Cache (VWC) field of the Identify 2384 * Controller data. Therefore, set status to Success if the command is 2385 * not supported (i.e. RAM or as indicated by the blockif). 2386 */ 2387 static bool 2388 nvme_opc_flush(struct pci_nvme_softc *sc __unused, 2389 struct nvme_command *cmd __unused, 2390 struct pci_nvme_blockstore *nvstore, 2391 struct pci_nvme_ioreq *req, 2392 uint16_t *status) 2393 { 2394 bool pending = false; 2395 2396 if (nvstore->type == NVME_STOR_RAM) { 2397 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2398 } else { 2399 int err; 2400 2401 req->io_req.br_callback = pci_nvme_io_done; 2402 2403 err = blockif_flush(nvstore->ctx, &req->io_req); 2404 switch (err) { 2405 case 0: 2406 pending = true; 2407 break; 2408 case EOPNOTSUPP: 2409 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2410 break; 2411 default: 2412 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2413 } 2414 } 2415 2416 return (pending); 2417 } 2418 2419 static uint16_t 2420 nvme_write_read_ram(struct pci_nvme_softc *sc, 2421 struct pci_nvme_blockstore *nvstore, 2422 uint64_t prp1, uint64_t prp2, 2423 size_t offset, uint64_t bytes, 2424 bool is_write) 2425 { 2426 uint8_t *buf = nvstore->ctx; 2427 enum nvme_copy_dir dir; 2428 uint16_t status; 2429 2430 if (is_write) 2431 dir = NVME_COPY_TO_PRP; 2432 else 2433 dir = NVME_COPY_FROM_PRP; 2434 2435 status = 0; 2436 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2, 2437 buf + offset, bytes, dir)) 2438 pci_nvme_status_genc(&status, 2439 NVME_SC_DATA_TRANSFER_ERROR); 2440 else 2441 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2442 2443 return (status); 2444 } 2445 2446 static uint16_t 2447 nvme_write_read_blockif(struct pci_nvme_softc *sc, 2448 struct pci_nvme_blockstore *nvstore, 2449 struct pci_nvme_ioreq *req, 2450 uint64_t prp1, uint64_t prp2, 2451 size_t offset, uint64_t bytes, 2452 bool is_write) 2453 { 2454 uint64_t size; 2455 int err; 2456 uint16_t status = NVME_NO_STATUS; 2457 2458 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes); 2459 if (pci_nvme_append_iov_req(sc, req, prp1, size, offset)) { 2460 err = -1; 2461 goto out; 2462 } 2463 2464 offset += size; 2465 bytes -= size; 2466 2467 if (bytes == 0) { 2468 ; 2469 } else if (bytes <= PAGE_SIZE) { 2470 size = bytes; 2471 if (pci_nvme_append_iov_req(sc, req, prp2, size, offset)) { 2472 err = -1; 2473 goto out; 2474 } 2475 } else { 2476 void *vmctx = sc->nsc_pi->pi_vmctx; 2477 uint64_t *prp_list = &prp2; 2478 uint64_t *last = prp_list; 2479 2480 /* PRP2 is pointer to a physical region page list */ 2481 while (bytes) { 2482 /* Last entry in list points to the next list */ 2483 if ((prp_list == last) && (bytes > PAGE_SIZE)) { 2484 uint64_t prp = *prp_list; 2485 2486 prp_list = paddr_guest2host(vmctx, prp, 2487 PAGE_SIZE - (prp % PAGE_SIZE)); 2488 if (prp_list == NULL) { 2489 err = -1; 2490 goto out; 2491 } 2492 last = prp_list + (NVME_PRP2_ITEMS - 1); 2493 } 2494 2495 size = MIN(bytes, PAGE_SIZE); 2496 2497 if (pci_nvme_append_iov_req(sc, req, *prp_list, size, 2498 offset)) { 2499 err = -1; 2500 goto out; 2501 } 2502 2503 offset += size; 2504 bytes -= size; 2505 2506 prp_list++; 2507 } 2508 } 2509 req->io_req.br_callback = pci_nvme_io_done; 2510 if (is_write) 2511 err = blockif_write(nvstore->ctx, &req->io_req); 2512 else 2513 err = blockif_read(nvstore->ctx, &req->io_req); 2514 out: 2515 if (err) 2516 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); 2517 2518 return (status); 2519 } 2520 2521 static bool 2522 nvme_opc_write_read(struct pci_nvme_softc *sc, 2523 struct nvme_command *cmd, 2524 struct pci_nvme_blockstore *nvstore, 2525 struct pci_nvme_ioreq *req, 2526 uint16_t *status) 2527 { 2528 uint64_t lba, nblocks, bytes; 2529 size_t offset; 2530 bool is_write = cmd->opc == NVME_OPC_WRITE; 2531 bool pending = false; 2532 2533 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; 2534 nblocks = (cmd->cdw12 & 0xFFFF) + 1; 2535 bytes = nblocks << nvstore->sectsz_bits; 2536 if (bytes > NVME_MAX_DATA_SIZE) { 2537 WPRINTF("%s command would exceed MDTS", __func__); 2538 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD); 2539 goto out; 2540 } 2541 2542 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) { 2543 WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)", 2544 __func__, lba, nblocks); 2545 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2546 goto out; 2547 } 2548 2549 offset = lba << nvstore->sectsz_bits; 2550 2551 req->bytes = bytes; 2552 req->io_req.br_offset = lba; 2553 2554 /* PRP bits 1:0 must be zero */ 2555 cmd->prp1 &= ~0x3UL; 2556 cmd->prp2 &= ~0x3UL; 2557 2558 if (nvstore->type == NVME_STOR_RAM) { 2559 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1, 2560 cmd->prp2, offset, bytes, is_write); 2561 } else { 2562 *status = nvme_write_read_blockif(sc, nvstore, req, 2563 cmd->prp1, cmd->prp2, offset, bytes, is_write); 2564 2565 if (*status == NVME_NO_STATUS) 2566 pending = true; 2567 } 2568 out: 2569 if (!pending) 2570 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status); 2571 2572 return (pending); 2573 } 2574 2575 static void 2576 pci_nvme_dealloc_sm(struct blockif_req *br, int err) 2577 { 2578 struct pci_nvme_ioreq *req = br->br_param; 2579 struct pci_nvme_softc *sc = req->sc; 2580 bool done = true; 2581 uint16_t status; 2582 2583 status = 0; 2584 if (err) { 2585 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); 2586 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) { 2587 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2588 } else { 2589 struct iovec *iov = req->io_req.br_iov; 2590 2591 req->prev_gpaddr++; 2592 iov += req->prev_gpaddr; 2593 2594 /* The iov_* values already include the sector size */ 2595 req->io_req.br_offset = (off_t)iov->iov_base; 2596 req->io_req.br_resid = iov->iov_len; 2597 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) { 2598 pci_nvme_status_genc(&status, 2599 NVME_SC_INTERNAL_DEVICE_ERROR); 2600 } else 2601 done = false; 2602 } 2603 2604 if (done) { 2605 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, req->cid, 2606 status); 2607 pci_nvme_release_ioreq(sc, req); 2608 } 2609 } 2610 2611 static bool 2612 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, 2613 struct nvme_command *cmd, 2614 struct pci_nvme_blockstore *nvstore, 2615 struct pci_nvme_ioreq *req, 2616 uint16_t *status) 2617 { 2618 struct nvme_dsm_range *range = NULL; 2619 uint32_t nr, r, non_zero, dr; 2620 int err; 2621 bool pending = false; 2622 2623 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) { 2624 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE); 2625 goto out; 2626 } 2627 2628 nr = cmd->cdw10 & 0xff; 2629 2630 /* copy locally because a range entry could straddle PRPs */ 2631 #ifdef __FreeBSD__ 2632 range = calloc(1, NVME_MAX_DSM_TRIM); 2633 #else 2634 _Static_assert(NVME_MAX_DSM_TRIM % sizeof(struct nvme_dsm_range) == 0, 2635 "NVME_MAX_DSM_TRIM is not a multiple of struct size"); 2636 range = calloc(NVME_MAX_DSM_TRIM / sizeof (*range), sizeof (*range)); 2637 #endif 2638 if (range == NULL) { 2639 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2640 goto out; 2641 } 2642 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, 2643 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP); 2644 2645 /* Check for invalid ranges and the number of non-zero lengths */ 2646 non_zero = 0; 2647 for (r = 0; r <= nr; r++) { 2648 if (pci_nvme_out_of_range(nvstore, 2649 range[r].starting_lba, range[r].length)) { 2650 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2651 goto out; 2652 } 2653 if (range[r].length != 0) 2654 non_zero++; 2655 } 2656 2657 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) { 2658 size_t offset, bytes; 2659 int sectsz_bits = sc->nvstore.sectsz_bits; 2660 2661 /* 2662 * DSM calls are advisory only, and compliant controllers 2663 * may choose to take no actions (i.e. return Success). 2664 */ 2665 if (!nvstore->deallocate) { 2666 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2667 goto out; 2668 } 2669 2670 /* If all ranges have a zero length, return Success */ 2671 if (non_zero == 0) { 2672 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2673 goto out; 2674 } 2675 2676 if (req == NULL) { 2677 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2678 goto out; 2679 } 2680 2681 offset = range[0].starting_lba << sectsz_bits; 2682 bytes = range[0].length << sectsz_bits; 2683 2684 /* 2685 * If the request is for more than a single range, store 2686 * the ranges in the br_iov. Optimize for the common case 2687 * of a single range. 2688 * 2689 * Note that NVMe Number of Ranges is a zero based value 2690 */ 2691 req->io_req.br_iovcnt = 0; 2692 req->io_req.br_offset = offset; 2693 req->io_req.br_resid = bytes; 2694 2695 if (nr == 0) { 2696 req->io_req.br_callback = pci_nvme_io_done; 2697 } else { 2698 struct iovec *iov = req->io_req.br_iov; 2699 2700 for (r = 0, dr = 0; r <= nr; r++) { 2701 offset = range[r].starting_lba << sectsz_bits; 2702 bytes = range[r].length << sectsz_bits; 2703 if (bytes == 0) 2704 continue; 2705 2706 if ((nvstore->size - offset) < bytes) { 2707 pci_nvme_status_genc(status, 2708 NVME_SC_LBA_OUT_OF_RANGE); 2709 goto out; 2710 } 2711 iov[dr].iov_base = (void *)offset; 2712 iov[dr].iov_len = bytes; 2713 dr++; 2714 } 2715 req->io_req.br_callback = pci_nvme_dealloc_sm; 2716 2717 /* 2718 * Use prev_gpaddr to track the current entry and 2719 * prev_size to track the number of entries 2720 */ 2721 req->prev_gpaddr = 0; 2722 req->prev_size = dr; 2723 } 2724 2725 err = blockif_delete(nvstore->ctx, &req->io_req); 2726 if (err) 2727 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2728 else 2729 pending = true; 2730 } 2731 out: 2732 free(range); 2733 return (pending); 2734 } 2735 2736 static void 2737 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) 2738 { 2739 struct nvme_submission_queue *sq; 2740 uint16_t status; 2741 uint16_t sqhead; 2742 2743 /* handle all submissions up to sq->tail index */ 2744 sq = &sc->submit_queues[idx]; 2745 2746 pthread_mutex_lock(&sq->mtx); 2747 2748 sqhead = sq->head; 2749 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p", 2750 idx, sqhead, sq->tail, sq->qbase); 2751 2752 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2753 struct nvme_command *cmd; 2754 struct pci_nvme_ioreq *req; 2755 uint32_t nsid; 2756 bool pending; 2757 2758 pending = false; 2759 req = NULL; 2760 status = 0; 2761 2762 cmd = &sq->qbase[sqhead]; 2763 sqhead = (sqhead + 1) % sq->size; 2764 2765 nsid = le32toh(cmd->nsid); 2766 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) { 2767 pci_nvme_status_genc(&status, 2768 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 2769 status |= NVMEM(NVME_STATUS_DNR); 2770 goto complete; 2771 } 2772 2773 req = pci_nvme_get_ioreq(sc); 2774 if (req == NULL) { 2775 pci_nvme_status_genc(&status, 2776 NVME_SC_INTERNAL_DEVICE_ERROR); 2777 WPRINTF("%s: unable to allocate IO req", __func__); 2778 goto complete; 2779 } 2780 req->nvme_sq = sq; 2781 req->sqid = idx; 2782 req->opc = cmd->opc; 2783 req->cid = cmd->cid; 2784 req->nsid = cmd->nsid; 2785 2786 switch (cmd->opc) { 2787 case NVME_OPC_FLUSH: 2788 pending = nvme_opc_flush(sc, cmd, &sc->nvstore, 2789 req, &status); 2790 break; 2791 case NVME_OPC_WRITE: 2792 case NVME_OPC_READ: 2793 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore, 2794 req, &status); 2795 break; 2796 case NVME_OPC_WRITE_ZEROES: 2797 /* TODO: write zeroes 2798 WPRINTF("%s write zeroes lba 0x%lx blocks %u", 2799 __func__, lba, cmd->cdw12 & 0xFFFF); */ 2800 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2801 break; 2802 case NVME_OPC_DATASET_MANAGEMENT: 2803 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, 2804 req, &status); 2805 break; 2806 default: 2807 WPRINTF("%s unhandled io command 0x%x", 2808 __func__, cmd->opc); 2809 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE); 2810 } 2811 complete: 2812 if (!pending) { 2813 pci_nvme_set_completion(sc, sq, idx, cmd->cid, status); 2814 if (req != NULL) 2815 pci_nvme_release_ioreq(sc, req); 2816 } 2817 } 2818 2819 sq->head = sqhead; 2820 2821 pthread_mutex_unlock(&sq->mtx); 2822 } 2823 2824 /* 2825 * Check for invalid doorbell write values 2826 * See NVM Express Base Specification, revision 2.0 2827 * "Asynchronous Event Information - Error Status" for details 2828 */ 2829 static bool 2830 pci_nvme_sq_doorbell_valid(struct nvme_submission_queue *sq, uint64_t value) 2831 { 2832 uint64_t capacity; 2833 2834 /* 2835 * Queue empty : head == tail 2836 * Queue full : head is one more than tail accounting for wrap 2837 * Therefore, can never have more than (size - 1) entries 2838 */ 2839 if (sq->head == sq->tail) 2840 capacity = sq->size - 1; 2841 else if (sq->head > sq->tail) 2842 capacity = sq->size - (sq->head - sq->tail) - 1; 2843 else 2844 capacity = sq->tail - sq->head - 1; 2845 2846 if ((value == sq->tail) || /* same as previous */ 2847 (value > capacity)) { /* exceeds queue capacity */ 2848 EPRINTLN("%s: SQ size=%u head=%u tail=%u capacity=%lu value=%lu", 2849 __func__, sq->size, sq->head, sq->tail, capacity, value); 2850 return false; 2851 } 2852 2853 return true; 2854 } 2855 2856 static void 2857 pci_nvme_handle_doorbell(struct pci_nvme_softc* sc, 2858 uint64_t idx, int is_sq, uint64_t value) 2859 { 2860 DPRINTF("nvme doorbell %lu, %s, val 0x%lx", 2861 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF); 2862 2863 if (is_sq) { 2864 if (idx > sc->num_squeues) { 2865 WPRINTF("%s queue index %lu overflow from " 2866 "guest (max %u)", 2867 __func__, idx, sc->num_squeues); 2868 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_ERROR, 2869 PCI_NVME_AEI_ERROR_INVALID_DB); 2870 return; 2871 } 2872 2873 if (sc->submit_queues[idx].qbase == NULL) { 2874 WPRINTF("%s write to SQ %lu before created", __func__, 2875 idx); 2876 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_ERROR, 2877 PCI_NVME_AEI_ERROR_INVALID_DB); 2878 return; 2879 } 2880 2881 if (!pci_nvme_sq_doorbell_valid(&sc->submit_queues[idx], value)) { 2882 EPRINTLN("%s write to SQ %lu of %lu invalid", __func__, 2883 idx, value); 2884 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_ERROR, 2885 PCI_NVME_AEI_ERROR_INVALID_DB_VALUE); 2886 return; 2887 } 2888 2889 atomic_store_short(&sc->submit_queues[idx].tail, 2890 (uint16_t)value); 2891 2892 if (idx == 0) 2893 pci_nvme_handle_admin_cmd(sc, value); 2894 else { 2895 /* submission queue; handle new entries in SQ */ 2896 pci_nvme_handle_io_cmd(sc, (uint16_t)idx); 2897 } 2898 } else { 2899 if (idx > sc->num_cqueues) { 2900 WPRINTF("%s queue index %lu overflow from " 2901 "guest (max %u)", 2902 __func__, idx, sc->num_cqueues); 2903 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_ERROR, 2904 PCI_NVME_AEI_ERROR_INVALID_DB); 2905 return; 2906 } 2907 2908 if (sc->compl_queues[idx].qbase == NULL) { 2909 WPRINTF("%s write to CQ %lu before created", __func__, 2910 idx); 2911 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_ERROR, 2912 PCI_NVME_AEI_ERROR_INVALID_DB); 2913 return; 2914 } 2915 2916 atomic_store_short(&sc->compl_queues[idx].head, 2917 (uint16_t)value); 2918 } 2919 } 2920 2921 static void 2922 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) 2923 { 2924 const char *s = iswrite ? "WRITE" : "READ"; 2925 2926 switch (offset) { 2927 case NVME_CR_CAP_LOW: 2928 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s); 2929 break; 2930 case NVME_CR_CAP_HI: 2931 DPRINTF("%s %s NVME_CR_CAP_HI", func, s); 2932 break; 2933 case NVME_CR_VS: 2934 DPRINTF("%s %s NVME_CR_VS", func, s); 2935 break; 2936 case NVME_CR_INTMS: 2937 DPRINTF("%s %s NVME_CR_INTMS", func, s); 2938 break; 2939 case NVME_CR_INTMC: 2940 DPRINTF("%s %s NVME_CR_INTMC", func, s); 2941 break; 2942 case NVME_CR_CC: 2943 DPRINTF("%s %s NVME_CR_CC", func, s); 2944 break; 2945 case NVME_CR_CSTS: 2946 DPRINTF("%s %s NVME_CR_CSTS", func, s); 2947 break; 2948 case NVME_CR_NSSR: 2949 DPRINTF("%s %s NVME_CR_NSSR", func, s); 2950 break; 2951 case NVME_CR_AQA: 2952 DPRINTF("%s %s NVME_CR_AQA", func, s); 2953 break; 2954 case NVME_CR_ASQ_LOW: 2955 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s); 2956 break; 2957 case NVME_CR_ASQ_HI: 2958 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s); 2959 break; 2960 case NVME_CR_ACQ_LOW: 2961 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s); 2962 break; 2963 case NVME_CR_ACQ_HI: 2964 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s); 2965 break; 2966 default: 2967 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset); 2968 } 2969 2970 } 2971 2972 static void 2973 pci_nvme_write_bar_0(struct pci_nvme_softc *sc, uint64_t offset, int size, 2974 uint64_t value) 2975 { 2976 uint32_t ccreg; 2977 2978 if (offset >= NVME_DOORBELL_OFFSET) { 2979 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; 2980 uint64_t idx = belloffset / 8; /* door bell size = 2*int */ 2981 int is_sq = (belloffset % 8) < 4; 2982 2983 if ((sc->regs.csts & NVME_CSTS_RDY) == 0) { 2984 WPRINTF("doorbell write prior to RDY (offset=%#lx)\n", 2985 offset); 2986 return; 2987 } 2988 2989 if (belloffset > ((sc->max_queues+1) * 8 - 4)) { 2990 WPRINTF("guest attempted an overflow write offset " 2991 "0x%lx, val 0x%lx in %s", 2992 offset, value, __func__); 2993 return; 2994 } 2995 2996 if (is_sq) { 2997 if (sc->submit_queues[idx].qbase == NULL) 2998 return; 2999 } else if (sc->compl_queues[idx].qbase == NULL) 3000 return; 3001 3002 pci_nvme_handle_doorbell(sc, idx, is_sq, value); 3003 return; 3004 } 3005 3006 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx", 3007 offset, size, value); 3008 3009 if (size != 4) { 3010 WPRINTF("guest wrote invalid size %d (offset 0x%lx, " 3011 "val 0x%lx) to bar0 in %s", 3012 size, offset, value, __func__); 3013 /* TODO: shutdown device */ 3014 return; 3015 } 3016 3017 pci_nvme_bar0_reg_dumps(__func__, offset, 1); 3018 3019 pthread_mutex_lock(&sc->mtx); 3020 3021 switch (offset) { 3022 case NVME_CR_CAP_LOW: 3023 case NVME_CR_CAP_HI: 3024 /* readonly */ 3025 break; 3026 case NVME_CR_VS: 3027 /* readonly */ 3028 break; 3029 case NVME_CR_INTMS: 3030 /* MSI-X, so ignore */ 3031 break; 3032 case NVME_CR_INTMC: 3033 /* MSI-X, so ignore */ 3034 break; 3035 case NVME_CR_CC: 3036 ccreg = (uint32_t)value; 3037 3038 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u " 3039 "iocqes %u", 3040 __func__, 3041 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), 3042 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), 3043 NVME_CC_GET_IOCQES(ccreg)); 3044 3045 if (NVME_CC_GET_SHN(ccreg)) { 3046 /* perform shutdown - flush out data to backend */ 3047 sc->regs.csts &= ~NVMEM(NVME_CSTS_REG_SHST); 3048 sc->regs.csts |= NVMEF(NVME_CSTS_REG_SHST, 3049 NVME_SHST_COMPLETE); 3050 } 3051 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { 3052 if (NVME_CC_GET_EN(ccreg) == 0) 3053 /* transition 1-> causes controller reset */ 3054 pci_nvme_reset_locked(sc); 3055 else 3056 pci_nvme_init_controller(sc); 3057 } 3058 3059 /* Insert the iocqes, iosqes and en bits from the write */ 3060 sc->regs.cc &= ~NVME_CC_WRITE_MASK; 3061 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; 3062 if (NVME_CC_GET_EN(ccreg) == 0) { 3063 /* Insert the ams, mps and css bit fields */ 3064 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; 3065 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; 3066 sc->regs.csts &= ~NVME_CSTS_RDY; 3067 } else if ((sc->pending_ios == 0) && 3068 !(sc->regs.csts & NVME_CSTS_CFS)) { 3069 sc->regs.csts |= NVME_CSTS_RDY; 3070 } 3071 break; 3072 case NVME_CR_CSTS: 3073 break; 3074 case NVME_CR_NSSR: 3075 /* ignore writes; don't support subsystem reset */ 3076 break; 3077 case NVME_CR_AQA: 3078 sc->regs.aqa = (uint32_t)value; 3079 break; 3080 case NVME_CR_ASQ_LOW: 3081 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | 3082 (0xFFFFF000 & value); 3083 break; 3084 case NVME_CR_ASQ_HI: 3085 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | 3086 (value << 32); 3087 break; 3088 case NVME_CR_ACQ_LOW: 3089 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | 3090 (0xFFFFF000 & value); 3091 break; 3092 case NVME_CR_ACQ_HI: 3093 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | 3094 (value << 32); 3095 break; 3096 default: 3097 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d", 3098 __func__, offset, value, size); 3099 } 3100 pthread_mutex_unlock(&sc->mtx); 3101 } 3102 3103 static void 3104 pci_nvme_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size, 3105 uint64_t value) 3106 { 3107 struct pci_nvme_softc* sc = pi->pi_arg; 3108 3109 if (baridx == pci_msix_table_bar(pi) || 3110 baridx == pci_msix_pba_bar(pi)) { 3111 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, " 3112 " value 0x%lx", baridx, offset, size, value); 3113 3114 pci_emul_msix_twrite(pi, offset, size, value); 3115 return; 3116 } 3117 3118 switch (baridx) { 3119 case 0: 3120 pci_nvme_write_bar_0(sc, offset, size, value); 3121 break; 3122 3123 default: 3124 DPRINTF("%s unknown baridx %d, val 0x%lx", 3125 __func__, baridx, value); 3126 } 3127 } 3128 3129 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, 3130 uint64_t offset, int size) 3131 { 3132 uint64_t value; 3133 3134 pci_nvme_bar0_reg_dumps(__func__, offset, 0); 3135 3136 if (offset < NVME_DOORBELL_OFFSET) { 3137 void *p = &(sc->regs); 3138 pthread_mutex_lock(&sc->mtx); 3139 memcpy(&value, (void *)((uintptr_t)p + offset), size); 3140 pthread_mutex_unlock(&sc->mtx); 3141 } else { 3142 value = 0; 3143 WPRINTF("pci_nvme: read invalid offset %ld", offset); 3144 } 3145 3146 switch (size) { 3147 case 1: 3148 value &= 0xFF; 3149 break; 3150 case 2: 3151 value &= 0xFFFF; 3152 break; 3153 case 4: 3154 value &= 0xFFFFFFFF; 3155 break; 3156 } 3157 3158 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x", 3159 offset, size, (uint32_t)value); 3160 3161 return (value); 3162 } 3163 3164 3165 3166 static uint64_t 3167 pci_nvme_read(struct pci_devinst *pi, int baridx, uint64_t offset, int size) 3168 { 3169 struct pci_nvme_softc* sc = pi->pi_arg; 3170 3171 if (baridx == pci_msix_table_bar(pi) || 3172 baridx == pci_msix_pba_bar(pi)) { 3173 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d", 3174 baridx, offset, size); 3175 3176 return pci_emul_msix_tread(pi, offset, size); 3177 } 3178 3179 switch (baridx) { 3180 case 0: 3181 return pci_nvme_read_bar_0(sc, offset, size); 3182 3183 default: 3184 DPRINTF("unknown bar %d, 0x%lx", baridx, offset); 3185 } 3186 3187 return (0); 3188 } 3189 3190 static int 3191 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl) 3192 { 3193 char bident[sizeof("XXX:XXX")]; 3194 const char *value; 3195 uint32_t sectsz; 3196 3197 sc->max_queues = NVME_QUEUES; 3198 sc->max_qentries = NVME_MAX_QENTRIES; 3199 sc->ioslots = NVME_IOSLOTS; 3200 sc->num_squeues = sc->max_queues; 3201 sc->num_cqueues = sc->max_queues; 3202 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 3203 sectsz = 0; 3204 #ifdef __FreeBSD__ 3205 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), 3206 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3207 #else 3208 snprintf((char *)sc->ctrldata.sn, sizeof(sc->ctrldata.sn), 3209 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3210 #endif 3211 3212 value = get_config_value_node(nvl, "maxq"); 3213 if (value != NULL) 3214 sc->max_queues = atoi(value); 3215 value = get_config_value_node(nvl, "qsz"); 3216 if (value != NULL) { 3217 sc->max_qentries = atoi(value); 3218 if (sc->max_qentries <= 0) { 3219 EPRINTLN("nvme: Invalid qsz option %d", 3220 sc->max_qentries); 3221 return (-1); 3222 } 3223 } 3224 value = get_config_value_node(nvl, "ioslots"); 3225 if (value != NULL) { 3226 sc->ioslots = atoi(value); 3227 if (sc->ioslots <= 0) { 3228 EPRINTLN("Invalid ioslots option %d", sc->ioslots); 3229 return (-1); 3230 } 3231 } 3232 value = get_config_value_node(nvl, "sectsz"); 3233 if (value != NULL) 3234 sectsz = atoi(value); 3235 value = get_config_value_node(nvl, "ser"); 3236 if (value != NULL) { 3237 /* 3238 * This field indicates the Product Serial Number in 3239 * 7-bit ASCII, unused bytes should be space characters. 3240 * Ref: NVMe v1.3c. 3241 */ 3242 cpywithpad((char *)sc->ctrldata.sn, 3243 sizeof(sc->ctrldata.sn), value, ' '); 3244 } 3245 value = get_config_value_node(nvl, "eui64"); 3246 if (value != NULL) 3247 sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0)); 3248 value = get_config_value_node(nvl, "dsm"); 3249 if (value != NULL) { 3250 if (strcmp(value, "auto") == 0) 3251 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 3252 else if (strcmp(value, "enable") == 0) 3253 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE; 3254 else if (strcmp(value, "disable") == 0) 3255 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE; 3256 } 3257 3258 value = get_config_value_node(nvl, "bootindex"); 3259 if (value != NULL) { 3260 if (pci_emul_add_boot_device(sc->nsc_pi, atoi(value))) { 3261 EPRINTLN("Invalid bootindex %d", atoi(value)); 3262 return (-1); 3263 } 3264 } 3265 3266 value = get_config_value_node(nvl, "ram"); 3267 if (value != NULL) { 3268 uint64_t sz = strtoull(value, NULL, 10); 3269 3270 sc->nvstore.type = NVME_STOR_RAM; 3271 sc->nvstore.size = sz * 1024 * 1024; 3272 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 3273 sc->nvstore.sectsz = 4096; 3274 sc->nvstore.sectsz_bits = 12; 3275 if (sc->nvstore.ctx == NULL) { 3276 EPRINTLN("nvme: Unable to allocate RAM"); 3277 return (-1); 3278 } 3279 } else { 3280 snprintf(bident, sizeof(bident), "%u:%u", 3281 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3282 sc->nvstore.ctx = blockif_open(nvl, bident); 3283 if (sc->nvstore.ctx == NULL) { 3284 EPRINTLN("nvme: Could not open backing file: %s", 3285 strerror(errno)); 3286 return (-1); 3287 } 3288 sc->nvstore.type = NVME_STOR_BLOCKIF; 3289 sc->nvstore.size = blockif_size(sc->nvstore.ctx); 3290 } 3291 3292 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) 3293 sc->nvstore.sectsz = sectsz; 3294 else if (sc->nvstore.type != NVME_STOR_RAM) 3295 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); 3296 for (sc->nvstore.sectsz_bits = 9; 3297 (1U << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; 3298 sc->nvstore.sectsz_bits++); 3299 3300 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) 3301 sc->max_queues = NVME_QUEUES; 3302 3303 return (0); 3304 } 3305 3306 static void 3307 pci_nvme_resized(struct blockif_ctxt *bctxt __unused, void *arg, 3308 size_t new_size) 3309 { 3310 struct pci_nvme_softc *sc; 3311 struct pci_nvme_blockstore *nvstore; 3312 struct nvme_namespace_data *nd; 3313 3314 sc = arg; 3315 nvstore = &sc->nvstore; 3316 nd = &sc->nsdata; 3317 3318 nvstore->size = new_size; 3319 pci_nvme_init_nsdata_size(nvstore, nd); 3320 3321 /* Add changed NSID to list */ 3322 sc->ns_log.ns[0] = 1; 3323 sc->ns_log.ns[1] = 0; 3324 3325 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE, 3326 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED); 3327 } 3328 3329 static int 3330 pci_nvme_init(struct pci_devinst *pi, nvlist_t *nvl) 3331 { 3332 struct pci_nvme_softc *sc; 3333 uint32_t pci_membar_sz; 3334 int error; 3335 3336 error = 0; 3337 3338 sc = calloc(1, sizeof(struct pci_nvme_softc)); 3339 pi->pi_arg = sc; 3340 sc->nsc_pi = pi; 3341 3342 error = pci_nvme_parse_config(sc, nvl); 3343 if (error < 0) 3344 goto done; 3345 else 3346 error = 0; 3347 3348 STAILQ_INIT(&sc->ioreqs_free); 3349 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); 3350 for (uint32_t i = 0; i < sc->ioslots; i++) { 3351 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); 3352 } 3353 3354 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); 3355 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); 3356 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); 3357 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); 3358 pci_set_cfgdata8(pi, PCIR_PROGIF, 3359 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); 3360 3361 /* 3362 * Allocate size of NVMe registers + doorbell space for all queues. 3363 * 3364 * The specification requires a minimum memory I/O window size of 16K. 3365 * The Windows driver will refuse to start a device with a smaller 3366 * window. 3367 */ 3368 pci_membar_sz = sizeof(struct nvme_registers) + 3369 2 * sizeof(uint32_t) * (sc->max_queues + 1); 3370 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); 3371 3372 DPRINTF("nvme membar size: %u", pci_membar_sz); 3373 3374 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); 3375 if (error) { 3376 WPRINTF("%s pci alloc mem bar failed", __func__); 3377 goto done; 3378 } 3379 3380 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); 3381 if (error) { 3382 WPRINTF("%s pci add msixcap failed", __func__); 3383 goto done; 3384 } 3385 3386 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); 3387 if (error) { 3388 WPRINTF("%s pci add Express capability failed", __func__); 3389 goto done; 3390 } 3391 3392 pthread_mutex_init(&sc->mtx, NULL); 3393 sem_init(&sc->iosemlock, 0, sc->ioslots); 3394 blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc); 3395 3396 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues); 3397 /* 3398 * Controller data depends on Namespace data so initialize Namespace 3399 * data first. 3400 */ 3401 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore); 3402 pci_nvme_init_ctrldata(sc); 3403 pci_nvme_init_logpages(sc); 3404 pci_nvme_init_features(sc); 3405 3406 pci_nvme_aer_init(sc); 3407 pci_nvme_aen_init(sc); 3408 3409 pci_nvme_reset(sc); 3410 done: 3411 return (error); 3412 } 3413 3414 static int 3415 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts) 3416 { 3417 char *cp, *ram; 3418 3419 if (opts == NULL) 3420 return (0); 3421 3422 if (strncmp(opts, "ram=", 4) == 0) { 3423 cp = strchr(opts, ','); 3424 if (cp == NULL) { 3425 set_config_value_node(nvl, "ram", opts + 4); 3426 return (0); 3427 } 3428 ram = strndup(opts + 4, cp - opts - 4); 3429 set_config_value_node(nvl, "ram", ram); 3430 free(ram); 3431 return (pci_parse_legacy_config(nvl, cp + 1)); 3432 } else 3433 return (blockif_legacy_config(nvl, opts)); 3434 } 3435 3436 static const struct pci_devemu pci_de_nvme = { 3437 .pe_emu = "nvme", 3438 .pe_init = pci_nvme_init, 3439 .pe_legacy_config = pci_nvme_legacy_config, 3440 .pe_barwrite = pci_nvme_write, 3441 .pe_barread = pci_nvme_read 3442 }; 3443 PCI_EMUL_SET(pci_de_nvme); 3444