1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2017 Shunsuke Mie 5 * Copyright (c) 2018 Leon Dang 6 * Copyright (c) 2020 Chuck Tuffli 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 /* 31 * bhyve PCIe-NVMe device emulation. 32 * 33 * options: 34 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt> 35 * 36 * accepted devpath: 37 * /dev/blockdev 38 * /path/to/image 39 * ram=size_in_MiB 40 * 41 * maxq = max number of queues 42 * qsz = max elements in each queue 43 * ioslots = max number of concurrent io requests 44 * sectsz = sector size (defaults to blockif sector size) 45 * ser = serial number (20-chars max) 46 * eui64 = IEEE Extended Unique Identifier (8 byte value) 47 * dsm = DataSet Management support. Option is one of auto, enable,disable 48 * 49 */ 50 51 /* TODO: 52 - create async event for smart and log 53 - intr coalesce 54 */ 55 56 #include <sys/cdefs.h> 57 #include <sys/errno.h> 58 #include <sys/types.h> 59 #include <sys/crc16.h> 60 #include <net/ieee_oui.h> 61 62 #include <assert.h> 63 #include <pthread.h> 64 #include <pthread_np.h> 65 #include <semaphore.h> 66 #include <stdbool.h> 67 #include <stddef.h> 68 #include <stdint.h> 69 #include <stdio.h> 70 #include <stdlib.h> 71 #include <string.h> 72 73 #include <machine/atomic.h> 74 #include <machine/vmm.h> 75 #include <vmmapi.h> 76 77 #include <dev/nvme/nvme.h> 78 79 #include "bhyverun.h" 80 #include "block_if.h" 81 #include "config.h" 82 #include "debug.h" 83 #include "pci_emul.h" 84 85 86 static int nvme_debug = 0; 87 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args) 88 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args) 89 90 /* defaults; can be overridden */ 91 #define NVME_MSIX_BAR 4 92 93 #define NVME_IOSLOTS 8 94 95 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ 96 #define NVME_MMIO_SPACE_MIN (1 << 14) 97 98 #define NVME_QUEUES 16 99 #define NVME_MAX_QENTRIES 2048 100 /* Memory Page size Minimum reported in CAP register */ 101 #define NVME_MPSMIN 0 102 /* MPSMIN converted to bytes */ 103 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN)) 104 105 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) 106 #define NVME_MDTS 9 107 /* Note the + 1 allows for the initial descriptor to not be page aligned */ 108 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1) 109 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES) 110 111 /* This is a synthetic status code to indicate there is no status */ 112 #define NVME_NO_STATUS 0xffff 113 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) 114 115 /* Reported temperature in Kelvin (i.e. room temperature) */ 116 #define NVME_TEMPERATURE 296 117 118 /* helpers */ 119 120 /* Convert a zero-based value into a one-based value */ 121 #define ONE_BASED(zero) ((zero) + 1) 122 /* Convert a one-based value into a zero-based value */ 123 #define ZERO_BASED(one) ((one) - 1) 124 125 /* Encode number of SQ's and CQ's for Set/Get Features */ 126 #define NVME_FEATURE_NUM_QUEUES(sc) \ 127 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ 128 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16 129 130 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) 131 132 enum nvme_controller_register_offsets { 133 NVME_CR_CAP_LOW = 0x00, 134 NVME_CR_CAP_HI = 0x04, 135 NVME_CR_VS = 0x08, 136 NVME_CR_INTMS = 0x0c, 137 NVME_CR_INTMC = 0x10, 138 NVME_CR_CC = 0x14, 139 NVME_CR_CSTS = 0x1c, 140 NVME_CR_NSSR = 0x20, 141 NVME_CR_AQA = 0x24, 142 NVME_CR_ASQ_LOW = 0x28, 143 NVME_CR_ASQ_HI = 0x2c, 144 NVME_CR_ACQ_LOW = 0x30, 145 NVME_CR_ACQ_HI = 0x34, 146 }; 147 148 enum nvme_cmd_cdw11 { 149 NVME_CMD_CDW11_PC = 0x0001, 150 NVME_CMD_CDW11_IEN = 0x0002, 151 NVME_CMD_CDW11_IV = 0xFFFF0000, 152 }; 153 154 enum nvme_copy_dir { 155 NVME_COPY_TO_PRP, 156 NVME_COPY_FROM_PRP, 157 }; 158 159 #define NVME_CQ_INTEN 0x01 160 #define NVME_CQ_INTCOAL 0x02 161 162 struct nvme_completion_queue { 163 struct nvme_completion *qbase; 164 pthread_mutex_t mtx; 165 uint32_t size; 166 uint16_t tail; /* nvme progress */ 167 uint16_t head; /* guest progress */ 168 uint16_t intr_vec; 169 uint32_t intr_en; 170 }; 171 172 struct nvme_submission_queue { 173 struct nvme_command *qbase; 174 pthread_mutex_t mtx; 175 uint32_t size; 176 uint16_t head; /* nvme progress */ 177 uint16_t tail; /* guest progress */ 178 uint16_t cqid; /* completion queue id */ 179 int qpriority; 180 }; 181 182 enum nvme_storage_type { 183 NVME_STOR_BLOCKIF = 0, 184 NVME_STOR_RAM = 1, 185 }; 186 187 struct pci_nvme_blockstore { 188 enum nvme_storage_type type; 189 void *ctx; 190 uint64_t size; 191 uint32_t sectsz; 192 uint32_t sectsz_bits; 193 uint64_t eui64; 194 uint32_t deallocate:1; 195 }; 196 197 /* 198 * Calculate the number of additional page descriptors for guest IO requests 199 * based on the advertised Max Data Transfer (MDTS) and given the number of 200 * default iovec's in a struct blockif_req. 201 */ 202 #define MDTS_PAD_SIZE \ 203 ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \ 204 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \ 205 0 ) 206 207 struct pci_nvme_ioreq { 208 struct pci_nvme_softc *sc; 209 STAILQ_ENTRY(pci_nvme_ioreq) link; 210 struct nvme_submission_queue *nvme_sq; 211 uint16_t sqid; 212 213 /* command information */ 214 uint16_t opc; 215 uint16_t cid; 216 uint32_t nsid; 217 218 uint64_t prev_gpaddr; 219 size_t prev_size; 220 size_t bytes; 221 222 struct blockif_req io_req; 223 224 struct iovec iovpadding[MDTS_PAD_SIZE]; 225 }; 226 227 enum nvme_dsm_type { 228 /* Dataset Management bit in ONCS reflects backing storage capability */ 229 NVME_DATASET_MANAGEMENT_AUTO, 230 /* Unconditionally set Dataset Management bit in ONCS */ 231 NVME_DATASET_MANAGEMENT_ENABLE, 232 /* Unconditionally clear Dataset Management bit in ONCS */ 233 NVME_DATASET_MANAGEMENT_DISABLE, 234 }; 235 236 struct pci_nvme_softc; 237 struct nvme_feature_obj; 238 239 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *, 240 struct nvme_feature_obj *, 241 struct nvme_command *, 242 struct nvme_completion *); 243 244 struct nvme_feature_obj { 245 uint32_t cdw11; 246 nvme_feature_cb set; 247 nvme_feature_cb get; 248 bool namespace_specific; 249 }; 250 251 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1) 252 253 typedef enum { 254 PCI_NVME_AE_TYPE_ERROR = 0, 255 PCI_NVME_AE_TYPE_SMART, 256 PCI_NVME_AE_TYPE_NOTICE, 257 PCI_NVME_AE_TYPE_IO_CMD = 6, 258 PCI_NVME_AE_TYPE_VENDOR = 7, 259 PCI_NVME_AE_TYPE_MAX /* Must be last */ 260 } pci_nvme_async_type; 261 262 /* Asynchronous Event Requests */ 263 struct pci_nvme_aer { 264 STAILQ_ENTRY(pci_nvme_aer) link; 265 uint16_t cid; /* Command ID of the submitted AER */ 266 }; 267 268 /** Asynchronous Event Information - Error */ 269 typedef enum { 270 PCI_NVME_AEI_ERROR_INVALID_DB, 271 PCI_NVME_AEI_ERROR_INVALID_DB_VALUE, 272 PCI_NVME_AEI_ERROR_DIAG_FAILURE, 273 PCI_NVME_AEI_ERROR_PERSISTANT_ERR, 274 PCI_NVME_AEI_ERROR_TRANSIENT_ERR, 275 PCI_NVME_AEI_ERROR_FIRMWARE_LOAD_ERR, 276 PCI_NVME_AEI_ERROR_MAX, 277 } pci_nvme_async_event_info_error; 278 279 /** Asynchronous Event Information - Notice */ 280 typedef enum { 281 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED = 0, 282 PCI_NVME_AEI_NOTICE_FW_ACTIVATION, 283 PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE, 284 PCI_NVME_AEI_NOTICE_ANA_CHANGE, 285 PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE, 286 PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT, 287 PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE, 288 PCI_NVME_AEI_NOTICE_MAX, 289 } pci_nvme_async_event_info_notice; 290 291 #define PCI_NVME_AEI_NOTICE_SHIFT 8 292 #define PCI_NVME_AEI_NOTICE_MASK(event) (1 << (event + PCI_NVME_AEI_NOTICE_SHIFT)) 293 294 /* Asynchronous Event Notifications */ 295 struct pci_nvme_aen { 296 pci_nvme_async_type atype; 297 uint32_t event_data; 298 bool posted; 299 }; 300 301 /* 302 * By default, enable all Asynchrnous Event Notifications: 303 * SMART / Health Critical Warnings 304 * Namespace Attribute Notices 305 */ 306 #define PCI_NVME_AEN_DEFAULT_MASK 0x11f 307 308 typedef enum { 309 NVME_CNTRLTYPE_IO = 1, 310 NVME_CNTRLTYPE_DISCOVERY = 2, 311 NVME_CNTRLTYPE_ADMIN = 3, 312 } pci_nvme_cntrl_type; 313 314 struct pci_nvme_softc { 315 struct pci_devinst *nsc_pi; 316 317 pthread_mutex_t mtx; 318 319 struct nvme_registers regs; 320 321 struct nvme_namespace_data nsdata; 322 struct nvme_controller_data ctrldata; 323 struct nvme_error_information_entry err_log; 324 struct nvme_health_information_page health_log; 325 struct nvme_firmware_page fw_log; 326 struct nvme_ns_list ns_log; 327 328 struct pci_nvme_blockstore nvstore; 329 330 uint16_t max_qentries; /* max entries per queue */ 331 uint32_t max_queues; /* max number of IO SQ's or CQ's */ 332 uint32_t num_cqueues; 333 uint32_t num_squeues; 334 bool num_q_is_set; /* Has host set Number of Queues */ 335 336 struct pci_nvme_ioreq *ioreqs; 337 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */ 338 uint32_t pending_ios; 339 uint32_t ioslots; 340 sem_t iosemlock; 341 342 /* 343 * Memory mapped Submission and Completion queues 344 * Each array includes both Admin and IO queues 345 */ 346 struct nvme_completion_queue *compl_queues; 347 struct nvme_submission_queue *submit_queues; 348 349 struct nvme_feature_obj feat[NVME_FID_MAX]; 350 351 enum nvme_dsm_type dataset_management; 352 353 /* Accounting for SMART data */ 354 __uint128_t read_data_units; 355 __uint128_t write_data_units; 356 __uint128_t read_commands; 357 __uint128_t write_commands; 358 uint32_t read_dunits_remainder; 359 uint32_t write_dunits_remainder; 360 361 STAILQ_HEAD(, pci_nvme_aer) aer_list; 362 pthread_mutex_t aer_mtx; 363 uint32_t aer_count; 364 struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX]; 365 pthread_t aen_tid; 366 pthread_mutex_t aen_mtx; 367 pthread_cond_t aen_cond; 368 }; 369 370 371 static void pci_nvme_cq_update(struct pci_nvme_softc *sc, 372 struct nvme_completion_queue *cq, 373 uint32_t cdw0, 374 uint16_t cid, 375 uint16_t sqid, 376 uint16_t status); 377 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); 378 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); 379 static void pci_nvme_io_done(struct blockif_req *, int); 380 381 /* Controller Configuration utils */ 382 #define NVME_CC_GET_EN(cc) \ 383 NVMEV(NVME_CC_REG_EN, cc) 384 #define NVME_CC_GET_CSS(cc) \ 385 NVMEV(NVME_CC_REG_CSS, cc) 386 #define NVME_CC_GET_SHN(cc) \ 387 NVMEV(NVME_CC_REG_SHN, cc) 388 #define NVME_CC_GET_IOSQES(cc) \ 389 NVMEV(NVME_CC_REG_IOSQES, cc) 390 #define NVME_CC_GET_IOCQES(cc) \ 391 NVMEV(NVME_CC_REG_IOCQES, cc) 392 393 #define NVME_CC_WRITE_MASK \ 394 (NVMEM(NVME_CC_REG_EN) | \ 395 NVMEM(NVME_CC_REG_IOSQES) | \ 396 NVMEM(NVME_CC_REG_IOCQES)) 397 398 #define NVME_CC_NEN_WRITE_MASK \ 399 (NVMEM(NVME_CC_REG_CSS) | \ 400 NVMEM(NVME_CC_REG_MPS) | \ 401 NVMEM(NVME_CC_REG_AMS)) 402 403 /* Controller Status utils */ 404 #define NVME_CSTS_GET_RDY(sts) \ 405 NVMEV(NVME_CSTS_REG_RDY, sts) 406 407 #define NVME_CSTS_RDY (NVMEF(NVME_CSTS_REG_RDY, 1)) 408 #define NVME_CSTS_CFS (NVMEF(NVME_CSTS_REG_CFS, 1)) 409 410 /* Completion Queue status word utils */ 411 #define NVME_STATUS_P (NVMEF(NVME_STATUS_P, 1)) 412 #define NVME_STATUS_MASK \ 413 (NVMEM(NVME_STATUS_SCT) | \ 414 NVMEM(NVME_STATUS_SC)) 415 416 #define NVME_ONCS_DSM NVMEM(NVME_CTRLR_DATA_ONCS_DSM) 417 418 static void nvme_feature_invalid_cb(struct pci_nvme_softc *, 419 struct nvme_feature_obj *, 420 struct nvme_command *, 421 struct nvme_completion *); 422 static void nvme_feature_temperature(struct pci_nvme_softc *, 423 struct nvme_feature_obj *, 424 struct nvme_command *, 425 struct nvme_completion *); 426 static void nvme_feature_num_queues(struct pci_nvme_softc *, 427 struct nvme_feature_obj *, 428 struct nvme_command *, 429 struct nvme_completion *); 430 static void nvme_feature_iv_config(struct pci_nvme_softc *, 431 struct nvme_feature_obj *, 432 struct nvme_command *, 433 struct nvme_completion *); 434 static void nvme_feature_async_event(struct pci_nvme_softc *, 435 struct nvme_feature_obj *, 436 struct nvme_command *, 437 struct nvme_completion *); 438 439 static void *aen_thr(void *arg); 440 441 static __inline void 442 cpywithpad(char *dst, size_t dst_size, const char *src, char pad) 443 { 444 size_t len; 445 446 len = strnlen(src, dst_size); 447 memset(dst, pad, dst_size); 448 memcpy(dst, src, len); 449 } 450 451 static __inline void 452 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) 453 { 454 455 *status &= ~NVME_STATUS_MASK; 456 *status |= NVMEF(NVME_STATUS_SCT, type) | NVMEF(NVME_STATUS_SC, code); 457 } 458 459 static __inline void 460 pci_nvme_status_genc(uint16_t *status, uint16_t code) 461 { 462 463 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); 464 } 465 466 /* 467 * Initialize the requested number or IO Submission and Completion Queues. 468 * Admin queues are allocated implicitly. 469 */ 470 static void 471 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq) 472 { 473 uint32_t i; 474 475 /* 476 * Allocate and initialize the Submission Queues 477 */ 478 if (nsq > NVME_QUEUES) { 479 WPRINTF("%s: clamping number of SQ from %u to %u", 480 __func__, nsq, NVME_QUEUES); 481 nsq = NVME_QUEUES; 482 } 483 484 sc->num_squeues = nsq; 485 486 sc->submit_queues = calloc(sc->num_squeues + 1, 487 sizeof(struct nvme_submission_queue)); 488 if (sc->submit_queues == NULL) { 489 WPRINTF("%s: SQ allocation failed", __func__); 490 sc->num_squeues = 0; 491 } else { 492 struct nvme_submission_queue *sq = sc->submit_queues; 493 494 for (i = 0; i < sc->num_squeues + 1; i++) 495 pthread_mutex_init(&sq[i].mtx, NULL); 496 } 497 498 /* 499 * Allocate and initialize the Completion Queues 500 */ 501 if (ncq > NVME_QUEUES) { 502 WPRINTF("%s: clamping number of CQ from %u to %u", 503 __func__, ncq, NVME_QUEUES); 504 ncq = NVME_QUEUES; 505 } 506 507 sc->num_cqueues = ncq; 508 509 sc->compl_queues = calloc(sc->num_cqueues + 1, 510 sizeof(struct nvme_completion_queue)); 511 if (sc->compl_queues == NULL) { 512 WPRINTF("%s: CQ allocation failed", __func__); 513 sc->num_cqueues = 0; 514 } else { 515 struct nvme_completion_queue *cq = sc->compl_queues; 516 517 for (i = 0; i < sc->num_cqueues + 1; i++) 518 pthread_mutex_init(&cq[i].mtx, NULL); 519 } 520 } 521 522 static void 523 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) 524 { 525 struct nvme_controller_data *cd = &sc->ctrldata; 526 int ret; 527 528 cd->vid = 0xFB5D; 529 cd->ssvid = 0x0000; 530 531 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); 532 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); 533 534 /* Num of submission commands that we can handle at a time (2^rab) */ 535 cd->rab = 4; 536 537 /* FreeBSD OUI */ 538 cd->ieee[0] = 0xfc; 539 cd->ieee[1] = 0x9c; 540 cd->ieee[2] = 0x58; 541 542 cd->mic = 0; 543 544 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */ 545 546 cd->ver = NVME_REV(1,4); 547 548 cd->cntrltype = NVME_CNTRLTYPE_IO; 549 cd->oacs = NVMEF(NVME_CTRLR_DATA_OACS_FORMAT, 1); 550 cd->oaes = NVMEM(NVME_CTRLR_DATA_OAES_NS_ATTR); 551 cd->acl = 2; 552 cd->aerl = 4; 553 554 /* Advertise 1, Read-only firmware slot */ 555 cd->frmw = NVMEM(NVME_CTRLR_DATA_FRMW_SLOT1_RO) | 556 NVMEF(NVME_CTRLR_DATA_FRMW_NUM_SLOTS, 1); 557 cd->lpa = 0; /* TODO: support some simple things like SMART */ 558 cd->elpe = 0; /* max error log page entries */ 559 /* 560 * Report a single power state (zero-based value) 561 * power_state[] values are left as zero to indicate "Not reported" 562 */ 563 cd->npss = 0; 564 565 /* Warning Composite Temperature Threshold */ 566 cd->wctemp = 0x0157; 567 cd->cctemp = 0x0157; 568 569 /* SANICAP must not be 0 for Revision 1.4 and later NVMe Controllers */ 570 cd->sanicap = NVMEF(NVME_CTRLR_DATA_SANICAP_NODMMAS, 571 NVME_CTRLR_DATA_SANICAP_NODMMAS_NO); 572 573 cd->sqes = NVMEF(NVME_CTRLR_DATA_SQES_MAX, 6) | 574 NVMEF(NVME_CTRLR_DATA_SQES_MIN, 6); 575 cd->cqes = NVMEF(NVME_CTRLR_DATA_CQES_MAX, 4) | 576 NVMEF(NVME_CTRLR_DATA_CQES_MIN, 4); 577 cd->nn = 1; /* number of namespaces */ 578 579 cd->oncs = 0; 580 switch (sc->dataset_management) { 581 case NVME_DATASET_MANAGEMENT_AUTO: 582 if (sc->nvstore.deallocate) 583 cd->oncs |= NVME_ONCS_DSM; 584 break; 585 case NVME_DATASET_MANAGEMENT_ENABLE: 586 cd->oncs |= NVME_ONCS_DSM; 587 break; 588 default: 589 break; 590 } 591 592 cd->fna = NVMEM(NVME_CTRLR_DATA_FNA_FORMAT_ALL); 593 594 cd->vwc = NVMEF(NVME_CTRLR_DATA_VWC_ALL, NVME_CTRLR_DATA_VWC_ALL_NO); 595 596 ret = snprintf(cd->subnqn, sizeof(cd->subnqn), 597 "nqn.2013-12.org.freebsd:bhyve-%s-%u-%u-%u", 598 get_config_value("name"), sc->nsc_pi->pi_bus, 599 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 600 if ((ret < 0) || ((unsigned)ret > sizeof(cd->subnqn))) 601 EPRINTLN("%s: error setting subnqn (%d)", __func__, ret); 602 } 603 604 static void 605 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore, 606 struct nvme_namespace_data *nd) 607 { 608 609 /* Get capacity and block size information from backing store */ 610 nd->nsze = nvstore->size / nvstore->sectsz; 611 nd->ncap = nd->nsze; 612 nd->nuse = nd->nsze; 613 } 614 615 static void 616 pci_nvme_init_nsdata(struct pci_nvme_softc *sc, 617 struct nvme_namespace_data *nd, uint32_t nsid, 618 struct pci_nvme_blockstore *nvstore) 619 { 620 621 pci_nvme_init_nsdata_size(nvstore, nd); 622 623 if (nvstore->type == NVME_STOR_BLOCKIF) 624 nvstore->deallocate = blockif_candelete(nvstore->ctx); 625 626 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ 627 nd->flbas = 0; 628 629 /* Create an EUI-64 if user did not provide one */ 630 if (nvstore->eui64 == 0) { 631 char *data = NULL; 632 uint64_t eui64 = nvstore->eui64; 633 634 asprintf(&data, "%s%u%u%u", get_config_value("name"), 635 sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot, 636 sc->nsc_pi->pi_func); 637 638 if (data != NULL) { 639 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); 640 free(data); 641 } 642 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff); 643 } 644 be64enc(nd->eui64, nvstore->eui64); 645 646 /* LBA data-sz = 2^lbads */ 647 nd->lbaf[0] = NVMEF(NVME_NS_DATA_LBAF_LBADS, nvstore->sectsz_bits); 648 } 649 650 static void 651 pci_nvme_init_logpages(struct pci_nvme_softc *sc) 652 { 653 __uint128_t power_cycles = 1; 654 655 memset(&sc->err_log, 0, sizeof(sc->err_log)); 656 memset(&sc->health_log, 0, sizeof(sc->health_log)); 657 memset(&sc->fw_log, 0, sizeof(sc->fw_log)); 658 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 659 660 /* Set read/write remainder to round up according to spec */ 661 sc->read_dunits_remainder = 999; 662 sc->write_dunits_remainder = 999; 663 664 /* Set nominal Health values checked by implementations */ 665 sc->health_log.temperature = NVME_TEMPERATURE; 666 sc->health_log.available_spare = 100; 667 sc->health_log.available_spare_threshold = 10; 668 669 /* Set Active Firmware Info to slot 1 */ 670 sc->fw_log.afi = NVMEF(NVME_FIRMWARE_PAGE_AFI_SLOT, 1); 671 memcpy(&sc->fw_log.revision[0], sc->ctrldata.fr, 672 sizeof(sc->fw_log.revision[0])); 673 674 memcpy(&sc->health_log.power_cycles, &power_cycles, 675 sizeof(sc->health_log.power_cycles)); 676 } 677 678 static void 679 pci_nvme_init_features(struct pci_nvme_softc *sc) 680 { 681 enum nvme_feature fid; 682 683 for (fid = 0; fid < NVME_FID_MAX; fid++) { 684 switch (fid) { 685 case NVME_FEAT_ARBITRATION: 686 case NVME_FEAT_POWER_MANAGEMENT: 687 case NVME_FEAT_INTERRUPT_COALESCING: //XXX 688 case NVME_FEAT_WRITE_ATOMICITY: 689 /* Mandatory but no special handling required */ 690 //XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 691 //XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 692 // this returns a data buffer 693 break; 694 case NVME_FEAT_TEMPERATURE_THRESHOLD: 695 sc->feat[fid].set = nvme_feature_temperature; 696 break; 697 case NVME_FEAT_ERROR_RECOVERY: 698 sc->feat[fid].namespace_specific = true; 699 break; 700 case NVME_FEAT_NUMBER_OF_QUEUES: 701 sc->feat[fid].set = nvme_feature_num_queues; 702 break; 703 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 704 sc->feat[fid].set = nvme_feature_iv_config; 705 break; 706 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 707 sc->feat[fid].set = nvme_feature_async_event; 708 /* Enable all AENs by default */ 709 sc->feat[fid].cdw11 = PCI_NVME_AEN_DEFAULT_MASK; 710 break; 711 default: 712 sc->feat[fid].set = nvme_feature_invalid_cb; 713 sc->feat[fid].get = nvme_feature_invalid_cb; 714 } 715 } 716 } 717 718 static void 719 pci_nvme_aer_reset(struct pci_nvme_softc *sc) 720 { 721 722 STAILQ_INIT(&sc->aer_list); 723 sc->aer_count = 0; 724 } 725 726 static void 727 pci_nvme_aer_init(struct pci_nvme_softc *sc) 728 { 729 730 pthread_mutex_init(&sc->aer_mtx, NULL); 731 pci_nvme_aer_reset(sc); 732 } 733 734 static void 735 pci_nvme_aer_destroy(struct pci_nvme_softc *sc) 736 { 737 struct pci_nvme_aer *aer = NULL; 738 739 pthread_mutex_lock(&sc->aer_mtx); 740 while (!STAILQ_EMPTY(&sc->aer_list)) { 741 aer = STAILQ_FIRST(&sc->aer_list); 742 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 743 free(aer); 744 } 745 pthread_mutex_unlock(&sc->aer_mtx); 746 747 pci_nvme_aer_reset(sc); 748 } 749 750 static bool 751 pci_nvme_aer_available(struct pci_nvme_softc *sc) 752 { 753 754 return (sc->aer_count != 0); 755 } 756 757 static bool 758 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc) 759 { 760 struct nvme_controller_data *cd = &sc->ctrldata; 761 762 /* AERL is a zero based value while aer_count is one's based */ 763 return (sc->aer_count == (cd->aerl + 1U)); 764 } 765 766 /* 767 * Add an Async Event Request 768 * 769 * Stores an AER to be returned later if the Controller needs to notify the 770 * host of an event. 771 * Note that while the NVMe spec doesn't require Controllers to return AER's 772 * in order, this implementation does preserve the order. 773 */ 774 static int 775 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid) 776 { 777 struct pci_nvme_aer *aer = NULL; 778 779 aer = calloc(1, sizeof(struct pci_nvme_aer)); 780 if (aer == NULL) 781 return (-1); 782 783 /* Save the Command ID for use in the completion message */ 784 aer->cid = cid; 785 786 pthread_mutex_lock(&sc->aer_mtx); 787 sc->aer_count++; 788 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link); 789 pthread_mutex_unlock(&sc->aer_mtx); 790 791 return (0); 792 } 793 794 /* 795 * Get an Async Event Request structure 796 * 797 * Returns a pointer to an AER previously submitted by the host or NULL if 798 * no AER's exist. Caller is responsible for freeing the returned struct. 799 */ 800 static struct pci_nvme_aer * 801 pci_nvme_aer_get(struct pci_nvme_softc *sc) 802 { 803 struct pci_nvme_aer *aer = NULL; 804 805 pthread_mutex_lock(&sc->aer_mtx); 806 aer = STAILQ_FIRST(&sc->aer_list); 807 if (aer != NULL) { 808 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 809 sc->aer_count--; 810 } 811 pthread_mutex_unlock(&sc->aer_mtx); 812 813 return (aer); 814 } 815 816 static void 817 pci_nvme_aen_reset(struct pci_nvme_softc *sc) 818 { 819 uint32_t atype; 820 821 memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen)); 822 823 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 824 sc->aen[atype].atype = atype; 825 } 826 } 827 828 static void 829 pci_nvme_aen_init(struct pci_nvme_softc *sc) 830 { 831 char nstr[80]; 832 833 pci_nvme_aen_reset(sc); 834 835 pthread_mutex_init(&sc->aen_mtx, NULL); 836 pthread_create(&sc->aen_tid, NULL, aen_thr, sc); 837 snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot, 838 sc->nsc_pi->pi_func); 839 pthread_set_name_np(sc->aen_tid, nstr); 840 } 841 842 static void 843 pci_nvme_aen_destroy(struct pci_nvme_softc *sc) 844 { 845 846 pci_nvme_aen_reset(sc); 847 } 848 849 /* Notify the AEN thread of pending work */ 850 static void 851 pci_nvme_aen_notify(struct pci_nvme_softc *sc) 852 { 853 854 pthread_cond_signal(&sc->aen_cond); 855 } 856 857 /* 858 * Post an Asynchronous Event Notification 859 */ 860 static int32_t 861 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype, 862 uint32_t event_data) 863 { 864 struct pci_nvme_aen *aen; 865 866 if (atype >= PCI_NVME_AE_TYPE_MAX) { 867 return(EINVAL); 868 } 869 870 pthread_mutex_lock(&sc->aen_mtx); 871 aen = &sc->aen[atype]; 872 873 /* Has the controller already posted an event of this type? */ 874 if (aen->posted) { 875 pthread_mutex_unlock(&sc->aen_mtx); 876 return(EALREADY); 877 } 878 879 aen->event_data = event_data; 880 aen->posted = true; 881 pthread_mutex_unlock(&sc->aen_mtx); 882 883 pci_nvme_aen_notify(sc); 884 885 return(0); 886 } 887 888 static void 889 pci_nvme_aen_process(struct pci_nvme_softc *sc) 890 { 891 struct pci_nvme_aer *aer; 892 struct pci_nvme_aen *aen; 893 pci_nvme_async_type atype; 894 uint32_t mask; 895 uint16_t status; 896 uint8_t lid; 897 898 assert(pthread_mutex_isowned_np(&sc->aen_mtx)); 899 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 900 aen = &sc->aen[atype]; 901 /* Previous iterations may have depleted the available AER's */ 902 if (!pci_nvme_aer_available(sc)) { 903 DPRINTF("%s: no AER", __func__); 904 break; 905 } 906 907 if (!aen->posted) { 908 DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype); 909 continue; 910 } 911 912 status = NVME_SC_SUCCESS; 913 914 /* Is the event masked? */ 915 mask = 916 sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11; 917 918 DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data); 919 switch (atype) { 920 case PCI_NVME_AE_TYPE_ERROR: 921 lid = NVME_LOG_ERROR; 922 break; 923 case PCI_NVME_AE_TYPE_SMART: 924 mask &= 0xff; 925 if ((mask & aen->event_data) == 0) 926 continue; 927 lid = NVME_LOG_HEALTH_INFORMATION; 928 break; 929 case PCI_NVME_AE_TYPE_NOTICE: 930 if (aen->event_data >= PCI_NVME_AEI_NOTICE_MAX) { 931 EPRINTLN("%s unknown AEN notice type %u", 932 __func__, aen->event_data); 933 status = NVME_SC_INTERNAL_DEVICE_ERROR; 934 lid = 0; 935 break; 936 } 937 if ((PCI_NVME_AEI_NOTICE_MASK(aen->event_data) & mask) == 0) 938 continue; 939 switch (aen->event_data) { 940 case PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED: 941 lid = NVME_LOG_CHANGED_NAMESPACE; 942 break; 943 case PCI_NVME_AEI_NOTICE_FW_ACTIVATION: 944 lid = NVME_LOG_FIRMWARE_SLOT; 945 break; 946 case PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE: 947 lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED; 948 break; 949 case PCI_NVME_AEI_NOTICE_ANA_CHANGE: 950 lid = NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS; 951 break; 952 case PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE: 953 lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE; 954 break; 955 case PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT: 956 lid = NVME_LOG_LBA_STATUS_INFORMATION; 957 break; 958 case PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE: 959 lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE; 960 break; 961 default: 962 lid = 0; 963 } 964 break; 965 default: 966 /* bad type?!? */ 967 EPRINTLN("%s unknown AEN type %u", __func__, atype); 968 status = NVME_SC_INTERNAL_DEVICE_ERROR; 969 lid = 0; 970 break; 971 } 972 973 aer = pci_nvme_aer_get(sc); 974 assert(aer != NULL); 975 976 DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype); 977 pci_nvme_cq_update(sc, &sc->compl_queues[0], 978 (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */ 979 aer->cid, 980 0, /* SQID */ 981 status); 982 983 aen->event_data = 0; 984 aen->posted = false; 985 986 pci_generate_msix(sc->nsc_pi, 0); 987 } 988 } 989 990 static void * 991 aen_thr(void *arg) 992 { 993 struct pci_nvme_softc *sc; 994 995 sc = arg; 996 997 pthread_mutex_lock(&sc->aen_mtx); 998 for (;;) { 999 pci_nvme_aen_process(sc); 1000 pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx); 1001 } 1002 pthread_mutex_unlock(&sc->aen_mtx); 1003 1004 pthread_exit(NULL); 1005 return (NULL); 1006 } 1007 1008 static void 1009 pci_nvme_reset_locked(struct pci_nvme_softc *sc) 1010 { 1011 uint32_t i; 1012 1013 DPRINTF("%s", __func__); 1014 1015 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | 1016 NVMEF(NVME_CAP_LO_REG_CQR, 1) | 1017 NVMEF(NVME_CAP_LO_REG_TO, 60); 1018 1019 sc->regs.cap_hi = NVMEF(NVME_CAP_HI_REG_CSS_NVM, 1); 1020 1021 sc->regs.vs = NVME_REV(1,4); /* NVMe v1.4 */ 1022 1023 sc->regs.cc = 0; 1024 1025 assert(sc->submit_queues != NULL); 1026 1027 for (i = 0; i < sc->num_squeues + 1; i++) { 1028 sc->submit_queues[i].qbase = NULL; 1029 sc->submit_queues[i].size = 0; 1030 sc->submit_queues[i].cqid = 0; 1031 sc->submit_queues[i].tail = 0; 1032 sc->submit_queues[i].head = 0; 1033 } 1034 1035 assert(sc->compl_queues != NULL); 1036 1037 for (i = 0; i < sc->num_cqueues + 1; i++) { 1038 sc->compl_queues[i].qbase = NULL; 1039 sc->compl_queues[i].size = 0; 1040 sc->compl_queues[i].tail = 0; 1041 sc->compl_queues[i].head = 0; 1042 } 1043 1044 sc->num_q_is_set = false; 1045 1046 pci_nvme_aer_destroy(sc); 1047 pci_nvme_aen_destroy(sc); 1048 1049 /* 1050 * Clear CSTS.RDY last to prevent the host from enabling Controller 1051 * before cleanup completes 1052 */ 1053 sc->regs.csts = 0; 1054 } 1055 1056 static void 1057 pci_nvme_reset(struct pci_nvme_softc *sc) 1058 { 1059 pthread_mutex_lock(&sc->mtx); 1060 pci_nvme_reset_locked(sc); 1061 pthread_mutex_unlock(&sc->mtx); 1062 } 1063 1064 static int 1065 pci_nvme_init_controller(struct pci_nvme_softc *sc) 1066 { 1067 uint16_t acqs, asqs; 1068 1069 DPRINTF("%s", __func__); 1070 1071 /* 1072 * NVMe 2.0 states that "enabling a controller while this field is 1073 * cleared to 0h produces undefined results" for both ACQS and 1074 * ASQS. If zero, set CFS and do not become ready. 1075 */ 1076 asqs = ONE_BASED(NVMEV(NVME_AQA_REG_ASQS, sc->regs.aqa)); 1077 if (asqs < 2) { 1078 EPRINTLN("%s: illegal ASQS value %#x (aqa=%#x)", __func__, 1079 asqs - 1, sc->regs.aqa); 1080 sc->regs.csts |= NVME_CSTS_CFS; 1081 return (-1); 1082 } 1083 sc->submit_queues[0].size = asqs; 1084 sc->submit_queues[0].qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1085 sc->regs.asq, sizeof(struct nvme_command) * asqs); 1086 if (sc->submit_queues[0].qbase == NULL) { 1087 EPRINTLN("%s: ASQ vm_map_gpa(%lx) failed", __func__, 1088 sc->regs.asq); 1089 sc->regs.csts |= NVME_CSTS_CFS; 1090 return (-1); 1091 } 1092 1093 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p", 1094 __func__, sc->regs.asq, sc->submit_queues[0].qbase); 1095 1096 acqs = ONE_BASED(NVMEV(NVME_AQA_REG_ACQS, sc->regs.aqa)); 1097 if (acqs < 2) { 1098 EPRINTLN("%s: illegal ACQS value %#x (aqa=%#x)", __func__, 1099 acqs - 1, sc->regs.aqa); 1100 sc->regs.csts |= NVME_CSTS_CFS; 1101 return (-1); 1102 } 1103 sc->compl_queues[0].size = acqs; 1104 sc->compl_queues[0].qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1105 sc->regs.acq, sizeof(struct nvme_completion) * acqs); 1106 if (sc->compl_queues[0].qbase == NULL) { 1107 EPRINTLN("%s: ACQ vm_map_gpa(%lx) failed", __func__, 1108 sc->regs.acq); 1109 sc->regs.csts |= NVME_CSTS_CFS; 1110 return (-1); 1111 } 1112 sc->compl_queues[0].intr_en = NVME_CQ_INTEN; 1113 1114 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p", 1115 __func__, sc->regs.acq, sc->compl_queues[0].qbase); 1116 1117 return (0); 1118 } 1119 1120 static int 1121 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b, 1122 size_t len, enum nvme_copy_dir dir) 1123 { 1124 uint8_t *p; 1125 size_t bytes; 1126 1127 if (len > (8 * 1024)) { 1128 return (-1); 1129 } 1130 1131 /* Copy from the start of prp1 to the end of the physical page */ 1132 bytes = PAGE_SIZE - (prp1 & PAGE_MASK); 1133 bytes = MIN(bytes, len); 1134 1135 p = vm_map_gpa(ctx, prp1, bytes); 1136 if (p == NULL) { 1137 return (-1); 1138 } 1139 1140 if (dir == NVME_COPY_TO_PRP) 1141 memcpy(p, b, bytes); 1142 else 1143 memcpy(b, p, bytes); 1144 1145 b += bytes; 1146 1147 len -= bytes; 1148 if (len == 0) { 1149 return (0); 1150 } 1151 1152 len = MIN(len, PAGE_SIZE); 1153 1154 p = vm_map_gpa(ctx, prp2, len); 1155 if (p == NULL) { 1156 return (-1); 1157 } 1158 1159 if (dir == NVME_COPY_TO_PRP) 1160 memcpy(p, b, len); 1161 else 1162 memcpy(b, p, len); 1163 1164 return (0); 1165 } 1166 1167 /* 1168 * Write a Completion Queue Entry update 1169 * 1170 * Write the completion and update the doorbell value 1171 */ 1172 static void 1173 pci_nvme_cq_update(struct pci_nvme_softc *sc, 1174 struct nvme_completion_queue *cq, 1175 uint32_t cdw0, 1176 uint16_t cid, 1177 uint16_t sqid, 1178 uint16_t status) 1179 { 1180 struct nvme_submission_queue *sq = &sc->submit_queues[sqid]; 1181 struct nvme_completion *cqe; 1182 1183 assert(cq->qbase != NULL); 1184 1185 pthread_mutex_lock(&cq->mtx); 1186 1187 cqe = &cq->qbase[cq->tail]; 1188 1189 /* Flip the phase bit */ 1190 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK; 1191 1192 cqe->cdw0 = cdw0; 1193 cqe->sqhd = sq->head; 1194 cqe->sqid = sqid; 1195 cqe->cid = cid; 1196 cqe->status = status; 1197 1198 cq->tail++; 1199 if (cq->tail >= cq->size) { 1200 cq->tail = 0; 1201 } 1202 1203 pthread_mutex_unlock(&cq->mtx); 1204 } 1205 1206 static int 1207 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1208 struct nvme_completion* compl) 1209 { 1210 uint16_t qid = command->cdw10 & 0xffff; 1211 1212 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid); 1213 if (qid == 0 || qid > sc->num_squeues || 1214 (sc->submit_queues[qid].qbase == NULL)) { 1215 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u", 1216 __func__, qid, sc->num_squeues); 1217 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1218 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1219 return (1); 1220 } 1221 1222 sc->submit_queues[qid].qbase = NULL; 1223 sc->submit_queues[qid].cqid = 0; 1224 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1225 return (1); 1226 } 1227 1228 static int 1229 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1230 struct nvme_completion* compl) 1231 { 1232 if (command->cdw11 & NVME_CMD_CDW11_PC) { 1233 uint16_t qid = command->cdw10 & 0xffff; 1234 struct nvme_submission_queue *nsq; 1235 1236 if ((qid == 0) || (qid > sc->num_squeues) || 1237 (sc->submit_queues[qid].qbase != NULL)) { 1238 WPRINTF("%s queue index %u > num_squeues %u", 1239 __func__, qid, sc->num_squeues); 1240 pci_nvme_status_tc(&compl->status, 1241 NVME_SCT_COMMAND_SPECIFIC, 1242 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1243 return (1); 1244 } 1245 1246 nsq = &sc->submit_queues[qid]; 1247 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1248 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries); 1249 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) { 1250 /* 1251 * Queues must specify at least two entries 1252 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1253 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1254 */ 1255 pci_nvme_status_tc(&compl->status, 1256 NVME_SCT_COMMAND_SPECIFIC, 1257 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1258 return (1); 1259 } 1260 nsq->head = nsq->tail = 0; 1261 1262 nsq->cqid = (command->cdw11 >> 16) & 0xffff; 1263 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) { 1264 pci_nvme_status_tc(&compl->status, 1265 NVME_SCT_COMMAND_SPECIFIC, 1266 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1267 return (1); 1268 } 1269 1270 if (sc->compl_queues[nsq->cqid].qbase == NULL) { 1271 pci_nvme_status_tc(&compl->status, 1272 NVME_SCT_COMMAND_SPECIFIC, 1273 NVME_SC_COMPLETION_QUEUE_INVALID); 1274 return (1); 1275 } 1276 1277 nsq->qpriority = (command->cdw11 >> 1) & 0x03; 1278 1279 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1280 sizeof(struct nvme_command) * (size_t)nsq->size); 1281 1282 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__, 1283 qid, nsq->size, nsq->qbase, nsq->cqid); 1284 1285 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1286 1287 DPRINTF("%s completed creating IOSQ qid %u", 1288 __func__, qid); 1289 } else { 1290 /* 1291 * Guest sent non-cont submission queue request. 1292 * This setting is unsupported by this emulation. 1293 */ 1294 WPRINTF("%s unsupported non-contig (list-based) " 1295 "create i/o submission queue", __func__); 1296 1297 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1298 } 1299 return (1); 1300 } 1301 1302 static int 1303 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1304 struct nvme_completion* compl) 1305 { 1306 uint16_t qid = command->cdw10 & 0xffff; 1307 uint16_t sqid; 1308 1309 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid); 1310 if (qid == 0 || qid > sc->num_cqueues || 1311 (sc->compl_queues[qid].qbase == NULL)) { 1312 WPRINTF("%s queue index %u / num_cqueues %u", 1313 __func__, qid, sc->num_cqueues); 1314 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1315 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1316 return (1); 1317 } 1318 1319 /* Deleting an Active CQ is an error */ 1320 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++) 1321 if (sc->submit_queues[sqid].cqid == qid) { 1322 pci_nvme_status_tc(&compl->status, 1323 NVME_SCT_COMMAND_SPECIFIC, 1324 NVME_SC_INVALID_QUEUE_DELETION); 1325 return (1); 1326 } 1327 1328 sc->compl_queues[qid].qbase = NULL; 1329 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1330 return (1); 1331 } 1332 1333 static int 1334 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1335 struct nvme_completion* compl) 1336 { 1337 struct nvme_completion_queue *ncq; 1338 uint16_t qid = command->cdw10 & 0xffff; 1339 1340 /* Only support Physically Contiguous queues */ 1341 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) { 1342 WPRINTF("%s unsupported non-contig (list-based) " 1343 "create i/o completion queue", 1344 __func__); 1345 1346 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1347 return (1); 1348 } 1349 1350 if ((qid == 0) || (qid > sc->num_cqueues) || 1351 (sc->compl_queues[qid].qbase != NULL)) { 1352 WPRINTF("%s queue index %u > num_cqueues %u", 1353 __func__, qid, sc->num_cqueues); 1354 pci_nvme_status_tc(&compl->status, 1355 NVME_SCT_COMMAND_SPECIFIC, 1356 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1357 return (1); 1358 } 1359 1360 ncq = &sc->compl_queues[qid]; 1361 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; 1362 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; 1363 if (ncq->intr_vec > (sc->max_queues + 1)) { 1364 pci_nvme_status_tc(&compl->status, 1365 NVME_SCT_COMMAND_SPECIFIC, 1366 NVME_SC_INVALID_INTERRUPT_VECTOR); 1367 return (1); 1368 } 1369 1370 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1371 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) { 1372 /* 1373 * Queues must specify at least two entries 1374 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1375 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1376 */ 1377 pci_nvme_status_tc(&compl->status, 1378 NVME_SCT_COMMAND_SPECIFIC, 1379 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1380 return (1); 1381 } 1382 ncq->head = ncq->tail = 0; 1383 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1384 command->prp1, 1385 sizeof(struct nvme_command) * (size_t)ncq->size); 1386 1387 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1388 1389 1390 return (1); 1391 } 1392 1393 static int 1394 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, 1395 struct nvme_completion* compl) 1396 { 1397 uint64_t logoff; 1398 uint32_t logsize; 1399 uint8_t logpage; 1400 1401 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1402 1403 /* 1404 * Command specifies the number of dwords to return in fields NUMDU 1405 * and NUMDL. This is a zero-based value. 1406 */ 1407 logpage = command->cdw10 & 0xFF; 1408 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1; 1409 logsize *= sizeof(uint32_t); 1410 logoff = ((uint64_t)(command->cdw13) << 32) | command->cdw12; 1411 1412 DPRINTF("%s log page %u offset %lu len %u", __func__, logpage, logoff, logsize); 1413 1414 switch (logpage) { 1415 case NVME_LOG_ERROR: 1416 if (logoff >= sizeof(sc->err_log)) { 1417 pci_nvme_status_genc(&compl->status, 1418 NVME_SC_INVALID_FIELD); 1419 break; 1420 } 1421 1422 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1423 command->prp2, (uint8_t *)&sc->err_log + logoff, 1424 MIN(logsize, sizeof(sc->err_log) - logoff), 1425 NVME_COPY_TO_PRP); 1426 break; 1427 case NVME_LOG_HEALTH_INFORMATION: 1428 if (logoff >= sizeof(sc->health_log)) { 1429 pci_nvme_status_genc(&compl->status, 1430 NVME_SC_INVALID_FIELD); 1431 break; 1432 } 1433 1434 pthread_mutex_lock(&sc->mtx); 1435 memcpy(&sc->health_log.data_units_read, &sc->read_data_units, 1436 sizeof(sc->health_log.data_units_read)); 1437 memcpy(&sc->health_log.data_units_written, &sc->write_data_units, 1438 sizeof(sc->health_log.data_units_written)); 1439 memcpy(&sc->health_log.host_read_commands, &sc->read_commands, 1440 sizeof(sc->health_log.host_read_commands)); 1441 memcpy(&sc->health_log.host_write_commands, &sc->write_commands, 1442 sizeof(sc->health_log.host_write_commands)); 1443 pthread_mutex_unlock(&sc->mtx); 1444 1445 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1446 command->prp2, (uint8_t *)&sc->health_log + logoff, 1447 MIN(logsize, sizeof(sc->health_log) - logoff), 1448 NVME_COPY_TO_PRP); 1449 break; 1450 case NVME_LOG_FIRMWARE_SLOT: 1451 if (logoff >= sizeof(sc->fw_log)) { 1452 pci_nvme_status_genc(&compl->status, 1453 NVME_SC_INVALID_FIELD); 1454 break; 1455 } 1456 1457 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1458 command->prp2, (uint8_t *)&sc->fw_log + logoff, 1459 MIN(logsize, sizeof(sc->fw_log) - logoff), 1460 NVME_COPY_TO_PRP); 1461 break; 1462 case NVME_LOG_CHANGED_NAMESPACE: 1463 if (logoff >= sizeof(sc->ns_log)) { 1464 pci_nvme_status_genc(&compl->status, 1465 NVME_SC_INVALID_FIELD); 1466 break; 1467 } 1468 1469 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1470 command->prp2, (uint8_t *)&sc->ns_log + logoff, 1471 MIN(logsize, sizeof(sc->ns_log) - logoff), 1472 NVME_COPY_TO_PRP); 1473 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 1474 break; 1475 default: 1476 DPRINTF("%s get log page %x command not supported", 1477 __func__, logpage); 1478 1479 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1480 NVME_SC_INVALID_LOG_PAGE); 1481 } 1482 1483 return (1); 1484 } 1485 1486 static int 1487 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, 1488 struct nvme_completion* compl) 1489 { 1490 void *dest; 1491 uint16_t status; 1492 1493 DPRINTF("%s identify 0x%x nsid 0x%x", __func__, 1494 command->cdw10 & 0xFF, command->nsid); 1495 1496 status = 0; 1497 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1498 1499 switch (command->cdw10 & 0xFF) { 1500 case 0x00: /* return Identify Namespace data structure */ 1501 /* Global NS only valid with NS Management */ 1502 if (command->nsid == NVME_GLOBAL_NAMESPACE_TAG) { 1503 pci_nvme_status_genc(&status, 1504 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1505 break; 1506 } 1507 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1508 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata), 1509 NVME_COPY_TO_PRP); 1510 break; 1511 case 0x01: /* return Identify Controller data structure */ 1512 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1513 command->prp2, (uint8_t *)&sc->ctrldata, 1514 sizeof(sc->ctrldata), 1515 NVME_COPY_TO_PRP); 1516 break; 1517 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ 1518 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1519 sizeof(uint32_t) * 1024); 1520 /* All unused entries shall be zero */ 1521 memset(dest, 0, sizeof(uint32_t) * 1024); 1522 ((uint32_t *)dest)[0] = 1; 1523 break; 1524 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ 1525 if (command->nsid != 1) { 1526 pci_nvme_status_genc(&status, 1527 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1528 break; 1529 } 1530 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1531 sizeof(uint32_t) * 1024); 1532 /* All bytes after the descriptor shall be zero */ 1533 memset(dest, 0, sizeof(uint32_t) * 1024); 1534 1535 /* Return NIDT=1 (i.e. EUI64) descriptor */ 1536 ((uint8_t *)dest)[0] = 1; 1537 ((uint8_t *)dest)[1] = sizeof(uint64_t); 1538 memcpy(((uint8_t *)dest) + 4, sc->nsdata.eui64, sizeof(uint64_t)); 1539 break; 1540 case 0x13: 1541 /* 1542 * Controller list is optional but used by UNH tests. Return 1543 * a valid but empty list. 1544 */ 1545 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1546 sizeof(uint16_t) * 2048); 1547 memset(dest, 0, sizeof(uint16_t) * 2048); 1548 break; 1549 default: 1550 DPRINTF("%s unsupported identify command requested 0x%x", 1551 __func__, command->cdw10 & 0xFF); 1552 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD); 1553 break; 1554 } 1555 1556 compl->status = status; 1557 return (1); 1558 } 1559 1560 static const char * 1561 nvme_fid_to_name(uint8_t fid) 1562 { 1563 const char *name; 1564 1565 switch (fid) { 1566 case NVME_FEAT_ARBITRATION: 1567 name = "Arbitration"; 1568 break; 1569 case NVME_FEAT_POWER_MANAGEMENT: 1570 name = "Power Management"; 1571 break; 1572 case NVME_FEAT_LBA_RANGE_TYPE: 1573 name = "LBA Range Type"; 1574 break; 1575 case NVME_FEAT_TEMPERATURE_THRESHOLD: 1576 name = "Temperature Threshold"; 1577 break; 1578 case NVME_FEAT_ERROR_RECOVERY: 1579 name = "Error Recovery"; 1580 break; 1581 case NVME_FEAT_VOLATILE_WRITE_CACHE: 1582 name = "Volatile Write Cache"; 1583 break; 1584 case NVME_FEAT_NUMBER_OF_QUEUES: 1585 name = "Number of Queues"; 1586 break; 1587 case NVME_FEAT_INTERRUPT_COALESCING: 1588 name = "Interrupt Coalescing"; 1589 break; 1590 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 1591 name = "Interrupt Vector Configuration"; 1592 break; 1593 case NVME_FEAT_WRITE_ATOMICITY: 1594 name = "Write Atomicity Normal"; 1595 break; 1596 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 1597 name = "Asynchronous Event Configuration"; 1598 break; 1599 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 1600 name = "Autonomous Power State Transition"; 1601 break; 1602 case NVME_FEAT_HOST_MEMORY_BUFFER: 1603 name = "Host Memory Buffer"; 1604 break; 1605 case NVME_FEAT_TIMESTAMP: 1606 name = "Timestamp"; 1607 break; 1608 case NVME_FEAT_KEEP_ALIVE_TIMER: 1609 name = "Keep Alive Timer"; 1610 break; 1611 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT: 1612 name = "Host Controlled Thermal Management"; 1613 break; 1614 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG: 1615 name = "Non-Operation Power State Config"; 1616 break; 1617 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG: 1618 name = "Read Recovery Level Config"; 1619 break; 1620 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 1621 name = "Predictable Latency Mode Config"; 1622 break; 1623 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW: 1624 name = "Predictable Latency Mode Window"; 1625 break; 1626 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES: 1627 name = "LBA Status Information Report Interval"; 1628 break; 1629 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 1630 name = "Host Behavior Support"; 1631 break; 1632 case NVME_FEAT_SANITIZE_CONFIG: 1633 name = "Sanitize Config"; 1634 break; 1635 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION: 1636 name = "Endurance Group Event Configuration"; 1637 break; 1638 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 1639 name = "Software Progress Marker"; 1640 break; 1641 case NVME_FEAT_HOST_IDENTIFIER: 1642 name = "Host Identifier"; 1643 break; 1644 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK: 1645 name = "Reservation Notification Mask"; 1646 break; 1647 case NVME_FEAT_RESERVATION_PERSISTENCE: 1648 name = "Reservation Persistence"; 1649 break; 1650 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG: 1651 name = "Namespace Write Protection Config"; 1652 break; 1653 default: 1654 name = "Unknown"; 1655 break; 1656 } 1657 1658 return (name); 1659 } 1660 1661 static void 1662 nvme_feature_invalid_cb(struct pci_nvme_softc *sc __unused, 1663 struct nvme_feature_obj *feat __unused, 1664 struct nvme_command *command __unused, 1665 struct nvme_completion *compl) 1666 { 1667 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1668 } 1669 1670 static void 1671 nvme_feature_iv_config(struct pci_nvme_softc *sc, 1672 struct nvme_feature_obj *feat __unused, 1673 struct nvme_command *command, 1674 struct nvme_completion *compl) 1675 { 1676 uint32_t i; 1677 uint32_t cdw11 = command->cdw11; 1678 uint16_t iv; 1679 bool cd; 1680 1681 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1682 1683 iv = cdw11 & 0xffff; 1684 cd = cdw11 & (1 << 16); 1685 1686 if (iv > (sc->max_queues + 1)) { 1687 return; 1688 } 1689 1690 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */ 1691 if ((iv == 0) && !cd) 1692 return; 1693 1694 /* Requested Interrupt Vector must be used by a CQ */ 1695 for (i = 0; i < sc->num_cqueues + 1; i++) { 1696 if (sc->compl_queues[i].intr_vec == iv) { 1697 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1698 } 1699 } 1700 } 1701 1702 #define NVME_ASYNC_EVENT_ENDURANCE_GROUP (0x4000) 1703 static void 1704 nvme_feature_async_event(struct pci_nvme_softc *sc __unused, 1705 struct nvme_feature_obj *feat __unused, 1706 struct nvme_command *command, 1707 struct nvme_completion *compl) 1708 { 1709 if (command->cdw11 & NVME_ASYNC_EVENT_ENDURANCE_GROUP) 1710 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1711 } 1712 1713 #define NVME_TEMP_THRESH_OVER 0 1714 #define NVME_TEMP_THRESH_UNDER 1 1715 static void 1716 nvme_feature_temperature(struct pci_nvme_softc *sc, 1717 struct nvme_feature_obj *feat __unused, 1718 struct nvme_command *command, 1719 struct nvme_completion *compl) 1720 { 1721 uint16_t tmpth; /* Temperature Threshold */ 1722 uint8_t tmpsel; /* Threshold Temperature Select */ 1723 uint8_t thsel; /* Threshold Type Select */ 1724 bool set_crit = false; 1725 bool report_crit; 1726 1727 tmpth = command->cdw11 & 0xffff; 1728 tmpsel = (command->cdw11 >> 16) & 0xf; 1729 thsel = (command->cdw11 >> 20) & 0x3; 1730 1731 DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel); 1732 1733 /* Check for unsupported values */ 1734 if (((tmpsel != 0) && (tmpsel != 0xf)) || 1735 (thsel > NVME_TEMP_THRESH_UNDER)) { 1736 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1737 return; 1738 } 1739 1740 if (((thsel == NVME_TEMP_THRESH_OVER) && (NVME_TEMPERATURE >= tmpth)) || 1741 ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth))) 1742 set_crit = true; 1743 1744 pthread_mutex_lock(&sc->mtx); 1745 if (set_crit) 1746 sc->health_log.critical_warning |= 1747 NVME_CRIT_WARN_ST_TEMPERATURE; 1748 else 1749 sc->health_log.critical_warning &= 1750 ~NVME_CRIT_WARN_ST_TEMPERATURE; 1751 pthread_mutex_unlock(&sc->mtx); 1752 1753 report_crit = sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11 & 1754 NVME_CRIT_WARN_ST_TEMPERATURE; 1755 1756 if (set_crit && report_crit) 1757 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART, 1758 sc->health_log.critical_warning); 1759 1760 DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status); 1761 } 1762 1763 static void 1764 nvme_feature_num_queues(struct pci_nvme_softc *sc, 1765 struct nvme_feature_obj *feat __unused, 1766 struct nvme_command *command, 1767 struct nvme_completion *compl) 1768 { 1769 uint16_t nqr; /* Number of Queues Requested */ 1770 1771 if (sc->num_q_is_set) { 1772 WPRINTF("%s: Number of Queues already set", __func__); 1773 pci_nvme_status_genc(&compl->status, 1774 NVME_SC_COMMAND_SEQUENCE_ERROR); 1775 return; 1776 } 1777 1778 nqr = command->cdw11 & 0xFFFF; 1779 if (nqr == 0xffff) { 1780 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr); 1781 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1782 return; 1783 } 1784 1785 sc->num_squeues = ONE_BASED(nqr); 1786 if (sc->num_squeues > sc->max_queues) { 1787 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues, 1788 sc->max_queues); 1789 sc->num_squeues = sc->max_queues; 1790 } 1791 1792 nqr = (command->cdw11 >> 16) & 0xFFFF; 1793 if (nqr == 0xffff) { 1794 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr); 1795 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1796 return; 1797 } 1798 1799 sc->num_cqueues = ONE_BASED(nqr); 1800 if (sc->num_cqueues > sc->max_queues) { 1801 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues, 1802 sc->max_queues); 1803 sc->num_cqueues = sc->max_queues; 1804 } 1805 1806 /* Patch the command value which will be saved on callback's return */ 1807 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc); 1808 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 1809 1810 sc->num_q_is_set = true; 1811 } 1812 1813 static int 1814 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command, 1815 struct nvme_completion *compl) 1816 { 1817 struct nvme_feature_obj *feat; 1818 uint32_t nsid = command->nsid; 1819 uint8_t fid = NVMEV(NVME_FEAT_SET_FID, command->cdw10); 1820 bool sv = NVMEV(NVME_FEAT_SET_SV, command->cdw10); 1821 1822 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1823 1824 if (fid >= NVME_FID_MAX) { 1825 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1826 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1827 return (1); 1828 } 1829 1830 if (sv) { 1831 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1832 NVME_SC_FEATURE_NOT_SAVEABLE); 1833 return (1); 1834 } 1835 1836 feat = &sc->feat[fid]; 1837 1838 if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) { 1839 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1840 return (1); 1841 } 1842 1843 if (!feat->namespace_specific && 1844 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) { 1845 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1846 NVME_SC_FEATURE_NOT_NS_SPECIFIC); 1847 return (1); 1848 } 1849 1850 compl->cdw0 = 0; 1851 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1852 1853 if (feat->set) 1854 feat->set(sc, feat, command, compl); 1855 else { 1856 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1857 NVME_SC_FEATURE_NOT_CHANGEABLE); 1858 return (1); 1859 } 1860 1861 DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11); 1862 if (compl->status == NVME_SC_SUCCESS) { 1863 feat->cdw11 = command->cdw11; 1864 if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) && 1865 (command->cdw11 != 0)) 1866 pci_nvme_aen_notify(sc); 1867 } 1868 1869 return (0); 1870 } 1871 1872 #define NVME_FEATURES_SEL_SUPPORTED 0x3 1873 #define NVME_FEATURES_NS_SPECIFIC (1 << 1) 1874 1875 static int 1876 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, 1877 struct nvme_completion* compl) 1878 { 1879 struct nvme_feature_obj *feat; 1880 uint8_t fid = command->cdw10 & 0xFF; 1881 uint8_t sel = (command->cdw10 >> 8) & 0x7; 1882 1883 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1884 1885 if (fid >= NVME_FID_MAX) { 1886 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1887 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1888 return (1); 1889 } 1890 1891 compl->cdw0 = 0; 1892 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1893 1894 feat = &sc->feat[fid]; 1895 if (feat->get) { 1896 feat->get(sc, feat, command, compl); 1897 } 1898 1899 if (compl->status == NVME_SC_SUCCESS) { 1900 if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific) 1901 compl->cdw0 = NVME_FEATURES_NS_SPECIFIC; 1902 else 1903 compl->cdw0 = feat->cdw11; 1904 } 1905 1906 return (0); 1907 } 1908 1909 static int 1910 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command, 1911 struct nvme_completion* compl) 1912 { 1913 uint8_t ses, lbaf, pi; 1914 1915 /* Only supports Secure Erase Setting - User Data Erase */ 1916 ses = (command->cdw10 >> 9) & 0x7; 1917 if (ses > 0x1) { 1918 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1919 return (1); 1920 } 1921 1922 /* Only supports a single LBA Format */ 1923 lbaf = command->cdw10 & 0xf; 1924 if (lbaf != 0) { 1925 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1926 NVME_SC_INVALID_FORMAT); 1927 return (1); 1928 } 1929 1930 /* Doesn't support Protection Information */ 1931 pi = (command->cdw10 >> 5) & 0x7; 1932 if (pi != 0) { 1933 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1934 return (1); 1935 } 1936 1937 if (sc->nvstore.type == NVME_STOR_RAM) { 1938 if (sc->nvstore.ctx) 1939 free(sc->nvstore.ctx); 1940 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 1941 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1942 } else { 1943 struct pci_nvme_ioreq *req; 1944 int err; 1945 1946 req = pci_nvme_get_ioreq(sc); 1947 if (req == NULL) { 1948 pci_nvme_status_genc(&compl->status, 1949 NVME_SC_INTERNAL_DEVICE_ERROR); 1950 WPRINTF("%s: unable to allocate IO req", __func__); 1951 return (1); 1952 } 1953 req->nvme_sq = &sc->submit_queues[0]; 1954 req->sqid = 0; 1955 req->opc = command->opc; 1956 req->cid = command->cid; 1957 req->nsid = command->nsid; 1958 1959 req->io_req.br_offset = 0; 1960 req->io_req.br_resid = sc->nvstore.size; 1961 req->io_req.br_callback = pci_nvme_io_done; 1962 1963 err = blockif_delete(sc->nvstore.ctx, &req->io_req); 1964 if (err) { 1965 pci_nvme_status_genc(&compl->status, 1966 NVME_SC_INTERNAL_DEVICE_ERROR); 1967 pci_nvme_release_ioreq(sc, req); 1968 } else 1969 compl->status = NVME_NO_STATUS; 1970 } 1971 1972 return (1); 1973 } 1974 1975 static int 1976 nvme_opc_abort(struct pci_nvme_softc *sc __unused, struct nvme_command *command, 1977 struct nvme_completion *compl) 1978 { 1979 DPRINTF("%s submission queue %u, command ID 0x%x", __func__, 1980 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF); 1981 1982 /* TODO: search for the command ID and abort it */ 1983 1984 compl->cdw0 = 1; 1985 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1986 return (1); 1987 } 1988 1989 static int 1990 nvme_opc_async_event_req(struct pci_nvme_softc* sc, 1991 struct nvme_command* command, struct nvme_completion* compl) 1992 { 1993 DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__, 1994 sc->aer_count, sc->ctrldata.aerl, command->cid); 1995 1996 /* Don't exceed the Async Event Request Limit (AERL). */ 1997 if (pci_nvme_aer_limit_reached(sc)) { 1998 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1999 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); 2000 return (1); 2001 } 2002 2003 if (pci_nvme_aer_add(sc, command->cid)) { 2004 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC, 2005 NVME_SC_INTERNAL_DEVICE_ERROR); 2006 return (1); 2007 } 2008 2009 /* 2010 * Raise events when they happen based on the Set Features cmd. 2011 * These events happen async, so only set completion successful if 2012 * there is an event reflective of the request to get event. 2013 */ 2014 compl->status = NVME_NO_STATUS; 2015 pci_nvme_aen_notify(sc); 2016 2017 return (0); 2018 } 2019 2020 static void 2021 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) 2022 { 2023 struct nvme_completion compl; 2024 struct nvme_command *cmd; 2025 struct nvme_submission_queue *sq; 2026 struct nvme_completion_queue *cq; 2027 uint16_t sqhead; 2028 2029 DPRINTF("%s index %u", __func__, (uint32_t)value); 2030 2031 sq = &sc->submit_queues[0]; 2032 cq = &sc->compl_queues[0]; 2033 2034 pthread_mutex_lock(&sq->mtx); 2035 2036 sqhead = sq->head; 2037 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail); 2038 2039 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2040 cmd = &(sq->qbase)[sqhead]; 2041 compl.cdw0 = 0; 2042 compl.status = 0; 2043 2044 switch (cmd->opc) { 2045 case NVME_OPC_DELETE_IO_SQ: 2046 DPRINTF("%s command DELETE_IO_SQ", __func__); 2047 nvme_opc_delete_io_sq(sc, cmd, &compl); 2048 break; 2049 case NVME_OPC_CREATE_IO_SQ: 2050 DPRINTF("%s command CREATE_IO_SQ", __func__); 2051 nvme_opc_create_io_sq(sc, cmd, &compl); 2052 break; 2053 case NVME_OPC_DELETE_IO_CQ: 2054 DPRINTF("%s command DELETE_IO_CQ", __func__); 2055 nvme_opc_delete_io_cq(sc, cmd, &compl); 2056 break; 2057 case NVME_OPC_CREATE_IO_CQ: 2058 DPRINTF("%s command CREATE_IO_CQ", __func__); 2059 nvme_opc_create_io_cq(sc, cmd, &compl); 2060 break; 2061 case NVME_OPC_GET_LOG_PAGE: 2062 DPRINTF("%s command GET_LOG_PAGE", __func__); 2063 nvme_opc_get_log_page(sc, cmd, &compl); 2064 break; 2065 case NVME_OPC_IDENTIFY: 2066 DPRINTF("%s command IDENTIFY", __func__); 2067 nvme_opc_identify(sc, cmd, &compl); 2068 break; 2069 case NVME_OPC_ABORT: 2070 DPRINTF("%s command ABORT", __func__); 2071 nvme_opc_abort(sc, cmd, &compl); 2072 break; 2073 case NVME_OPC_SET_FEATURES: 2074 DPRINTF("%s command SET_FEATURES", __func__); 2075 nvme_opc_set_features(sc, cmd, &compl); 2076 break; 2077 case NVME_OPC_GET_FEATURES: 2078 DPRINTF("%s command GET_FEATURES", __func__); 2079 nvme_opc_get_features(sc, cmd, &compl); 2080 break; 2081 case NVME_OPC_FIRMWARE_ACTIVATE: 2082 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__); 2083 pci_nvme_status_tc(&compl.status, 2084 NVME_SCT_COMMAND_SPECIFIC, 2085 NVME_SC_INVALID_FIRMWARE_SLOT); 2086 break; 2087 case NVME_OPC_ASYNC_EVENT_REQUEST: 2088 DPRINTF("%s command ASYNC_EVENT_REQ", __func__); 2089 nvme_opc_async_event_req(sc, cmd, &compl); 2090 break; 2091 case NVME_OPC_FORMAT_NVM: 2092 DPRINTF("%s command FORMAT_NVM", __func__); 2093 if (NVMEV(NVME_CTRLR_DATA_OACS_FORMAT, 2094 sc->ctrldata.oacs) == 0) { 2095 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 2096 break; 2097 } 2098 nvme_opc_format_nvm(sc, cmd, &compl); 2099 break; 2100 case NVME_OPC_SECURITY_SEND: 2101 case NVME_OPC_SECURITY_RECEIVE: 2102 case NVME_OPC_SANITIZE: 2103 case NVME_OPC_GET_LBA_STATUS: 2104 DPRINTF("%s command OPC=%#x (unsupported)", __func__, 2105 cmd->opc); 2106 /* Valid but unsupported opcodes */ 2107 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD); 2108 break; 2109 default: 2110 DPRINTF("%s command OPC=%#X (not implemented)", 2111 __func__, 2112 cmd->opc); 2113 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 2114 } 2115 sqhead = (sqhead + 1) % sq->size; 2116 2117 if (NVME_COMPLETION_VALID(compl)) { 2118 pci_nvme_cq_update(sc, &sc->compl_queues[0], 2119 compl.cdw0, 2120 cmd->cid, 2121 0, /* SQID */ 2122 compl.status); 2123 } 2124 } 2125 2126 DPRINTF("setting sqhead %u", sqhead); 2127 sq->head = sqhead; 2128 2129 if (cq->head != cq->tail) 2130 pci_generate_msix(sc->nsc_pi, 0); 2131 2132 pthread_mutex_unlock(&sq->mtx); 2133 } 2134 2135 /* 2136 * Update the Write and Read statistics reported in SMART data 2137 * 2138 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up. 2139 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000 2140 * 512 byte blocks. Rounding up is achieved by initializing the remainder to 999. 2141 */ 2142 static void 2143 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc, 2144 size_t bytes, uint16_t status) 2145 { 2146 2147 pthread_mutex_lock(&sc->mtx); 2148 switch (opc) { 2149 case NVME_OPC_WRITE: 2150 sc->write_commands++; 2151 if (status != NVME_SC_SUCCESS) 2152 break; 2153 sc->write_dunits_remainder += (bytes / 512); 2154 while (sc->write_dunits_remainder >= 1000) { 2155 sc->write_data_units++; 2156 sc->write_dunits_remainder -= 1000; 2157 } 2158 break; 2159 case NVME_OPC_READ: 2160 sc->read_commands++; 2161 if (status != NVME_SC_SUCCESS) 2162 break; 2163 sc->read_dunits_remainder += (bytes / 512); 2164 while (sc->read_dunits_remainder >= 1000) { 2165 sc->read_data_units++; 2166 sc->read_dunits_remainder -= 1000; 2167 } 2168 break; 2169 default: 2170 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc); 2171 break; 2172 } 2173 pthread_mutex_unlock(&sc->mtx); 2174 } 2175 2176 /* 2177 * Check if the combination of Starting LBA (slba) and number of blocks 2178 * exceeds the range of the underlying storage. 2179 * 2180 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores 2181 * the capacity in bytes as a uint64_t, care must be taken to avoid integer 2182 * overflow. 2183 */ 2184 static bool 2185 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba, 2186 uint32_t nblocks) 2187 { 2188 size_t offset, bytes; 2189 2190 /* Overflow check of multiplying Starting LBA by the sector size */ 2191 if (slba >> (64 - nvstore->sectsz_bits)) 2192 return (true); 2193 2194 offset = slba << nvstore->sectsz_bits; 2195 bytes = nblocks << nvstore->sectsz_bits; 2196 2197 /* Overflow check of Number of Logical Blocks */ 2198 if ((nvstore->size <= offset) || ((nvstore->size - offset) < bytes)) 2199 return (true); 2200 2201 return (false); 2202 } 2203 2204 static int 2205 pci_nvme_append_iov_req(struct pci_nvme_softc *sc __unused, 2206 struct pci_nvme_ioreq *req, uint64_t gpaddr, size_t size, uint64_t offset) 2207 { 2208 int iovidx; 2209 bool range_is_contiguous; 2210 2211 if (req == NULL) 2212 return (-1); 2213 2214 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) { 2215 return (-1); 2216 } 2217 2218 /* 2219 * Minimize the number of IOVs by concatenating contiguous address 2220 * ranges. If the IOV count is zero, there is no previous range to 2221 * concatenate. 2222 */ 2223 if (req->io_req.br_iovcnt == 0) 2224 range_is_contiguous = false; 2225 else 2226 range_is_contiguous = (req->prev_gpaddr + req->prev_size) == gpaddr; 2227 2228 if (range_is_contiguous) { 2229 iovidx = req->io_req.br_iovcnt - 1; 2230 2231 req->io_req.br_iov[iovidx].iov_base = 2232 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2233 req->prev_gpaddr, size); 2234 if (req->io_req.br_iov[iovidx].iov_base == NULL) 2235 return (-1); 2236 2237 req->prev_size += size; 2238 req->io_req.br_resid += size; 2239 2240 req->io_req.br_iov[iovidx].iov_len = req->prev_size; 2241 } else { 2242 iovidx = req->io_req.br_iovcnt; 2243 if (iovidx == 0) { 2244 req->io_req.br_offset = offset; 2245 req->io_req.br_resid = 0; 2246 req->io_req.br_param = req; 2247 } 2248 2249 req->io_req.br_iov[iovidx].iov_base = 2250 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2251 gpaddr, size); 2252 if (req->io_req.br_iov[iovidx].iov_base == NULL) 2253 return (-1); 2254 2255 req->io_req.br_iov[iovidx].iov_len = size; 2256 2257 req->prev_gpaddr = gpaddr; 2258 req->prev_size = size; 2259 req->io_req.br_resid += size; 2260 2261 req->io_req.br_iovcnt++; 2262 } 2263 2264 return (0); 2265 } 2266 2267 static void 2268 pci_nvme_set_completion(struct pci_nvme_softc *sc, 2269 struct nvme_submission_queue *sq, int sqid, uint16_t cid, uint16_t status) 2270 { 2271 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; 2272 2273 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x", 2274 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), 2275 NVME_STATUS_GET_SC(status)); 2276 2277 pci_nvme_cq_update(sc, cq, 0, cid, sqid, status); 2278 2279 if (cq->head != cq->tail) { 2280 if (cq->intr_en & NVME_CQ_INTEN) { 2281 pci_generate_msix(sc->nsc_pi, cq->intr_vec); 2282 } else { 2283 DPRINTF("%s: CQ%u interrupt disabled", 2284 __func__, sq->cqid); 2285 } 2286 } 2287 } 2288 2289 static void 2290 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) 2291 { 2292 req->sc = NULL; 2293 req->nvme_sq = NULL; 2294 req->sqid = 0; 2295 2296 pthread_mutex_lock(&sc->mtx); 2297 2298 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link); 2299 sc->pending_ios--; 2300 2301 /* when no more IO pending, can set to ready if device reset/enabled */ 2302 if (sc->pending_ios == 0 && 2303 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) 2304 sc->regs.csts |= NVME_CSTS_RDY; 2305 2306 pthread_mutex_unlock(&sc->mtx); 2307 2308 sem_post(&sc->iosemlock); 2309 } 2310 2311 static struct pci_nvme_ioreq * 2312 pci_nvme_get_ioreq(struct pci_nvme_softc *sc) 2313 { 2314 struct pci_nvme_ioreq *req = NULL; 2315 2316 sem_wait(&sc->iosemlock); 2317 pthread_mutex_lock(&sc->mtx); 2318 2319 req = STAILQ_FIRST(&sc->ioreqs_free); 2320 assert(req != NULL); 2321 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link); 2322 2323 req->sc = sc; 2324 2325 sc->pending_ios++; 2326 2327 pthread_mutex_unlock(&sc->mtx); 2328 2329 req->io_req.br_iovcnt = 0; 2330 req->io_req.br_offset = 0; 2331 req->io_req.br_resid = 0; 2332 req->io_req.br_param = req; 2333 req->prev_gpaddr = 0; 2334 req->prev_size = 0; 2335 2336 return req; 2337 } 2338 2339 static void 2340 pci_nvme_io_done(struct blockif_req *br, int err) 2341 { 2342 struct pci_nvme_ioreq *req = br->br_param; 2343 struct nvme_submission_queue *sq = req->nvme_sq; 2344 uint16_t code, status; 2345 2346 DPRINTF("%s error %d %s", __func__, err, strerror(err)); 2347 2348 /* TODO return correct error */ 2349 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; 2350 status = 0; 2351 pci_nvme_status_genc(&status, code); 2352 2353 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, status); 2354 pci_nvme_stats_write_read_update(req->sc, req->opc, 2355 req->bytes, status); 2356 pci_nvme_release_ioreq(req->sc, req); 2357 } 2358 2359 /* 2360 * Implements the Flush command. The specification states: 2361 * If a volatile write cache is not present, Flush commands complete 2362 * successfully and have no effect 2363 * in the description of the Volatile Write Cache (VWC) field of the Identify 2364 * Controller data. Therefore, set status to Success if the command is 2365 * not supported (i.e. RAM or as indicated by the blockif). 2366 */ 2367 static bool 2368 nvme_opc_flush(struct pci_nvme_softc *sc __unused, 2369 struct nvme_command *cmd __unused, 2370 struct pci_nvme_blockstore *nvstore, 2371 struct pci_nvme_ioreq *req, 2372 uint16_t *status) 2373 { 2374 bool pending = false; 2375 2376 if (nvstore->type == NVME_STOR_RAM) { 2377 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2378 } else { 2379 int err; 2380 2381 req->io_req.br_callback = pci_nvme_io_done; 2382 2383 err = blockif_flush(nvstore->ctx, &req->io_req); 2384 switch (err) { 2385 case 0: 2386 pending = true; 2387 break; 2388 case EOPNOTSUPP: 2389 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2390 break; 2391 default: 2392 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2393 } 2394 } 2395 2396 return (pending); 2397 } 2398 2399 static uint16_t 2400 nvme_write_read_ram(struct pci_nvme_softc *sc, 2401 struct pci_nvme_blockstore *nvstore, 2402 uint64_t prp1, uint64_t prp2, 2403 size_t offset, uint64_t bytes, 2404 bool is_write) 2405 { 2406 uint8_t *buf = nvstore->ctx; 2407 enum nvme_copy_dir dir; 2408 uint16_t status; 2409 2410 if (is_write) 2411 dir = NVME_COPY_TO_PRP; 2412 else 2413 dir = NVME_COPY_FROM_PRP; 2414 2415 status = 0; 2416 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2, 2417 buf + offset, bytes, dir)) 2418 pci_nvme_status_genc(&status, 2419 NVME_SC_DATA_TRANSFER_ERROR); 2420 else 2421 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2422 2423 return (status); 2424 } 2425 2426 static uint16_t 2427 nvme_write_read_blockif(struct pci_nvme_softc *sc, 2428 struct pci_nvme_blockstore *nvstore, 2429 struct pci_nvme_ioreq *req, 2430 uint64_t prp1, uint64_t prp2, 2431 size_t offset, uint64_t bytes, 2432 bool is_write) 2433 { 2434 uint64_t size; 2435 int err; 2436 uint16_t status = NVME_NO_STATUS; 2437 2438 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes); 2439 if (pci_nvme_append_iov_req(sc, req, prp1, size, offset)) { 2440 err = -1; 2441 goto out; 2442 } 2443 2444 offset += size; 2445 bytes -= size; 2446 2447 if (bytes == 0) { 2448 ; 2449 } else if (bytes <= PAGE_SIZE) { 2450 size = bytes; 2451 if (pci_nvme_append_iov_req(sc, req, prp2, size, offset)) { 2452 err = -1; 2453 goto out; 2454 } 2455 } else { 2456 void *vmctx = sc->nsc_pi->pi_vmctx; 2457 uint64_t *prp_list = &prp2; 2458 uint64_t *last = prp_list; 2459 2460 /* PRP2 is pointer to a physical region page list */ 2461 while (bytes) { 2462 /* Last entry in list points to the next list */ 2463 if ((prp_list == last) && (bytes > PAGE_SIZE)) { 2464 uint64_t prp = *prp_list; 2465 2466 prp_list = paddr_guest2host(vmctx, prp, 2467 PAGE_SIZE - (prp % PAGE_SIZE)); 2468 if (prp_list == NULL) { 2469 err = -1; 2470 goto out; 2471 } 2472 last = prp_list + (NVME_PRP2_ITEMS - 1); 2473 } 2474 2475 size = MIN(bytes, PAGE_SIZE); 2476 2477 if (pci_nvme_append_iov_req(sc, req, *prp_list, size, 2478 offset)) { 2479 err = -1; 2480 goto out; 2481 } 2482 2483 offset += size; 2484 bytes -= size; 2485 2486 prp_list++; 2487 } 2488 } 2489 req->io_req.br_callback = pci_nvme_io_done; 2490 if (is_write) 2491 err = blockif_write(nvstore->ctx, &req->io_req); 2492 else 2493 err = blockif_read(nvstore->ctx, &req->io_req); 2494 out: 2495 if (err) 2496 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); 2497 2498 return (status); 2499 } 2500 2501 static bool 2502 nvme_opc_write_read(struct pci_nvme_softc *sc, 2503 struct nvme_command *cmd, 2504 struct pci_nvme_blockstore *nvstore, 2505 struct pci_nvme_ioreq *req, 2506 uint16_t *status) 2507 { 2508 uint64_t lba, nblocks, bytes; 2509 size_t offset; 2510 bool is_write = cmd->opc == NVME_OPC_WRITE; 2511 bool pending = false; 2512 2513 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; 2514 nblocks = (cmd->cdw12 & 0xFFFF) + 1; 2515 bytes = nblocks << nvstore->sectsz_bits; 2516 if (bytes > NVME_MAX_DATA_SIZE) { 2517 WPRINTF("%s command would exceed MDTS", __func__); 2518 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD); 2519 goto out; 2520 } 2521 2522 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) { 2523 WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)", 2524 __func__, lba, nblocks); 2525 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2526 goto out; 2527 } 2528 2529 offset = lba << nvstore->sectsz_bits; 2530 2531 req->bytes = bytes; 2532 req->io_req.br_offset = lba; 2533 2534 /* PRP bits 1:0 must be zero */ 2535 cmd->prp1 &= ~0x3UL; 2536 cmd->prp2 &= ~0x3UL; 2537 2538 if (nvstore->type == NVME_STOR_RAM) { 2539 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1, 2540 cmd->prp2, offset, bytes, is_write); 2541 } else { 2542 *status = nvme_write_read_blockif(sc, nvstore, req, 2543 cmd->prp1, cmd->prp2, offset, bytes, is_write); 2544 2545 if (*status == NVME_NO_STATUS) 2546 pending = true; 2547 } 2548 out: 2549 if (!pending) 2550 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status); 2551 2552 return (pending); 2553 } 2554 2555 static void 2556 pci_nvme_dealloc_sm(struct blockif_req *br, int err) 2557 { 2558 struct pci_nvme_ioreq *req = br->br_param; 2559 struct pci_nvme_softc *sc = req->sc; 2560 bool done = true; 2561 uint16_t status; 2562 2563 status = 0; 2564 if (err) { 2565 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); 2566 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) { 2567 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2568 } else { 2569 struct iovec *iov = req->io_req.br_iov; 2570 2571 req->prev_gpaddr++; 2572 iov += req->prev_gpaddr; 2573 2574 /* The iov_* values already include the sector size */ 2575 req->io_req.br_offset = (off_t)iov->iov_base; 2576 req->io_req.br_resid = iov->iov_len; 2577 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) { 2578 pci_nvme_status_genc(&status, 2579 NVME_SC_INTERNAL_DEVICE_ERROR); 2580 } else 2581 done = false; 2582 } 2583 2584 if (done) { 2585 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, req->cid, 2586 status); 2587 pci_nvme_release_ioreq(sc, req); 2588 } 2589 } 2590 2591 static bool 2592 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, 2593 struct nvme_command *cmd, 2594 struct pci_nvme_blockstore *nvstore, 2595 struct pci_nvme_ioreq *req, 2596 uint16_t *status) 2597 { 2598 struct nvme_dsm_range *range = NULL; 2599 uint32_t nr, r, non_zero, dr; 2600 int err; 2601 bool pending = false; 2602 2603 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) { 2604 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE); 2605 goto out; 2606 } 2607 2608 nr = cmd->cdw10 & 0xff; 2609 2610 /* copy locally because a range entry could straddle PRPs */ 2611 range = calloc(1, NVME_MAX_DSM_TRIM); 2612 if (range == NULL) { 2613 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2614 goto out; 2615 } 2616 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, 2617 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP); 2618 2619 /* Check for invalid ranges and the number of non-zero lengths */ 2620 non_zero = 0; 2621 for (r = 0; r <= nr; r++) { 2622 if (pci_nvme_out_of_range(nvstore, 2623 range[r].starting_lba, range[r].length)) { 2624 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2625 goto out; 2626 } 2627 if (range[r].length != 0) 2628 non_zero++; 2629 } 2630 2631 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) { 2632 size_t offset, bytes; 2633 int sectsz_bits = sc->nvstore.sectsz_bits; 2634 2635 /* 2636 * DSM calls are advisory only, and compliant controllers 2637 * may choose to take no actions (i.e. return Success). 2638 */ 2639 if (!nvstore->deallocate) { 2640 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2641 goto out; 2642 } 2643 2644 /* If all ranges have a zero length, return Success */ 2645 if (non_zero == 0) { 2646 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2647 goto out; 2648 } 2649 2650 if (req == NULL) { 2651 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2652 goto out; 2653 } 2654 2655 offset = range[0].starting_lba << sectsz_bits; 2656 bytes = range[0].length << sectsz_bits; 2657 2658 /* 2659 * If the request is for more than a single range, store 2660 * the ranges in the br_iov. Optimize for the common case 2661 * of a single range. 2662 * 2663 * Note that NVMe Number of Ranges is a zero based value 2664 */ 2665 req->io_req.br_iovcnt = 0; 2666 req->io_req.br_offset = offset; 2667 req->io_req.br_resid = bytes; 2668 2669 if (nr == 0) { 2670 req->io_req.br_callback = pci_nvme_io_done; 2671 } else { 2672 struct iovec *iov = req->io_req.br_iov; 2673 2674 for (r = 0, dr = 0; r <= nr; r++) { 2675 offset = range[r].starting_lba << sectsz_bits; 2676 bytes = range[r].length << sectsz_bits; 2677 if (bytes == 0) 2678 continue; 2679 2680 if ((nvstore->size - offset) < bytes) { 2681 pci_nvme_status_genc(status, 2682 NVME_SC_LBA_OUT_OF_RANGE); 2683 goto out; 2684 } 2685 iov[dr].iov_base = (void *)offset; 2686 iov[dr].iov_len = bytes; 2687 dr++; 2688 } 2689 req->io_req.br_callback = pci_nvme_dealloc_sm; 2690 2691 /* 2692 * Use prev_gpaddr to track the current entry and 2693 * prev_size to track the number of entries 2694 */ 2695 req->prev_gpaddr = 0; 2696 req->prev_size = dr; 2697 } 2698 2699 err = blockif_delete(nvstore->ctx, &req->io_req); 2700 if (err) 2701 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2702 else 2703 pending = true; 2704 } 2705 out: 2706 free(range); 2707 return (pending); 2708 } 2709 2710 static void 2711 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) 2712 { 2713 struct nvme_submission_queue *sq; 2714 uint16_t status; 2715 uint16_t sqhead; 2716 2717 /* handle all submissions up to sq->tail index */ 2718 sq = &sc->submit_queues[idx]; 2719 2720 pthread_mutex_lock(&sq->mtx); 2721 2722 sqhead = sq->head; 2723 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p", 2724 idx, sqhead, sq->tail, sq->qbase); 2725 2726 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2727 struct nvme_command *cmd; 2728 struct pci_nvme_ioreq *req; 2729 uint32_t nsid; 2730 bool pending; 2731 2732 pending = false; 2733 req = NULL; 2734 status = 0; 2735 2736 cmd = &sq->qbase[sqhead]; 2737 sqhead = (sqhead + 1) % sq->size; 2738 2739 nsid = le32toh(cmd->nsid); 2740 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) { 2741 pci_nvme_status_genc(&status, 2742 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 2743 status |= NVMEM(NVME_STATUS_DNR); 2744 goto complete; 2745 } 2746 2747 req = pci_nvme_get_ioreq(sc); 2748 if (req == NULL) { 2749 pci_nvme_status_genc(&status, 2750 NVME_SC_INTERNAL_DEVICE_ERROR); 2751 WPRINTF("%s: unable to allocate IO req", __func__); 2752 goto complete; 2753 } 2754 req->nvme_sq = sq; 2755 req->sqid = idx; 2756 req->opc = cmd->opc; 2757 req->cid = cmd->cid; 2758 req->nsid = cmd->nsid; 2759 2760 switch (cmd->opc) { 2761 case NVME_OPC_FLUSH: 2762 pending = nvme_opc_flush(sc, cmd, &sc->nvstore, 2763 req, &status); 2764 break; 2765 case NVME_OPC_WRITE: 2766 case NVME_OPC_READ: 2767 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore, 2768 req, &status); 2769 break; 2770 case NVME_OPC_WRITE_ZEROES: 2771 /* TODO: write zeroes 2772 WPRINTF("%s write zeroes lba 0x%lx blocks %u", 2773 __func__, lba, cmd->cdw12 & 0xFFFF); */ 2774 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2775 break; 2776 case NVME_OPC_DATASET_MANAGEMENT: 2777 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, 2778 req, &status); 2779 break; 2780 default: 2781 WPRINTF("%s unhandled io command 0x%x", 2782 __func__, cmd->opc); 2783 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE); 2784 } 2785 complete: 2786 if (!pending) { 2787 pci_nvme_set_completion(sc, sq, idx, cmd->cid, status); 2788 if (req != NULL) 2789 pci_nvme_release_ioreq(sc, req); 2790 } 2791 } 2792 2793 sq->head = sqhead; 2794 2795 pthread_mutex_unlock(&sq->mtx); 2796 } 2797 2798 /* 2799 * Check for invalid doorbell write values 2800 * See NVM Express Base Specification, revision 2.0 2801 * "Asynchronous Event Information - Error Status" for details 2802 */ 2803 static bool 2804 pci_nvme_sq_doorbell_valid(struct nvme_submission_queue *sq, uint64_t value) 2805 { 2806 uint64_t capacity; 2807 2808 /* 2809 * Queue empty : head == tail 2810 * Queue full : head is one more than tail accounting for wrap 2811 * Therefore, can never have more than (size - 1) entries 2812 */ 2813 if (sq->head == sq->tail) 2814 capacity = sq->size - 1; 2815 else if (sq->head > sq->tail) 2816 capacity = sq->size - (sq->head - sq->tail) - 1; 2817 else 2818 capacity = sq->tail - sq->head - 1; 2819 2820 if ((value == sq->tail) || /* same as previous */ 2821 (value > capacity)) { /* exceeds queue capacity */ 2822 EPRINTLN("%s: SQ size=%u head=%u tail=%u capacity=%lu value=%lu", 2823 __func__, sq->size, sq->head, sq->tail, capacity, value); 2824 return false; 2825 } 2826 2827 return true; 2828 } 2829 2830 static void 2831 pci_nvme_handle_doorbell(struct pci_nvme_softc* sc, 2832 uint64_t idx, int is_sq, uint64_t value) 2833 { 2834 DPRINTF("nvme doorbell %lu, %s, val 0x%lx", 2835 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF); 2836 2837 if (is_sq) { 2838 if (idx > sc->num_squeues) { 2839 WPRINTF("%s queue index %lu overflow from " 2840 "guest (max %u)", 2841 __func__, idx, sc->num_squeues); 2842 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_ERROR, 2843 PCI_NVME_AEI_ERROR_INVALID_DB); 2844 return; 2845 } 2846 2847 if (sc->submit_queues[idx].qbase == NULL) { 2848 WPRINTF("%s write to SQ %lu before created", __func__, 2849 idx); 2850 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_ERROR, 2851 PCI_NVME_AEI_ERROR_INVALID_DB); 2852 return; 2853 } 2854 2855 if (!pci_nvme_sq_doorbell_valid(&sc->submit_queues[idx], value)) { 2856 EPRINTLN("%s write to SQ %lu of %lu invalid", __func__, 2857 idx, value); 2858 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_ERROR, 2859 PCI_NVME_AEI_ERROR_INVALID_DB_VALUE); 2860 return; 2861 } 2862 2863 atomic_store_short(&sc->submit_queues[idx].tail, 2864 (uint16_t)value); 2865 2866 if (idx == 0) 2867 pci_nvme_handle_admin_cmd(sc, value); 2868 else { 2869 /* submission queue; handle new entries in SQ */ 2870 pci_nvme_handle_io_cmd(sc, (uint16_t)idx); 2871 } 2872 } else { 2873 if (idx > sc->num_cqueues) { 2874 WPRINTF("%s queue index %lu overflow from " 2875 "guest (max %u)", 2876 __func__, idx, sc->num_cqueues); 2877 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_ERROR, 2878 PCI_NVME_AEI_ERROR_INVALID_DB); 2879 return; 2880 } 2881 2882 if (sc->compl_queues[idx].qbase == NULL) { 2883 WPRINTF("%s write to CQ %lu before created", __func__, 2884 idx); 2885 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_ERROR, 2886 PCI_NVME_AEI_ERROR_INVALID_DB); 2887 return; 2888 } 2889 2890 atomic_store_short(&sc->compl_queues[idx].head, 2891 (uint16_t)value); 2892 } 2893 } 2894 2895 static void 2896 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) 2897 { 2898 const char *s = iswrite ? "WRITE" : "READ"; 2899 2900 switch (offset) { 2901 case NVME_CR_CAP_LOW: 2902 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s); 2903 break; 2904 case NVME_CR_CAP_HI: 2905 DPRINTF("%s %s NVME_CR_CAP_HI", func, s); 2906 break; 2907 case NVME_CR_VS: 2908 DPRINTF("%s %s NVME_CR_VS", func, s); 2909 break; 2910 case NVME_CR_INTMS: 2911 DPRINTF("%s %s NVME_CR_INTMS", func, s); 2912 break; 2913 case NVME_CR_INTMC: 2914 DPRINTF("%s %s NVME_CR_INTMC", func, s); 2915 break; 2916 case NVME_CR_CC: 2917 DPRINTF("%s %s NVME_CR_CC", func, s); 2918 break; 2919 case NVME_CR_CSTS: 2920 DPRINTF("%s %s NVME_CR_CSTS", func, s); 2921 break; 2922 case NVME_CR_NSSR: 2923 DPRINTF("%s %s NVME_CR_NSSR", func, s); 2924 break; 2925 case NVME_CR_AQA: 2926 DPRINTF("%s %s NVME_CR_AQA", func, s); 2927 break; 2928 case NVME_CR_ASQ_LOW: 2929 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s); 2930 break; 2931 case NVME_CR_ASQ_HI: 2932 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s); 2933 break; 2934 case NVME_CR_ACQ_LOW: 2935 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s); 2936 break; 2937 case NVME_CR_ACQ_HI: 2938 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s); 2939 break; 2940 default: 2941 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset); 2942 } 2943 2944 } 2945 2946 static void 2947 pci_nvme_write_bar_0(struct pci_nvme_softc *sc, uint64_t offset, int size, 2948 uint64_t value) 2949 { 2950 uint32_t ccreg; 2951 2952 if (offset >= NVME_DOORBELL_OFFSET) { 2953 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; 2954 uint64_t idx = belloffset / 8; /* door bell size = 2*int */ 2955 int is_sq = (belloffset % 8) < 4; 2956 2957 if ((sc->regs.csts & NVME_CSTS_RDY) == 0) { 2958 WPRINTF("doorbell write prior to RDY (offset=%#lx)\n", 2959 offset); 2960 return; 2961 } 2962 2963 if (belloffset > ((sc->max_queues+1) * 8 - 4)) { 2964 WPRINTF("guest attempted an overflow write offset " 2965 "0x%lx, val 0x%lx in %s", 2966 offset, value, __func__); 2967 return; 2968 } 2969 2970 if (is_sq) { 2971 if (sc->submit_queues[idx].qbase == NULL) 2972 return; 2973 } else if (sc->compl_queues[idx].qbase == NULL) 2974 return; 2975 2976 pci_nvme_handle_doorbell(sc, idx, is_sq, value); 2977 return; 2978 } 2979 2980 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx", 2981 offset, size, value); 2982 2983 if (size != 4) { 2984 WPRINTF("guest wrote invalid size %d (offset 0x%lx, " 2985 "val 0x%lx) to bar0 in %s", 2986 size, offset, value, __func__); 2987 /* TODO: shutdown device */ 2988 return; 2989 } 2990 2991 pci_nvme_bar0_reg_dumps(__func__, offset, 1); 2992 2993 pthread_mutex_lock(&sc->mtx); 2994 2995 switch (offset) { 2996 case NVME_CR_CAP_LOW: 2997 case NVME_CR_CAP_HI: 2998 /* readonly */ 2999 break; 3000 case NVME_CR_VS: 3001 /* readonly */ 3002 break; 3003 case NVME_CR_INTMS: 3004 /* MSI-X, so ignore */ 3005 break; 3006 case NVME_CR_INTMC: 3007 /* MSI-X, so ignore */ 3008 break; 3009 case NVME_CR_CC: 3010 ccreg = (uint32_t)value; 3011 3012 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u " 3013 "iocqes %u", 3014 __func__, 3015 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), 3016 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), 3017 NVME_CC_GET_IOCQES(ccreg)); 3018 3019 if (NVME_CC_GET_SHN(ccreg)) { 3020 /* perform shutdown - flush out data to backend */ 3021 sc->regs.csts &= ~NVMEM(NVME_CSTS_REG_SHST); 3022 sc->regs.csts |= NVMEF(NVME_CSTS_REG_SHST, 3023 NVME_SHST_COMPLETE); 3024 } 3025 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { 3026 if (NVME_CC_GET_EN(ccreg) == 0) 3027 /* transition 1-> causes controller reset */ 3028 pci_nvme_reset_locked(sc); 3029 else 3030 pci_nvme_init_controller(sc); 3031 } 3032 3033 /* Insert the iocqes, iosqes and en bits from the write */ 3034 sc->regs.cc &= ~NVME_CC_WRITE_MASK; 3035 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; 3036 if (NVME_CC_GET_EN(ccreg) == 0) { 3037 /* Insert the ams, mps and css bit fields */ 3038 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; 3039 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; 3040 sc->regs.csts &= ~NVME_CSTS_RDY; 3041 } else if ((sc->pending_ios == 0) && 3042 !(sc->regs.csts & NVME_CSTS_CFS)) { 3043 sc->regs.csts |= NVME_CSTS_RDY; 3044 } 3045 break; 3046 case NVME_CR_CSTS: 3047 break; 3048 case NVME_CR_NSSR: 3049 /* ignore writes; don't support subsystem reset */ 3050 break; 3051 case NVME_CR_AQA: 3052 sc->regs.aqa = (uint32_t)value; 3053 break; 3054 case NVME_CR_ASQ_LOW: 3055 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | 3056 (0xFFFFF000 & value); 3057 break; 3058 case NVME_CR_ASQ_HI: 3059 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | 3060 (value << 32); 3061 break; 3062 case NVME_CR_ACQ_LOW: 3063 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | 3064 (0xFFFFF000 & value); 3065 break; 3066 case NVME_CR_ACQ_HI: 3067 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | 3068 (value << 32); 3069 break; 3070 default: 3071 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d", 3072 __func__, offset, value, size); 3073 } 3074 pthread_mutex_unlock(&sc->mtx); 3075 } 3076 3077 static void 3078 pci_nvme_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size, 3079 uint64_t value) 3080 { 3081 struct pci_nvme_softc* sc = pi->pi_arg; 3082 3083 if (baridx == pci_msix_table_bar(pi) || 3084 baridx == pci_msix_pba_bar(pi)) { 3085 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, " 3086 " value 0x%lx", baridx, offset, size, value); 3087 3088 pci_emul_msix_twrite(pi, offset, size, value); 3089 return; 3090 } 3091 3092 switch (baridx) { 3093 case 0: 3094 pci_nvme_write_bar_0(sc, offset, size, value); 3095 break; 3096 3097 default: 3098 DPRINTF("%s unknown baridx %d, val 0x%lx", 3099 __func__, baridx, value); 3100 } 3101 } 3102 3103 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, 3104 uint64_t offset, int size) 3105 { 3106 uint64_t value; 3107 3108 pci_nvme_bar0_reg_dumps(__func__, offset, 0); 3109 3110 if (offset < NVME_DOORBELL_OFFSET) { 3111 void *p = &(sc->regs); 3112 pthread_mutex_lock(&sc->mtx); 3113 memcpy(&value, (void *)((uintptr_t)p + offset), size); 3114 pthread_mutex_unlock(&sc->mtx); 3115 } else { 3116 value = 0; 3117 WPRINTF("pci_nvme: read invalid offset %ld", offset); 3118 } 3119 3120 switch (size) { 3121 case 1: 3122 value &= 0xFF; 3123 break; 3124 case 2: 3125 value &= 0xFFFF; 3126 break; 3127 case 4: 3128 value &= 0xFFFFFFFF; 3129 break; 3130 } 3131 3132 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x", 3133 offset, size, (uint32_t)value); 3134 3135 return (value); 3136 } 3137 3138 3139 3140 static uint64_t 3141 pci_nvme_read(struct pci_devinst *pi, int baridx, uint64_t offset, int size) 3142 { 3143 struct pci_nvme_softc* sc = pi->pi_arg; 3144 3145 if (baridx == pci_msix_table_bar(pi) || 3146 baridx == pci_msix_pba_bar(pi)) { 3147 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d", 3148 baridx, offset, size); 3149 3150 return pci_emul_msix_tread(pi, offset, size); 3151 } 3152 3153 switch (baridx) { 3154 case 0: 3155 return pci_nvme_read_bar_0(sc, offset, size); 3156 3157 default: 3158 DPRINTF("unknown bar %d, 0x%lx", baridx, offset); 3159 } 3160 3161 return (0); 3162 } 3163 3164 static int 3165 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl) 3166 { 3167 char bident[sizeof("XXX:XXX")]; 3168 const char *value; 3169 uint32_t sectsz; 3170 3171 sc->max_queues = NVME_QUEUES; 3172 sc->max_qentries = NVME_MAX_QENTRIES; 3173 sc->ioslots = NVME_IOSLOTS; 3174 sc->num_squeues = sc->max_queues; 3175 sc->num_cqueues = sc->max_queues; 3176 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 3177 sectsz = 0; 3178 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), 3179 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3180 3181 value = get_config_value_node(nvl, "maxq"); 3182 if (value != NULL) 3183 sc->max_queues = atoi(value); 3184 value = get_config_value_node(nvl, "qsz"); 3185 if (value != NULL) { 3186 sc->max_qentries = atoi(value); 3187 if (sc->max_qentries <= 0) { 3188 EPRINTLN("nvme: Invalid qsz option %d", 3189 sc->max_qentries); 3190 return (-1); 3191 } 3192 } 3193 value = get_config_value_node(nvl, "ioslots"); 3194 if (value != NULL) { 3195 sc->ioslots = atoi(value); 3196 if (sc->ioslots <= 0) { 3197 EPRINTLN("Invalid ioslots option %d", sc->ioslots); 3198 return (-1); 3199 } 3200 } 3201 value = get_config_value_node(nvl, "sectsz"); 3202 if (value != NULL) 3203 sectsz = atoi(value); 3204 value = get_config_value_node(nvl, "ser"); 3205 if (value != NULL) { 3206 /* 3207 * This field indicates the Product Serial Number in 3208 * 7-bit ASCII, unused bytes should be space characters. 3209 * Ref: NVMe v1.3c. 3210 */ 3211 cpywithpad((char *)sc->ctrldata.sn, 3212 sizeof(sc->ctrldata.sn), value, ' '); 3213 } 3214 value = get_config_value_node(nvl, "eui64"); 3215 if (value != NULL) 3216 sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0)); 3217 value = get_config_value_node(nvl, "dsm"); 3218 if (value != NULL) { 3219 if (strcmp(value, "auto") == 0) 3220 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 3221 else if (strcmp(value, "enable") == 0) 3222 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE; 3223 else if (strcmp(value, "disable") == 0) 3224 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE; 3225 } 3226 3227 value = get_config_value_node(nvl, "bootindex"); 3228 if (value != NULL) { 3229 if (pci_emul_add_boot_device(sc->nsc_pi, atoi(value))) { 3230 EPRINTLN("Invalid bootindex %d", atoi(value)); 3231 return (-1); 3232 } 3233 } 3234 3235 value = get_config_value_node(nvl, "ram"); 3236 if (value != NULL) { 3237 uint64_t sz = strtoull(value, NULL, 10); 3238 3239 sc->nvstore.type = NVME_STOR_RAM; 3240 sc->nvstore.size = sz * 1024 * 1024; 3241 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 3242 sc->nvstore.sectsz = 4096; 3243 sc->nvstore.sectsz_bits = 12; 3244 if (sc->nvstore.ctx == NULL) { 3245 EPRINTLN("nvme: Unable to allocate RAM"); 3246 return (-1); 3247 } 3248 } else { 3249 snprintf(bident, sizeof(bident), "%u:%u", 3250 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3251 sc->nvstore.ctx = blockif_open(nvl, bident); 3252 if (sc->nvstore.ctx == NULL) { 3253 EPRINTLN("nvme: Could not open backing file: %s", 3254 strerror(errno)); 3255 return (-1); 3256 } 3257 sc->nvstore.type = NVME_STOR_BLOCKIF; 3258 sc->nvstore.size = blockif_size(sc->nvstore.ctx); 3259 } 3260 3261 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) 3262 sc->nvstore.sectsz = sectsz; 3263 else if (sc->nvstore.type != NVME_STOR_RAM) 3264 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); 3265 for (sc->nvstore.sectsz_bits = 9; 3266 (1U << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; 3267 sc->nvstore.sectsz_bits++); 3268 3269 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) 3270 sc->max_queues = NVME_QUEUES; 3271 3272 return (0); 3273 } 3274 3275 static void 3276 pci_nvme_resized(struct blockif_ctxt *bctxt __unused, void *arg, 3277 size_t new_size) 3278 { 3279 struct pci_nvme_softc *sc; 3280 struct pci_nvme_blockstore *nvstore; 3281 struct nvme_namespace_data *nd; 3282 3283 sc = arg; 3284 nvstore = &sc->nvstore; 3285 nd = &sc->nsdata; 3286 3287 nvstore->size = new_size; 3288 pci_nvme_init_nsdata_size(nvstore, nd); 3289 3290 /* Add changed NSID to list */ 3291 sc->ns_log.ns[0] = 1; 3292 sc->ns_log.ns[1] = 0; 3293 3294 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE, 3295 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED); 3296 } 3297 3298 static int 3299 pci_nvme_init(struct pci_devinst *pi, nvlist_t *nvl) 3300 { 3301 struct pci_nvme_softc *sc; 3302 uint32_t pci_membar_sz; 3303 int error; 3304 3305 error = 0; 3306 3307 sc = calloc(1, sizeof(struct pci_nvme_softc)); 3308 pi->pi_arg = sc; 3309 sc->nsc_pi = pi; 3310 3311 error = pci_nvme_parse_config(sc, nvl); 3312 if (error < 0) 3313 goto done; 3314 else 3315 error = 0; 3316 3317 STAILQ_INIT(&sc->ioreqs_free); 3318 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); 3319 for (uint32_t i = 0; i < sc->ioslots; i++) { 3320 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); 3321 } 3322 3323 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); 3324 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); 3325 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); 3326 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); 3327 pci_set_cfgdata8(pi, PCIR_PROGIF, 3328 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); 3329 3330 /* 3331 * Allocate size of NVMe registers + doorbell space for all queues. 3332 * 3333 * The specification requires a minimum memory I/O window size of 16K. 3334 * The Windows driver will refuse to start a device with a smaller 3335 * window. 3336 */ 3337 pci_membar_sz = sizeof(struct nvme_registers) + 3338 2 * sizeof(uint32_t) * (sc->max_queues + 1); 3339 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); 3340 3341 DPRINTF("nvme membar size: %u", pci_membar_sz); 3342 3343 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); 3344 if (error) { 3345 WPRINTF("%s pci alloc mem bar failed", __func__); 3346 goto done; 3347 } 3348 3349 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); 3350 if (error) { 3351 WPRINTF("%s pci add msixcap failed", __func__); 3352 goto done; 3353 } 3354 3355 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); 3356 if (error) { 3357 WPRINTF("%s pci add Express capability failed", __func__); 3358 goto done; 3359 } 3360 3361 pthread_mutex_init(&sc->mtx, NULL); 3362 sem_init(&sc->iosemlock, 0, sc->ioslots); 3363 blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc); 3364 3365 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues); 3366 /* 3367 * Controller data depends on Namespace data so initialize Namespace 3368 * data first. 3369 */ 3370 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore); 3371 pci_nvme_init_ctrldata(sc); 3372 pci_nvme_init_logpages(sc); 3373 pci_nvme_init_features(sc); 3374 3375 pci_nvme_aer_init(sc); 3376 pci_nvme_aen_init(sc); 3377 3378 pci_nvme_reset(sc); 3379 done: 3380 return (error); 3381 } 3382 3383 static int 3384 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts) 3385 { 3386 char *cp, *ram; 3387 3388 if (opts == NULL) 3389 return (0); 3390 3391 if (strncmp(opts, "ram=", 4) == 0) { 3392 cp = strchr(opts, ','); 3393 if (cp == NULL) { 3394 set_config_value_node(nvl, "ram", opts + 4); 3395 return (0); 3396 } 3397 ram = strndup(opts + 4, cp - opts - 4); 3398 set_config_value_node(nvl, "ram", ram); 3399 free(ram); 3400 return (pci_parse_legacy_config(nvl, cp + 1)); 3401 } else 3402 return (blockif_legacy_config(nvl, opts)); 3403 } 3404 3405 static const struct pci_devemu pci_de_nvme = { 3406 .pe_emu = "nvme", 3407 .pe_init = pci_nvme_init, 3408 .pe_legacy_config = pci_nvme_legacy_config, 3409 .pe_barwrite = pci_nvme_write, 3410 .pe_barread = pci_nvme_read 3411 }; 3412 PCI_EMUL_SET(pci_de_nvme); 3413