1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2017 Shunsuke Mie 5 * Copyright (c) 2018 Leon Dang 6 * Copyright (c) 2020 Chuck Tuffli 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 /* 31 * bhyve PCIe-NVMe device emulation. 32 * 33 * options: 34 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt> 35 * 36 * accepted devpath: 37 * /dev/blockdev 38 * /path/to/image 39 * ram=size_in_MiB 40 * 41 * maxq = max number of queues 42 * qsz = max elements in each queue 43 * ioslots = max number of concurrent io requests 44 * sectsz = sector size (defaults to blockif sector size) 45 * ser = serial number (20-chars max) 46 * eui64 = IEEE Extended Unique Identifier (8 byte value) 47 * dsm = DataSet Management support. Option is one of auto, enable,disable 48 * 49 */ 50 51 /* TODO: 52 - create async event for smart and log 53 - intr coalesce 54 */ 55 56 #include <sys/cdefs.h> 57 #include <sys/errno.h> 58 #include <sys/types.h> 59 #include <sys/crc16.h> 60 #include <net/ieee_oui.h> 61 62 #include <assert.h> 63 #include <pthread.h> 64 #include <pthread_np.h> 65 #include <semaphore.h> 66 #include <stdbool.h> 67 #include <stddef.h> 68 #include <stdint.h> 69 #include <stdio.h> 70 #include <stdlib.h> 71 #include <string.h> 72 73 #include <machine/atomic.h> 74 #include <machine/vmm.h> 75 #include <vmmapi.h> 76 77 #include <dev/nvme/nvme.h> 78 79 #include "bhyverun.h" 80 #include "block_if.h" 81 #include "config.h" 82 #include "debug.h" 83 #include "pci_emul.h" 84 85 86 static int nvme_debug = 0; 87 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args) 88 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args) 89 90 /* defaults; can be overridden */ 91 #define NVME_MSIX_BAR 4 92 93 #define NVME_IOSLOTS 8 94 95 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ 96 #define NVME_MMIO_SPACE_MIN (1 << 14) 97 98 #define NVME_QUEUES 16 99 #define NVME_MAX_QENTRIES 2048 100 /* Memory Page size Minimum reported in CAP register */ 101 #define NVME_MPSMIN 0 102 /* MPSMIN converted to bytes */ 103 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN)) 104 105 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) 106 #define NVME_MDTS 9 107 /* Note the + 1 allows for the initial descriptor to not be page aligned */ 108 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1) 109 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES) 110 111 /* This is a synthetic status code to indicate there is no status */ 112 #define NVME_NO_STATUS 0xffff 113 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) 114 115 /* Reported temperature in Kelvin (i.e. room temperature) */ 116 #define NVME_TEMPERATURE 296 117 118 /* helpers */ 119 120 /* Convert a zero-based value into a one-based value */ 121 #define ONE_BASED(zero) ((zero) + 1) 122 /* Convert a one-based value into a zero-based value */ 123 #define ZERO_BASED(one) ((one) - 1) 124 125 /* Encode number of SQ's and CQ's for Set/Get Features */ 126 #define NVME_FEATURE_NUM_QUEUES(sc) \ 127 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ 128 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16 129 130 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) 131 132 enum nvme_controller_register_offsets { 133 NVME_CR_CAP_LOW = 0x00, 134 NVME_CR_CAP_HI = 0x04, 135 NVME_CR_VS = 0x08, 136 NVME_CR_INTMS = 0x0c, 137 NVME_CR_INTMC = 0x10, 138 NVME_CR_CC = 0x14, 139 NVME_CR_CSTS = 0x1c, 140 NVME_CR_NSSR = 0x20, 141 NVME_CR_AQA = 0x24, 142 NVME_CR_ASQ_LOW = 0x28, 143 NVME_CR_ASQ_HI = 0x2c, 144 NVME_CR_ACQ_LOW = 0x30, 145 NVME_CR_ACQ_HI = 0x34, 146 }; 147 148 enum nvme_cmd_cdw11 { 149 NVME_CMD_CDW11_PC = 0x0001, 150 NVME_CMD_CDW11_IEN = 0x0002, 151 NVME_CMD_CDW11_IV = 0xFFFF0000, 152 }; 153 154 enum nvme_copy_dir { 155 NVME_COPY_TO_PRP, 156 NVME_COPY_FROM_PRP, 157 }; 158 159 #define NVME_CQ_INTEN 0x01 160 #define NVME_CQ_INTCOAL 0x02 161 162 struct nvme_completion_queue { 163 struct nvme_completion *qbase; 164 pthread_mutex_t mtx; 165 uint32_t size; 166 uint16_t tail; /* nvme progress */ 167 uint16_t head; /* guest progress */ 168 uint16_t intr_vec; 169 uint32_t intr_en; 170 }; 171 172 struct nvme_submission_queue { 173 struct nvme_command *qbase; 174 pthread_mutex_t mtx; 175 uint32_t size; 176 uint16_t head; /* nvme progress */ 177 uint16_t tail; /* guest progress */ 178 uint16_t cqid; /* completion queue id */ 179 int qpriority; 180 }; 181 182 enum nvme_storage_type { 183 NVME_STOR_BLOCKIF = 0, 184 NVME_STOR_RAM = 1, 185 }; 186 187 struct pci_nvme_blockstore { 188 enum nvme_storage_type type; 189 void *ctx; 190 uint64_t size; 191 uint32_t sectsz; 192 uint32_t sectsz_bits; 193 uint64_t eui64; 194 uint32_t deallocate:1; 195 }; 196 197 /* 198 * Calculate the number of additional page descriptors for guest IO requests 199 * based on the advertised Max Data Transfer (MDTS) and given the number of 200 * default iovec's in a struct blockif_req. 201 */ 202 #define MDTS_PAD_SIZE \ 203 ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \ 204 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \ 205 0 ) 206 207 struct pci_nvme_ioreq { 208 struct pci_nvme_softc *sc; 209 STAILQ_ENTRY(pci_nvme_ioreq) link; 210 struct nvme_submission_queue *nvme_sq; 211 uint16_t sqid; 212 213 /* command information */ 214 uint16_t opc; 215 uint16_t cid; 216 uint32_t nsid; 217 218 uint64_t prev_gpaddr; 219 size_t prev_size; 220 size_t bytes; 221 222 struct blockif_req io_req; 223 224 struct iovec iovpadding[MDTS_PAD_SIZE]; 225 }; 226 227 enum nvme_dsm_type { 228 /* Dataset Management bit in ONCS reflects backing storage capability */ 229 NVME_DATASET_MANAGEMENT_AUTO, 230 /* Unconditionally set Dataset Management bit in ONCS */ 231 NVME_DATASET_MANAGEMENT_ENABLE, 232 /* Unconditionally clear Dataset Management bit in ONCS */ 233 NVME_DATASET_MANAGEMENT_DISABLE, 234 }; 235 236 struct pci_nvme_softc; 237 struct nvme_feature_obj; 238 239 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *, 240 struct nvme_feature_obj *, 241 struct nvme_command *, 242 struct nvme_completion *); 243 244 struct nvme_feature_obj { 245 uint32_t cdw11; 246 nvme_feature_cb set; 247 nvme_feature_cb get; 248 bool namespace_specific; 249 }; 250 251 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1) 252 253 typedef enum { 254 PCI_NVME_AE_TYPE_ERROR = 0, 255 PCI_NVME_AE_TYPE_SMART, 256 PCI_NVME_AE_TYPE_NOTICE, 257 PCI_NVME_AE_TYPE_IO_CMD = 6, 258 PCI_NVME_AE_TYPE_VENDOR = 7, 259 PCI_NVME_AE_TYPE_MAX /* Must be last */ 260 } pci_nvme_async_type; 261 262 /* Asynchronous Event Requests */ 263 struct pci_nvme_aer { 264 STAILQ_ENTRY(pci_nvme_aer) link; 265 uint16_t cid; /* Command ID of the submitted AER */ 266 }; 267 268 /** Asynchronous Event Information - Notice */ 269 typedef enum { 270 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED = 0, 271 PCI_NVME_AEI_NOTICE_FW_ACTIVATION, 272 PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE, 273 PCI_NVME_AEI_NOTICE_ANA_CHANGE, 274 PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE, 275 PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT, 276 PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE, 277 PCI_NVME_AEI_NOTICE_MAX, 278 } pci_nvme_async_event_info_notice; 279 280 #define PCI_NVME_AEI_NOTICE_SHIFT 8 281 #define PCI_NVME_AEI_NOTICE_MASK(event) (1 << (event + PCI_NVME_AEI_NOTICE_SHIFT)) 282 283 /* Asynchronous Event Notifications */ 284 struct pci_nvme_aen { 285 pci_nvme_async_type atype; 286 uint32_t event_data; 287 bool posted; 288 }; 289 290 /* 291 * By default, enable all Asynchrnous Event Notifications: 292 * SMART / Health Critical Warnings 293 * Namespace Attribute Notices 294 */ 295 #define PCI_NVME_AEN_DEFAULT_MASK 0x11f 296 297 typedef enum { 298 NVME_CNTRLTYPE_IO = 1, 299 NVME_CNTRLTYPE_DISCOVERY = 2, 300 NVME_CNTRLTYPE_ADMIN = 3, 301 } pci_nvme_cntrl_type; 302 303 struct pci_nvme_softc { 304 struct pci_devinst *nsc_pi; 305 306 pthread_mutex_t mtx; 307 308 struct nvme_registers regs; 309 310 struct nvme_namespace_data nsdata; 311 struct nvme_controller_data ctrldata; 312 struct nvme_error_information_entry err_log; 313 struct nvme_health_information_page health_log; 314 struct nvme_firmware_page fw_log; 315 struct nvme_ns_list ns_log; 316 317 struct pci_nvme_blockstore nvstore; 318 319 uint16_t max_qentries; /* max entries per queue */ 320 uint32_t max_queues; /* max number of IO SQ's or CQ's */ 321 uint32_t num_cqueues; 322 uint32_t num_squeues; 323 bool num_q_is_set; /* Has host set Number of Queues */ 324 325 struct pci_nvme_ioreq *ioreqs; 326 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */ 327 uint32_t pending_ios; 328 uint32_t ioslots; 329 sem_t iosemlock; 330 331 /* 332 * Memory mapped Submission and Completion queues 333 * Each array includes both Admin and IO queues 334 */ 335 struct nvme_completion_queue *compl_queues; 336 struct nvme_submission_queue *submit_queues; 337 338 struct nvme_feature_obj feat[NVME_FID_MAX]; 339 340 enum nvme_dsm_type dataset_management; 341 342 /* Accounting for SMART data */ 343 __uint128_t read_data_units; 344 __uint128_t write_data_units; 345 __uint128_t read_commands; 346 __uint128_t write_commands; 347 uint32_t read_dunits_remainder; 348 uint32_t write_dunits_remainder; 349 350 STAILQ_HEAD(, pci_nvme_aer) aer_list; 351 pthread_mutex_t aer_mtx; 352 uint32_t aer_count; 353 struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX]; 354 pthread_t aen_tid; 355 pthread_mutex_t aen_mtx; 356 pthread_cond_t aen_cond; 357 }; 358 359 360 static void pci_nvme_cq_update(struct pci_nvme_softc *sc, 361 struct nvme_completion_queue *cq, 362 uint32_t cdw0, 363 uint16_t cid, 364 uint16_t sqid, 365 uint16_t status); 366 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); 367 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); 368 static void pci_nvme_io_done(struct blockif_req *, int); 369 370 /* Controller Configuration utils */ 371 #define NVME_CC_GET_EN(cc) \ 372 NVMEV(NVME_CC_REG_EN, cc) 373 #define NVME_CC_GET_CSS(cc) \ 374 NVMEV(NVME_CC_REG_CSS, cc) 375 #define NVME_CC_GET_SHN(cc) \ 376 NVMEV(NVME_CC_REG_SHN, cc) 377 #define NVME_CC_GET_IOSQES(cc) \ 378 NVMEV(NVME_CC_REG_IOSQES, cc) 379 #define NVME_CC_GET_IOCQES(cc) \ 380 NVMEV(NVME_CC_REG_IOCQES, cc) 381 382 #define NVME_CC_WRITE_MASK \ 383 (NVMEM(NVME_CC_REG_EN) | \ 384 NVMEM(NVME_CC_REG_IOSQES) | \ 385 NVMEM(NVME_CC_REG_IOCQES)) 386 387 #define NVME_CC_NEN_WRITE_MASK \ 388 (NVMEM(NVME_CC_REG_CSS) | \ 389 NVMEM(NVME_CC_REG_MPS) | \ 390 NVMEM(NVME_CC_REG_AMS)) 391 392 /* Controller Status utils */ 393 #define NVME_CSTS_GET_RDY(sts) \ 394 NVMEV(NVME_CSTS_REG_RDY, sts) 395 396 #define NVME_CSTS_RDY (NVMEF(NVME_CSTS_REG_RDY, 1)) 397 #define NVME_CSTS_CFS (NVMEF(NVME_CSTS_REG_CFS, 1)) 398 399 /* Completion Queue status word utils */ 400 #define NVME_STATUS_P (NVMEF(NVME_STATUS_P, 1)) 401 #define NVME_STATUS_MASK \ 402 (NVMEM(NVME_STATUS_SCT) | \ 403 NVMEM(NVME_STATUS_SC)) 404 405 #define NVME_ONCS_DSM NVMEM(NVME_CTRLR_DATA_ONCS_DSM) 406 407 static void nvme_feature_invalid_cb(struct pci_nvme_softc *, 408 struct nvme_feature_obj *, 409 struct nvme_command *, 410 struct nvme_completion *); 411 static void nvme_feature_temperature(struct pci_nvme_softc *, 412 struct nvme_feature_obj *, 413 struct nvme_command *, 414 struct nvme_completion *); 415 static void nvme_feature_num_queues(struct pci_nvme_softc *, 416 struct nvme_feature_obj *, 417 struct nvme_command *, 418 struct nvme_completion *); 419 static void nvme_feature_iv_config(struct pci_nvme_softc *, 420 struct nvme_feature_obj *, 421 struct nvme_command *, 422 struct nvme_completion *); 423 static void nvme_feature_async_event(struct pci_nvme_softc *, 424 struct nvme_feature_obj *, 425 struct nvme_command *, 426 struct nvme_completion *); 427 428 static void *aen_thr(void *arg); 429 430 static __inline void 431 cpywithpad(char *dst, size_t dst_size, const char *src, char pad) 432 { 433 size_t len; 434 435 len = strnlen(src, dst_size); 436 memset(dst, pad, dst_size); 437 memcpy(dst, src, len); 438 } 439 440 static __inline void 441 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) 442 { 443 444 *status &= ~NVME_STATUS_MASK; 445 *status |= NVMEF(NVME_STATUS_SCT, type) | NVMEF(NVME_STATUS_SC, code); 446 } 447 448 static __inline void 449 pci_nvme_status_genc(uint16_t *status, uint16_t code) 450 { 451 452 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); 453 } 454 455 /* 456 * Initialize the requested number or IO Submission and Completion Queues. 457 * Admin queues are allocated implicitly. 458 */ 459 static void 460 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq) 461 { 462 uint32_t i; 463 464 /* 465 * Allocate and initialize the Submission Queues 466 */ 467 if (nsq > NVME_QUEUES) { 468 WPRINTF("%s: clamping number of SQ from %u to %u", 469 __func__, nsq, NVME_QUEUES); 470 nsq = NVME_QUEUES; 471 } 472 473 sc->num_squeues = nsq; 474 475 sc->submit_queues = calloc(sc->num_squeues + 1, 476 sizeof(struct nvme_submission_queue)); 477 if (sc->submit_queues == NULL) { 478 WPRINTF("%s: SQ allocation failed", __func__); 479 sc->num_squeues = 0; 480 } else { 481 struct nvme_submission_queue *sq = sc->submit_queues; 482 483 for (i = 0; i < sc->num_squeues + 1; i++) 484 pthread_mutex_init(&sq[i].mtx, NULL); 485 } 486 487 /* 488 * Allocate and initialize the Completion Queues 489 */ 490 if (ncq > NVME_QUEUES) { 491 WPRINTF("%s: clamping number of CQ from %u to %u", 492 __func__, ncq, NVME_QUEUES); 493 ncq = NVME_QUEUES; 494 } 495 496 sc->num_cqueues = ncq; 497 498 sc->compl_queues = calloc(sc->num_cqueues + 1, 499 sizeof(struct nvme_completion_queue)); 500 if (sc->compl_queues == NULL) { 501 WPRINTF("%s: CQ allocation failed", __func__); 502 sc->num_cqueues = 0; 503 } else { 504 struct nvme_completion_queue *cq = sc->compl_queues; 505 506 for (i = 0; i < sc->num_cqueues + 1; i++) 507 pthread_mutex_init(&cq[i].mtx, NULL); 508 } 509 } 510 511 static void 512 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) 513 { 514 struct nvme_controller_data *cd = &sc->ctrldata; 515 int ret; 516 517 cd->vid = 0xFB5D; 518 cd->ssvid = 0x0000; 519 520 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); 521 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); 522 523 /* Num of submission commands that we can handle at a time (2^rab) */ 524 cd->rab = 4; 525 526 /* FreeBSD OUI */ 527 cd->ieee[0] = 0xfc; 528 cd->ieee[1] = 0x9c; 529 cd->ieee[2] = 0x58; 530 531 cd->mic = 0; 532 533 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */ 534 535 cd->ver = NVME_REV(1,4); 536 537 cd->cntrltype = NVME_CNTRLTYPE_IO; 538 cd->oacs = NVMEF(NVME_CTRLR_DATA_OACS_FORMAT, 1); 539 cd->oaes = NVMEM(NVME_CTRLR_DATA_OAES_NS_ATTR); 540 cd->acl = 2; 541 cd->aerl = 4; 542 543 /* Advertise 1, Read-only firmware slot */ 544 cd->frmw = NVMEM(NVME_CTRLR_DATA_FRMW_SLOT1_RO) | 545 NVMEF(NVME_CTRLR_DATA_FRMW_NUM_SLOTS, 1); 546 cd->lpa = 0; /* TODO: support some simple things like SMART */ 547 cd->elpe = 0; /* max error log page entries */ 548 /* 549 * Report a single power state (zero-based value) 550 * power_state[] values are left as zero to indicate "Not reported" 551 */ 552 cd->npss = 0; 553 554 /* Warning Composite Temperature Threshold */ 555 cd->wctemp = 0x0157; 556 cd->cctemp = 0x0157; 557 558 /* SANICAP must not be 0 for Revision 1.4 and later NVMe Controllers */ 559 cd->sanicap = NVMEF(NVME_CTRLR_DATA_SANICAP_NODMMAS, 560 NVME_CTRLR_DATA_SANICAP_NODMMAS_NO); 561 562 cd->sqes = NVMEF(NVME_CTRLR_DATA_SQES_MAX, 6) | 563 NVMEF(NVME_CTRLR_DATA_SQES_MIN, 6); 564 cd->cqes = NVMEF(NVME_CTRLR_DATA_CQES_MAX, 4) | 565 NVMEF(NVME_CTRLR_DATA_CQES_MIN, 4); 566 cd->nn = 1; /* number of namespaces */ 567 568 cd->oncs = 0; 569 switch (sc->dataset_management) { 570 case NVME_DATASET_MANAGEMENT_AUTO: 571 if (sc->nvstore.deallocate) 572 cd->oncs |= NVME_ONCS_DSM; 573 break; 574 case NVME_DATASET_MANAGEMENT_ENABLE: 575 cd->oncs |= NVME_ONCS_DSM; 576 break; 577 default: 578 break; 579 } 580 581 cd->fna = NVMEM(NVME_CTRLR_DATA_FNA_FORMAT_ALL); 582 583 cd->vwc = NVMEF(NVME_CTRLR_DATA_VWC_ALL, NVME_CTRLR_DATA_VWC_ALL_NO); 584 585 ret = snprintf(cd->subnqn, sizeof(cd->subnqn), 586 "nqn.2013-12.org.freebsd:bhyve-%s-%u-%u-%u", 587 get_config_value("name"), sc->nsc_pi->pi_bus, 588 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 589 if ((ret < 0) || ((unsigned)ret > sizeof(cd->subnqn))) 590 EPRINTLN("%s: error setting subnqn (%d)", __func__, ret); 591 } 592 593 static void 594 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore, 595 struct nvme_namespace_data *nd) 596 { 597 598 /* Get capacity and block size information from backing store */ 599 nd->nsze = nvstore->size / nvstore->sectsz; 600 nd->ncap = nd->nsze; 601 nd->nuse = nd->nsze; 602 } 603 604 static void 605 pci_nvme_init_nsdata(struct pci_nvme_softc *sc, 606 struct nvme_namespace_data *nd, uint32_t nsid, 607 struct pci_nvme_blockstore *nvstore) 608 { 609 610 pci_nvme_init_nsdata_size(nvstore, nd); 611 612 if (nvstore->type == NVME_STOR_BLOCKIF) 613 nvstore->deallocate = blockif_candelete(nvstore->ctx); 614 615 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ 616 nd->flbas = 0; 617 618 /* Create an EUI-64 if user did not provide one */ 619 if (nvstore->eui64 == 0) { 620 char *data = NULL; 621 uint64_t eui64 = nvstore->eui64; 622 623 asprintf(&data, "%s%u%u%u", get_config_value("name"), 624 sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot, 625 sc->nsc_pi->pi_func); 626 627 if (data != NULL) { 628 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); 629 free(data); 630 } 631 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff); 632 } 633 be64enc(nd->eui64, nvstore->eui64); 634 635 /* LBA data-sz = 2^lbads */ 636 nd->lbaf[0] = NVMEF(NVME_NS_DATA_LBAF_LBADS, nvstore->sectsz_bits); 637 } 638 639 static void 640 pci_nvme_init_logpages(struct pci_nvme_softc *sc) 641 { 642 __uint128_t power_cycles = 1; 643 644 memset(&sc->err_log, 0, sizeof(sc->err_log)); 645 memset(&sc->health_log, 0, sizeof(sc->health_log)); 646 memset(&sc->fw_log, 0, sizeof(sc->fw_log)); 647 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 648 649 /* Set read/write remainder to round up according to spec */ 650 sc->read_dunits_remainder = 999; 651 sc->write_dunits_remainder = 999; 652 653 /* Set nominal Health values checked by implementations */ 654 sc->health_log.temperature = NVME_TEMPERATURE; 655 sc->health_log.available_spare = 100; 656 sc->health_log.available_spare_threshold = 10; 657 658 /* Set Active Firmware Info to slot 1 */ 659 sc->fw_log.afi = NVMEF(NVME_FIRMWARE_PAGE_AFI_SLOT, 1); 660 memcpy(&sc->fw_log.revision[0], sc->ctrldata.fr, 661 sizeof(sc->fw_log.revision[0])); 662 663 memcpy(&sc->health_log.power_cycles, &power_cycles, 664 sizeof(sc->health_log.power_cycles)); 665 } 666 667 static void 668 pci_nvme_init_features(struct pci_nvme_softc *sc) 669 { 670 enum nvme_feature fid; 671 672 for (fid = 0; fid < NVME_FID_MAX; fid++) { 673 switch (fid) { 674 case NVME_FEAT_ARBITRATION: 675 case NVME_FEAT_POWER_MANAGEMENT: 676 case NVME_FEAT_INTERRUPT_COALESCING: //XXX 677 case NVME_FEAT_WRITE_ATOMICITY: 678 /* Mandatory but no special handling required */ 679 //XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 680 //XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 681 // this returns a data buffer 682 break; 683 case NVME_FEAT_TEMPERATURE_THRESHOLD: 684 sc->feat[fid].set = nvme_feature_temperature; 685 break; 686 case NVME_FEAT_ERROR_RECOVERY: 687 sc->feat[fid].namespace_specific = true; 688 break; 689 case NVME_FEAT_NUMBER_OF_QUEUES: 690 sc->feat[fid].set = nvme_feature_num_queues; 691 break; 692 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 693 sc->feat[fid].set = nvme_feature_iv_config; 694 break; 695 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 696 sc->feat[fid].set = nvme_feature_async_event; 697 /* Enable all AENs by default */ 698 sc->feat[fid].cdw11 = PCI_NVME_AEN_DEFAULT_MASK; 699 break; 700 default: 701 sc->feat[fid].set = nvme_feature_invalid_cb; 702 sc->feat[fid].get = nvme_feature_invalid_cb; 703 } 704 } 705 } 706 707 static void 708 pci_nvme_aer_reset(struct pci_nvme_softc *sc) 709 { 710 711 STAILQ_INIT(&sc->aer_list); 712 sc->aer_count = 0; 713 } 714 715 static void 716 pci_nvme_aer_init(struct pci_nvme_softc *sc) 717 { 718 719 pthread_mutex_init(&sc->aer_mtx, NULL); 720 pci_nvme_aer_reset(sc); 721 } 722 723 static void 724 pci_nvme_aer_destroy(struct pci_nvme_softc *sc) 725 { 726 struct pci_nvme_aer *aer = NULL; 727 728 pthread_mutex_lock(&sc->aer_mtx); 729 while (!STAILQ_EMPTY(&sc->aer_list)) { 730 aer = STAILQ_FIRST(&sc->aer_list); 731 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 732 free(aer); 733 } 734 pthread_mutex_unlock(&sc->aer_mtx); 735 736 pci_nvme_aer_reset(sc); 737 } 738 739 static bool 740 pci_nvme_aer_available(struct pci_nvme_softc *sc) 741 { 742 743 return (sc->aer_count != 0); 744 } 745 746 static bool 747 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc) 748 { 749 struct nvme_controller_data *cd = &sc->ctrldata; 750 751 /* AERL is a zero based value while aer_count is one's based */ 752 return (sc->aer_count == (cd->aerl + 1U)); 753 } 754 755 /* 756 * Add an Async Event Request 757 * 758 * Stores an AER to be returned later if the Controller needs to notify the 759 * host of an event. 760 * Note that while the NVMe spec doesn't require Controllers to return AER's 761 * in order, this implementation does preserve the order. 762 */ 763 static int 764 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid) 765 { 766 struct pci_nvme_aer *aer = NULL; 767 768 aer = calloc(1, sizeof(struct pci_nvme_aer)); 769 if (aer == NULL) 770 return (-1); 771 772 /* Save the Command ID for use in the completion message */ 773 aer->cid = cid; 774 775 pthread_mutex_lock(&sc->aer_mtx); 776 sc->aer_count++; 777 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link); 778 pthread_mutex_unlock(&sc->aer_mtx); 779 780 return (0); 781 } 782 783 /* 784 * Get an Async Event Request structure 785 * 786 * Returns a pointer to an AER previously submitted by the host or NULL if 787 * no AER's exist. Caller is responsible for freeing the returned struct. 788 */ 789 static struct pci_nvme_aer * 790 pci_nvme_aer_get(struct pci_nvme_softc *sc) 791 { 792 struct pci_nvme_aer *aer = NULL; 793 794 pthread_mutex_lock(&sc->aer_mtx); 795 aer = STAILQ_FIRST(&sc->aer_list); 796 if (aer != NULL) { 797 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 798 sc->aer_count--; 799 } 800 pthread_mutex_unlock(&sc->aer_mtx); 801 802 return (aer); 803 } 804 805 static void 806 pci_nvme_aen_reset(struct pci_nvme_softc *sc) 807 { 808 uint32_t atype; 809 810 memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen)); 811 812 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 813 sc->aen[atype].atype = atype; 814 } 815 } 816 817 static void 818 pci_nvme_aen_init(struct pci_nvme_softc *sc) 819 { 820 char nstr[80]; 821 822 pci_nvme_aen_reset(sc); 823 824 pthread_mutex_init(&sc->aen_mtx, NULL); 825 pthread_create(&sc->aen_tid, NULL, aen_thr, sc); 826 snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot, 827 sc->nsc_pi->pi_func); 828 pthread_set_name_np(sc->aen_tid, nstr); 829 } 830 831 static void 832 pci_nvme_aen_destroy(struct pci_nvme_softc *sc) 833 { 834 835 pci_nvme_aen_reset(sc); 836 } 837 838 /* Notify the AEN thread of pending work */ 839 static void 840 pci_nvme_aen_notify(struct pci_nvme_softc *sc) 841 { 842 843 pthread_cond_signal(&sc->aen_cond); 844 } 845 846 /* 847 * Post an Asynchronous Event Notification 848 */ 849 static int32_t 850 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype, 851 uint32_t event_data) 852 { 853 struct pci_nvme_aen *aen; 854 855 if (atype >= PCI_NVME_AE_TYPE_MAX) { 856 return(EINVAL); 857 } 858 859 pthread_mutex_lock(&sc->aen_mtx); 860 aen = &sc->aen[atype]; 861 862 /* Has the controller already posted an event of this type? */ 863 if (aen->posted) { 864 pthread_mutex_unlock(&sc->aen_mtx); 865 return(EALREADY); 866 } 867 868 aen->event_data = event_data; 869 aen->posted = true; 870 pthread_mutex_unlock(&sc->aen_mtx); 871 872 pci_nvme_aen_notify(sc); 873 874 return(0); 875 } 876 877 static void 878 pci_nvme_aen_process(struct pci_nvme_softc *sc) 879 { 880 struct pci_nvme_aer *aer; 881 struct pci_nvme_aen *aen; 882 pci_nvme_async_type atype; 883 uint32_t mask; 884 uint16_t status; 885 uint8_t lid; 886 887 assert(pthread_mutex_isowned_np(&sc->aen_mtx)); 888 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 889 aen = &sc->aen[atype]; 890 /* Previous iterations may have depleted the available AER's */ 891 if (!pci_nvme_aer_available(sc)) { 892 DPRINTF("%s: no AER", __func__); 893 break; 894 } 895 896 if (!aen->posted) { 897 DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype); 898 continue; 899 } 900 901 status = NVME_SC_SUCCESS; 902 903 /* Is the event masked? */ 904 mask = 905 sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11; 906 907 DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data); 908 switch (atype) { 909 case PCI_NVME_AE_TYPE_ERROR: 910 lid = NVME_LOG_ERROR; 911 break; 912 case PCI_NVME_AE_TYPE_SMART: 913 mask &= 0xff; 914 if ((mask & aen->event_data) == 0) 915 continue; 916 lid = NVME_LOG_HEALTH_INFORMATION; 917 break; 918 case PCI_NVME_AE_TYPE_NOTICE: 919 if (aen->event_data >= PCI_NVME_AEI_NOTICE_MAX) { 920 EPRINTLN("%s unknown AEN notice type %u", 921 __func__, aen->event_data); 922 status = NVME_SC_INTERNAL_DEVICE_ERROR; 923 lid = 0; 924 break; 925 } 926 if ((PCI_NVME_AEI_NOTICE_MASK(aen->event_data) & mask) == 0) 927 continue; 928 switch (aen->event_data) { 929 case PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED: 930 lid = NVME_LOG_CHANGED_NAMESPACE; 931 break; 932 case PCI_NVME_AEI_NOTICE_FW_ACTIVATION: 933 lid = NVME_LOG_FIRMWARE_SLOT; 934 break; 935 case PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE: 936 lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED; 937 break; 938 case PCI_NVME_AEI_NOTICE_ANA_CHANGE: 939 lid = NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS; 940 break; 941 case PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE: 942 lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE; 943 break; 944 case PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT: 945 lid = NVME_LOG_LBA_STATUS_INFORMATION; 946 break; 947 case PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE: 948 lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE; 949 break; 950 default: 951 lid = 0; 952 } 953 break; 954 default: 955 /* bad type?!? */ 956 EPRINTLN("%s unknown AEN type %u", __func__, atype); 957 status = NVME_SC_INTERNAL_DEVICE_ERROR; 958 lid = 0; 959 break; 960 } 961 962 aer = pci_nvme_aer_get(sc); 963 assert(aer != NULL); 964 965 DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype); 966 pci_nvme_cq_update(sc, &sc->compl_queues[0], 967 (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */ 968 aer->cid, 969 0, /* SQID */ 970 status); 971 972 aen->event_data = 0; 973 aen->posted = false; 974 975 pci_generate_msix(sc->nsc_pi, 0); 976 } 977 } 978 979 static void * 980 aen_thr(void *arg) 981 { 982 struct pci_nvme_softc *sc; 983 984 sc = arg; 985 986 pthread_mutex_lock(&sc->aen_mtx); 987 for (;;) { 988 pci_nvme_aen_process(sc); 989 pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx); 990 } 991 pthread_mutex_unlock(&sc->aen_mtx); 992 993 pthread_exit(NULL); 994 return (NULL); 995 } 996 997 static void 998 pci_nvme_reset_locked(struct pci_nvme_softc *sc) 999 { 1000 uint32_t i; 1001 1002 DPRINTF("%s", __func__); 1003 1004 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | 1005 NVMEF(NVME_CAP_LO_REG_CQR, 1) | 1006 NVMEF(NVME_CAP_LO_REG_TO, 60); 1007 1008 sc->regs.cap_hi = NVMEF(NVME_CAP_HI_REG_CSS_NVM, 1); 1009 1010 sc->regs.vs = NVME_REV(1,4); /* NVMe v1.4 */ 1011 1012 sc->regs.cc = 0; 1013 1014 assert(sc->submit_queues != NULL); 1015 1016 for (i = 0; i < sc->num_squeues + 1; i++) { 1017 sc->submit_queues[i].qbase = NULL; 1018 sc->submit_queues[i].size = 0; 1019 sc->submit_queues[i].cqid = 0; 1020 sc->submit_queues[i].tail = 0; 1021 sc->submit_queues[i].head = 0; 1022 } 1023 1024 assert(sc->compl_queues != NULL); 1025 1026 for (i = 0; i < sc->num_cqueues + 1; i++) { 1027 sc->compl_queues[i].qbase = NULL; 1028 sc->compl_queues[i].size = 0; 1029 sc->compl_queues[i].tail = 0; 1030 sc->compl_queues[i].head = 0; 1031 } 1032 1033 sc->num_q_is_set = false; 1034 1035 pci_nvme_aer_destroy(sc); 1036 pci_nvme_aen_destroy(sc); 1037 1038 /* 1039 * Clear CSTS.RDY last to prevent the host from enabling Controller 1040 * before cleanup completes 1041 */ 1042 sc->regs.csts = 0; 1043 } 1044 1045 static void 1046 pci_nvme_reset(struct pci_nvme_softc *sc) 1047 { 1048 pthread_mutex_lock(&sc->mtx); 1049 pci_nvme_reset_locked(sc); 1050 pthread_mutex_unlock(&sc->mtx); 1051 } 1052 1053 static int 1054 pci_nvme_init_controller(struct pci_nvme_softc *sc) 1055 { 1056 uint16_t acqs, asqs; 1057 1058 DPRINTF("%s", __func__); 1059 1060 /* 1061 * NVMe 2.0 states that "enabling a controller while this field is 1062 * cleared to 0h produces undefined results" for both ACQS and 1063 * ASQS. If zero, set CFS and do not become ready. 1064 */ 1065 asqs = ONE_BASED(NVMEV(NVME_AQA_REG_ASQS, sc->regs.aqa)); 1066 if (asqs < 2) { 1067 EPRINTLN("%s: illegal ASQS value %#x (aqa=%#x)", __func__, 1068 asqs - 1, sc->regs.aqa); 1069 sc->regs.csts |= NVME_CSTS_CFS; 1070 return (-1); 1071 } 1072 sc->submit_queues[0].size = asqs; 1073 sc->submit_queues[0].qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1074 sc->regs.asq, sizeof(struct nvme_command) * asqs); 1075 if (sc->submit_queues[0].qbase == NULL) { 1076 EPRINTLN("%s: ASQ vm_map_gpa(%lx) failed", __func__, 1077 sc->regs.asq); 1078 sc->regs.csts |= NVME_CSTS_CFS; 1079 return (-1); 1080 } 1081 1082 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p", 1083 __func__, sc->regs.asq, sc->submit_queues[0].qbase); 1084 1085 acqs = ONE_BASED(NVMEV(NVME_AQA_REG_ACQS, sc->regs.aqa)); 1086 if (acqs < 2) { 1087 EPRINTLN("%s: illegal ACQS value %#x (aqa=%#x)", __func__, 1088 acqs - 1, sc->regs.aqa); 1089 sc->regs.csts |= NVME_CSTS_CFS; 1090 return (-1); 1091 } 1092 sc->compl_queues[0].size = acqs; 1093 sc->compl_queues[0].qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1094 sc->regs.acq, sizeof(struct nvme_completion) * acqs); 1095 if (sc->compl_queues[0].qbase == NULL) { 1096 EPRINTLN("%s: ACQ vm_map_gpa(%lx) failed", __func__, 1097 sc->regs.acq); 1098 sc->regs.csts |= NVME_CSTS_CFS; 1099 return (-1); 1100 } 1101 sc->compl_queues[0].intr_en = NVME_CQ_INTEN; 1102 1103 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p", 1104 __func__, sc->regs.acq, sc->compl_queues[0].qbase); 1105 1106 return (0); 1107 } 1108 1109 static int 1110 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b, 1111 size_t len, enum nvme_copy_dir dir) 1112 { 1113 uint8_t *p; 1114 size_t bytes; 1115 1116 if (len > (8 * 1024)) { 1117 return (-1); 1118 } 1119 1120 /* Copy from the start of prp1 to the end of the physical page */ 1121 bytes = PAGE_SIZE - (prp1 & PAGE_MASK); 1122 bytes = MIN(bytes, len); 1123 1124 p = vm_map_gpa(ctx, prp1, bytes); 1125 if (p == NULL) { 1126 return (-1); 1127 } 1128 1129 if (dir == NVME_COPY_TO_PRP) 1130 memcpy(p, b, bytes); 1131 else 1132 memcpy(b, p, bytes); 1133 1134 b += bytes; 1135 1136 len -= bytes; 1137 if (len == 0) { 1138 return (0); 1139 } 1140 1141 len = MIN(len, PAGE_SIZE); 1142 1143 p = vm_map_gpa(ctx, prp2, len); 1144 if (p == NULL) { 1145 return (-1); 1146 } 1147 1148 if (dir == NVME_COPY_TO_PRP) 1149 memcpy(p, b, len); 1150 else 1151 memcpy(b, p, len); 1152 1153 return (0); 1154 } 1155 1156 /* 1157 * Write a Completion Queue Entry update 1158 * 1159 * Write the completion and update the doorbell value 1160 */ 1161 static void 1162 pci_nvme_cq_update(struct pci_nvme_softc *sc, 1163 struct nvme_completion_queue *cq, 1164 uint32_t cdw0, 1165 uint16_t cid, 1166 uint16_t sqid, 1167 uint16_t status) 1168 { 1169 struct nvme_submission_queue *sq = &sc->submit_queues[sqid]; 1170 struct nvme_completion *cqe; 1171 1172 assert(cq->qbase != NULL); 1173 1174 pthread_mutex_lock(&cq->mtx); 1175 1176 cqe = &cq->qbase[cq->tail]; 1177 1178 /* Flip the phase bit */ 1179 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK; 1180 1181 cqe->cdw0 = cdw0; 1182 cqe->sqhd = sq->head; 1183 cqe->sqid = sqid; 1184 cqe->cid = cid; 1185 cqe->status = status; 1186 1187 cq->tail++; 1188 if (cq->tail >= cq->size) { 1189 cq->tail = 0; 1190 } 1191 1192 pthread_mutex_unlock(&cq->mtx); 1193 } 1194 1195 static int 1196 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1197 struct nvme_completion* compl) 1198 { 1199 uint16_t qid = command->cdw10 & 0xffff; 1200 1201 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid); 1202 if (qid == 0 || qid > sc->num_squeues || 1203 (sc->submit_queues[qid].qbase == NULL)) { 1204 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u", 1205 __func__, qid, sc->num_squeues); 1206 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1207 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1208 return (1); 1209 } 1210 1211 sc->submit_queues[qid].qbase = NULL; 1212 sc->submit_queues[qid].cqid = 0; 1213 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1214 return (1); 1215 } 1216 1217 static int 1218 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1219 struct nvme_completion* compl) 1220 { 1221 if (command->cdw11 & NVME_CMD_CDW11_PC) { 1222 uint16_t qid = command->cdw10 & 0xffff; 1223 struct nvme_submission_queue *nsq; 1224 1225 if ((qid == 0) || (qid > sc->num_squeues) || 1226 (sc->submit_queues[qid].qbase != NULL)) { 1227 WPRINTF("%s queue index %u > num_squeues %u", 1228 __func__, qid, sc->num_squeues); 1229 pci_nvme_status_tc(&compl->status, 1230 NVME_SCT_COMMAND_SPECIFIC, 1231 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1232 return (1); 1233 } 1234 1235 nsq = &sc->submit_queues[qid]; 1236 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1237 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries); 1238 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) { 1239 /* 1240 * Queues must specify at least two entries 1241 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1242 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1243 */ 1244 pci_nvme_status_tc(&compl->status, 1245 NVME_SCT_COMMAND_SPECIFIC, 1246 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1247 return (1); 1248 } 1249 nsq->head = nsq->tail = 0; 1250 1251 nsq->cqid = (command->cdw11 >> 16) & 0xffff; 1252 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) { 1253 pci_nvme_status_tc(&compl->status, 1254 NVME_SCT_COMMAND_SPECIFIC, 1255 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1256 return (1); 1257 } 1258 1259 if (sc->compl_queues[nsq->cqid].qbase == NULL) { 1260 pci_nvme_status_tc(&compl->status, 1261 NVME_SCT_COMMAND_SPECIFIC, 1262 NVME_SC_COMPLETION_QUEUE_INVALID); 1263 return (1); 1264 } 1265 1266 nsq->qpriority = (command->cdw11 >> 1) & 0x03; 1267 1268 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1269 sizeof(struct nvme_command) * (size_t)nsq->size); 1270 1271 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__, 1272 qid, nsq->size, nsq->qbase, nsq->cqid); 1273 1274 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1275 1276 DPRINTF("%s completed creating IOSQ qid %u", 1277 __func__, qid); 1278 } else { 1279 /* 1280 * Guest sent non-cont submission queue request. 1281 * This setting is unsupported by this emulation. 1282 */ 1283 WPRINTF("%s unsupported non-contig (list-based) " 1284 "create i/o submission queue", __func__); 1285 1286 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1287 } 1288 return (1); 1289 } 1290 1291 static int 1292 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1293 struct nvme_completion* compl) 1294 { 1295 uint16_t qid = command->cdw10 & 0xffff; 1296 uint16_t sqid; 1297 1298 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid); 1299 if (qid == 0 || qid > sc->num_cqueues || 1300 (sc->compl_queues[qid].qbase == NULL)) { 1301 WPRINTF("%s queue index %u / num_cqueues %u", 1302 __func__, qid, sc->num_cqueues); 1303 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1304 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1305 return (1); 1306 } 1307 1308 /* Deleting an Active CQ is an error */ 1309 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++) 1310 if (sc->submit_queues[sqid].cqid == qid) { 1311 pci_nvme_status_tc(&compl->status, 1312 NVME_SCT_COMMAND_SPECIFIC, 1313 NVME_SC_INVALID_QUEUE_DELETION); 1314 return (1); 1315 } 1316 1317 sc->compl_queues[qid].qbase = NULL; 1318 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1319 return (1); 1320 } 1321 1322 static int 1323 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1324 struct nvme_completion* compl) 1325 { 1326 struct nvme_completion_queue *ncq; 1327 uint16_t qid = command->cdw10 & 0xffff; 1328 1329 /* Only support Physically Contiguous queues */ 1330 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) { 1331 WPRINTF("%s unsupported non-contig (list-based) " 1332 "create i/o completion queue", 1333 __func__); 1334 1335 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1336 return (1); 1337 } 1338 1339 if ((qid == 0) || (qid > sc->num_cqueues) || 1340 (sc->compl_queues[qid].qbase != NULL)) { 1341 WPRINTF("%s queue index %u > num_cqueues %u", 1342 __func__, qid, sc->num_cqueues); 1343 pci_nvme_status_tc(&compl->status, 1344 NVME_SCT_COMMAND_SPECIFIC, 1345 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1346 return (1); 1347 } 1348 1349 ncq = &sc->compl_queues[qid]; 1350 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; 1351 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; 1352 if (ncq->intr_vec > (sc->max_queues + 1)) { 1353 pci_nvme_status_tc(&compl->status, 1354 NVME_SCT_COMMAND_SPECIFIC, 1355 NVME_SC_INVALID_INTERRUPT_VECTOR); 1356 return (1); 1357 } 1358 1359 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1360 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) { 1361 /* 1362 * Queues must specify at least two entries 1363 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1364 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1365 */ 1366 pci_nvme_status_tc(&compl->status, 1367 NVME_SCT_COMMAND_SPECIFIC, 1368 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1369 return (1); 1370 } 1371 ncq->head = ncq->tail = 0; 1372 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1373 command->prp1, 1374 sizeof(struct nvme_command) * (size_t)ncq->size); 1375 1376 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1377 1378 1379 return (1); 1380 } 1381 1382 static int 1383 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, 1384 struct nvme_completion* compl) 1385 { 1386 uint64_t logoff; 1387 uint32_t logsize; 1388 uint8_t logpage; 1389 1390 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1391 1392 /* 1393 * Command specifies the number of dwords to return in fields NUMDU 1394 * and NUMDL. This is a zero-based value. 1395 */ 1396 logpage = command->cdw10 & 0xFF; 1397 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1; 1398 logsize *= sizeof(uint32_t); 1399 logoff = ((uint64_t)(command->cdw13) << 32) | command->cdw12; 1400 1401 DPRINTF("%s log page %u len %u", __func__, logpage, logsize); 1402 1403 switch (logpage) { 1404 case NVME_LOG_ERROR: 1405 if (logoff >= sizeof(sc->err_log)) { 1406 pci_nvme_status_genc(&compl->status, 1407 NVME_SC_INVALID_FIELD); 1408 break; 1409 } 1410 1411 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1412 command->prp2, (uint8_t *)&sc->err_log + logoff, 1413 MIN(logsize - logoff, sizeof(sc->err_log)), 1414 NVME_COPY_TO_PRP); 1415 break; 1416 case NVME_LOG_HEALTH_INFORMATION: 1417 if (logoff >= sizeof(sc->health_log)) { 1418 pci_nvme_status_genc(&compl->status, 1419 NVME_SC_INVALID_FIELD); 1420 break; 1421 } 1422 1423 pthread_mutex_lock(&sc->mtx); 1424 memcpy(&sc->health_log.data_units_read, &sc->read_data_units, 1425 sizeof(sc->health_log.data_units_read)); 1426 memcpy(&sc->health_log.data_units_written, &sc->write_data_units, 1427 sizeof(sc->health_log.data_units_written)); 1428 memcpy(&sc->health_log.host_read_commands, &sc->read_commands, 1429 sizeof(sc->health_log.host_read_commands)); 1430 memcpy(&sc->health_log.host_write_commands, &sc->write_commands, 1431 sizeof(sc->health_log.host_write_commands)); 1432 pthread_mutex_unlock(&sc->mtx); 1433 1434 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1435 command->prp2, (uint8_t *)&sc->health_log + logoff, 1436 MIN(logsize - logoff, sizeof(sc->health_log)), 1437 NVME_COPY_TO_PRP); 1438 break; 1439 case NVME_LOG_FIRMWARE_SLOT: 1440 if (logoff >= sizeof(sc->fw_log)) { 1441 pci_nvme_status_genc(&compl->status, 1442 NVME_SC_INVALID_FIELD); 1443 break; 1444 } 1445 1446 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1447 command->prp2, (uint8_t *)&sc->fw_log + logoff, 1448 MIN(logsize - logoff, sizeof(sc->fw_log)), 1449 NVME_COPY_TO_PRP); 1450 break; 1451 case NVME_LOG_CHANGED_NAMESPACE: 1452 if (logoff >= sizeof(sc->ns_log)) { 1453 pci_nvme_status_genc(&compl->status, 1454 NVME_SC_INVALID_FIELD); 1455 break; 1456 } 1457 1458 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1459 command->prp2, (uint8_t *)&sc->ns_log + logoff, 1460 MIN(logsize - logoff, sizeof(sc->ns_log)), 1461 NVME_COPY_TO_PRP); 1462 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 1463 break; 1464 default: 1465 DPRINTF("%s get log page %x command not supported", 1466 __func__, logpage); 1467 1468 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1469 NVME_SC_INVALID_LOG_PAGE); 1470 } 1471 1472 return (1); 1473 } 1474 1475 static int 1476 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, 1477 struct nvme_completion* compl) 1478 { 1479 void *dest; 1480 uint16_t status; 1481 1482 DPRINTF("%s identify 0x%x nsid 0x%x", __func__, 1483 command->cdw10 & 0xFF, command->nsid); 1484 1485 status = 0; 1486 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1487 1488 switch (command->cdw10 & 0xFF) { 1489 case 0x00: /* return Identify Namespace data structure */ 1490 /* Global NS only valid with NS Management */ 1491 if (command->nsid == NVME_GLOBAL_NAMESPACE_TAG) { 1492 pci_nvme_status_genc(&status, 1493 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1494 break; 1495 } 1496 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1497 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata), 1498 NVME_COPY_TO_PRP); 1499 break; 1500 case 0x01: /* return Identify Controller data structure */ 1501 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1502 command->prp2, (uint8_t *)&sc->ctrldata, 1503 sizeof(sc->ctrldata), 1504 NVME_COPY_TO_PRP); 1505 break; 1506 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ 1507 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1508 sizeof(uint32_t) * 1024); 1509 /* All unused entries shall be zero */ 1510 memset(dest, 0, sizeof(uint32_t) * 1024); 1511 ((uint32_t *)dest)[0] = 1; 1512 break; 1513 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ 1514 if (command->nsid != 1) { 1515 pci_nvme_status_genc(&status, 1516 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1517 break; 1518 } 1519 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1520 sizeof(uint32_t) * 1024); 1521 /* All bytes after the descriptor shall be zero */ 1522 memset(dest, 0, sizeof(uint32_t) * 1024); 1523 1524 /* Return NIDT=1 (i.e. EUI64) descriptor */ 1525 ((uint8_t *)dest)[0] = 1; 1526 ((uint8_t *)dest)[1] = sizeof(uint64_t); 1527 memcpy(((uint8_t *)dest) + 4, sc->nsdata.eui64, sizeof(uint64_t)); 1528 break; 1529 case 0x13: 1530 /* 1531 * Controller list is optional but used by UNH tests. Return 1532 * a valid but empty list. 1533 */ 1534 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1535 sizeof(uint16_t) * 2048); 1536 memset(dest, 0, sizeof(uint16_t) * 2048); 1537 break; 1538 default: 1539 DPRINTF("%s unsupported identify command requested 0x%x", 1540 __func__, command->cdw10 & 0xFF); 1541 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD); 1542 break; 1543 } 1544 1545 compl->status = status; 1546 return (1); 1547 } 1548 1549 static const char * 1550 nvme_fid_to_name(uint8_t fid) 1551 { 1552 const char *name; 1553 1554 switch (fid) { 1555 case NVME_FEAT_ARBITRATION: 1556 name = "Arbitration"; 1557 break; 1558 case NVME_FEAT_POWER_MANAGEMENT: 1559 name = "Power Management"; 1560 break; 1561 case NVME_FEAT_LBA_RANGE_TYPE: 1562 name = "LBA Range Type"; 1563 break; 1564 case NVME_FEAT_TEMPERATURE_THRESHOLD: 1565 name = "Temperature Threshold"; 1566 break; 1567 case NVME_FEAT_ERROR_RECOVERY: 1568 name = "Error Recovery"; 1569 break; 1570 case NVME_FEAT_VOLATILE_WRITE_CACHE: 1571 name = "Volatile Write Cache"; 1572 break; 1573 case NVME_FEAT_NUMBER_OF_QUEUES: 1574 name = "Number of Queues"; 1575 break; 1576 case NVME_FEAT_INTERRUPT_COALESCING: 1577 name = "Interrupt Coalescing"; 1578 break; 1579 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 1580 name = "Interrupt Vector Configuration"; 1581 break; 1582 case NVME_FEAT_WRITE_ATOMICITY: 1583 name = "Write Atomicity Normal"; 1584 break; 1585 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 1586 name = "Asynchronous Event Configuration"; 1587 break; 1588 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 1589 name = "Autonomous Power State Transition"; 1590 break; 1591 case NVME_FEAT_HOST_MEMORY_BUFFER: 1592 name = "Host Memory Buffer"; 1593 break; 1594 case NVME_FEAT_TIMESTAMP: 1595 name = "Timestamp"; 1596 break; 1597 case NVME_FEAT_KEEP_ALIVE_TIMER: 1598 name = "Keep Alive Timer"; 1599 break; 1600 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT: 1601 name = "Host Controlled Thermal Management"; 1602 break; 1603 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG: 1604 name = "Non-Operation Power State Config"; 1605 break; 1606 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG: 1607 name = "Read Recovery Level Config"; 1608 break; 1609 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 1610 name = "Predictable Latency Mode Config"; 1611 break; 1612 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW: 1613 name = "Predictable Latency Mode Window"; 1614 break; 1615 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES: 1616 name = "LBA Status Information Report Interval"; 1617 break; 1618 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 1619 name = "Host Behavior Support"; 1620 break; 1621 case NVME_FEAT_SANITIZE_CONFIG: 1622 name = "Sanitize Config"; 1623 break; 1624 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION: 1625 name = "Endurance Group Event Configuration"; 1626 break; 1627 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 1628 name = "Software Progress Marker"; 1629 break; 1630 case NVME_FEAT_HOST_IDENTIFIER: 1631 name = "Host Identifier"; 1632 break; 1633 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK: 1634 name = "Reservation Notification Mask"; 1635 break; 1636 case NVME_FEAT_RESERVATION_PERSISTENCE: 1637 name = "Reservation Persistence"; 1638 break; 1639 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG: 1640 name = "Namespace Write Protection Config"; 1641 break; 1642 default: 1643 name = "Unknown"; 1644 break; 1645 } 1646 1647 return (name); 1648 } 1649 1650 static void 1651 nvme_feature_invalid_cb(struct pci_nvme_softc *sc __unused, 1652 struct nvme_feature_obj *feat __unused, 1653 struct nvme_command *command __unused, 1654 struct nvme_completion *compl) 1655 { 1656 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1657 } 1658 1659 static void 1660 nvme_feature_iv_config(struct pci_nvme_softc *sc, 1661 struct nvme_feature_obj *feat __unused, 1662 struct nvme_command *command, 1663 struct nvme_completion *compl) 1664 { 1665 uint32_t i; 1666 uint32_t cdw11 = command->cdw11; 1667 uint16_t iv; 1668 bool cd; 1669 1670 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1671 1672 iv = cdw11 & 0xffff; 1673 cd = cdw11 & (1 << 16); 1674 1675 if (iv > (sc->max_queues + 1)) { 1676 return; 1677 } 1678 1679 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */ 1680 if ((iv == 0) && !cd) 1681 return; 1682 1683 /* Requested Interrupt Vector must be used by a CQ */ 1684 for (i = 0; i < sc->num_cqueues + 1; i++) { 1685 if (sc->compl_queues[i].intr_vec == iv) { 1686 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1687 } 1688 } 1689 } 1690 1691 #define NVME_ASYNC_EVENT_ENDURANCE_GROUP (0x4000) 1692 static void 1693 nvme_feature_async_event(struct pci_nvme_softc *sc __unused, 1694 struct nvme_feature_obj *feat __unused, 1695 struct nvme_command *command, 1696 struct nvme_completion *compl) 1697 { 1698 if (command->cdw11 & NVME_ASYNC_EVENT_ENDURANCE_GROUP) 1699 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1700 } 1701 1702 #define NVME_TEMP_THRESH_OVER 0 1703 #define NVME_TEMP_THRESH_UNDER 1 1704 static void 1705 nvme_feature_temperature(struct pci_nvme_softc *sc, 1706 struct nvme_feature_obj *feat __unused, 1707 struct nvme_command *command, 1708 struct nvme_completion *compl) 1709 { 1710 uint16_t tmpth; /* Temperature Threshold */ 1711 uint8_t tmpsel; /* Threshold Temperature Select */ 1712 uint8_t thsel; /* Threshold Type Select */ 1713 bool set_crit = false; 1714 bool report_crit; 1715 1716 tmpth = command->cdw11 & 0xffff; 1717 tmpsel = (command->cdw11 >> 16) & 0xf; 1718 thsel = (command->cdw11 >> 20) & 0x3; 1719 1720 DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel); 1721 1722 /* Check for unsupported values */ 1723 if (((tmpsel != 0) && (tmpsel != 0xf)) || 1724 (thsel > NVME_TEMP_THRESH_UNDER)) { 1725 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1726 return; 1727 } 1728 1729 if (((thsel == NVME_TEMP_THRESH_OVER) && (NVME_TEMPERATURE >= tmpth)) || 1730 ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth))) 1731 set_crit = true; 1732 1733 pthread_mutex_lock(&sc->mtx); 1734 if (set_crit) 1735 sc->health_log.critical_warning |= 1736 NVME_CRIT_WARN_ST_TEMPERATURE; 1737 else 1738 sc->health_log.critical_warning &= 1739 ~NVME_CRIT_WARN_ST_TEMPERATURE; 1740 pthread_mutex_unlock(&sc->mtx); 1741 1742 report_crit = sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11 & 1743 NVME_CRIT_WARN_ST_TEMPERATURE; 1744 1745 if (set_crit && report_crit) 1746 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART, 1747 sc->health_log.critical_warning); 1748 1749 DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status); 1750 } 1751 1752 static void 1753 nvme_feature_num_queues(struct pci_nvme_softc *sc, 1754 struct nvme_feature_obj *feat __unused, 1755 struct nvme_command *command, 1756 struct nvme_completion *compl) 1757 { 1758 uint16_t nqr; /* Number of Queues Requested */ 1759 1760 if (sc->num_q_is_set) { 1761 WPRINTF("%s: Number of Queues already set", __func__); 1762 pci_nvme_status_genc(&compl->status, 1763 NVME_SC_COMMAND_SEQUENCE_ERROR); 1764 return; 1765 } 1766 1767 nqr = command->cdw11 & 0xFFFF; 1768 if (nqr == 0xffff) { 1769 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr); 1770 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1771 return; 1772 } 1773 1774 sc->num_squeues = ONE_BASED(nqr); 1775 if (sc->num_squeues > sc->max_queues) { 1776 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues, 1777 sc->max_queues); 1778 sc->num_squeues = sc->max_queues; 1779 } 1780 1781 nqr = (command->cdw11 >> 16) & 0xFFFF; 1782 if (nqr == 0xffff) { 1783 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr); 1784 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1785 return; 1786 } 1787 1788 sc->num_cqueues = ONE_BASED(nqr); 1789 if (sc->num_cqueues > sc->max_queues) { 1790 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues, 1791 sc->max_queues); 1792 sc->num_cqueues = sc->max_queues; 1793 } 1794 1795 /* Patch the command value which will be saved on callback's return */ 1796 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc); 1797 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 1798 1799 sc->num_q_is_set = true; 1800 } 1801 1802 static int 1803 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command, 1804 struct nvme_completion *compl) 1805 { 1806 struct nvme_feature_obj *feat; 1807 uint32_t nsid = command->nsid; 1808 uint8_t fid = NVMEV(NVME_FEAT_SET_FID, command->cdw10); 1809 bool sv = NVMEV(NVME_FEAT_SET_SV, command->cdw10); 1810 1811 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1812 1813 if (fid >= NVME_FID_MAX) { 1814 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1815 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1816 return (1); 1817 } 1818 1819 if (sv) { 1820 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1821 NVME_SC_FEATURE_NOT_SAVEABLE); 1822 return (1); 1823 } 1824 1825 feat = &sc->feat[fid]; 1826 1827 if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) { 1828 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1829 return (1); 1830 } 1831 1832 if (!feat->namespace_specific && 1833 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) { 1834 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1835 NVME_SC_FEATURE_NOT_NS_SPECIFIC); 1836 return (1); 1837 } 1838 1839 compl->cdw0 = 0; 1840 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1841 1842 if (feat->set) 1843 feat->set(sc, feat, command, compl); 1844 else { 1845 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1846 NVME_SC_FEATURE_NOT_CHANGEABLE); 1847 return (1); 1848 } 1849 1850 DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11); 1851 if (compl->status == NVME_SC_SUCCESS) { 1852 feat->cdw11 = command->cdw11; 1853 if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) && 1854 (command->cdw11 != 0)) 1855 pci_nvme_aen_notify(sc); 1856 } 1857 1858 return (0); 1859 } 1860 1861 #define NVME_FEATURES_SEL_SUPPORTED 0x3 1862 #define NVME_FEATURES_NS_SPECIFIC (1 << 1) 1863 1864 static int 1865 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, 1866 struct nvme_completion* compl) 1867 { 1868 struct nvme_feature_obj *feat; 1869 uint8_t fid = command->cdw10 & 0xFF; 1870 uint8_t sel = (command->cdw10 >> 8) & 0x7; 1871 1872 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1873 1874 if (fid >= NVME_FID_MAX) { 1875 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1876 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1877 return (1); 1878 } 1879 1880 compl->cdw0 = 0; 1881 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1882 1883 feat = &sc->feat[fid]; 1884 if (feat->get) { 1885 feat->get(sc, feat, command, compl); 1886 } 1887 1888 if (compl->status == NVME_SC_SUCCESS) { 1889 if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific) 1890 compl->cdw0 = NVME_FEATURES_NS_SPECIFIC; 1891 else 1892 compl->cdw0 = feat->cdw11; 1893 } 1894 1895 return (0); 1896 } 1897 1898 static int 1899 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command, 1900 struct nvme_completion* compl) 1901 { 1902 uint8_t ses, lbaf, pi; 1903 1904 /* Only supports Secure Erase Setting - User Data Erase */ 1905 ses = (command->cdw10 >> 9) & 0x7; 1906 if (ses > 0x1) { 1907 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1908 return (1); 1909 } 1910 1911 /* Only supports a single LBA Format */ 1912 lbaf = command->cdw10 & 0xf; 1913 if (lbaf != 0) { 1914 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1915 NVME_SC_INVALID_FORMAT); 1916 return (1); 1917 } 1918 1919 /* Doesn't support Protection Information */ 1920 pi = (command->cdw10 >> 5) & 0x7; 1921 if (pi != 0) { 1922 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1923 return (1); 1924 } 1925 1926 if (sc->nvstore.type == NVME_STOR_RAM) { 1927 if (sc->nvstore.ctx) 1928 free(sc->nvstore.ctx); 1929 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 1930 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1931 } else { 1932 struct pci_nvme_ioreq *req; 1933 int err; 1934 1935 req = pci_nvme_get_ioreq(sc); 1936 if (req == NULL) { 1937 pci_nvme_status_genc(&compl->status, 1938 NVME_SC_INTERNAL_DEVICE_ERROR); 1939 WPRINTF("%s: unable to allocate IO req", __func__); 1940 return (1); 1941 } 1942 req->nvme_sq = &sc->submit_queues[0]; 1943 req->sqid = 0; 1944 req->opc = command->opc; 1945 req->cid = command->cid; 1946 req->nsid = command->nsid; 1947 1948 req->io_req.br_offset = 0; 1949 req->io_req.br_resid = sc->nvstore.size; 1950 req->io_req.br_callback = pci_nvme_io_done; 1951 1952 err = blockif_delete(sc->nvstore.ctx, &req->io_req); 1953 if (err) { 1954 pci_nvme_status_genc(&compl->status, 1955 NVME_SC_INTERNAL_DEVICE_ERROR); 1956 pci_nvme_release_ioreq(sc, req); 1957 } else 1958 compl->status = NVME_NO_STATUS; 1959 } 1960 1961 return (1); 1962 } 1963 1964 static int 1965 nvme_opc_abort(struct pci_nvme_softc *sc __unused, struct nvme_command *command, 1966 struct nvme_completion *compl) 1967 { 1968 DPRINTF("%s submission queue %u, command ID 0x%x", __func__, 1969 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF); 1970 1971 /* TODO: search for the command ID and abort it */ 1972 1973 compl->cdw0 = 1; 1974 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1975 return (1); 1976 } 1977 1978 static int 1979 nvme_opc_async_event_req(struct pci_nvme_softc* sc, 1980 struct nvme_command* command, struct nvme_completion* compl) 1981 { 1982 DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__, 1983 sc->aer_count, sc->ctrldata.aerl, command->cid); 1984 1985 /* Don't exceed the Async Event Request Limit (AERL). */ 1986 if (pci_nvme_aer_limit_reached(sc)) { 1987 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1988 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); 1989 return (1); 1990 } 1991 1992 if (pci_nvme_aer_add(sc, command->cid)) { 1993 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC, 1994 NVME_SC_INTERNAL_DEVICE_ERROR); 1995 return (1); 1996 } 1997 1998 /* 1999 * Raise events when they happen based on the Set Features cmd. 2000 * These events happen async, so only set completion successful if 2001 * there is an event reflective of the request to get event. 2002 */ 2003 compl->status = NVME_NO_STATUS; 2004 pci_nvme_aen_notify(sc); 2005 2006 return (0); 2007 } 2008 2009 static void 2010 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) 2011 { 2012 struct nvme_completion compl; 2013 struct nvme_command *cmd; 2014 struct nvme_submission_queue *sq; 2015 struct nvme_completion_queue *cq; 2016 uint16_t sqhead; 2017 2018 DPRINTF("%s index %u", __func__, (uint32_t)value); 2019 2020 sq = &sc->submit_queues[0]; 2021 cq = &sc->compl_queues[0]; 2022 2023 pthread_mutex_lock(&sq->mtx); 2024 2025 sqhead = sq->head; 2026 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail); 2027 2028 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2029 cmd = &(sq->qbase)[sqhead]; 2030 compl.cdw0 = 0; 2031 compl.status = 0; 2032 2033 switch (cmd->opc) { 2034 case NVME_OPC_DELETE_IO_SQ: 2035 DPRINTF("%s command DELETE_IO_SQ", __func__); 2036 nvme_opc_delete_io_sq(sc, cmd, &compl); 2037 break; 2038 case NVME_OPC_CREATE_IO_SQ: 2039 DPRINTF("%s command CREATE_IO_SQ", __func__); 2040 nvme_opc_create_io_sq(sc, cmd, &compl); 2041 break; 2042 case NVME_OPC_DELETE_IO_CQ: 2043 DPRINTF("%s command DELETE_IO_CQ", __func__); 2044 nvme_opc_delete_io_cq(sc, cmd, &compl); 2045 break; 2046 case NVME_OPC_CREATE_IO_CQ: 2047 DPRINTF("%s command CREATE_IO_CQ", __func__); 2048 nvme_opc_create_io_cq(sc, cmd, &compl); 2049 break; 2050 case NVME_OPC_GET_LOG_PAGE: 2051 DPRINTF("%s command GET_LOG_PAGE", __func__); 2052 nvme_opc_get_log_page(sc, cmd, &compl); 2053 break; 2054 case NVME_OPC_IDENTIFY: 2055 DPRINTF("%s command IDENTIFY", __func__); 2056 nvme_opc_identify(sc, cmd, &compl); 2057 break; 2058 case NVME_OPC_ABORT: 2059 DPRINTF("%s command ABORT", __func__); 2060 nvme_opc_abort(sc, cmd, &compl); 2061 break; 2062 case NVME_OPC_SET_FEATURES: 2063 DPRINTF("%s command SET_FEATURES", __func__); 2064 nvme_opc_set_features(sc, cmd, &compl); 2065 break; 2066 case NVME_OPC_GET_FEATURES: 2067 DPRINTF("%s command GET_FEATURES", __func__); 2068 nvme_opc_get_features(sc, cmd, &compl); 2069 break; 2070 case NVME_OPC_FIRMWARE_ACTIVATE: 2071 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__); 2072 pci_nvme_status_tc(&compl.status, 2073 NVME_SCT_COMMAND_SPECIFIC, 2074 NVME_SC_INVALID_FIRMWARE_SLOT); 2075 break; 2076 case NVME_OPC_ASYNC_EVENT_REQUEST: 2077 DPRINTF("%s command ASYNC_EVENT_REQ", __func__); 2078 nvme_opc_async_event_req(sc, cmd, &compl); 2079 break; 2080 case NVME_OPC_FORMAT_NVM: 2081 DPRINTF("%s command FORMAT_NVM", __func__); 2082 if (NVMEV(NVME_CTRLR_DATA_OACS_FORMAT, 2083 sc->ctrldata.oacs) == 0) { 2084 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 2085 break; 2086 } 2087 nvme_opc_format_nvm(sc, cmd, &compl); 2088 break; 2089 case NVME_OPC_SECURITY_SEND: 2090 case NVME_OPC_SECURITY_RECEIVE: 2091 case NVME_OPC_SANITIZE: 2092 case NVME_OPC_GET_LBA_STATUS: 2093 DPRINTF("%s command OPC=%#x (unsupported)", __func__, 2094 cmd->opc); 2095 /* Valid but unsupported opcodes */ 2096 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD); 2097 break; 2098 default: 2099 DPRINTF("%s command OPC=%#X (not implemented)", 2100 __func__, 2101 cmd->opc); 2102 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 2103 } 2104 sqhead = (sqhead + 1) % sq->size; 2105 2106 if (NVME_COMPLETION_VALID(compl)) { 2107 pci_nvme_cq_update(sc, &sc->compl_queues[0], 2108 compl.cdw0, 2109 cmd->cid, 2110 0, /* SQID */ 2111 compl.status); 2112 } 2113 } 2114 2115 DPRINTF("setting sqhead %u", sqhead); 2116 sq->head = sqhead; 2117 2118 if (cq->head != cq->tail) 2119 pci_generate_msix(sc->nsc_pi, 0); 2120 2121 pthread_mutex_unlock(&sq->mtx); 2122 } 2123 2124 /* 2125 * Update the Write and Read statistics reported in SMART data 2126 * 2127 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up. 2128 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000 2129 * 512 byte blocks. Rounding up is achieved by initializing the remainder to 999. 2130 */ 2131 static void 2132 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc, 2133 size_t bytes, uint16_t status) 2134 { 2135 2136 pthread_mutex_lock(&sc->mtx); 2137 switch (opc) { 2138 case NVME_OPC_WRITE: 2139 sc->write_commands++; 2140 if (status != NVME_SC_SUCCESS) 2141 break; 2142 sc->write_dunits_remainder += (bytes / 512); 2143 while (sc->write_dunits_remainder >= 1000) { 2144 sc->write_data_units++; 2145 sc->write_dunits_remainder -= 1000; 2146 } 2147 break; 2148 case NVME_OPC_READ: 2149 sc->read_commands++; 2150 if (status != NVME_SC_SUCCESS) 2151 break; 2152 sc->read_dunits_remainder += (bytes / 512); 2153 while (sc->read_dunits_remainder >= 1000) { 2154 sc->read_data_units++; 2155 sc->read_dunits_remainder -= 1000; 2156 } 2157 break; 2158 default: 2159 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc); 2160 break; 2161 } 2162 pthread_mutex_unlock(&sc->mtx); 2163 } 2164 2165 /* 2166 * Check if the combination of Starting LBA (slba) and number of blocks 2167 * exceeds the range of the underlying storage. 2168 * 2169 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores 2170 * the capacity in bytes as a uint64_t, care must be taken to avoid integer 2171 * overflow. 2172 */ 2173 static bool 2174 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba, 2175 uint32_t nblocks) 2176 { 2177 size_t offset, bytes; 2178 2179 /* Overflow check of multiplying Starting LBA by the sector size */ 2180 if (slba >> (64 - nvstore->sectsz_bits)) 2181 return (true); 2182 2183 offset = slba << nvstore->sectsz_bits; 2184 bytes = nblocks << nvstore->sectsz_bits; 2185 2186 /* Overflow check of Number of Logical Blocks */ 2187 if ((nvstore->size <= offset) || ((nvstore->size - offset) < bytes)) 2188 return (true); 2189 2190 return (false); 2191 } 2192 2193 static int 2194 pci_nvme_append_iov_req(struct pci_nvme_softc *sc __unused, 2195 struct pci_nvme_ioreq *req, uint64_t gpaddr, size_t size, uint64_t offset) 2196 { 2197 int iovidx; 2198 bool range_is_contiguous; 2199 2200 if (req == NULL) 2201 return (-1); 2202 2203 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) { 2204 return (-1); 2205 } 2206 2207 /* 2208 * Minimize the number of IOVs by concatenating contiguous address 2209 * ranges. If the IOV count is zero, there is no previous range to 2210 * concatenate. 2211 */ 2212 if (req->io_req.br_iovcnt == 0) 2213 range_is_contiguous = false; 2214 else 2215 range_is_contiguous = (req->prev_gpaddr + req->prev_size) == gpaddr; 2216 2217 if (range_is_contiguous) { 2218 iovidx = req->io_req.br_iovcnt - 1; 2219 2220 req->io_req.br_iov[iovidx].iov_base = 2221 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2222 req->prev_gpaddr, size); 2223 if (req->io_req.br_iov[iovidx].iov_base == NULL) 2224 return (-1); 2225 2226 req->prev_size += size; 2227 req->io_req.br_resid += size; 2228 2229 req->io_req.br_iov[iovidx].iov_len = req->prev_size; 2230 } else { 2231 iovidx = req->io_req.br_iovcnt; 2232 if (iovidx == 0) { 2233 req->io_req.br_offset = offset; 2234 req->io_req.br_resid = 0; 2235 req->io_req.br_param = req; 2236 } 2237 2238 req->io_req.br_iov[iovidx].iov_base = 2239 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2240 gpaddr, size); 2241 if (req->io_req.br_iov[iovidx].iov_base == NULL) 2242 return (-1); 2243 2244 req->io_req.br_iov[iovidx].iov_len = size; 2245 2246 req->prev_gpaddr = gpaddr; 2247 req->prev_size = size; 2248 req->io_req.br_resid += size; 2249 2250 req->io_req.br_iovcnt++; 2251 } 2252 2253 return (0); 2254 } 2255 2256 static void 2257 pci_nvme_set_completion(struct pci_nvme_softc *sc, 2258 struct nvme_submission_queue *sq, int sqid, uint16_t cid, uint16_t status) 2259 { 2260 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; 2261 2262 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x", 2263 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), 2264 NVME_STATUS_GET_SC(status)); 2265 2266 pci_nvme_cq_update(sc, cq, 0, cid, sqid, status); 2267 2268 if (cq->head != cq->tail) { 2269 if (cq->intr_en & NVME_CQ_INTEN) { 2270 pci_generate_msix(sc->nsc_pi, cq->intr_vec); 2271 } else { 2272 DPRINTF("%s: CQ%u interrupt disabled", 2273 __func__, sq->cqid); 2274 } 2275 } 2276 } 2277 2278 static void 2279 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) 2280 { 2281 req->sc = NULL; 2282 req->nvme_sq = NULL; 2283 req->sqid = 0; 2284 2285 pthread_mutex_lock(&sc->mtx); 2286 2287 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link); 2288 sc->pending_ios--; 2289 2290 /* when no more IO pending, can set to ready if device reset/enabled */ 2291 if (sc->pending_ios == 0 && 2292 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) 2293 sc->regs.csts |= NVME_CSTS_RDY; 2294 2295 pthread_mutex_unlock(&sc->mtx); 2296 2297 sem_post(&sc->iosemlock); 2298 } 2299 2300 static struct pci_nvme_ioreq * 2301 pci_nvme_get_ioreq(struct pci_nvme_softc *sc) 2302 { 2303 struct pci_nvme_ioreq *req = NULL; 2304 2305 sem_wait(&sc->iosemlock); 2306 pthread_mutex_lock(&sc->mtx); 2307 2308 req = STAILQ_FIRST(&sc->ioreqs_free); 2309 assert(req != NULL); 2310 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link); 2311 2312 req->sc = sc; 2313 2314 sc->pending_ios++; 2315 2316 pthread_mutex_unlock(&sc->mtx); 2317 2318 req->io_req.br_iovcnt = 0; 2319 req->io_req.br_offset = 0; 2320 req->io_req.br_resid = 0; 2321 req->io_req.br_param = req; 2322 req->prev_gpaddr = 0; 2323 req->prev_size = 0; 2324 2325 return req; 2326 } 2327 2328 static void 2329 pci_nvme_io_done(struct blockif_req *br, int err) 2330 { 2331 struct pci_nvme_ioreq *req = br->br_param; 2332 struct nvme_submission_queue *sq = req->nvme_sq; 2333 uint16_t code, status; 2334 2335 DPRINTF("%s error %d %s", __func__, err, strerror(err)); 2336 2337 /* TODO return correct error */ 2338 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; 2339 status = 0; 2340 pci_nvme_status_genc(&status, code); 2341 2342 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, status); 2343 pci_nvme_stats_write_read_update(req->sc, req->opc, 2344 req->bytes, status); 2345 pci_nvme_release_ioreq(req->sc, req); 2346 } 2347 2348 /* 2349 * Implements the Flush command. The specification states: 2350 * If a volatile write cache is not present, Flush commands complete 2351 * successfully and have no effect 2352 * in the description of the Volatile Write Cache (VWC) field of the Identify 2353 * Controller data. Therefore, set status to Success if the command is 2354 * not supported (i.e. RAM or as indicated by the blockif). 2355 */ 2356 static bool 2357 nvme_opc_flush(struct pci_nvme_softc *sc __unused, 2358 struct nvme_command *cmd __unused, 2359 struct pci_nvme_blockstore *nvstore, 2360 struct pci_nvme_ioreq *req, 2361 uint16_t *status) 2362 { 2363 bool pending = false; 2364 2365 if (nvstore->type == NVME_STOR_RAM) { 2366 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2367 } else { 2368 int err; 2369 2370 req->io_req.br_callback = pci_nvme_io_done; 2371 2372 err = blockif_flush(nvstore->ctx, &req->io_req); 2373 switch (err) { 2374 case 0: 2375 pending = true; 2376 break; 2377 case EOPNOTSUPP: 2378 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2379 break; 2380 default: 2381 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2382 } 2383 } 2384 2385 return (pending); 2386 } 2387 2388 static uint16_t 2389 nvme_write_read_ram(struct pci_nvme_softc *sc, 2390 struct pci_nvme_blockstore *nvstore, 2391 uint64_t prp1, uint64_t prp2, 2392 size_t offset, uint64_t bytes, 2393 bool is_write) 2394 { 2395 uint8_t *buf = nvstore->ctx; 2396 enum nvme_copy_dir dir; 2397 uint16_t status; 2398 2399 if (is_write) 2400 dir = NVME_COPY_TO_PRP; 2401 else 2402 dir = NVME_COPY_FROM_PRP; 2403 2404 status = 0; 2405 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2, 2406 buf + offset, bytes, dir)) 2407 pci_nvme_status_genc(&status, 2408 NVME_SC_DATA_TRANSFER_ERROR); 2409 else 2410 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2411 2412 return (status); 2413 } 2414 2415 static uint16_t 2416 nvme_write_read_blockif(struct pci_nvme_softc *sc, 2417 struct pci_nvme_blockstore *nvstore, 2418 struct pci_nvme_ioreq *req, 2419 uint64_t prp1, uint64_t prp2, 2420 size_t offset, uint64_t bytes, 2421 bool is_write) 2422 { 2423 uint64_t size; 2424 int err; 2425 uint16_t status = NVME_NO_STATUS; 2426 2427 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes); 2428 if (pci_nvme_append_iov_req(sc, req, prp1, size, offset)) { 2429 err = -1; 2430 goto out; 2431 } 2432 2433 offset += size; 2434 bytes -= size; 2435 2436 if (bytes == 0) { 2437 ; 2438 } else if (bytes <= PAGE_SIZE) { 2439 size = bytes; 2440 if (pci_nvme_append_iov_req(sc, req, prp2, size, offset)) { 2441 err = -1; 2442 goto out; 2443 } 2444 } else { 2445 void *vmctx = sc->nsc_pi->pi_vmctx; 2446 uint64_t *prp_list = &prp2; 2447 uint64_t *last = prp_list; 2448 2449 /* PRP2 is pointer to a physical region page list */ 2450 while (bytes) { 2451 /* Last entry in list points to the next list */ 2452 if ((prp_list == last) && (bytes > PAGE_SIZE)) { 2453 uint64_t prp = *prp_list; 2454 2455 prp_list = paddr_guest2host(vmctx, prp, 2456 PAGE_SIZE - (prp % PAGE_SIZE)); 2457 if (prp_list == NULL) { 2458 err = -1; 2459 goto out; 2460 } 2461 last = prp_list + (NVME_PRP2_ITEMS - 1); 2462 } 2463 2464 size = MIN(bytes, PAGE_SIZE); 2465 2466 if (pci_nvme_append_iov_req(sc, req, *prp_list, size, 2467 offset)) { 2468 err = -1; 2469 goto out; 2470 } 2471 2472 offset += size; 2473 bytes -= size; 2474 2475 prp_list++; 2476 } 2477 } 2478 req->io_req.br_callback = pci_nvme_io_done; 2479 if (is_write) 2480 err = blockif_write(nvstore->ctx, &req->io_req); 2481 else 2482 err = blockif_read(nvstore->ctx, &req->io_req); 2483 out: 2484 if (err) 2485 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); 2486 2487 return (status); 2488 } 2489 2490 static bool 2491 nvme_opc_write_read(struct pci_nvme_softc *sc, 2492 struct nvme_command *cmd, 2493 struct pci_nvme_blockstore *nvstore, 2494 struct pci_nvme_ioreq *req, 2495 uint16_t *status) 2496 { 2497 uint64_t lba, nblocks, bytes; 2498 size_t offset; 2499 bool is_write = cmd->opc == NVME_OPC_WRITE; 2500 bool pending = false; 2501 2502 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; 2503 nblocks = (cmd->cdw12 & 0xFFFF) + 1; 2504 bytes = nblocks << nvstore->sectsz_bits; 2505 if (bytes > NVME_MAX_DATA_SIZE) { 2506 WPRINTF("%s command would exceed MDTS", __func__); 2507 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD); 2508 goto out; 2509 } 2510 2511 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) { 2512 WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)", 2513 __func__, lba, nblocks); 2514 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2515 goto out; 2516 } 2517 2518 offset = lba << nvstore->sectsz_bits; 2519 2520 req->bytes = bytes; 2521 req->io_req.br_offset = lba; 2522 2523 /* PRP bits 1:0 must be zero */ 2524 cmd->prp1 &= ~0x3UL; 2525 cmd->prp2 &= ~0x3UL; 2526 2527 if (nvstore->type == NVME_STOR_RAM) { 2528 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1, 2529 cmd->prp2, offset, bytes, is_write); 2530 } else { 2531 *status = nvme_write_read_blockif(sc, nvstore, req, 2532 cmd->prp1, cmd->prp2, offset, bytes, is_write); 2533 2534 if (*status == NVME_NO_STATUS) 2535 pending = true; 2536 } 2537 out: 2538 if (!pending) 2539 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status); 2540 2541 return (pending); 2542 } 2543 2544 static void 2545 pci_nvme_dealloc_sm(struct blockif_req *br, int err) 2546 { 2547 struct pci_nvme_ioreq *req = br->br_param; 2548 struct pci_nvme_softc *sc = req->sc; 2549 bool done = true; 2550 uint16_t status; 2551 2552 status = 0; 2553 if (err) { 2554 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); 2555 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) { 2556 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2557 } else { 2558 struct iovec *iov = req->io_req.br_iov; 2559 2560 req->prev_gpaddr++; 2561 iov += req->prev_gpaddr; 2562 2563 /* The iov_* values already include the sector size */ 2564 req->io_req.br_offset = (off_t)iov->iov_base; 2565 req->io_req.br_resid = iov->iov_len; 2566 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) { 2567 pci_nvme_status_genc(&status, 2568 NVME_SC_INTERNAL_DEVICE_ERROR); 2569 } else 2570 done = false; 2571 } 2572 2573 if (done) { 2574 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, req->cid, 2575 status); 2576 pci_nvme_release_ioreq(sc, req); 2577 } 2578 } 2579 2580 static bool 2581 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, 2582 struct nvme_command *cmd, 2583 struct pci_nvme_blockstore *nvstore, 2584 struct pci_nvme_ioreq *req, 2585 uint16_t *status) 2586 { 2587 struct nvme_dsm_range *range = NULL; 2588 uint32_t nr, r, non_zero, dr; 2589 int err; 2590 bool pending = false; 2591 2592 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) { 2593 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE); 2594 goto out; 2595 } 2596 2597 nr = cmd->cdw10 & 0xff; 2598 2599 /* copy locally because a range entry could straddle PRPs */ 2600 range = calloc(1, NVME_MAX_DSM_TRIM); 2601 if (range == NULL) { 2602 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2603 goto out; 2604 } 2605 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, 2606 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP); 2607 2608 /* Check for invalid ranges and the number of non-zero lengths */ 2609 non_zero = 0; 2610 for (r = 0; r <= nr; r++) { 2611 if (pci_nvme_out_of_range(nvstore, 2612 range[r].starting_lba, range[r].length)) { 2613 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2614 goto out; 2615 } 2616 if (range[r].length != 0) 2617 non_zero++; 2618 } 2619 2620 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) { 2621 size_t offset, bytes; 2622 int sectsz_bits = sc->nvstore.sectsz_bits; 2623 2624 /* 2625 * DSM calls are advisory only, and compliant controllers 2626 * may choose to take no actions (i.e. return Success). 2627 */ 2628 if (!nvstore->deallocate) { 2629 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2630 goto out; 2631 } 2632 2633 /* If all ranges have a zero length, return Success */ 2634 if (non_zero == 0) { 2635 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2636 goto out; 2637 } 2638 2639 if (req == NULL) { 2640 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2641 goto out; 2642 } 2643 2644 offset = range[0].starting_lba << sectsz_bits; 2645 bytes = range[0].length << sectsz_bits; 2646 2647 /* 2648 * If the request is for more than a single range, store 2649 * the ranges in the br_iov. Optimize for the common case 2650 * of a single range. 2651 * 2652 * Note that NVMe Number of Ranges is a zero based value 2653 */ 2654 req->io_req.br_iovcnt = 0; 2655 req->io_req.br_offset = offset; 2656 req->io_req.br_resid = bytes; 2657 2658 if (nr == 0) { 2659 req->io_req.br_callback = pci_nvme_io_done; 2660 } else { 2661 struct iovec *iov = req->io_req.br_iov; 2662 2663 for (r = 0, dr = 0; r <= nr; r++) { 2664 offset = range[r].starting_lba << sectsz_bits; 2665 bytes = range[r].length << sectsz_bits; 2666 if (bytes == 0) 2667 continue; 2668 2669 if ((nvstore->size - offset) < bytes) { 2670 pci_nvme_status_genc(status, 2671 NVME_SC_LBA_OUT_OF_RANGE); 2672 goto out; 2673 } 2674 iov[dr].iov_base = (void *)offset; 2675 iov[dr].iov_len = bytes; 2676 dr++; 2677 } 2678 req->io_req.br_callback = pci_nvme_dealloc_sm; 2679 2680 /* 2681 * Use prev_gpaddr to track the current entry and 2682 * prev_size to track the number of entries 2683 */ 2684 req->prev_gpaddr = 0; 2685 req->prev_size = dr; 2686 } 2687 2688 err = blockif_delete(nvstore->ctx, &req->io_req); 2689 if (err) 2690 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2691 else 2692 pending = true; 2693 } 2694 out: 2695 free(range); 2696 return (pending); 2697 } 2698 2699 static void 2700 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) 2701 { 2702 struct nvme_submission_queue *sq; 2703 uint16_t status; 2704 uint16_t sqhead; 2705 2706 /* handle all submissions up to sq->tail index */ 2707 sq = &sc->submit_queues[idx]; 2708 2709 pthread_mutex_lock(&sq->mtx); 2710 2711 sqhead = sq->head; 2712 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p", 2713 idx, sqhead, sq->tail, sq->qbase); 2714 2715 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2716 struct nvme_command *cmd; 2717 struct pci_nvme_ioreq *req; 2718 uint32_t nsid; 2719 bool pending; 2720 2721 pending = false; 2722 req = NULL; 2723 status = 0; 2724 2725 cmd = &sq->qbase[sqhead]; 2726 sqhead = (sqhead + 1) % sq->size; 2727 2728 nsid = le32toh(cmd->nsid); 2729 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) { 2730 pci_nvme_status_genc(&status, 2731 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 2732 status |= NVMEM(NVME_STATUS_DNR); 2733 goto complete; 2734 } 2735 2736 req = pci_nvme_get_ioreq(sc); 2737 if (req == NULL) { 2738 pci_nvme_status_genc(&status, 2739 NVME_SC_INTERNAL_DEVICE_ERROR); 2740 WPRINTF("%s: unable to allocate IO req", __func__); 2741 goto complete; 2742 } 2743 req->nvme_sq = sq; 2744 req->sqid = idx; 2745 req->opc = cmd->opc; 2746 req->cid = cmd->cid; 2747 req->nsid = cmd->nsid; 2748 2749 switch (cmd->opc) { 2750 case NVME_OPC_FLUSH: 2751 pending = nvme_opc_flush(sc, cmd, &sc->nvstore, 2752 req, &status); 2753 break; 2754 case NVME_OPC_WRITE: 2755 case NVME_OPC_READ: 2756 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore, 2757 req, &status); 2758 break; 2759 case NVME_OPC_WRITE_ZEROES: 2760 /* TODO: write zeroes 2761 WPRINTF("%s write zeroes lba 0x%lx blocks %u", 2762 __func__, lba, cmd->cdw12 & 0xFFFF); */ 2763 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2764 break; 2765 case NVME_OPC_DATASET_MANAGEMENT: 2766 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, 2767 req, &status); 2768 break; 2769 default: 2770 WPRINTF("%s unhandled io command 0x%x", 2771 __func__, cmd->opc); 2772 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE); 2773 } 2774 complete: 2775 if (!pending) { 2776 pci_nvme_set_completion(sc, sq, idx, cmd->cid, status); 2777 if (req != NULL) 2778 pci_nvme_release_ioreq(sc, req); 2779 } 2780 } 2781 2782 sq->head = sqhead; 2783 2784 pthread_mutex_unlock(&sq->mtx); 2785 } 2786 2787 static void 2788 pci_nvme_handle_doorbell(struct pci_nvme_softc* sc, 2789 uint64_t idx, int is_sq, uint64_t value) 2790 { 2791 DPRINTF("nvme doorbell %lu, %s, val 0x%lx", 2792 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF); 2793 2794 if (is_sq) { 2795 if (idx > sc->num_squeues) { 2796 WPRINTF("%s queue index %lu overflow from " 2797 "guest (max %u)", 2798 __func__, idx, sc->num_squeues); 2799 return; 2800 } 2801 2802 atomic_store_short(&sc->submit_queues[idx].tail, 2803 (uint16_t)value); 2804 2805 if (idx == 0) { 2806 pci_nvme_handle_admin_cmd(sc, value); 2807 } else { 2808 /* submission queue; handle new entries in SQ */ 2809 if (idx > sc->num_squeues) { 2810 WPRINTF("%s SQ index %lu overflow from " 2811 "guest (max %u)", 2812 __func__, idx, sc->num_squeues); 2813 return; 2814 } 2815 pci_nvme_handle_io_cmd(sc, (uint16_t)idx); 2816 } 2817 } else { 2818 if (idx > sc->num_cqueues) { 2819 WPRINTF("%s queue index %lu overflow from " 2820 "guest (max %u)", 2821 __func__, idx, sc->num_cqueues); 2822 return; 2823 } 2824 2825 atomic_store_short(&sc->compl_queues[idx].head, 2826 (uint16_t)value); 2827 } 2828 } 2829 2830 static void 2831 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) 2832 { 2833 const char *s = iswrite ? "WRITE" : "READ"; 2834 2835 switch (offset) { 2836 case NVME_CR_CAP_LOW: 2837 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s); 2838 break; 2839 case NVME_CR_CAP_HI: 2840 DPRINTF("%s %s NVME_CR_CAP_HI", func, s); 2841 break; 2842 case NVME_CR_VS: 2843 DPRINTF("%s %s NVME_CR_VS", func, s); 2844 break; 2845 case NVME_CR_INTMS: 2846 DPRINTF("%s %s NVME_CR_INTMS", func, s); 2847 break; 2848 case NVME_CR_INTMC: 2849 DPRINTF("%s %s NVME_CR_INTMC", func, s); 2850 break; 2851 case NVME_CR_CC: 2852 DPRINTF("%s %s NVME_CR_CC", func, s); 2853 break; 2854 case NVME_CR_CSTS: 2855 DPRINTF("%s %s NVME_CR_CSTS", func, s); 2856 break; 2857 case NVME_CR_NSSR: 2858 DPRINTF("%s %s NVME_CR_NSSR", func, s); 2859 break; 2860 case NVME_CR_AQA: 2861 DPRINTF("%s %s NVME_CR_AQA", func, s); 2862 break; 2863 case NVME_CR_ASQ_LOW: 2864 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s); 2865 break; 2866 case NVME_CR_ASQ_HI: 2867 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s); 2868 break; 2869 case NVME_CR_ACQ_LOW: 2870 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s); 2871 break; 2872 case NVME_CR_ACQ_HI: 2873 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s); 2874 break; 2875 default: 2876 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset); 2877 } 2878 2879 } 2880 2881 static void 2882 pci_nvme_write_bar_0(struct pci_nvme_softc *sc, uint64_t offset, int size, 2883 uint64_t value) 2884 { 2885 uint32_t ccreg; 2886 2887 if (offset >= NVME_DOORBELL_OFFSET) { 2888 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; 2889 uint64_t idx = belloffset / 8; /* door bell size = 2*int */ 2890 int is_sq = (belloffset % 8) < 4; 2891 2892 if ((sc->regs.csts & NVME_CSTS_RDY) == 0) { 2893 WPRINTF("doorbell write prior to RDY (offset=%#lx)\n", 2894 offset); 2895 return; 2896 } 2897 2898 if (belloffset > ((sc->max_queues+1) * 8 - 4)) { 2899 WPRINTF("guest attempted an overflow write offset " 2900 "0x%lx, val 0x%lx in %s", 2901 offset, value, __func__); 2902 return; 2903 } 2904 2905 if (is_sq) { 2906 if (sc->submit_queues[idx].qbase == NULL) 2907 return; 2908 } else if (sc->compl_queues[idx].qbase == NULL) 2909 return; 2910 2911 pci_nvme_handle_doorbell(sc, idx, is_sq, value); 2912 return; 2913 } 2914 2915 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx", 2916 offset, size, value); 2917 2918 if (size != 4) { 2919 WPRINTF("guest wrote invalid size %d (offset 0x%lx, " 2920 "val 0x%lx) to bar0 in %s", 2921 size, offset, value, __func__); 2922 /* TODO: shutdown device */ 2923 return; 2924 } 2925 2926 pci_nvme_bar0_reg_dumps(__func__, offset, 1); 2927 2928 pthread_mutex_lock(&sc->mtx); 2929 2930 switch (offset) { 2931 case NVME_CR_CAP_LOW: 2932 case NVME_CR_CAP_HI: 2933 /* readonly */ 2934 break; 2935 case NVME_CR_VS: 2936 /* readonly */ 2937 break; 2938 case NVME_CR_INTMS: 2939 /* MSI-X, so ignore */ 2940 break; 2941 case NVME_CR_INTMC: 2942 /* MSI-X, so ignore */ 2943 break; 2944 case NVME_CR_CC: 2945 ccreg = (uint32_t)value; 2946 2947 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u " 2948 "iocqes %u", 2949 __func__, 2950 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), 2951 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), 2952 NVME_CC_GET_IOCQES(ccreg)); 2953 2954 if (NVME_CC_GET_SHN(ccreg)) { 2955 /* perform shutdown - flush out data to backend */ 2956 sc->regs.csts &= ~NVMEM(NVME_CSTS_REG_SHST); 2957 sc->regs.csts |= NVMEF(NVME_CSTS_REG_SHST, 2958 NVME_SHST_COMPLETE); 2959 } 2960 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { 2961 if (NVME_CC_GET_EN(ccreg) == 0) 2962 /* transition 1-> causes controller reset */ 2963 pci_nvme_reset_locked(sc); 2964 else 2965 pci_nvme_init_controller(sc); 2966 } 2967 2968 /* Insert the iocqes, iosqes and en bits from the write */ 2969 sc->regs.cc &= ~NVME_CC_WRITE_MASK; 2970 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; 2971 if (NVME_CC_GET_EN(ccreg) == 0) { 2972 /* Insert the ams, mps and css bit fields */ 2973 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; 2974 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; 2975 sc->regs.csts &= ~NVME_CSTS_RDY; 2976 } else if ((sc->pending_ios == 0) && 2977 !(sc->regs.csts & NVME_CSTS_CFS)) { 2978 sc->regs.csts |= NVME_CSTS_RDY; 2979 } 2980 break; 2981 case NVME_CR_CSTS: 2982 break; 2983 case NVME_CR_NSSR: 2984 /* ignore writes; don't support subsystem reset */ 2985 break; 2986 case NVME_CR_AQA: 2987 sc->regs.aqa = (uint32_t)value; 2988 break; 2989 case NVME_CR_ASQ_LOW: 2990 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | 2991 (0xFFFFF000 & value); 2992 break; 2993 case NVME_CR_ASQ_HI: 2994 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | 2995 (value << 32); 2996 break; 2997 case NVME_CR_ACQ_LOW: 2998 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | 2999 (0xFFFFF000 & value); 3000 break; 3001 case NVME_CR_ACQ_HI: 3002 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | 3003 (value << 32); 3004 break; 3005 default: 3006 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d", 3007 __func__, offset, value, size); 3008 } 3009 pthread_mutex_unlock(&sc->mtx); 3010 } 3011 3012 static void 3013 pci_nvme_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size, 3014 uint64_t value) 3015 { 3016 struct pci_nvme_softc* sc = pi->pi_arg; 3017 3018 if (baridx == pci_msix_table_bar(pi) || 3019 baridx == pci_msix_pba_bar(pi)) { 3020 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, " 3021 " value 0x%lx", baridx, offset, size, value); 3022 3023 pci_emul_msix_twrite(pi, offset, size, value); 3024 return; 3025 } 3026 3027 switch (baridx) { 3028 case 0: 3029 pci_nvme_write_bar_0(sc, offset, size, value); 3030 break; 3031 3032 default: 3033 DPRINTF("%s unknown baridx %d, val 0x%lx", 3034 __func__, baridx, value); 3035 } 3036 } 3037 3038 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, 3039 uint64_t offset, int size) 3040 { 3041 uint64_t value; 3042 3043 pci_nvme_bar0_reg_dumps(__func__, offset, 0); 3044 3045 if (offset < NVME_DOORBELL_OFFSET) { 3046 void *p = &(sc->regs); 3047 pthread_mutex_lock(&sc->mtx); 3048 memcpy(&value, (void *)((uintptr_t)p + offset), size); 3049 pthread_mutex_unlock(&sc->mtx); 3050 } else { 3051 value = 0; 3052 WPRINTF("pci_nvme: read invalid offset %ld", offset); 3053 } 3054 3055 switch (size) { 3056 case 1: 3057 value &= 0xFF; 3058 break; 3059 case 2: 3060 value &= 0xFFFF; 3061 break; 3062 case 4: 3063 value &= 0xFFFFFFFF; 3064 break; 3065 } 3066 3067 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x", 3068 offset, size, (uint32_t)value); 3069 3070 return (value); 3071 } 3072 3073 3074 3075 static uint64_t 3076 pci_nvme_read(struct pci_devinst *pi, int baridx, uint64_t offset, int size) 3077 { 3078 struct pci_nvme_softc* sc = pi->pi_arg; 3079 3080 if (baridx == pci_msix_table_bar(pi) || 3081 baridx == pci_msix_pba_bar(pi)) { 3082 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d", 3083 baridx, offset, size); 3084 3085 return pci_emul_msix_tread(pi, offset, size); 3086 } 3087 3088 switch (baridx) { 3089 case 0: 3090 return pci_nvme_read_bar_0(sc, offset, size); 3091 3092 default: 3093 DPRINTF("unknown bar %d, 0x%lx", baridx, offset); 3094 } 3095 3096 return (0); 3097 } 3098 3099 static int 3100 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl) 3101 { 3102 char bident[sizeof("XXX:XXX")]; 3103 const char *value; 3104 uint32_t sectsz; 3105 3106 sc->max_queues = NVME_QUEUES; 3107 sc->max_qentries = NVME_MAX_QENTRIES; 3108 sc->ioslots = NVME_IOSLOTS; 3109 sc->num_squeues = sc->max_queues; 3110 sc->num_cqueues = sc->max_queues; 3111 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 3112 sectsz = 0; 3113 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), 3114 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3115 3116 value = get_config_value_node(nvl, "maxq"); 3117 if (value != NULL) 3118 sc->max_queues = atoi(value); 3119 value = get_config_value_node(nvl, "qsz"); 3120 if (value != NULL) { 3121 sc->max_qentries = atoi(value); 3122 if (sc->max_qentries <= 0) { 3123 EPRINTLN("nvme: Invalid qsz option %d", 3124 sc->max_qentries); 3125 return (-1); 3126 } 3127 } 3128 value = get_config_value_node(nvl, "ioslots"); 3129 if (value != NULL) { 3130 sc->ioslots = atoi(value); 3131 if (sc->ioslots <= 0) { 3132 EPRINTLN("Invalid ioslots option %d", sc->ioslots); 3133 return (-1); 3134 } 3135 } 3136 value = get_config_value_node(nvl, "sectsz"); 3137 if (value != NULL) 3138 sectsz = atoi(value); 3139 value = get_config_value_node(nvl, "ser"); 3140 if (value != NULL) { 3141 /* 3142 * This field indicates the Product Serial Number in 3143 * 7-bit ASCII, unused bytes should be space characters. 3144 * Ref: NVMe v1.3c. 3145 */ 3146 cpywithpad((char *)sc->ctrldata.sn, 3147 sizeof(sc->ctrldata.sn), value, ' '); 3148 } 3149 value = get_config_value_node(nvl, "eui64"); 3150 if (value != NULL) 3151 sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0)); 3152 value = get_config_value_node(nvl, "dsm"); 3153 if (value != NULL) { 3154 if (strcmp(value, "auto") == 0) 3155 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 3156 else if (strcmp(value, "enable") == 0) 3157 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE; 3158 else if (strcmp(value, "disable") == 0) 3159 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE; 3160 } 3161 3162 value = get_config_value_node(nvl, "bootindex"); 3163 if (value != NULL) { 3164 if (pci_emul_add_boot_device(sc->nsc_pi, atoi(value))) { 3165 EPRINTLN("Invalid bootindex %d", atoi(value)); 3166 return (-1); 3167 } 3168 } 3169 3170 value = get_config_value_node(nvl, "ram"); 3171 if (value != NULL) { 3172 uint64_t sz = strtoull(value, NULL, 10); 3173 3174 sc->nvstore.type = NVME_STOR_RAM; 3175 sc->nvstore.size = sz * 1024 * 1024; 3176 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 3177 sc->nvstore.sectsz = 4096; 3178 sc->nvstore.sectsz_bits = 12; 3179 if (sc->nvstore.ctx == NULL) { 3180 EPRINTLN("nvme: Unable to allocate RAM"); 3181 return (-1); 3182 } 3183 } else { 3184 snprintf(bident, sizeof(bident), "%u:%u", 3185 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3186 sc->nvstore.ctx = blockif_open(nvl, bident); 3187 if (sc->nvstore.ctx == NULL) { 3188 EPRINTLN("nvme: Could not open backing file: %s", 3189 strerror(errno)); 3190 return (-1); 3191 } 3192 sc->nvstore.type = NVME_STOR_BLOCKIF; 3193 sc->nvstore.size = blockif_size(sc->nvstore.ctx); 3194 } 3195 3196 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) 3197 sc->nvstore.sectsz = sectsz; 3198 else if (sc->nvstore.type != NVME_STOR_RAM) 3199 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); 3200 for (sc->nvstore.sectsz_bits = 9; 3201 (1U << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; 3202 sc->nvstore.sectsz_bits++); 3203 3204 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) 3205 sc->max_queues = NVME_QUEUES; 3206 3207 return (0); 3208 } 3209 3210 static void 3211 pci_nvme_resized(struct blockif_ctxt *bctxt __unused, void *arg, 3212 size_t new_size) 3213 { 3214 struct pci_nvme_softc *sc; 3215 struct pci_nvme_blockstore *nvstore; 3216 struct nvme_namespace_data *nd; 3217 3218 sc = arg; 3219 nvstore = &sc->nvstore; 3220 nd = &sc->nsdata; 3221 3222 nvstore->size = new_size; 3223 pci_nvme_init_nsdata_size(nvstore, nd); 3224 3225 /* Add changed NSID to list */ 3226 sc->ns_log.ns[0] = 1; 3227 sc->ns_log.ns[1] = 0; 3228 3229 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE, 3230 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED); 3231 } 3232 3233 static int 3234 pci_nvme_init(struct pci_devinst *pi, nvlist_t *nvl) 3235 { 3236 struct pci_nvme_softc *sc; 3237 uint32_t pci_membar_sz; 3238 int error; 3239 3240 error = 0; 3241 3242 sc = calloc(1, sizeof(struct pci_nvme_softc)); 3243 pi->pi_arg = sc; 3244 sc->nsc_pi = pi; 3245 3246 error = pci_nvme_parse_config(sc, nvl); 3247 if (error < 0) 3248 goto done; 3249 else 3250 error = 0; 3251 3252 STAILQ_INIT(&sc->ioreqs_free); 3253 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); 3254 for (uint32_t i = 0; i < sc->ioslots; i++) { 3255 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); 3256 } 3257 3258 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); 3259 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); 3260 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); 3261 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); 3262 pci_set_cfgdata8(pi, PCIR_PROGIF, 3263 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); 3264 3265 /* 3266 * Allocate size of NVMe registers + doorbell space for all queues. 3267 * 3268 * The specification requires a minimum memory I/O window size of 16K. 3269 * The Windows driver will refuse to start a device with a smaller 3270 * window. 3271 */ 3272 pci_membar_sz = sizeof(struct nvme_registers) + 3273 2 * sizeof(uint32_t) * (sc->max_queues + 1); 3274 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); 3275 3276 DPRINTF("nvme membar size: %u", pci_membar_sz); 3277 3278 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); 3279 if (error) { 3280 WPRINTF("%s pci alloc mem bar failed", __func__); 3281 goto done; 3282 } 3283 3284 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); 3285 if (error) { 3286 WPRINTF("%s pci add msixcap failed", __func__); 3287 goto done; 3288 } 3289 3290 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); 3291 if (error) { 3292 WPRINTF("%s pci add Express capability failed", __func__); 3293 goto done; 3294 } 3295 3296 pthread_mutex_init(&sc->mtx, NULL); 3297 sem_init(&sc->iosemlock, 0, sc->ioslots); 3298 blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc); 3299 3300 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues); 3301 /* 3302 * Controller data depends on Namespace data so initialize Namespace 3303 * data first. 3304 */ 3305 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore); 3306 pci_nvme_init_ctrldata(sc); 3307 pci_nvme_init_logpages(sc); 3308 pci_nvme_init_features(sc); 3309 3310 pci_nvme_aer_init(sc); 3311 pci_nvme_aen_init(sc); 3312 3313 pci_nvme_reset(sc); 3314 done: 3315 return (error); 3316 } 3317 3318 static int 3319 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts) 3320 { 3321 char *cp, *ram; 3322 3323 if (opts == NULL) 3324 return (0); 3325 3326 if (strncmp(opts, "ram=", 4) == 0) { 3327 cp = strchr(opts, ','); 3328 if (cp == NULL) { 3329 set_config_value_node(nvl, "ram", opts + 4); 3330 return (0); 3331 } 3332 ram = strndup(opts + 4, cp - opts - 4); 3333 set_config_value_node(nvl, "ram", ram); 3334 free(ram); 3335 return (pci_parse_legacy_config(nvl, cp + 1)); 3336 } else 3337 return (blockif_legacy_config(nvl, opts)); 3338 } 3339 3340 static const struct pci_devemu pci_de_nvme = { 3341 .pe_emu = "nvme", 3342 .pe_init = pci_nvme_init, 3343 .pe_legacy_config = pci_nvme_legacy_config, 3344 .pe_barwrite = pci_nvme_write, 3345 .pe_barread = pci_nvme_read 3346 }; 3347 PCI_EMUL_SET(pci_de_nvme); 3348