1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2017 Shunsuke Mie 5 * Copyright (c) 2018 Leon Dang 6 * Copyright (c) 2020 Chuck Tuffli 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 /* 31 * bhyve PCIe-NVMe device emulation. 32 * 33 * options: 34 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt> 35 * 36 * accepted devpath: 37 * /dev/blockdev 38 * /path/to/image 39 * ram=size_in_MiB 40 * 41 * maxq = max number of queues 42 * qsz = max elements in each queue 43 * ioslots = max number of concurrent io requests 44 * sectsz = sector size (defaults to blockif sector size) 45 * ser = serial number (20-chars max) 46 * eui64 = IEEE Extended Unique Identifier (8 byte value) 47 * dsm = DataSet Management support. Option is one of auto, enable,disable 48 * 49 */ 50 51 /* TODO: 52 - create async event for smart and log 53 - intr coalesce 54 */ 55 56 57 #include <sys/errno.h> 58 #include <sys/types.h> 59 #ifdef __FreeBSD__ 60 #include <sys/crc16.h> 61 #else 62 #include "crc16.h" 63 #endif 64 #include <net/ieee_oui.h> 65 #ifndef __FreeBSD__ 66 #include <endian.h> 67 #endif 68 69 #include <assert.h> 70 #include <pthread.h> 71 #include <pthread_np.h> 72 #include <semaphore.h> 73 #include <stdbool.h> 74 #include <stddef.h> 75 #include <stdint.h> 76 #include <stdio.h> 77 #include <stdlib.h> 78 #include <string.h> 79 80 #include <machine/atomic.h> 81 #include <machine/vmm.h> 82 #include <vmmapi.h> 83 84 #include <dev/nvme/nvme.h> 85 86 #include "bhyverun.h" 87 #include "block_if.h" 88 #include "config.h" 89 #include "debug.h" 90 #include "pci_emul.h" 91 92 93 static int nvme_debug = 0; 94 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args) 95 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args) 96 97 /* defaults; can be overridden */ 98 #define NVME_MSIX_BAR 4 99 100 #define NVME_IOSLOTS 8 101 102 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ 103 #define NVME_MMIO_SPACE_MIN (1 << 14) 104 105 #define NVME_QUEUES 16 106 #define NVME_MAX_QENTRIES 2048 107 /* Memory Page size Minimum reported in CAP register */ 108 #define NVME_MPSMIN 0 109 /* MPSMIN converted to bytes */ 110 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN)) 111 112 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) 113 #define NVME_MDTS 9 114 /* Note the + 1 allows for the initial descriptor to not be page aligned */ 115 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1) 116 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES) 117 118 /* This is a synthetic status code to indicate there is no status */ 119 #define NVME_NO_STATUS 0xffff 120 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) 121 122 /* Reported temperature in Kelvin (i.e. room temperature) */ 123 #define NVME_TEMPERATURE 296 124 125 /* helpers */ 126 127 /* Convert a zero-based value into a one-based value */ 128 #define ONE_BASED(zero) ((zero) + 1) 129 /* Convert a one-based value into a zero-based value */ 130 #define ZERO_BASED(one) ((one) - 1) 131 132 /* Encode number of SQ's and CQ's for Set/Get Features */ 133 #define NVME_FEATURE_NUM_QUEUES(sc) \ 134 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ 135 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16 136 137 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) 138 139 enum nvme_controller_register_offsets { 140 NVME_CR_CAP_LOW = 0x00, 141 NVME_CR_CAP_HI = 0x04, 142 NVME_CR_VS = 0x08, 143 NVME_CR_INTMS = 0x0c, 144 NVME_CR_INTMC = 0x10, 145 NVME_CR_CC = 0x14, 146 NVME_CR_CSTS = 0x1c, 147 NVME_CR_NSSR = 0x20, 148 NVME_CR_AQA = 0x24, 149 NVME_CR_ASQ_LOW = 0x28, 150 NVME_CR_ASQ_HI = 0x2c, 151 NVME_CR_ACQ_LOW = 0x30, 152 NVME_CR_ACQ_HI = 0x34, 153 }; 154 155 enum nvme_cmd_cdw11 { 156 NVME_CMD_CDW11_PC = 0x0001, 157 NVME_CMD_CDW11_IEN = 0x0002, 158 NVME_CMD_CDW11_IV = 0xFFFF0000, 159 }; 160 161 enum nvme_copy_dir { 162 NVME_COPY_TO_PRP, 163 NVME_COPY_FROM_PRP, 164 }; 165 166 #define NVME_CQ_INTEN 0x01 167 #define NVME_CQ_INTCOAL 0x02 168 169 struct nvme_completion_queue { 170 struct nvme_completion *qbase; 171 pthread_mutex_t mtx; 172 uint32_t size; 173 uint16_t tail; /* nvme progress */ 174 uint16_t head; /* guest progress */ 175 uint16_t intr_vec; 176 uint32_t intr_en; 177 }; 178 179 struct nvme_submission_queue { 180 struct nvme_command *qbase; 181 pthread_mutex_t mtx; 182 uint32_t size; 183 uint16_t head; /* nvme progress */ 184 uint16_t tail; /* guest progress */ 185 uint16_t cqid; /* completion queue id */ 186 int qpriority; 187 }; 188 189 enum nvme_storage_type { 190 NVME_STOR_BLOCKIF = 0, 191 NVME_STOR_RAM = 1, 192 }; 193 194 struct pci_nvme_blockstore { 195 enum nvme_storage_type type; 196 void *ctx; 197 uint64_t size; 198 uint32_t sectsz; 199 uint32_t sectsz_bits; 200 uint64_t eui64; 201 uint32_t deallocate:1; 202 }; 203 204 /* 205 * Calculate the number of additional page descriptors for guest IO requests 206 * based on the advertised Max Data Transfer (MDTS) and given the number of 207 * default iovec's in a struct blockif_req. 208 */ 209 #define MDTS_PAD_SIZE \ 210 ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \ 211 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \ 212 0 ) 213 214 struct pci_nvme_ioreq { 215 struct pci_nvme_softc *sc; 216 STAILQ_ENTRY(pci_nvme_ioreq) link; 217 struct nvme_submission_queue *nvme_sq; 218 uint16_t sqid; 219 220 /* command information */ 221 uint16_t opc; 222 uint16_t cid; 223 uint32_t nsid; 224 225 uint64_t prev_gpaddr; 226 size_t prev_size; 227 size_t bytes; 228 229 struct blockif_req io_req; 230 231 struct iovec iovpadding[MDTS_PAD_SIZE]; 232 }; 233 234 enum nvme_dsm_type { 235 /* Dataset Management bit in ONCS reflects backing storage capability */ 236 NVME_DATASET_MANAGEMENT_AUTO, 237 /* Unconditionally set Dataset Management bit in ONCS */ 238 NVME_DATASET_MANAGEMENT_ENABLE, 239 /* Unconditionally clear Dataset Management bit in ONCS */ 240 NVME_DATASET_MANAGEMENT_DISABLE, 241 }; 242 243 struct pci_nvme_softc; 244 struct nvme_feature_obj; 245 246 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *, 247 struct nvme_feature_obj *, 248 struct nvme_command *, 249 struct nvme_completion *); 250 251 struct nvme_feature_obj { 252 uint32_t cdw11; 253 nvme_feature_cb set; 254 nvme_feature_cb get; 255 bool namespace_specific; 256 }; 257 258 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1) 259 260 typedef enum { 261 PCI_NVME_AE_TYPE_ERROR = 0, 262 PCI_NVME_AE_TYPE_SMART, 263 PCI_NVME_AE_TYPE_NOTICE, 264 PCI_NVME_AE_TYPE_IO_CMD = 6, 265 PCI_NVME_AE_TYPE_VENDOR = 7, 266 PCI_NVME_AE_TYPE_MAX /* Must be last */ 267 } pci_nvme_async_type; 268 269 /* Asynchronous Event Requests */ 270 struct pci_nvme_aer { 271 STAILQ_ENTRY(pci_nvme_aer) link; 272 uint16_t cid; /* Command ID of the submitted AER */ 273 }; 274 275 /** Asynchronous Event Information - Notice */ 276 typedef enum { 277 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED = 0, 278 PCI_NVME_AEI_NOTICE_FW_ACTIVATION, 279 PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE, 280 PCI_NVME_AEI_NOTICE_ANA_CHANGE, 281 PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE, 282 PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT, 283 PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE, 284 PCI_NVME_AEI_NOTICE_MAX, 285 } pci_nvme_async_event_info_notice; 286 287 #define PCI_NVME_AEI_NOTICE_SHIFT 8 288 #define PCI_NVME_AEI_NOTICE_MASK(event) (1 << (event + PCI_NVME_AEI_NOTICE_SHIFT)) 289 290 /* Asynchronous Event Notifications */ 291 struct pci_nvme_aen { 292 pci_nvme_async_type atype; 293 uint32_t event_data; 294 bool posted; 295 }; 296 297 /* 298 * By default, enable all Asynchrnous Event Notifications: 299 * SMART / Health Critical Warnings 300 * Namespace Attribute Notices 301 */ 302 #define PCI_NVME_AEN_DEFAULT_MASK 0x11f 303 304 typedef enum { 305 NVME_CNTRLTYPE_IO = 1, 306 NVME_CNTRLTYPE_DISCOVERY = 2, 307 NVME_CNTRLTYPE_ADMIN = 3, 308 } pci_nvme_cntrl_type; 309 310 struct pci_nvme_softc { 311 struct pci_devinst *nsc_pi; 312 313 pthread_mutex_t mtx; 314 315 struct nvme_registers regs; 316 317 struct nvme_namespace_data nsdata; 318 struct nvme_controller_data ctrldata; 319 struct nvme_error_information_entry err_log; 320 struct nvme_health_information_page health_log; 321 struct nvme_firmware_page fw_log; 322 struct nvme_ns_list ns_log; 323 324 struct pci_nvme_blockstore nvstore; 325 326 uint16_t max_qentries; /* max entries per queue */ 327 uint32_t max_queues; /* max number of IO SQ's or CQ's */ 328 uint32_t num_cqueues; 329 uint32_t num_squeues; 330 bool num_q_is_set; /* Has host set Number of Queues */ 331 332 struct pci_nvme_ioreq *ioreqs; 333 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */ 334 uint32_t pending_ios; 335 uint32_t ioslots; 336 sem_t iosemlock; 337 338 /* 339 * Memory mapped Submission and Completion queues 340 * Each array includes both Admin and IO queues 341 */ 342 struct nvme_completion_queue *compl_queues; 343 struct nvme_submission_queue *submit_queues; 344 345 struct nvme_feature_obj feat[NVME_FID_MAX]; 346 347 enum nvme_dsm_type dataset_management; 348 349 /* Accounting for SMART data */ 350 __uint128_t read_data_units; 351 __uint128_t write_data_units; 352 __uint128_t read_commands; 353 __uint128_t write_commands; 354 uint32_t read_dunits_remainder; 355 uint32_t write_dunits_remainder; 356 357 STAILQ_HEAD(, pci_nvme_aer) aer_list; 358 pthread_mutex_t aer_mtx; 359 uint32_t aer_count; 360 struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX]; 361 pthread_t aen_tid; 362 pthread_mutex_t aen_mtx; 363 pthread_cond_t aen_cond; 364 }; 365 366 367 static void pci_nvme_cq_update(struct pci_nvme_softc *sc, 368 struct nvme_completion_queue *cq, 369 uint32_t cdw0, 370 uint16_t cid, 371 uint16_t sqid, 372 uint16_t status); 373 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); 374 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); 375 static void pci_nvme_io_done(struct blockif_req *, int); 376 377 /* Controller Configuration utils */ 378 #define NVME_CC_GET_EN(cc) \ 379 NVMEV(NVME_CC_REG_EN, cc) 380 #define NVME_CC_GET_CSS(cc) \ 381 NVMEV(NVME_CC_REG_CSS, cc) 382 #define NVME_CC_GET_SHN(cc) \ 383 NVMEV(NVME_CC_REG_SHN, cc) 384 #define NVME_CC_GET_IOSQES(cc) \ 385 NVMEV(NVME_CC_REG_IOSQES, cc) 386 #define NVME_CC_GET_IOCQES(cc) \ 387 NVMEV(NVME_CC_REG_IOCQES, cc) 388 389 #define NVME_CC_WRITE_MASK \ 390 (NVMEM(NVME_CC_REG_EN) | \ 391 NVMEM(NVME_CC_REG_IOSQES) | \ 392 NVMEM(NVME_CC_REG_IOCQES)) 393 394 #define NVME_CC_NEN_WRITE_MASK \ 395 (NVMEM(NVME_CC_REG_CSS) | \ 396 NVMEM(NVME_CC_REG_MPS) | \ 397 NVMEM(NVME_CC_REG_AMS)) 398 399 /* Controller Status utils */ 400 #define NVME_CSTS_GET_RDY(sts) \ 401 NVMEV(NVME_CSTS_REG_RDY, sts) 402 403 #define NVME_CSTS_RDY (NVMEF(NVME_CSTS_REG_RDY, 1)) 404 #define NVME_CSTS_CFS (NVMEF(NVME_CSTS_REG_CFS, 1)) 405 406 /* Completion Queue status word utils */ 407 #define NVME_STATUS_P (NVMEF(NVME_STATUS_P, 1)) 408 #define NVME_STATUS_MASK \ 409 (NVMEM(NVME_STATUS_SCT) | \ 410 NVMEM(NVME_STATUS_SC)) 411 412 #define NVME_ONCS_DSM NVMEM(NVME_CTRLR_DATA_ONCS_DSM) 413 414 static void nvme_feature_invalid_cb(struct pci_nvme_softc *, 415 struct nvme_feature_obj *, 416 struct nvme_command *, 417 struct nvme_completion *); 418 static void nvme_feature_temperature(struct pci_nvme_softc *, 419 struct nvme_feature_obj *, 420 struct nvme_command *, 421 struct nvme_completion *); 422 static void nvme_feature_num_queues(struct pci_nvme_softc *, 423 struct nvme_feature_obj *, 424 struct nvme_command *, 425 struct nvme_completion *); 426 static void nvme_feature_iv_config(struct pci_nvme_softc *, 427 struct nvme_feature_obj *, 428 struct nvme_command *, 429 struct nvme_completion *); 430 static void nvme_feature_async_event(struct pci_nvme_softc *, 431 struct nvme_feature_obj *, 432 struct nvme_command *, 433 struct nvme_completion *); 434 435 static void *aen_thr(void *arg); 436 437 static __inline void 438 cpywithpad(char *dst, size_t dst_size, const char *src, char pad) 439 { 440 size_t len; 441 442 len = strnlen(src, dst_size); 443 memset(dst, pad, dst_size); 444 memcpy(dst, src, len); 445 } 446 447 static __inline void 448 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) 449 { 450 451 *status &= ~NVME_STATUS_MASK; 452 *status |= NVMEF(NVME_STATUS_SCT, type) | NVMEF(NVME_STATUS_SC, code); 453 } 454 455 static __inline void 456 pci_nvme_status_genc(uint16_t *status, uint16_t code) 457 { 458 459 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); 460 } 461 462 /* 463 * Initialize the requested number or IO Submission and Completion Queues. 464 * Admin queues are allocated implicitly. 465 */ 466 static void 467 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq) 468 { 469 uint32_t i; 470 471 /* 472 * Allocate and initialize the Submission Queues 473 */ 474 if (nsq > NVME_QUEUES) { 475 WPRINTF("%s: clamping number of SQ from %u to %u", 476 __func__, nsq, NVME_QUEUES); 477 nsq = NVME_QUEUES; 478 } 479 480 sc->num_squeues = nsq; 481 482 sc->submit_queues = calloc(sc->num_squeues + 1, 483 sizeof(struct nvme_submission_queue)); 484 if (sc->submit_queues == NULL) { 485 WPRINTF("%s: SQ allocation failed", __func__); 486 sc->num_squeues = 0; 487 } else { 488 struct nvme_submission_queue *sq = sc->submit_queues; 489 490 for (i = 0; i < sc->num_squeues + 1; i++) 491 pthread_mutex_init(&sq[i].mtx, NULL); 492 } 493 494 /* 495 * Allocate and initialize the Completion Queues 496 */ 497 if (ncq > NVME_QUEUES) { 498 WPRINTF("%s: clamping number of CQ from %u to %u", 499 __func__, ncq, NVME_QUEUES); 500 ncq = NVME_QUEUES; 501 } 502 503 sc->num_cqueues = ncq; 504 505 sc->compl_queues = calloc(sc->num_cqueues + 1, 506 sizeof(struct nvme_completion_queue)); 507 if (sc->compl_queues == NULL) { 508 WPRINTF("%s: CQ allocation failed", __func__); 509 sc->num_cqueues = 0; 510 } else { 511 struct nvme_completion_queue *cq = sc->compl_queues; 512 513 for (i = 0; i < sc->num_cqueues + 1; i++) 514 pthread_mutex_init(&cq[i].mtx, NULL); 515 } 516 } 517 518 static void 519 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) 520 { 521 struct nvme_controller_data *cd = &sc->ctrldata; 522 int ret; 523 524 cd->vid = 0xFB5D; 525 cd->ssvid = 0x0000; 526 527 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); 528 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); 529 530 /* Num of submission commands that we can handle at a time (2^rab) */ 531 cd->rab = 4; 532 533 /* FreeBSD OUI */ 534 cd->ieee[0] = 0xfc; 535 cd->ieee[1] = 0x9c; 536 cd->ieee[2] = 0x58; 537 538 cd->mic = 0; 539 540 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */ 541 542 cd->ver = NVME_REV(1,4); 543 544 cd->cntrltype = NVME_CNTRLTYPE_IO; 545 cd->oacs = NVMEF(NVME_CTRLR_DATA_OACS_FORMAT, 1); 546 cd->oaes = NVMEM(NVME_CTRLR_DATA_OAES_NS_ATTR); 547 cd->acl = 2; 548 cd->aerl = 4; 549 550 /* Advertise 1, Read-only firmware slot */ 551 cd->frmw = NVMEM(NVME_CTRLR_DATA_FRMW_SLOT1_RO) | 552 NVMEF(NVME_CTRLR_DATA_FRMW_NUM_SLOTS, 1); 553 cd->lpa = 0; /* TODO: support some simple things like SMART */ 554 cd->elpe = 0; /* max error log page entries */ 555 /* 556 * Report a single power state (zero-based value) 557 * power_state[] values are left as zero to indicate "Not reported" 558 */ 559 cd->npss = 0; 560 561 /* Warning Composite Temperature Threshold */ 562 cd->wctemp = 0x0157; 563 cd->cctemp = 0x0157; 564 565 /* SANICAP must not be 0 for Revision 1.4 and later NVMe Controllers */ 566 cd->sanicap = NVMEF(NVME_CTRLR_DATA_SANICAP_NODMMAS, 567 NVME_CTRLR_DATA_SANICAP_NODMMAS_NO); 568 569 cd->sqes = NVMEF(NVME_CTRLR_DATA_SQES_MAX, 6) | 570 NVMEF(NVME_CTRLR_DATA_SQES_MIN, 6); 571 cd->cqes = NVMEF(NVME_CTRLR_DATA_CQES_MAX, 4) | 572 NVMEF(NVME_CTRLR_DATA_CQES_MIN, 4); 573 cd->nn = 1; /* number of namespaces */ 574 575 cd->oncs = 0; 576 switch (sc->dataset_management) { 577 case NVME_DATASET_MANAGEMENT_AUTO: 578 if (sc->nvstore.deallocate) 579 cd->oncs |= NVME_ONCS_DSM; 580 break; 581 case NVME_DATASET_MANAGEMENT_ENABLE: 582 cd->oncs |= NVME_ONCS_DSM; 583 break; 584 default: 585 break; 586 } 587 588 cd->fna = NVMEM(NVME_CTRLR_DATA_FNA_FORMAT_ALL); 589 590 cd->vwc = NVMEF(NVME_CTRLR_DATA_VWC_ALL, NVME_CTRLR_DATA_VWC_ALL_NO); 591 592 #ifdef __FreeBSD__ 593 ret = snprintf(cd->subnqn, sizeof(cd->subnqn), 594 "nqn.2013-12.org.freebsd:bhyve-%s-%u-%u-%u", 595 get_config_value("name"), sc->nsc_pi->pi_bus, 596 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 597 #else 598 ret = snprintf((char *)cd->subnqn, sizeof (cd->subnqn), 599 "nqn.2013-12.org.illumos:bhyve-%s-%u-%u-%u", 600 get_config_value("name"), sc->nsc_pi->pi_bus, 601 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 602 #endif 603 if ((ret < 0) || ((unsigned)ret > sizeof(cd->subnqn))) 604 EPRINTLN("%s: error setting subnqn (%d)", __func__, ret); 605 } 606 607 static void 608 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore, 609 struct nvme_namespace_data *nd) 610 { 611 612 /* Get capacity and block size information from backing store */ 613 nd->nsze = nvstore->size / nvstore->sectsz; 614 nd->ncap = nd->nsze; 615 nd->nuse = nd->nsze; 616 } 617 618 static void 619 pci_nvme_init_nsdata(struct pci_nvme_softc *sc, 620 struct nvme_namespace_data *nd, uint32_t nsid, 621 struct pci_nvme_blockstore *nvstore) 622 { 623 624 pci_nvme_init_nsdata_size(nvstore, nd); 625 626 if (nvstore->type == NVME_STOR_BLOCKIF) 627 nvstore->deallocate = blockif_candelete(nvstore->ctx); 628 629 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ 630 nd->flbas = 0; 631 632 /* Create an EUI-64 if user did not provide one */ 633 if (nvstore->eui64 == 0) { 634 char *data = NULL; 635 uint64_t eui64 = nvstore->eui64; 636 637 asprintf(&data, "%s%u%u%u", get_config_value("name"), 638 sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot, 639 sc->nsc_pi->pi_func); 640 641 if (data != NULL) { 642 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); 643 free(data); 644 } 645 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff); 646 } 647 be64enc(nd->eui64, nvstore->eui64); 648 649 /* LBA data-sz = 2^lbads */ 650 nd->lbaf[0] = NVMEF(NVME_NS_DATA_LBAF_LBADS, nvstore->sectsz_bits); 651 } 652 653 static void 654 pci_nvme_init_logpages(struct pci_nvme_softc *sc) 655 { 656 __uint128_t power_cycles = 1; 657 658 memset(&sc->err_log, 0, sizeof(sc->err_log)); 659 memset(&sc->health_log, 0, sizeof(sc->health_log)); 660 memset(&sc->fw_log, 0, sizeof(sc->fw_log)); 661 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 662 663 /* Set read/write remainder to round up according to spec */ 664 sc->read_dunits_remainder = 999; 665 sc->write_dunits_remainder = 999; 666 667 /* Set nominal Health values checked by implementations */ 668 sc->health_log.temperature = NVME_TEMPERATURE; 669 sc->health_log.available_spare = 100; 670 sc->health_log.available_spare_threshold = 10; 671 672 /* Set Active Firmware Info to slot 1 */ 673 sc->fw_log.afi = NVMEF(NVME_FIRMWARE_PAGE_AFI_SLOT, 1); 674 memcpy(&sc->fw_log.revision[0], sc->ctrldata.fr, 675 sizeof(sc->fw_log.revision[0])); 676 677 memcpy(&sc->health_log.power_cycles, &power_cycles, 678 sizeof(sc->health_log.power_cycles)); 679 } 680 681 static void 682 pci_nvme_init_features(struct pci_nvme_softc *sc) 683 { 684 enum nvme_feature fid; 685 686 for (fid = 0; fid < NVME_FID_MAX; fid++) { 687 switch (fid) { 688 case NVME_FEAT_ARBITRATION: 689 case NVME_FEAT_POWER_MANAGEMENT: 690 case NVME_FEAT_INTERRUPT_COALESCING: //XXX 691 case NVME_FEAT_WRITE_ATOMICITY: 692 /* Mandatory but no special handling required */ 693 //XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 694 //XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 695 // this returns a data buffer 696 break; 697 case NVME_FEAT_TEMPERATURE_THRESHOLD: 698 sc->feat[fid].set = nvme_feature_temperature; 699 break; 700 case NVME_FEAT_ERROR_RECOVERY: 701 sc->feat[fid].namespace_specific = true; 702 break; 703 case NVME_FEAT_NUMBER_OF_QUEUES: 704 sc->feat[fid].set = nvme_feature_num_queues; 705 break; 706 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 707 sc->feat[fid].set = nvme_feature_iv_config; 708 break; 709 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 710 sc->feat[fid].set = nvme_feature_async_event; 711 /* Enable all AENs by default */ 712 sc->feat[fid].cdw11 = PCI_NVME_AEN_DEFAULT_MASK; 713 break; 714 default: 715 sc->feat[fid].set = nvme_feature_invalid_cb; 716 sc->feat[fid].get = nvme_feature_invalid_cb; 717 } 718 } 719 } 720 721 static void 722 pci_nvme_aer_reset(struct pci_nvme_softc *sc) 723 { 724 725 STAILQ_INIT(&sc->aer_list); 726 sc->aer_count = 0; 727 } 728 729 static void 730 pci_nvme_aer_init(struct pci_nvme_softc *sc) 731 { 732 733 pthread_mutex_init(&sc->aer_mtx, NULL); 734 pci_nvme_aer_reset(sc); 735 } 736 737 static void 738 pci_nvme_aer_destroy(struct pci_nvme_softc *sc) 739 { 740 struct pci_nvme_aer *aer = NULL; 741 742 pthread_mutex_lock(&sc->aer_mtx); 743 while (!STAILQ_EMPTY(&sc->aer_list)) { 744 aer = STAILQ_FIRST(&sc->aer_list); 745 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 746 free(aer); 747 } 748 pthread_mutex_unlock(&sc->aer_mtx); 749 750 pci_nvme_aer_reset(sc); 751 } 752 753 static bool 754 pci_nvme_aer_available(struct pci_nvme_softc *sc) 755 { 756 757 return (sc->aer_count != 0); 758 } 759 760 static bool 761 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc) 762 { 763 struct nvme_controller_data *cd = &sc->ctrldata; 764 765 /* AERL is a zero based value while aer_count is one's based */ 766 return (sc->aer_count == (cd->aerl + 1U)); 767 } 768 769 /* 770 * Add an Async Event Request 771 * 772 * Stores an AER to be returned later if the Controller needs to notify the 773 * host of an event. 774 * Note that while the NVMe spec doesn't require Controllers to return AER's 775 * in order, this implementation does preserve the order. 776 */ 777 static int 778 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid) 779 { 780 struct pci_nvme_aer *aer = NULL; 781 782 aer = calloc(1, sizeof(struct pci_nvme_aer)); 783 if (aer == NULL) 784 return (-1); 785 786 /* Save the Command ID for use in the completion message */ 787 aer->cid = cid; 788 789 pthread_mutex_lock(&sc->aer_mtx); 790 sc->aer_count++; 791 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link); 792 pthread_mutex_unlock(&sc->aer_mtx); 793 794 return (0); 795 } 796 797 /* 798 * Get an Async Event Request structure 799 * 800 * Returns a pointer to an AER previously submitted by the host or NULL if 801 * no AER's exist. Caller is responsible for freeing the returned struct. 802 */ 803 static struct pci_nvme_aer * 804 pci_nvme_aer_get(struct pci_nvme_softc *sc) 805 { 806 struct pci_nvme_aer *aer = NULL; 807 808 pthread_mutex_lock(&sc->aer_mtx); 809 aer = STAILQ_FIRST(&sc->aer_list); 810 if (aer != NULL) { 811 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 812 sc->aer_count--; 813 } 814 pthread_mutex_unlock(&sc->aer_mtx); 815 816 return (aer); 817 } 818 819 static void 820 pci_nvme_aen_reset(struct pci_nvme_softc *sc) 821 { 822 uint32_t atype; 823 824 memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen)); 825 826 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 827 sc->aen[atype].atype = atype; 828 } 829 } 830 831 static void 832 pci_nvme_aen_init(struct pci_nvme_softc *sc) 833 { 834 char nstr[80]; 835 836 pci_nvme_aen_reset(sc); 837 838 pthread_mutex_init(&sc->aen_mtx, NULL); 839 pthread_create(&sc->aen_tid, NULL, aen_thr, sc); 840 snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot, 841 sc->nsc_pi->pi_func); 842 pthread_set_name_np(sc->aen_tid, nstr); 843 } 844 845 static void 846 pci_nvme_aen_destroy(struct pci_nvme_softc *sc) 847 { 848 849 pci_nvme_aen_reset(sc); 850 } 851 852 /* Notify the AEN thread of pending work */ 853 static void 854 pci_nvme_aen_notify(struct pci_nvme_softc *sc) 855 { 856 857 pthread_cond_signal(&sc->aen_cond); 858 } 859 860 /* 861 * Post an Asynchronous Event Notification 862 */ 863 static int32_t 864 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype, 865 uint32_t event_data) 866 { 867 struct pci_nvme_aen *aen; 868 869 if (atype >= PCI_NVME_AE_TYPE_MAX) { 870 return(EINVAL); 871 } 872 873 pthread_mutex_lock(&sc->aen_mtx); 874 aen = &sc->aen[atype]; 875 876 /* Has the controller already posted an event of this type? */ 877 if (aen->posted) { 878 pthread_mutex_unlock(&sc->aen_mtx); 879 return(EALREADY); 880 } 881 882 aen->event_data = event_data; 883 aen->posted = true; 884 pthread_mutex_unlock(&sc->aen_mtx); 885 886 pci_nvme_aen_notify(sc); 887 888 return(0); 889 } 890 891 static void 892 pci_nvme_aen_process(struct pci_nvme_softc *sc) 893 { 894 struct pci_nvme_aer *aer; 895 struct pci_nvme_aen *aen; 896 pci_nvme_async_type atype; 897 uint32_t mask; 898 uint16_t status; 899 uint8_t lid; 900 901 #ifndef __FreeBSD__ 902 lid = 0; 903 #endif 904 905 assert(pthread_mutex_isowned_np(&sc->aen_mtx)); 906 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 907 aen = &sc->aen[atype]; 908 /* Previous iterations may have depleted the available AER's */ 909 if (!pci_nvme_aer_available(sc)) { 910 DPRINTF("%s: no AER", __func__); 911 break; 912 } 913 914 if (!aen->posted) { 915 DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype); 916 continue; 917 } 918 919 status = NVME_SC_SUCCESS; 920 921 /* Is the event masked? */ 922 mask = 923 sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11; 924 925 DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data); 926 switch (atype) { 927 case PCI_NVME_AE_TYPE_ERROR: 928 lid = NVME_LOG_ERROR; 929 break; 930 case PCI_NVME_AE_TYPE_SMART: 931 mask &= 0xff; 932 if ((mask & aen->event_data) == 0) 933 continue; 934 lid = NVME_LOG_HEALTH_INFORMATION; 935 break; 936 case PCI_NVME_AE_TYPE_NOTICE: 937 if (aen->event_data >= PCI_NVME_AEI_NOTICE_MAX) { 938 EPRINTLN("%s unknown AEN notice type %u", 939 __func__, aen->event_data); 940 status = NVME_SC_INTERNAL_DEVICE_ERROR; 941 lid = 0; 942 break; 943 } 944 if ((PCI_NVME_AEI_NOTICE_MASK(aen->event_data) & mask) == 0) 945 continue; 946 switch (aen->event_data) { 947 case PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED: 948 lid = NVME_LOG_CHANGED_NAMESPACE; 949 break; 950 case PCI_NVME_AEI_NOTICE_FW_ACTIVATION: 951 lid = NVME_LOG_FIRMWARE_SLOT; 952 break; 953 case PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE: 954 lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED; 955 break; 956 case PCI_NVME_AEI_NOTICE_ANA_CHANGE: 957 lid = NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS; 958 break; 959 case PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE: 960 lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE; 961 break; 962 case PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT: 963 lid = NVME_LOG_LBA_STATUS_INFORMATION; 964 break; 965 case PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE: 966 lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE; 967 break; 968 default: 969 lid = 0; 970 } 971 break; 972 default: 973 /* bad type?!? */ 974 EPRINTLN("%s unknown AEN type %u", __func__, atype); 975 status = NVME_SC_INTERNAL_DEVICE_ERROR; 976 lid = 0; 977 break; 978 } 979 980 aer = pci_nvme_aer_get(sc); 981 assert(aer != NULL); 982 983 DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype); 984 pci_nvme_cq_update(sc, &sc->compl_queues[0], 985 (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */ 986 aer->cid, 987 0, /* SQID */ 988 status); 989 990 aen->event_data = 0; 991 aen->posted = false; 992 993 pci_generate_msix(sc->nsc_pi, 0); 994 } 995 } 996 997 static void * 998 aen_thr(void *arg) 999 { 1000 struct pci_nvme_softc *sc; 1001 1002 sc = arg; 1003 1004 pthread_mutex_lock(&sc->aen_mtx); 1005 for (;;) { 1006 pci_nvme_aen_process(sc); 1007 pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx); 1008 } 1009 #ifdef __FreeBSD__ /* Smatch spots unreachable code */ 1010 pthread_mutex_unlock(&sc->aen_mtx); 1011 1012 pthread_exit(NULL); 1013 #endif 1014 return (NULL); 1015 } 1016 1017 static void 1018 pci_nvme_reset_locked(struct pci_nvme_softc *sc) 1019 { 1020 uint32_t i; 1021 1022 DPRINTF("%s", __func__); 1023 1024 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | 1025 NVMEF(NVME_CAP_LO_REG_CQR, 1) | 1026 NVMEF(NVME_CAP_LO_REG_TO, 60); 1027 1028 sc->regs.cap_hi = NVMEF(NVME_CAP_HI_REG_CSS_NVM, 1); 1029 1030 sc->regs.vs = NVME_REV(1,4); /* NVMe v1.4 */ 1031 1032 sc->regs.cc = 0; 1033 1034 assert(sc->submit_queues != NULL); 1035 1036 for (i = 0; i < sc->num_squeues + 1; i++) { 1037 sc->submit_queues[i].qbase = NULL; 1038 sc->submit_queues[i].size = 0; 1039 sc->submit_queues[i].cqid = 0; 1040 sc->submit_queues[i].tail = 0; 1041 sc->submit_queues[i].head = 0; 1042 } 1043 1044 assert(sc->compl_queues != NULL); 1045 1046 for (i = 0; i < sc->num_cqueues + 1; i++) { 1047 sc->compl_queues[i].qbase = NULL; 1048 sc->compl_queues[i].size = 0; 1049 sc->compl_queues[i].tail = 0; 1050 sc->compl_queues[i].head = 0; 1051 } 1052 1053 sc->num_q_is_set = false; 1054 1055 pci_nvme_aer_destroy(sc); 1056 pci_nvme_aen_destroy(sc); 1057 1058 /* 1059 * Clear CSTS.RDY last to prevent the host from enabling Controller 1060 * before cleanup completes 1061 */ 1062 sc->regs.csts = 0; 1063 } 1064 1065 static void 1066 pci_nvme_reset(struct pci_nvme_softc *sc) 1067 { 1068 pthread_mutex_lock(&sc->mtx); 1069 pci_nvme_reset_locked(sc); 1070 pthread_mutex_unlock(&sc->mtx); 1071 } 1072 1073 static int 1074 pci_nvme_init_controller(struct pci_nvme_softc *sc) 1075 { 1076 uint16_t acqs, asqs; 1077 1078 DPRINTF("%s", __func__); 1079 1080 /* 1081 * NVMe 2.0 states that "enabling a controller while this field is 1082 * cleared to 0h produces undefined results" for both ACQS and 1083 * ASQS. If zero, set CFS and do not become ready. 1084 */ 1085 asqs = ONE_BASED(NVMEV(NVME_AQA_REG_ASQS, sc->regs.aqa)); 1086 if (asqs < 2) { 1087 EPRINTLN("%s: illegal ASQS value %#x (aqa=%#x)", __func__, 1088 asqs - 1, sc->regs.aqa); 1089 sc->regs.csts |= NVME_CSTS_CFS; 1090 return (-1); 1091 } 1092 sc->submit_queues[0].size = asqs; 1093 sc->submit_queues[0].qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1094 sc->regs.asq, sizeof(struct nvme_command) * asqs); 1095 if (sc->submit_queues[0].qbase == NULL) { 1096 EPRINTLN("%s: ASQ vm_map_gpa(%lx) failed", __func__, 1097 sc->regs.asq); 1098 sc->regs.csts |= NVME_CSTS_CFS; 1099 return (-1); 1100 } 1101 1102 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p", 1103 __func__, sc->regs.asq, sc->submit_queues[0].qbase); 1104 1105 acqs = ONE_BASED(NVMEV(NVME_AQA_REG_ACQS, sc->regs.aqa)); 1106 if (acqs < 2) { 1107 EPRINTLN("%s: illegal ACQS value %#x (aqa=%#x)", __func__, 1108 acqs - 1, sc->regs.aqa); 1109 sc->regs.csts |= NVME_CSTS_CFS; 1110 return (-1); 1111 } 1112 sc->compl_queues[0].size = acqs; 1113 sc->compl_queues[0].qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1114 sc->regs.acq, sizeof(struct nvme_completion) * acqs); 1115 if (sc->compl_queues[0].qbase == NULL) { 1116 EPRINTLN("%s: ACQ vm_map_gpa(%lx) failed", __func__, 1117 sc->regs.acq); 1118 sc->regs.csts |= NVME_CSTS_CFS; 1119 return (-1); 1120 } 1121 sc->compl_queues[0].intr_en = NVME_CQ_INTEN; 1122 1123 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p", 1124 __func__, sc->regs.acq, sc->compl_queues[0].qbase); 1125 1126 return (0); 1127 } 1128 1129 static int 1130 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b, 1131 size_t len, enum nvme_copy_dir dir) 1132 { 1133 uint8_t *p; 1134 size_t bytes; 1135 1136 if (len > (8 * 1024)) { 1137 return (-1); 1138 } 1139 1140 /* Copy from the start of prp1 to the end of the physical page */ 1141 bytes = PAGE_SIZE - (prp1 & PAGE_MASK); 1142 bytes = MIN(bytes, len); 1143 1144 p = vm_map_gpa(ctx, prp1, bytes); 1145 if (p == NULL) { 1146 return (-1); 1147 } 1148 1149 if (dir == NVME_COPY_TO_PRP) 1150 memcpy(p, b, bytes); 1151 else 1152 memcpy(b, p, bytes); 1153 1154 b += bytes; 1155 1156 len -= bytes; 1157 if (len == 0) { 1158 return (0); 1159 } 1160 1161 len = MIN(len, PAGE_SIZE); 1162 1163 p = vm_map_gpa(ctx, prp2, len); 1164 if (p == NULL) { 1165 return (-1); 1166 } 1167 1168 if (dir == NVME_COPY_TO_PRP) 1169 memcpy(p, b, len); 1170 else 1171 memcpy(b, p, len); 1172 1173 return (0); 1174 } 1175 1176 /* 1177 * Write a Completion Queue Entry update 1178 * 1179 * Write the completion and update the doorbell value 1180 */ 1181 static void 1182 pci_nvme_cq_update(struct pci_nvme_softc *sc, 1183 struct nvme_completion_queue *cq, 1184 uint32_t cdw0, 1185 uint16_t cid, 1186 uint16_t sqid, 1187 uint16_t status) 1188 { 1189 struct nvme_submission_queue *sq = &sc->submit_queues[sqid]; 1190 struct nvme_completion *cqe; 1191 1192 assert(cq->qbase != NULL); 1193 1194 pthread_mutex_lock(&cq->mtx); 1195 1196 cqe = &cq->qbase[cq->tail]; 1197 1198 /* Flip the phase bit */ 1199 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK; 1200 1201 cqe->cdw0 = cdw0; 1202 cqe->sqhd = sq->head; 1203 cqe->sqid = sqid; 1204 cqe->cid = cid; 1205 cqe->status = status; 1206 1207 cq->tail++; 1208 if (cq->tail >= cq->size) { 1209 cq->tail = 0; 1210 } 1211 1212 pthread_mutex_unlock(&cq->mtx); 1213 } 1214 1215 static int 1216 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1217 struct nvme_completion* compl) 1218 { 1219 uint16_t qid = command->cdw10 & 0xffff; 1220 1221 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid); 1222 if (qid == 0 || qid > sc->num_squeues || 1223 (sc->submit_queues[qid].qbase == NULL)) { 1224 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u", 1225 __func__, qid, sc->num_squeues); 1226 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1227 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1228 return (1); 1229 } 1230 1231 sc->submit_queues[qid].qbase = NULL; 1232 sc->submit_queues[qid].cqid = 0; 1233 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1234 return (1); 1235 } 1236 1237 static int 1238 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1239 struct nvme_completion* compl) 1240 { 1241 if (command->cdw11 & NVME_CMD_CDW11_PC) { 1242 uint16_t qid = command->cdw10 & 0xffff; 1243 struct nvme_submission_queue *nsq; 1244 1245 if ((qid == 0) || (qid > sc->num_squeues) || 1246 (sc->submit_queues[qid].qbase != NULL)) { 1247 WPRINTF("%s queue index %u > num_squeues %u", 1248 __func__, qid, sc->num_squeues); 1249 pci_nvme_status_tc(&compl->status, 1250 NVME_SCT_COMMAND_SPECIFIC, 1251 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1252 return (1); 1253 } 1254 1255 nsq = &sc->submit_queues[qid]; 1256 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1257 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries); 1258 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) { 1259 /* 1260 * Queues must specify at least two entries 1261 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1262 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1263 */ 1264 pci_nvme_status_tc(&compl->status, 1265 NVME_SCT_COMMAND_SPECIFIC, 1266 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1267 return (1); 1268 } 1269 nsq->head = nsq->tail = 0; 1270 1271 nsq->cqid = (command->cdw11 >> 16) & 0xffff; 1272 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) { 1273 pci_nvme_status_tc(&compl->status, 1274 NVME_SCT_COMMAND_SPECIFIC, 1275 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1276 return (1); 1277 } 1278 1279 if (sc->compl_queues[nsq->cqid].qbase == NULL) { 1280 pci_nvme_status_tc(&compl->status, 1281 NVME_SCT_COMMAND_SPECIFIC, 1282 NVME_SC_COMPLETION_QUEUE_INVALID); 1283 return (1); 1284 } 1285 1286 nsq->qpriority = (command->cdw11 >> 1) & 0x03; 1287 1288 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1289 sizeof(struct nvme_command) * (size_t)nsq->size); 1290 1291 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__, 1292 qid, nsq->size, nsq->qbase, nsq->cqid); 1293 1294 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1295 1296 DPRINTF("%s completed creating IOSQ qid %u", 1297 __func__, qid); 1298 } else { 1299 /* 1300 * Guest sent non-cont submission queue request. 1301 * This setting is unsupported by this emulation. 1302 */ 1303 WPRINTF("%s unsupported non-contig (list-based) " 1304 "create i/o submission queue", __func__); 1305 1306 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1307 } 1308 return (1); 1309 } 1310 1311 static int 1312 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1313 struct nvme_completion* compl) 1314 { 1315 uint16_t qid = command->cdw10 & 0xffff; 1316 uint16_t sqid; 1317 1318 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid); 1319 if (qid == 0 || qid > sc->num_cqueues || 1320 (sc->compl_queues[qid].qbase == NULL)) { 1321 WPRINTF("%s queue index %u / num_cqueues %u", 1322 __func__, qid, sc->num_cqueues); 1323 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1324 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1325 return (1); 1326 } 1327 1328 /* Deleting an Active CQ is an error */ 1329 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++) 1330 if (sc->submit_queues[sqid].cqid == qid) { 1331 pci_nvme_status_tc(&compl->status, 1332 NVME_SCT_COMMAND_SPECIFIC, 1333 NVME_SC_INVALID_QUEUE_DELETION); 1334 return (1); 1335 } 1336 1337 sc->compl_queues[qid].qbase = NULL; 1338 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1339 return (1); 1340 } 1341 1342 static int 1343 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1344 struct nvme_completion* compl) 1345 { 1346 struct nvme_completion_queue *ncq; 1347 uint16_t qid = command->cdw10 & 0xffff; 1348 1349 /* Only support Physically Contiguous queues */ 1350 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) { 1351 WPRINTF("%s unsupported non-contig (list-based) " 1352 "create i/o completion queue", 1353 __func__); 1354 1355 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1356 return (1); 1357 } 1358 1359 if ((qid == 0) || (qid > sc->num_cqueues) || 1360 (sc->compl_queues[qid].qbase != NULL)) { 1361 WPRINTF("%s queue index %u > num_cqueues %u", 1362 __func__, qid, sc->num_cqueues); 1363 pci_nvme_status_tc(&compl->status, 1364 NVME_SCT_COMMAND_SPECIFIC, 1365 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1366 return (1); 1367 } 1368 1369 ncq = &sc->compl_queues[qid]; 1370 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; 1371 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; 1372 if (ncq->intr_vec > (sc->max_queues + 1)) { 1373 pci_nvme_status_tc(&compl->status, 1374 NVME_SCT_COMMAND_SPECIFIC, 1375 NVME_SC_INVALID_INTERRUPT_VECTOR); 1376 return (1); 1377 } 1378 1379 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1380 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) { 1381 /* 1382 * Queues must specify at least two entries 1383 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1384 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1385 */ 1386 pci_nvme_status_tc(&compl->status, 1387 NVME_SCT_COMMAND_SPECIFIC, 1388 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1389 return (1); 1390 } 1391 ncq->head = ncq->tail = 0; 1392 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1393 command->prp1, 1394 sizeof(struct nvme_command) * (size_t)ncq->size); 1395 1396 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1397 1398 1399 return (1); 1400 } 1401 1402 static int 1403 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, 1404 struct nvme_completion* compl) 1405 { 1406 uint64_t logoff; 1407 uint32_t logsize; 1408 uint8_t logpage; 1409 1410 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1411 1412 /* 1413 * Command specifies the number of dwords to return in fields NUMDU 1414 * and NUMDL. This is a zero-based value. 1415 */ 1416 logpage = command->cdw10 & 0xFF; 1417 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1; 1418 logsize *= sizeof(uint32_t); 1419 logoff = ((uint64_t)(command->cdw13) << 32) | command->cdw12; 1420 1421 DPRINTF("%s log page %u len %u", __func__, logpage, logsize); 1422 1423 switch (logpage) { 1424 case NVME_LOG_ERROR: 1425 if (logoff >= sizeof(sc->err_log)) { 1426 pci_nvme_status_genc(&compl->status, 1427 NVME_SC_INVALID_FIELD); 1428 break; 1429 } 1430 1431 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1432 command->prp2, (uint8_t *)&sc->err_log + logoff, 1433 MIN(logsize - logoff, sizeof(sc->err_log)), 1434 NVME_COPY_TO_PRP); 1435 break; 1436 case NVME_LOG_HEALTH_INFORMATION: 1437 if (logoff >= sizeof(sc->health_log)) { 1438 pci_nvme_status_genc(&compl->status, 1439 NVME_SC_INVALID_FIELD); 1440 break; 1441 } 1442 1443 pthread_mutex_lock(&sc->mtx); 1444 memcpy(&sc->health_log.data_units_read, &sc->read_data_units, 1445 sizeof(sc->health_log.data_units_read)); 1446 memcpy(&sc->health_log.data_units_written, &sc->write_data_units, 1447 sizeof(sc->health_log.data_units_written)); 1448 memcpy(&sc->health_log.host_read_commands, &sc->read_commands, 1449 sizeof(sc->health_log.host_read_commands)); 1450 memcpy(&sc->health_log.host_write_commands, &sc->write_commands, 1451 sizeof(sc->health_log.host_write_commands)); 1452 pthread_mutex_unlock(&sc->mtx); 1453 1454 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1455 command->prp2, (uint8_t *)&sc->health_log + logoff, 1456 MIN(logsize - logoff, sizeof(sc->health_log)), 1457 NVME_COPY_TO_PRP); 1458 break; 1459 case NVME_LOG_FIRMWARE_SLOT: 1460 if (logoff >= sizeof(sc->fw_log)) { 1461 pci_nvme_status_genc(&compl->status, 1462 NVME_SC_INVALID_FIELD); 1463 break; 1464 } 1465 1466 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1467 command->prp2, (uint8_t *)&sc->fw_log + logoff, 1468 MIN(logsize - logoff, sizeof(sc->fw_log)), 1469 NVME_COPY_TO_PRP); 1470 break; 1471 case NVME_LOG_CHANGED_NAMESPACE: 1472 if (logoff >= sizeof(sc->ns_log)) { 1473 pci_nvme_status_genc(&compl->status, 1474 NVME_SC_INVALID_FIELD); 1475 break; 1476 } 1477 1478 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1479 command->prp2, (uint8_t *)&sc->ns_log + logoff, 1480 MIN(logsize - logoff, sizeof(sc->ns_log)), 1481 NVME_COPY_TO_PRP); 1482 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 1483 break; 1484 default: 1485 DPRINTF("%s get log page %x command not supported", 1486 __func__, logpage); 1487 1488 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1489 NVME_SC_INVALID_LOG_PAGE); 1490 } 1491 1492 return (1); 1493 } 1494 1495 static int 1496 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, 1497 struct nvme_completion* compl) 1498 { 1499 void *dest; 1500 uint16_t status; 1501 1502 DPRINTF("%s identify 0x%x nsid 0x%x", __func__, 1503 command->cdw10 & 0xFF, command->nsid); 1504 1505 status = 0; 1506 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1507 1508 switch (command->cdw10 & 0xFF) { 1509 case 0x00: /* return Identify Namespace data structure */ 1510 /* Global NS only valid with NS Management */ 1511 if (command->nsid == NVME_GLOBAL_NAMESPACE_TAG) { 1512 pci_nvme_status_genc(&status, 1513 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1514 break; 1515 } 1516 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1517 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata), 1518 NVME_COPY_TO_PRP); 1519 break; 1520 case 0x01: /* return Identify Controller data structure */ 1521 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1522 command->prp2, (uint8_t *)&sc->ctrldata, 1523 sizeof(sc->ctrldata), 1524 NVME_COPY_TO_PRP); 1525 break; 1526 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ 1527 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1528 sizeof(uint32_t) * 1024); 1529 /* All unused entries shall be zero */ 1530 memset(dest, 0, sizeof(uint32_t) * 1024); 1531 ((uint32_t *)dest)[0] = 1; 1532 break; 1533 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ 1534 if (command->nsid != 1) { 1535 pci_nvme_status_genc(&status, 1536 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1537 break; 1538 } 1539 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1540 sizeof(uint32_t) * 1024); 1541 /* All bytes after the descriptor shall be zero */ 1542 memset(dest, 0, sizeof(uint32_t) * 1024); 1543 1544 /* Return NIDT=1 (i.e. EUI64) descriptor */ 1545 ((uint8_t *)dest)[0] = 1; 1546 ((uint8_t *)dest)[1] = sizeof(uint64_t); 1547 memcpy(((uint8_t *)dest) + 4, sc->nsdata.eui64, sizeof(uint64_t)); 1548 break; 1549 case 0x13: 1550 /* 1551 * Controller list is optional but used by UNH tests. Return 1552 * a valid but empty list. 1553 */ 1554 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1555 sizeof(uint16_t) * 2048); 1556 memset(dest, 0, sizeof(uint16_t) * 2048); 1557 break; 1558 default: 1559 DPRINTF("%s unsupported identify command requested 0x%x", 1560 __func__, command->cdw10 & 0xFF); 1561 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD); 1562 break; 1563 } 1564 1565 compl->status = status; 1566 return (1); 1567 } 1568 1569 static const char * 1570 nvme_fid_to_name(uint8_t fid) 1571 { 1572 const char *name; 1573 1574 switch (fid) { 1575 case NVME_FEAT_ARBITRATION: 1576 name = "Arbitration"; 1577 break; 1578 case NVME_FEAT_POWER_MANAGEMENT: 1579 name = "Power Management"; 1580 break; 1581 case NVME_FEAT_LBA_RANGE_TYPE: 1582 name = "LBA Range Type"; 1583 break; 1584 case NVME_FEAT_TEMPERATURE_THRESHOLD: 1585 name = "Temperature Threshold"; 1586 break; 1587 case NVME_FEAT_ERROR_RECOVERY: 1588 name = "Error Recovery"; 1589 break; 1590 case NVME_FEAT_VOLATILE_WRITE_CACHE: 1591 name = "Volatile Write Cache"; 1592 break; 1593 case NVME_FEAT_NUMBER_OF_QUEUES: 1594 name = "Number of Queues"; 1595 break; 1596 case NVME_FEAT_INTERRUPT_COALESCING: 1597 name = "Interrupt Coalescing"; 1598 break; 1599 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 1600 name = "Interrupt Vector Configuration"; 1601 break; 1602 case NVME_FEAT_WRITE_ATOMICITY: 1603 name = "Write Atomicity Normal"; 1604 break; 1605 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 1606 name = "Asynchronous Event Configuration"; 1607 break; 1608 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 1609 name = "Autonomous Power State Transition"; 1610 break; 1611 case NVME_FEAT_HOST_MEMORY_BUFFER: 1612 name = "Host Memory Buffer"; 1613 break; 1614 case NVME_FEAT_TIMESTAMP: 1615 name = "Timestamp"; 1616 break; 1617 case NVME_FEAT_KEEP_ALIVE_TIMER: 1618 name = "Keep Alive Timer"; 1619 break; 1620 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT: 1621 name = "Host Controlled Thermal Management"; 1622 break; 1623 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG: 1624 name = "Non-Operation Power State Config"; 1625 break; 1626 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG: 1627 name = "Read Recovery Level Config"; 1628 break; 1629 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 1630 name = "Predictable Latency Mode Config"; 1631 break; 1632 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW: 1633 name = "Predictable Latency Mode Window"; 1634 break; 1635 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES: 1636 name = "LBA Status Information Report Interval"; 1637 break; 1638 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 1639 name = "Host Behavior Support"; 1640 break; 1641 case NVME_FEAT_SANITIZE_CONFIG: 1642 name = "Sanitize Config"; 1643 break; 1644 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION: 1645 name = "Endurance Group Event Configuration"; 1646 break; 1647 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 1648 name = "Software Progress Marker"; 1649 break; 1650 case NVME_FEAT_HOST_IDENTIFIER: 1651 name = "Host Identifier"; 1652 break; 1653 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK: 1654 name = "Reservation Notification Mask"; 1655 break; 1656 case NVME_FEAT_RESERVATION_PERSISTENCE: 1657 name = "Reservation Persistence"; 1658 break; 1659 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG: 1660 name = "Namespace Write Protection Config"; 1661 break; 1662 default: 1663 name = "Unknown"; 1664 break; 1665 } 1666 1667 return (name); 1668 } 1669 1670 static void 1671 nvme_feature_invalid_cb(struct pci_nvme_softc *sc __unused, 1672 struct nvme_feature_obj *feat __unused, 1673 struct nvme_command *command __unused, 1674 struct nvme_completion *compl) 1675 { 1676 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1677 } 1678 1679 static void 1680 nvme_feature_iv_config(struct pci_nvme_softc *sc, 1681 struct nvme_feature_obj *feat __unused, 1682 struct nvme_command *command, 1683 struct nvme_completion *compl) 1684 { 1685 uint32_t i; 1686 uint32_t cdw11 = command->cdw11; 1687 uint16_t iv; 1688 bool cd; 1689 1690 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1691 1692 iv = cdw11 & 0xffff; 1693 cd = cdw11 & (1 << 16); 1694 1695 if (iv > (sc->max_queues + 1)) { 1696 return; 1697 } 1698 1699 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */ 1700 if ((iv == 0) && !cd) 1701 return; 1702 1703 /* Requested Interrupt Vector must be used by a CQ */ 1704 for (i = 0; i < sc->num_cqueues + 1; i++) { 1705 if (sc->compl_queues[i].intr_vec == iv) { 1706 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1707 } 1708 } 1709 } 1710 1711 #define NVME_ASYNC_EVENT_ENDURANCE_GROUP (0x4000) 1712 static void 1713 nvme_feature_async_event(struct pci_nvme_softc *sc __unused, 1714 struct nvme_feature_obj *feat __unused, 1715 struct nvme_command *command, 1716 struct nvme_completion *compl) 1717 { 1718 if (command->cdw11 & NVME_ASYNC_EVENT_ENDURANCE_GROUP) 1719 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1720 } 1721 1722 #define NVME_TEMP_THRESH_OVER 0 1723 #define NVME_TEMP_THRESH_UNDER 1 1724 static void 1725 nvme_feature_temperature(struct pci_nvme_softc *sc, 1726 struct nvme_feature_obj *feat __unused, 1727 struct nvme_command *command, 1728 struct nvme_completion *compl) 1729 { 1730 uint16_t tmpth; /* Temperature Threshold */ 1731 uint8_t tmpsel; /* Threshold Temperature Select */ 1732 uint8_t thsel; /* Threshold Type Select */ 1733 bool set_crit = false; 1734 bool report_crit; 1735 1736 tmpth = command->cdw11 & 0xffff; 1737 tmpsel = (command->cdw11 >> 16) & 0xf; 1738 thsel = (command->cdw11 >> 20) & 0x3; 1739 1740 DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel); 1741 1742 /* Check for unsupported values */ 1743 if (((tmpsel != 0) && (tmpsel != 0xf)) || 1744 (thsel > NVME_TEMP_THRESH_UNDER)) { 1745 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1746 return; 1747 } 1748 1749 if (((thsel == NVME_TEMP_THRESH_OVER) && (NVME_TEMPERATURE >= tmpth)) || 1750 ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth))) 1751 set_crit = true; 1752 1753 pthread_mutex_lock(&sc->mtx); 1754 if (set_crit) 1755 sc->health_log.critical_warning |= 1756 NVME_CRIT_WARN_ST_TEMPERATURE; 1757 else 1758 sc->health_log.critical_warning &= 1759 ~NVME_CRIT_WARN_ST_TEMPERATURE; 1760 pthread_mutex_unlock(&sc->mtx); 1761 1762 report_crit = sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11 & 1763 NVME_CRIT_WARN_ST_TEMPERATURE; 1764 1765 if (set_crit && report_crit) 1766 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART, 1767 sc->health_log.critical_warning); 1768 1769 DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status); 1770 } 1771 1772 static void 1773 nvme_feature_num_queues(struct pci_nvme_softc *sc, 1774 struct nvme_feature_obj *feat __unused, 1775 struct nvme_command *command, 1776 struct nvme_completion *compl) 1777 { 1778 uint16_t nqr; /* Number of Queues Requested */ 1779 1780 if (sc->num_q_is_set) { 1781 WPRINTF("%s: Number of Queues already set", __func__); 1782 pci_nvme_status_genc(&compl->status, 1783 NVME_SC_COMMAND_SEQUENCE_ERROR); 1784 return; 1785 } 1786 1787 nqr = command->cdw11 & 0xFFFF; 1788 if (nqr == 0xffff) { 1789 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr); 1790 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1791 return; 1792 } 1793 1794 sc->num_squeues = ONE_BASED(nqr); 1795 if (sc->num_squeues > sc->max_queues) { 1796 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues, 1797 sc->max_queues); 1798 sc->num_squeues = sc->max_queues; 1799 } 1800 1801 nqr = (command->cdw11 >> 16) & 0xFFFF; 1802 if (nqr == 0xffff) { 1803 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr); 1804 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1805 return; 1806 } 1807 1808 sc->num_cqueues = ONE_BASED(nqr); 1809 if (sc->num_cqueues > sc->max_queues) { 1810 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues, 1811 sc->max_queues); 1812 sc->num_cqueues = sc->max_queues; 1813 } 1814 1815 /* Patch the command value which will be saved on callback's return */ 1816 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc); 1817 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 1818 1819 sc->num_q_is_set = true; 1820 } 1821 1822 static int 1823 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command, 1824 struct nvme_completion *compl) 1825 { 1826 struct nvme_feature_obj *feat; 1827 uint32_t nsid = command->nsid; 1828 uint8_t fid = NVMEV(NVME_FEAT_SET_FID, command->cdw10); 1829 bool sv = NVMEV(NVME_FEAT_SET_SV, command->cdw10); 1830 1831 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1832 1833 if (fid >= NVME_FID_MAX) { 1834 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1835 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1836 return (1); 1837 } 1838 1839 if (sv) { 1840 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1841 NVME_SC_FEATURE_NOT_SAVEABLE); 1842 return (1); 1843 } 1844 1845 feat = &sc->feat[fid]; 1846 1847 if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) { 1848 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1849 return (1); 1850 } 1851 1852 if (!feat->namespace_specific && 1853 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) { 1854 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1855 NVME_SC_FEATURE_NOT_NS_SPECIFIC); 1856 return (1); 1857 } 1858 1859 compl->cdw0 = 0; 1860 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1861 1862 if (feat->set) 1863 feat->set(sc, feat, command, compl); 1864 else { 1865 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1866 NVME_SC_FEATURE_NOT_CHANGEABLE); 1867 return (1); 1868 } 1869 1870 DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11); 1871 if (compl->status == NVME_SC_SUCCESS) { 1872 feat->cdw11 = command->cdw11; 1873 if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) && 1874 (command->cdw11 != 0)) 1875 pci_nvme_aen_notify(sc); 1876 } 1877 1878 return (0); 1879 } 1880 1881 #define NVME_FEATURES_SEL_SUPPORTED 0x3 1882 #define NVME_FEATURES_NS_SPECIFIC (1 << 1) 1883 1884 static int 1885 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, 1886 struct nvme_completion* compl) 1887 { 1888 struct nvme_feature_obj *feat; 1889 uint8_t fid = command->cdw10 & 0xFF; 1890 uint8_t sel = (command->cdw10 >> 8) & 0x7; 1891 1892 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1893 1894 if (fid >= NVME_FID_MAX) { 1895 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1896 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1897 return (1); 1898 } 1899 1900 compl->cdw0 = 0; 1901 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1902 1903 feat = &sc->feat[fid]; 1904 if (feat->get) { 1905 feat->get(sc, feat, command, compl); 1906 } 1907 1908 if (compl->status == NVME_SC_SUCCESS) { 1909 if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific) 1910 compl->cdw0 = NVME_FEATURES_NS_SPECIFIC; 1911 else 1912 compl->cdw0 = feat->cdw11; 1913 } 1914 1915 return (0); 1916 } 1917 1918 static int 1919 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command, 1920 struct nvme_completion* compl) 1921 { 1922 uint8_t ses, lbaf, pi; 1923 1924 /* Only supports Secure Erase Setting - User Data Erase */ 1925 ses = (command->cdw10 >> 9) & 0x7; 1926 if (ses > 0x1) { 1927 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1928 return (1); 1929 } 1930 1931 /* Only supports a single LBA Format */ 1932 lbaf = command->cdw10 & 0xf; 1933 if (lbaf != 0) { 1934 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1935 NVME_SC_INVALID_FORMAT); 1936 return (1); 1937 } 1938 1939 /* Doesn't support Protection Information */ 1940 pi = (command->cdw10 >> 5) & 0x7; 1941 if (pi != 0) { 1942 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1943 return (1); 1944 } 1945 1946 if (sc->nvstore.type == NVME_STOR_RAM) { 1947 if (sc->nvstore.ctx) 1948 free(sc->nvstore.ctx); 1949 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 1950 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1951 } else { 1952 struct pci_nvme_ioreq *req; 1953 int err; 1954 1955 req = pci_nvme_get_ioreq(sc); 1956 if (req == NULL) { 1957 pci_nvme_status_genc(&compl->status, 1958 NVME_SC_INTERNAL_DEVICE_ERROR); 1959 WPRINTF("%s: unable to allocate IO req", __func__); 1960 return (1); 1961 } 1962 req->nvme_sq = &sc->submit_queues[0]; 1963 req->sqid = 0; 1964 req->opc = command->opc; 1965 req->cid = command->cid; 1966 req->nsid = command->nsid; 1967 1968 req->io_req.br_offset = 0; 1969 req->io_req.br_resid = sc->nvstore.size; 1970 req->io_req.br_callback = pci_nvme_io_done; 1971 1972 err = blockif_delete(sc->nvstore.ctx, &req->io_req); 1973 if (err) { 1974 pci_nvme_status_genc(&compl->status, 1975 NVME_SC_INTERNAL_DEVICE_ERROR); 1976 pci_nvme_release_ioreq(sc, req); 1977 } else 1978 compl->status = NVME_NO_STATUS; 1979 } 1980 1981 return (1); 1982 } 1983 1984 static int 1985 nvme_opc_abort(struct pci_nvme_softc *sc __unused, struct nvme_command *command, 1986 struct nvme_completion *compl) 1987 { 1988 DPRINTF("%s submission queue %u, command ID 0x%x", __func__, 1989 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF); 1990 1991 /* TODO: search for the command ID and abort it */ 1992 1993 compl->cdw0 = 1; 1994 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1995 return (1); 1996 } 1997 1998 static int 1999 nvme_opc_async_event_req(struct pci_nvme_softc* sc, 2000 struct nvme_command* command, struct nvme_completion* compl) 2001 { 2002 DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__, 2003 sc->aer_count, sc->ctrldata.aerl, command->cid); 2004 2005 /* Don't exceed the Async Event Request Limit (AERL). */ 2006 if (pci_nvme_aer_limit_reached(sc)) { 2007 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 2008 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); 2009 return (1); 2010 } 2011 2012 if (pci_nvme_aer_add(sc, command->cid)) { 2013 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC, 2014 NVME_SC_INTERNAL_DEVICE_ERROR); 2015 return (1); 2016 } 2017 2018 /* 2019 * Raise events when they happen based on the Set Features cmd. 2020 * These events happen async, so only set completion successful if 2021 * there is an event reflective of the request to get event. 2022 */ 2023 compl->status = NVME_NO_STATUS; 2024 pci_nvme_aen_notify(sc); 2025 2026 return (0); 2027 } 2028 2029 static void 2030 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) 2031 { 2032 struct nvme_completion compl; 2033 struct nvme_command *cmd; 2034 struct nvme_submission_queue *sq; 2035 struct nvme_completion_queue *cq; 2036 uint16_t sqhead; 2037 2038 DPRINTF("%s index %u", __func__, (uint32_t)value); 2039 2040 sq = &sc->submit_queues[0]; 2041 cq = &sc->compl_queues[0]; 2042 2043 pthread_mutex_lock(&sq->mtx); 2044 2045 sqhead = sq->head; 2046 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail); 2047 2048 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2049 cmd = &(sq->qbase)[sqhead]; 2050 compl.cdw0 = 0; 2051 compl.status = 0; 2052 2053 switch (cmd->opc) { 2054 case NVME_OPC_DELETE_IO_SQ: 2055 DPRINTF("%s command DELETE_IO_SQ", __func__); 2056 nvme_opc_delete_io_sq(sc, cmd, &compl); 2057 break; 2058 case NVME_OPC_CREATE_IO_SQ: 2059 DPRINTF("%s command CREATE_IO_SQ", __func__); 2060 nvme_opc_create_io_sq(sc, cmd, &compl); 2061 break; 2062 case NVME_OPC_DELETE_IO_CQ: 2063 DPRINTF("%s command DELETE_IO_CQ", __func__); 2064 nvme_opc_delete_io_cq(sc, cmd, &compl); 2065 break; 2066 case NVME_OPC_CREATE_IO_CQ: 2067 DPRINTF("%s command CREATE_IO_CQ", __func__); 2068 nvme_opc_create_io_cq(sc, cmd, &compl); 2069 break; 2070 case NVME_OPC_GET_LOG_PAGE: 2071 DPRINTF("%s command GET_LOG_PAGE", __func__); 2072 nvme_opc_get_log_page(sc, cmd, &compl); 2073 break; 2074 case NVME_OPC_IDENTIFY: 2075 DPRINTF("%s command IDENTIFY", __func__); 2076 nvme_opc_identify(sc, cmd, &compl); 2077 break; 2078 case NVME_OPC_ABORT: 2079 DPRINTF("%s command ABORT", __func__); 2080 nvme_opc_abort(sc, cmd, &compl); 2081 break; 2082 case NVME_OPC_SET_FEATURES: 2083 DPRINTF("%s command SET_FEATURES", __func__); 2084 nvme_opc_set_features(sc, cmd, &compl); 2085 break; 2086 case NVME_OPC_GET_FEATURES: 2087 DPRINTF("%s command GET_FEATURES", __func__); 2088 nvme_opc_get_features(sc, cmd, &compl); 2089 break; 2090 case NVME_OPC_FIRMWARE_ACTIVATE: 2091 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__); 2092 pci_nvme_status_tc(&compl.status, 2093 NVME_SCT_COMMAND_SPECIFIC, 2094 NVME_SC_INVALID_FIRMWARE_SLOT); 2095 break; 2096 case NVME_OPC_ASYNC_EVENT_REQUEST: 2097 DPRINTF("%s command ASYNC_EVENT_REQ", __func__); 2098 nvme_opc_async_event_req(sc, cmd, &compl); 2099 break; 2100 case NVME_OPC_FORMAT_NVM: 2101 DPRINTF("%s command FORMAT_NVM", __func__); 2102 if (NVMEV(NVME_CTRLR_DATA_OACS_FORMAT, 2103 sc->ctrldata.oacs) == 0) { 2104 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 2105 break; 2106 } 2107 nvme_opc_format_nvm(sc, cmd, &compl); 2108 break; 2109 case NVME_OPC_SECURITY_SEND: 2110 case NVME_OPC_SECURITY_RECEIVE: 2111 case NVME_OPC_SANITIZE: 2112 case NVME_OPC_GET_LBA_STATUS: 2113 DPRINTF("%s command OPC=%#x (unsupported)", __func__, 2114 cmd->opc); 2115 /* Valid but unsupported opcodes */ 2116 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD); 2117 break; 2118 default: 2119 DPRINTF("%s command OPC=%#X (not implemented)", 2120 __func__, 2121 cmd->opc); 2122 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 2123 } 2124 sqhead = (sqhead + 1) % sq->size; 2125 2126 if (NVME_COMPLETION_VALID(compl)) { 2127 pci_nvme_cq_update(sc, &sc->compl_queues[0], 2128 compl.cdw0, 2129 cmd->cid, 2130 0, /* SQID */ 2131 compl.status); 2132 } 2133 } 2134 2135 DPRINTF("setting sqhead %u", sqhead); 2136 sq->head = sqhead; 2137 2138 if (cq->head != cq->tail) 2139 pci_generate_msix(sc->nsc_pi, 0); 2140 2141 pthread_mutex_unlock(&sq->mtx); 2142 } 2143 2144 /* 2145 * Update the Write and Read statistics reported in SMART data 2146 * 2147 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up. 2148 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000 2149 * 512 byte blocks. Rounding up is achieved by initializing the remainder to 999. 2150 */ 2151 static void 2152 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc, 2153 size_t bytes, uint16_t status) 2154 { 2155 2156 pthread_mutex_lock(&sc->mtx); 2157 switch (opc) { 2158 case NVME_OPC_WRITE: 2159 sc->write_commands++; 2160 if (status != NVME_SC_SUCCESS) 2161 break; 2162 sc->write_dunits_remainder += (bytes / 512); 2163 while (sc->write_dunits_remainder >= 1000) { 2164 sc->write_data_units++; 2165 sc->write_dunits_remainder -= 1000; 2166 } 2167 break; 2168 case NVME_OPC_READ: 2169 sc->read_commands++; 2170 if (status != NVME_SC_SUCCESS) 2171 break; 2172 sc->read_dunits_remainder += (bytes / 512); 2173 while (sc->read_dunits_remainder >= 1000) { 2174 sc->read_data_units++; 2175 sc->read_dunits_remainder -= 1000; 2176 } 2177 break; 2178 default: 2179 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc); 2180 break; 2181 } 2182 pthread_mutex_unlock(&sc->mtx); 2183 } 2184 2185 /* 2186 * Check if the combination of Starting LBA (slba) and number of blocks 2187 * exceeds the range of the underlying storage. 2188 * 2189 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores 2190 * the capacity in bytes as a uint64_t, care must be taken to avoid integer 2191 * overflow. 2192 */ 2193 static bool 2194 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba, 2195 uint32_t nblocks) 2196 { 2197 size_t offset, bytes; 2198 2199 /* Overflow check of multiplying Starting LBA by the sector size */ 2200 if (slba >> (64 - nvstore->sectsz_bits)) 2201 return (true); 2202 2203 offset = slba << nvstore->sectsz_bits; 2204 bytes = nblocks << nvstore->sectsz_bits; 2205 2206 /* Overflow check of Number of Logical Blocks */ 2207 if ((nvstore->size <= offset) || ((nvstore->size - offset) < bytes)) 2208 return (true); 2209 2210 return (false); 2211 } 2212 2213 static int 2214 pci_nvme_append_iov_req(struct pci_nvme_softc *sc __unused, 2215 struct pci_nvme_ioreq *req, uint64_t gpaddr, size_t size, uint64_t offset) 2216 { 2217 int iovidx; 2218 bool range_is_contiguous; 2219 2220 if (req == NULL) 2221 return (-1); 2222 2223 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) { 2224 return (-1); 2225 } 2226 2227 /* 2228 * Minimize the number of IOVs by concatenating contiguous address 2229 * ranges. If the IOV count is zero, there is no previous range to 2230 * concatenate. 2231 */ 2232 if (req->io_req.br_iovcnt == 0) 2233 range_is_contiguous = false; 2234 else 2235 range_is_contiguous = (req->prev_gpaddr + req->prev_size) == gpaddr; 2236 2237 if (range_is_contiguous) { 2238 iovidx = req->io_req.br_iovcnt - 1; 2239 2240 req->io_req.br_iov[iovidx].iov_base = 2241 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2242 req->prev_gpaddr, size); 2243 if (req->io_req.br_iov[iovidx].iov_base == NULL) 2244 return (-1); 2245 2246 req->prev_size += size; 2247 req->io_req.br_resid += size; 2248 2249 req->io_req.br_iov[iovidx].iov_len = req->prev_size; 2250 } else { 2251 iovidx = req->io_req.br_iovcnt; 2252 if (iovidx == 0) { 2253 req->io_req.br_offset = offset; 2254 req->io_req.br_resid = 0; 2255 req->io_req.br_param = req; 2256 } 2257 2258 req->io_req.br_iov[iovidx].iov_base = 2259 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2260 gpaddr, size); 2261 if (req->io_req.br_iov[iovidx].iov_base == NULL) 2262 return (-1); 2263 2264 req->io_req.br_iov[iovidx].iov_len = size; 2265 2266 req->prev_gpaddr = gpaddr; 2267 req->prev_size = size; 2268 req->io_req.br_resid += size; 2269 2270 req->io_req.br_iovcnt++; 2271 } 2272 2273 return (0); 2274 } 2275 2276 static void 2277 pci_nvme_set_completion(struct pci_nvme_softc *sc, 2278 struct nvme_submission_queue *sq, int sqid, uint16_t cid, uint16_t status) 2279 { 2280 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; 2281 2282 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x", 2283 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), 2284 NVME_STATUS_GET_SC(status)); 2285 2286 pci_nvme_cq_update(sc, cq, 0, cid, sqid, status); 2287 2288 if (cq->head != cq->tail) { 2289 if (cq->intr_en & NVME_CQ_INTEN) { 2290 pci_generate_msix(sc->nsc_pi, cq->intr_vec); 2291 } else { 2292 DPRINTF("%s: CQ%u interrupt disabled", 2293 __func__, sq->cqid); 2294 } 2295 } 2296 } 2297 2298 static void 2299 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) 2300 { 2301 req->sc = NULL; 2302 req->nvme_sq = NULL; 2303 req->sqid = 0; 2304 2305 pthread_mutex_lock(&sc->mtx); 2306 2307 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link); 2308 sc->pending_ios--; 2309 2310 /* when no more IO pending, can set to ready if device reset/enabled */ 2311 if (sc->pending_ios == 0 && 2312 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) 2313 sc->regs.csts |= NVME_CSTS_RDY; 2314 2315 pthread_mutex_unlock(&sc->mtx); 2316 2317 sem_post(&sc->iosemlock); 2318 } 2319 2320 static struct pci_nvme_ioreq * 2321 pci_nvme_get_ioreq(struct pci_nvme_softc *sc) 2322 { 2323 struct pci_nvme_ioreq *req = NULL; 2324 2325 sem_wait(&sc->iosemlock); 2326 pthread_mutex_lock(&sc->mtx); 2327 2328 req = STAILQ_FIRST(&sc->ioreqs_free); 2329 assert(req != NULL); 2330 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link); 2331 2332 req->sc = sc; 2333 2334 sc->pending_ios++; 2335 2336 pthread_mutex_unlock(&sc->mtx); 2337 2338 req->io_req.br_iovcnt = 0; 2339 req->io_req.br_offset = 0; 2340 req->io_req.br_resid = 0; 2341 req->io_req.br_param = req; 2342 req->prev_gpaddr = 0; 2343 req->prev_size = 0; 2344 2345 return req; 2346 } 2347 2348 static void 2349 pci_nvme_io_done(struct blockif_req *br, int err) 2350 { 2351 struct pci_nvme_ioreq *req = br->br_param; 2352 struct nvme_submission_queue *sq = req->nvme_sq; 2353 uint16_t code, status; 2354 2355 DPRINTF("%s error %d %s", __func__, err, strerror(err)); 2356 2357 /* TODO return correct error */ 2358 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; 2359 status = 0; 2360 pci_nvme_status_genc(&status, code); 2361 2362 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, status); 2363 pci_nvme_stats_write_read_update(req->sc, req->opc, 2364 req->bytes, status); 2365 pci_nvme_release_ioreq(req->sc, req); 2366 } 2367 2368 /* 2369 * Implements the Flush command. The specification states: 2370 * If a volatile write cache is not present, Flush commands complete 2371 * successfully and have no effect 2372 * in the description of the Volatile Write Cache (VWC) field of the Identify 2373 * Controller data. Therefore, set status to Success if the command is 2374 * not supported (i.e. RAM or as indicated by the blockif). 2375 */ 2376 static bool 2377 nvme_opc_flush(struct pci_nvme_softc *sc __unused, 2378 struct nvme_command *cmd __unused, 2379 struct pci_nvme_blockstore *nvstore, 2380 struct pci_nvme_ioreq *req, 2381 uint16_t *status) 2382 { 2383 bool pending = false; 2384 2385 if (nvstore->type == NVME_STOR_RAM) { 2386 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2387 } else { 2388 int err; 2389 2390 req->io_req.br_callback = pci_nvme_io_done; 2391 2392 err = blockif_flush(nvstore->ctx, &req->io_req); 2393 switch (err) { 2394 case 0: 2395 pending = true; 2396 break; 2397 case EOPNOTSUPP: 2398 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2399 break; 2400 default: 2401 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2402 } 2403 } 2404 2405 return (pending); 2406 } 2407 2408 static uint16_t 2409 nvme_write_read_ram(struct pci_nvme_softc *sc, 2410 struct pci_nvme_blockstore *nvstore, 2411 uint64_t prp1, uint64_t prp2, 2412 size_t offset, uint64_t bytes, 2413 bool is_write) 2414 { 2415 uint8_t *buf = nvstore->ctx; 2416 enum nvme_copy_dir dir; 2417 uint16_t status; 2418 2419 if (is_write) 2420 dir = NVME_COPY_TO_PRP; 2421 else 2422 dir = NVME_COPY_FROM_PRP; 2423 2424 status = 0; 2425 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2, 2426 buf + offset, bytes, dir)) 2427 pci_nvme_status_genc(&status, 2428 NVME_SC_DATA_TRANSFER_ERROR); 2429 else 2430 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2431 2432 return (status); 2433 } 2434 2435 static uint16_t 2436 nvme_write_read_blockif(struct pci_nvme_softc *sc, 2437 struct pci_nvme_blockstore *nvstore, 2438 struct pci_nvme_ioreq *req, 2439 uint64_t prp1, uint64_t prp2, 2440 size_t offset, uint64_t bytes, 2441 bool is_write) 2442 { 2443 uint64_t size; 2444 int err; 2445 uint16_t status = NVME_NO_STATUS; 2446 2447 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes); 2448 if (pci_nvme_append_iov_req(sc, req, prp1, size, offset)) { 2449 err = -1; 2450 goto out; 2451 } 2452 2453 offset += size; 2454 bytes -= size; 2455 2456 if (bytes == 0) { 2457 ; 2458 } else if (bytes <= PAGE_SIZE) { 2459 size = bytes; 2460 if (pci_nvme_append_iov_req(sc, req, prp2, size, offset)) { 2461 err = -1; 2462 goto out; 2463 } 2464 } else { 2465 void *vmctx = sc->nsc_pi->pi_vmctx; 2466 uint64_t *prp_list = &prp2; 2467 uint64_t *last = prp_list; 2468 2469 /* PRP2 is pointer to a physical region page list */ 2470 while (bytes) { 2471 /* Last entry in list points to the next list */ 2472 if ((prp_list == last) && (bytes > PAGE_SIZE)) { 2473 uint64_t prp = *prp_list; 2474 2475 prp_list = paddr_guest2host(vmctx, prp, 2476 PAGE_SIZE - (prp % PAGE_SIZE)); 2477 if (prp_list == NULL) { 2478 err = -1; 2479 goto out; 2480 } 2481 last = prp_list + (NVME_PRP2_ITEMS - 1); 2482 } 2483 2484 size = MIN(bytes, PAGE_SIZE); 2485 2486 if (pci_nvme_append_iov_req(sc, req, *prp_list, size, 2487 offset)) { 2488 err = -1; 2489 goto out; 2490 } 2491 2492 offset += size; 2493 bytes -= size; 2494 2495 prp_list++; 2496 } 2497 } 2498 req->io_req.br_callback = pci_nvme_io_done; 2499 if (is_write) 2500 err = blockif_write(nvstore->ctx, &req->io_req); 2501 else 2502 err = blockif_read(nvstore->ctx, &req->io_req); 2503 out: 2504 if (err) 2505 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); 2506 2507 return (status); 2508 } 2509 2510 static bool 2511 nvme_opc_write_read(struct pci_nvme_softc *sc, 2512 struct nvme_command *cmd, 2513 struct pci_nvme_blockstore *nvstore, 2514 struct pci_nvme_ioreq *req, 2515 uint16_t *status) 2516 { 2517 uint64_t lba, nblocks, bytes; 2518 size_t offset; 2519 bool is_write = cmd->opc == NVME_OPC_WRITE; 2520 bool pending = false; 2521 2522 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; 2523 nblocks = (cmd->cdw12 & 0xFFFF) + 1; 2524 bytes = nblocks << nvstore->sectsz_bits; 2525 if (bytes > NVME_MAX_DATA_SIZE) { 2526 WPRINTF("%s command would exceed MDTS", __func__); 2527 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD); 2528 goto out; 2529 } 2530 2531 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) { 2532 WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)", 2533 __func__, lba, nblocks); 2534 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2535 goto out; 2536 } 2537 2538 offset = lba << nvstore->sectsz_bits; 2539 2540 req->bytes = bytes; 2541 req->io_req.br_offset = lba; 2542 2543 /* PRP bits 1:0 must be zero */ 2544 cmd->prp1 &= ~0x3UL; 2545 cmd->prp2 &= ~0x3UL; 2546 2547 if (nvstore->type == NVME_STOR_RAM) { 2548 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1, 2549 cmd->prp2, offset, bytes, is_write); 2550 } else { 2551 *status = nvme_write_read_blockif(sc, nvstore, req, 2552 cmd->prp1, cmd->prp2, offset, bytes, is_write); 2553 2554 if (*status == NVME_NO_STATUS) 2555 pending = true; 2556 } 2557 out: 2558 if (!pending) 2559 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status); 2560 2561 return (pending); 2562 } 2563 2564 static void 2565 pci_nvme_dealloc_sm(struct blockif_req *br, int err) 2566 { 2567 struct pci_nvme_ioreq *req = br->br_param; 2568 struct pci_nvme_softc *sc = req->sc; 2569 bool done = true; 2570 uint16_t status; 2571 2572 status = 0; 2573 if (err) { 2574 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); 2575 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) { 2576 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2577 } else { 2578 struct iovec *iov = req->io_req.br_iov; 2579 2580 req->prev_gpaddr++; 2581 iov += req->prev_gpaddr; 2582 2583 /* The iov_* values already include the sector size */ 2584 req->io_req.br_offset = (off_t)iov->iov_base; 2585 req->io_req.br_resid = iov->iov_len; 2586 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) { 2587 pci_nvme_status_genc(&status, 2588 NVME_SC_INTERNAL_DEVICE_ERROR); 2589 } else 2590 done = false; 2591 } 2592 2593 if (done) { 2594 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, req->cid, 2595 status); 2596 pci_nvme_release_ioreq(sc, req); 2597 } 2598 } 2599 2600 static bool 2601 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, 2602 struct nvme_command *cmd, 2603 struct pci_nvme_blockstore *nvstore, 2604 struct pci_nvme_ioreq *req, 2605 uint16_t *status) 2606 { 2607 struct nvme_dsm_range *range = NULL; 2608 uint32_t nr, r, non_zero, dr; 2609 int err; 2610 bool pending = false; 2611 2612 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) { 2613 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE); 2614 goto out; 2615 } 2616 2617 nr = cmd->cdw10 & 0xff; 2618 2619 /* copy locally because a range entry could straddle PRPs */ 2620 #ifdef __FreeBSD__ 2621 range = calloc(1, NVME_MAX_DSM_TRIM); 2622 #else 2623 _Static_assert(NVME_MAX_DSM_TRIM % sizeof(struct nvme_dsm_range) == 0, 2624 "NVME_MAX_DSM_TRIM is not a multiple of struct size"); 2625 range = calloc(NVME_MAX_DSM_TRIM / sizeof (*range), sizeof (*range)); 2626 #endif 2627 if (range == NULL) { 2628 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2629 goto out; 2630 } 2631 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, 2632 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP); 2633 2634 /* Check for invalid ranges and the number of non-zero lengths */ 2635 non_zero = 0; 2636 for (r = 0; r <= nr; r++) { 2637 if (pci_nvme_out_of_range(nvstore, 2638 range[r].starting_lba, range[r].length)) { 2639 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2640 goto out; 2641 } 2642 if (range[r].length != 0) 2643 non_zero++; 2644 } 2645 2646 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) { 2647 size_t offset, bytes; 2648 int sectsz_bits = sc->nvstore.sectsz_bits; 2649 2650 /* 2651 * DSM calls are advisory only, and compliant controllers 2652 * may choose to take no actions (i.e. return Success). 2653 */ 2654 if (!nvstore->deallocate) { 2655 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2656 goto out; 2657 } 2658 2659 /* If all ranges have a zero length, return Success */ 2660 if (non_zero == 0) { 2661 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2662 goto out; 2663 } 2664 2665 if (req == NULL) { 2666 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2667 goto out; 2668 } 2669 2670 offset = range[0].starting_lba << sectsz_bits; 2671 bytes = range[0].length << sectsz_bits; 2672 2673 /* 2674 * If the request is for more than a single range, store 2675 * the ranges in the br_iov. Optimize for the common case 2676 * of a single range. 2677 * 2678 * Note that NVMe Number of Ranges is a zero based value 2679 */ 2680 req->io_req.br_iovcnt = 0; 2681 req->io_req.br_offset = offset; 2682 req->io_req.br_resid = bytes; 2683 2684 if (nr == 0) { 2685 req->io_req.br_callback = pci_nvme_io_done; 2686 } else { 2687 struct iovec *iov = req->io_req.br_iov; 2688 2689 for (r = 0, dr = 0; r <= nr; r++) { 2690 offset = range[r].starting_lba << sectsz_bits; 2691 bytes = range[r].length << sectsz_bits; 2692 if (bytes == 0) 2693 continue; 2694 2695 if ((nvstore->size - offset) < bytes) { 2696 pci_nvme_status_genc(status, 2697 NVME_SC_LBA_OUT_OF_RANGE); 2698 goto out; 2699 } 2700 iov[dr].iov_base = (void *)offset; 2701 iov[dr].iov_len = bytes; 2702 dr++; 2703 } 2704 req->io_req.br_callback = pci_nvme_dealloc_sm; 2705 2706 /* 2707 * Use prev_gpaddr to track the current entry and 2708 * prev_size to track the number of entries 2709 */ 2710 req->prev_gpaddr = 0; 2711 req->prev_size = dr; 2712 } 2713 2714 err = blockif_delete(nvstore->ctx, &req->io_req); 2715 if (err) 2716 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2717 else 2718 pending = true; 2719 } 2720 out: 2721 free(range); 2722 return (pending); 2723 } 2724 2725 static void 2726 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) 2727 { 2728 struct nvme_submission_queue *sq; 2729 uint16_t status; 2730 uint16_t sqhead; 2731 2732 /* handle all submissions up to sq->tail index */ 2733 sq = &sc->submit_queues[idx]; 2734 2735 pthread_mutex_lock(&sq->mtx); 2736 2737 sqhead = sq->head; 2738 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p", 2739 idx, sqhead, sq->tail, sq->qbase); 2740 2741 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2742 struct nvme_command *cmd; 2743 struct pci_nvme_ioreq *req; 2744 uint32_t nsid; 2745 bool pending; 2746 2747 pending = false; 2748 req = NULL; 2749 status = 0; 2750 2751 cmd = &sq->qbase[sqhead]; 2752 sqhead = (sqhead + 1) % sq->size; 2753 2754 nsid = le32toh(cmd->nsid); 2755 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) { 2756 pci_nvme_status_genc(&status, 2757 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 2758 status |= NVMEM(NVME_STATUS_DNR); 2759 goto complete; 2760 } 2761 2762 req = pci_nvme_get_ioreq(sc); 2763 if (req == NULL) { 2764 pci_nvme_status_genc(&status, 2765 NVME_SC_INTERNAL_DEVICE_ERROR); 2766 WPRINTF("%s: unable to allocate IO req", __func__); 2767 goto complete; 2768 } 2769 req->nvme_sq = sq; 2770 req->sqid = idx; 2771 req->opc = cmd->opc; 2772 req->cid = cmd->cid; 2773 req->nsid = cmd->nsid; 2774 2775 switch (cmd->opc) { 2776 case NVME_OPC_FLUSH: 2777 pending = nvme_opc_flush(sc, cmd, &sc->nvstore, 2778 req, &status); 2779 break; 2780 case NVME_OPC_WRITE: 2781 case NVME_OPC_READ: 2782 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore, 2783 req, &status); 2784 break; 2785 case NVME_OPC_WRITE_ZEROES: 2786 /* TODO: write zeroes 2787 WPRINTF("%s write zeroes lba 0x%lx blocks %u", 2788 __func__, lba, cmd->cdw12 & 0xFFFF); */ 2789 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2790 break; 2791 case NVME_OPC_DATASET_MANAGEMENT: 2792 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, 2793 req, &status); 2794 break; 2795 default: 2796 WPRINTF("%s unhandled io command 0x%x", 2797 __func__, cmd->opc); 2798 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE); 2799 } 2800 complete: 2801 if (!pending) { 2802 pci_nvme_set_completion(sc, sq, idx, cmd->cid, status); 2803 if (req != NULL) 2804 pci_nvme_release_ioreq(sc, req); 2805 } 2806 } 2807 2808 sq->head = sqhead; 2809 2810 pthread_mutex_unlock(&sq->mtx); 2811 } 2812 2813 static void 2814 pci_nvme_handle_doorbell(struct pci_nvme_softc* sc, 2815 uint64_t idx, int is_sq, uint64_t value) 2816 { 2817 DPRINTF("nvme doorbell %lu, %s, val 0x%lx", 2818 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF); 2819 2820 if (is_sq) { 2821 if (idx > sc->num_squeues) { 2822 WPRINTF("%s queue index %lu overflow from " 2823 "guest (max %u)", 2824 __func__, idx, sc->num_squeues); 2825 return; 2826 } 2827 2828 atomic_store_short(&sc->submit_queues[idx].tail, 2829 (uint16_t)value); 2830 2831 if (idx == 0) { 2832 pci_nvme_handle_admin_cmd(sc, value); 2833 } else { 2834 /* submission queue; handle new entries in SQ */ 2835 if (idx > sc->num_squeues) { 2836 WPRINTF("%s SQ index %lu overflow from " 2837 "guest (max %u)", 2838 __func__, idx, sc->num_squeues); 2839 return; 2840 } 2841 pci_nvme_handle_io_cmd(sc, (uint16_t)idx); 2842 } 2843 } else { 2844 if (idx > sc->num_cqueues) { 2845 WPRINTF("%s queue index %lu overflow from " 2846 "guest (max %u)", 2847 __func__, idx, sc->num_cqueues); 2848 return; 2849 } 2850 2851 atomic_store_short(&sc->compl_queues[idx].head, 2852 (uint16_t)value); 2853 } 2854 } 2855 2856 static void 2857 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) 2858 { 2859 const char *s = iswrite ? "WRITE" : "READ"; 2860 2861 switch (offset) { 2862 case NVME_CR_CAP_LOW: 2863 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s); 2864 break; 2865 case NVME_CR_CAP_HI: 2866 DPRINTF("%s %s NVME_CR_CAP_HI", func, s); 2867 break; 2868 case NVME_CR_VS: 2869 DPRINTF("%s %s NVME_CR_VS", func, s); 2870 break; 2871 case NVME_CR_INTMS: 2872 DPRINTF("%s %s NVME_CR_INTMS", func, s); 2873 break; 2874 case NVME_CR_INTMC: 2875 DPRINTF("%s %s NVME_CR_INTMC", func, s); 2876 break; 2877 case NVME_CR_CC: 2878 DPRINTF("%s %s NVME_CR_CC", func, s); 2879 break; 2880 case NVME_CR_CSTS: 2881 DPRINTF("%s %s NVME_CR_CSTS", func, s); 2882 break; 2883 case NVME_CR_NSSR: 2884 DPRINTF("%s %s NVME_CR_NSSR", func, s); 2885 break; 2886 case NVME_CR_AQA: 2887 DPRINTF("%s %s NVME_CR_AQA", func, s); 2888 break; 2889 case NVME_CR_ASQ_LOW: 2890 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s); 2891 break; 2892 case NVME_CR_ASQ_HI: 2893 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s); 2894 break; 2895 case NVME_CR_ACQ_LOW: 2896 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s); 2897 break; 2898 case NVME_CR_ACQ_HI: 2899 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s); 2900 break; 2901 default: 2902 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset); 2903 } 2904 2905 } 2906 2907 static void 2908 pci_nvme_write_bar_0(struct pci_nvme_softc *sc, uint64_t offset, int size, 2909 uint64_t value) 2910 { 2911 uint32_t ccreg; 2912 2913 if (offset >= NVME_DOORBELL_OFFSET) { 2914 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; 2915 uint64_t idx = belloffset / 8; /* door bell size = 2*int */ 2916 int is_sq = (belloffset % 8) < 4; 2917 2918 if ((sc->regs.csts & NVME_CSTS_RDY) == 0) { 2919 WPRINTF("doorbell write prior to RDY (offset=%#lx)\n", 2920 offset); 2921 return; 2922 } 2923 2924 if (belloffset > ((sc->max_queues+1) * 8 - 4)) { 2925 WPRINTF("guest attempted an overflow write offset " 2926 "0x%lx, val 0x%lx in %s", 2927 offset, value, __func__); 2928 return; 2929 } 2930 2931 if (is_sq) { 2932 if (sc->submit_queues[idx].qbase == NULL) 2933 return; 2934 } else if (sc->compl_queues[idx].qbase == NULL) 2935 return; 2936 2937 pci_nvme_handle_doorbell(sc, idx, is_sq, value); 2938 return; 2939 } 2940 2941 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx", 2942 offset, size, value); 2943 2944 if (size != 4) { 2945 WPRINTF("guest wrote invalid size %d (offset 0x%lx, " 2946 "val 0x%lx) to bar0 in %s", 2947 size, offset, value, __func__); 2948 /* TODO: shutdown device */ 2949 return; 2950 } 2951 2952 pci_nvme_bar0_reg_dumps(__func__, offset, 1); 2953 2954 pthread_mutex_lock(&sc->mtx); 2955 2956 switch (offset) { 2957 case NVME_CR_CAP_LOW: 2958 case NVME_CR_CAP_HI: 2959 /* readonly */ 2960 break; 2961 case NVME_CR_VS: 2962 /* readonly */ 2963 break; 2964 case NVME_CR_INTMS: 2965 /* MSI-X, so ignore */ 2966 break; 2967 case NVME_CR_INTMC: 2968 /* MSI-X, so ignore */ 2969 break; 2970 case NVME_CR_CC: 2971 ccreg = (uint32_t)value; 2972 2973 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u " 2974 "iocqes %u", 2975 __func__, 2976 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), 2977 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), 2978 NVME_CC_GET_IOCQES(ccreg)); 2979 2980 if (NVME_CC_GET_SHN(ccreg)) { 2981 /* perform shutdown - flush out data to backend */ 2982 sc->regs.csts &= ~NVMEM(NVME_CSTS_REG_SHST); 2983 sc->regs.csts |= NVMEF(NVME_CSTS_REG_SHST, 2984 NVME_SHST_COMPLETE); 2985 } 2986 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { 2987 if (NVME_CC_GET_EN(ccreg) == 0) 2988 /* transition 1-> causes controller reset */ 2989 pci_nvme_reset_locked(sc); 2990 else 2991 pci_nvme_init_controller(sc); 2992 } 2993 2994 /* Insert the iocqes, iosqes and en bits from the write */ 2995 sc->regs.cc &= ~NVME_CC_WRITE_MASK; 2996 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; 2997 if (NVME_CC_GET_EN(ccreg) == 0) { 2998 /* Insert the ams, mps and css bit fields */ 2999 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; 3000 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; 3001 sc->regs.csts &= ~NVME_CSTS_RDY; 3002 } else if ((sc->pending_ios == 0) && 3003 !(sc->regs.csts & NVME_CSTS_CFS)) { 3004 sc->regs.csts |= NVME_CSTS_RDY; 3005 } 3006 break; 3007 case NVME_CR_CSTS: 3008 break; 3009 case NVME_CR_NSSR: 3010 /* ignore writes; don't support subsystem reset */ 3011 break; 3012 case NVME_CR_AQA: 3013 sc->regs.aqa = (uint32_t)value; 3014 break; 3015 case NVME_CR_ASQ_LOW: 3016 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | 3017 (0xFFFFF000 & value); 3018 break; 3019 case NVME_CR_ASQ_HI: 3020 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | 3021 (value << 32); 3022 break; 3023 case NVME_CR_ACQ_LOW: 3024 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | 3025 (0xFFFFF000 & value); 3026 break; 3027 case NVME_CR_ACQ_HI: 3028 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | 3029 (value << 32); 3030 break; 3031 default: 3032 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d", 3033 __func__, offset, value, size); 3034 } 3035 pthread_mutex_unlock(&sc->mtx); 3036 } 3037 3038 static void 3039 pci_nvme_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size, 3040 uint64_t value) 3041 { 3042 struct pci_nvme_softc* sc = pi->pi_arg; 3043 3044 if (baridx == pci_msix_table_bar(pi) || 3045 baridx == pci_msix_pba_bar(pi)) { 3046 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, " 3047 " value 0x%lx", baridx, offset, size, value); 3048 3049 pci_emul_msix_twrite(pi, offset, size, value); 3050 return; 3051 } 3052 3053 switch (baridx) { 3054 case 0: 3055 pci_nvme_write_bar_0(sc, offset, size, value); 3056 break; 3057 3058 default: 3059 DPRINTF("%s unknown baridx %d, val 0x%lx", 3060 __func__, baridx, value); 3061 } 3062 } 3063 3064 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, 3065 uint64_t offset, int size) 3066 { 3067 uint64_t value; 3068 3069 pci_nvme_bar0_reg_dumps(__func__, offset, 0); 3070 3071 if (offset < NVME_DOORBELL_OFFSET) { 3072 void *p = &(sc->regs); 3073 pthread_mutex_lock(&sc->mtx); 3074 memcpy(&value, (void *)((uintptr_t)p + offset), size); 3075 pthread_mutex_unlock(&sc->mtx); 3076 } else { 3077 value = 0; 3078 WPRINTF("pci_nvme: read invalid offset %ld", offset); 3079 } 3080 3081 switch (size) { 3082 case 1: 3083 value &= 0xFF; 3084 break; 3085 case 2: 3086 value &= 0xFFFF; 3087 break; 3088 case 4: 3089 value &= 0xFFFFFFFF; 3090 break; 3091 } 3092 3093 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x", 3094 offset, size, (uint32_t)value); 3095 3096 return (value); 3097 } 3098 3099 3100 3101 static uint64_t 3102 pci_nvme_read(struct pci_devinst *pi, int baridx, uint64_t offset, int size) 3103 { 3104 struct pci_nvme_softc* sc = pi->pi_arg; 3105 3106 if (baridx == pci_msix_table_bar(pi) || 3107 baridx == pci_msix_pba_bar(pi)) { 3108 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d", 3109 baridx, offset, size); 3110 3111 return pci_emul_msix_tread(pi, offset, size); 3112 } 3113 3114 switch (baridx) { 3115 case 0: 3116 return pci_nvme_read_bar_0(sc, offset, size); 3117 3118 default: 3119 DPRINTF("unknown bar %d, 0x%lx", baridx, offset); 3120 } 3121 3122 return (0); 3123 } 3124 3125 static int 3126 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl) 3127 { 3128 char bident[sizeof("XXX:XXX")]; 3129 const char *value; 3130 uint32_t sectsz; 3131 3132 sc->max_queues = NVME_QUEUES; 3133 sc->max_qentries = NVME_MAX_QENTRIES; 3134 sc->ioslots = NVME_IOSLOTS; 3135 sc->num_squeues = sc->max_queues; 3136 sc->num_cqueues = sc->max_queues; 3137 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 3138 sectsz = 0; 3139 #ifdef __FreeBSD__ 3140 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), 3141 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3142 #else 3143 snprintf((char *)sc->ctrldata.sn, sizeof(sc->ctrldata.sn), 3144 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3145 #endif 3146 3147 value = get_config_value_node(nvl, "maxq"); 3148 if (value != NULL) 3149 sc->max_queues = atoi(value); 3150 value = get_config_value_node(nvl, "qsz"); 3151 if (value != NULL) { 3152 sc->max_qentries = atoi(value); 3153 if (sc->max_qentries <= 0) { 3154 EPRINTLN("nvme: Invalid qsz option %d", 3155 sc->max_qentries); 3156 return (-1); 3157 } 3158 } 3159 value = get_config_value_node(nvl, "ioslots"); 3160 if (value != NULL) { 3161 sc->ioslots = atoi(value); 3162 if (sc->ioslots <= 0) { 3163 EPRINTLN("Invalid ioslots option %d", sc->ioslots); 3164 return (-1); 3165 } 3166 } 3167 value = get_config_value_node(nvl, "sectsz"); 3168 if (value != NULL) 3169 sectsz = atoi(value); 3170 value = get_config_value_node(nvl, "ser"); 3171 if (value != NULL) { 3172 /* 3173 * This field indicates the Product Serial Number in 3174 * 7-bit ASCII, unused bytes should be space characters. 3175 * Ref: NVMe v1.3c. 3176 */ 3177 cpywithpad((char *)sc->ctrldata.sn, 3178 sizeof(sc->ctrldata.sn), value, ' '); 3179 } 3180 value = get_config_value_node(nvl, "eui64"); 3181 if (value != NULL) 3182 sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0)); 3183 value = get_config_value_node(nvl, "dsm"); 3184 if (value != NULL) { 3185 if (strcmp(value, "auto") == 0) 3186 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 3187 else if (strcmp(value, "enable") == 0) 3188 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE; 3189 else if (strcmp(value, "disable") == 0) 3190 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE; 3191 } 3192 3193 value = get_config_value_node(nvl, "bootindex"); 3194 if (value != NULL) { 3195 if (pci_emul_add_boot_device(sc->nsc_pi, atoi(value))) { 3196 EPRINTLN("Invalid bootindex %d", atoi(value)); 3197 return (-1); 3198 } 3199 } 3200 3201 value = get_config_value_node(nvl, "ram"); 3202 if (value != NULL) { 3203 uint64_t sz = strtoull(value, NULL, 10); 3204 3205 sc->nvstore.type = NVME_STOR_RAM; 3206 sc->nvstore.size = sz * 1024 * 1024; 3207 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 3208 sc->nvstore.sectsz = 4096; 3209 sc->nvstore.sectsz_bits = 12; 3210 if (sc->nvstore.ctx == NULL) { 3211 EPRINTLN("nvme: Unable to allocate RAM"); 3212 return (-1); 3213 } 3214 } else { 3215 snprintf(bident, sizeof(bident), "%u:%u", 3216 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3217 sc->nvstore.ctx = blockif_open(nvl, bident); 3218 if (sc->nvstore.ctx == NULL) { 3219 EPRINTLN("nvme: Could not open backing file: %s", 3220 strerror(errno)); 3221 return (-1); 3222 } 3223 sc->nvstore.type = NVME_STOR_BLOCKIF; 3224 sc->nvstore.size = blockif_size(sc->nvstore.ctx); 3225 } 3226 3227 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) 3228 sc->nvstore.sectsz = sectsz; 3229 else if (sc->nvstore.type != NVME_STOR_RAM) 3230 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); 3231 for (sc->nvstore.sectsz_bits = 9; 3232 (1U << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; 3233 sc->nvstore.sectsz_bits++); 3234 3235 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) 3236 sc->max_queues = NVME_QUEUES; 3237 3238 return (0); 3239 } 3240 3241 static void 3242 pci_nvme_resized(struct blockif_ctxt *bctxt __unused, void *arg, 3243 size_t new_size) 3244 { 3245 struct pci_nvme_softc *sc; 3246 struct pci_nvme_blockstore *nvstore; 3247 struct nvme_namespace_data *nd; 3248 3249 sc = arg; 3250 nvstore = &sc->nvstore; 3251 nd = &sc->nsdata; 3252 3253 nvstore->size = new_size; 3254 pci_nvme_init_nsdata_size(nvstore, nd); 3255 3256 /* Add changed NSID to list */ 3257 sc->ns_log.ns[0] = 1; 3258 sc->ns_log.ns[1] = 0; 3259 3260 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE, 3261 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED); 3262 } 3263 3264 static int 3265 pci_nvme_init(struct pci_devinst *pi, nvlist_t *nvl) 3266 { 3267 struct pci_nvme_softc *sc; 3268 uint32_t pci_membar_sz; 3269 int error; 3270 3271 error = 0; 3272 3273 sc = calloc(1, sizeof(struct pci_nvme_softc)); 3274 pi->pi_arg = sc; 3275 sc->nsc_pi = pi; 3276 3277 error = pci_nvme_parse_config(sc, nvl); 3278 if (error < 0) 3279 goto done; 3280 else 3281 error = 0; 3282 3283 STAILQ_INIT(&sc->ioreqs_free); 3284 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); 3285 for (uint32_t i = 0; i < sc->ioslots; i++) { 3286 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); 3287 } 3288 3289 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); 3290 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); 3291 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); 3292 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); 3293 pci_set_cfgdata8(pi, PCIR_PROGIF, 3294 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); 3295 3296 /* 3297 * Allocate size of NVMe registers + doorbell space for all queues. 3298 * 3299 * The specification requires a minimum memory I/O window size of 16K. 3300 * The Windows driver will refuse to start a device with a smaller 3301 * window. 3302 */ 3303 pci_membar_sz = sizeof(struct nvme_registers) + 3304 2 * sizeof(uint32_t) * (sc->max_queues + 1); 3305 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); 3306 3307 DPRINTF("nvme membar size: %u", pci_membar_sz); 3308 3309 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); 3310 if (error) { 3311 WPRINTF("%s pci alloc mem bar failed", __func__); 3312 goto done; 3313 } 3314 3315 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); 3316 if (error) { 3317 WPRINTF("%s pci add msixcap failed", __func__); 3318 goto done; 3319 } 3320 3321 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); 3322 if (error) { 3323 WPRINTF("%s pci add Express capability failed", __func__); 3324 goto done; 3325 } 3326 3327 pthread_mutex_init(&sc->mtx, NULL); 3328 sem_init(&sc->iosemlock, 0, sc->ioslots); 3329 blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc); 3330 3331 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues); 3332 /* 3333 * Controller data depends on Namespace data so initialize Namespace 3334 * data first. 3335 */ 3336 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore); 3337 pci_nvme_init_ctrldata(sc); 3338 pci_nvme_init_logpages(sc); 3339 pci_nvme_init_features(sc); 3340 3341 pci_nvme_aer_init(sc); 3342 pci_nvme_aen_init(sc); 3343 3344 pci_nvme_reset(sc); 3345 done: 3346 return (error); 3347 } 3348 3349 static int 3350 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts) 3351 { 3352 char *cp, *ram; 3353 3354 if (opts == NULL) 3355 return (0); 3356 3357 if (strncmp(opts, "ram=", 4) == 0) { 3358 cp = strchr(opts, ','); 3359 if (cp == NULL) { 3360 set_config_value_node(nvl, "ram", opts + 4); 3361 return (0); 3362 } 3363 ram = strndup(opts + 4, cp - opts - 4); 3364 set_config_value_node(nvl, "ram", ram); 3365 free(ram); 3366 return (pci_parse_legacy_config(nvl, cp + 1)); 3367 } else 3368 return (blockif_legacy_config(nvl, opts)); 3369 } 3370 3371 static const struct pci_devemu pci_de_nvme = { 3372 .pe_emu = "nvme", 3373 .pe_init = pci_nvme_init, 3374 .pe_legacy_config = pci_nvme_legacy_config, 3375 .pe_barwrite = pci_nvme_write, 3376 .pe_barread = pci_nvme_read 3377 }; 3378 PCI_EMUL_SET(pci_de_nvme); 3379