1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2017 Shunsuke Mie 5 * Copyright (c) 2018 Leon Dang 6 * Copyright (c) 2020 Chuck Tuffli 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 /* 31 * bhyve PCIe-NVMe device emulation. 32 * 33 * options: 34 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt> 35 * 36 * accepted devpath: 37 * /dev/blockdev 38 * /path/to/image 39 * ram=size_in_MiB 40 * 41 * maxq = max number of queues 42 * qsz = max elements in each queue 43 * ioslots = max number of concurrent io requests 44 * sectsz = sector size (defaults to blockif sector size) 45 * ser = serial number (20-chars max) 46 * eui64 = IEEE Extended Unique Identifier (8 byte value) 47 * dsm = DataSet Management support. Option is one of auto, enable,disable 48 * 49 */ 50 51 /* TODO: 52 - create async event for smart and log 53 - intr coalesce 54 */ 55 56 #include <sys/cdefs.h> 57 #include <sys/errno.h> 58 #include <sys/types.h> 59 #include <sys/crc16.h> 60 #include <net/ieee_oui.h> 61 62 #include <assert.h> 63 #include <pthread.h> 64 #include <pthread_np.h> 65 #include <semaphore.h> 66 #include <stdbool.h> 67 #include <stddef.h> 68 #include <stdint.h> 69 #include <stdio.h> 70 #include <stdlib.h> 71 #include <string.h> 72 73 #include <machine/atomic.h> 74 #include <machine/vmm.h> 75 #include <vmmapi.h> 76 77 #include <dev/nvme/nvme.h> 78 79 #include "bhyverun.h" 80 #include "block_if.h" 81 #include "config.h" 82 #include "debug.h" 83 #include "pci_emul.h" 84 85 86 static int nvme_debug = 0; 87 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args) 88 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args) 89 90 /* defaults; can be overridden */ 91 #define NVME_MSIX_BAR 4 92 93 #define NVME_IOSLOTS 8 94 95 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ 96 #define NVME_MMIO_SPACE_MIN (1 << 14) 97 98 #define NVME_QUEUES 16 99 #define NVME_MAX_QENTRIES 2048 100 /* Memory Page size Minimum reported in CAP register */ 101 #define NVME_MPSMIN 0 102 /* MPSMIN converted to bytes */ 103 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN)) 104 105 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) 106 #define NVME_MDTS 9 107 /* Note the + 1 allows for the initial descriptor to not be page aligned */ 108 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1) 109 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES) 110 111 /* This is a synthetic status code to indicate there is no status */ 112 #define NVME_NO_STATUS 0xffff 113 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) 114 115 /* Reported temperature in Kelvin (i.e. room temperature) */ 116 #define NVME_TEMPERATURE 296 117 118 /* helpers */ 119 120 /* Convert a zero-based value into a one-based value */ 121 #define ONE_BASED(zero) ((zero) + 1) 122 /* Convert a one-based value into a zero-based value */ 123 #define ZERO_BASED(one) ((one) - 1) 124 125 /* Encode number of SQ's and CQ's for Set/Get Features */ 126 #define NVME_FEATURE_NUM_QUEUES(sc) \ 127 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ 128 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16 129 130 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) 131 132 enum nvme_controller_register_offsets { 133 NVME_CR_CAP_LOW = 0x00, 134 NVME_CR_CAP_HI = 0x04, 135 NVME_CR_VS = 0x08, 136 NVME_CR_INTMS = 0x0c, 137 NVME_CR_INTMC = 0x10, 138 NVME_CR_CC = 0x14, 139 NVME_CR_CSTS = 0x1c, 140 NVME_CR_NSSR = 0x20, 141 NVME_CR_AQA = 0x24, 142 NVME_CR_ASQ_LOW = 0x28, 143 NVME_CR_ASQ_HI = 0x2c, 144 NVME_CR_ACQ_LOW = 0x30, 145 NVME_CR_ACQ_HI = 0x34, 146 }; 147 148 enum nvme_cmd_cdw11 { 149 NVME_CMD_CDW11_PC = 0x0001, 150 NVME_CMD_CDW11_IEN = 0x0002, 151 NVME_CMD_CDW11_IV = 0xFFFF0000, 152 }; 153 154 enum nvme_copy_dir { 155 NVME_COPY_TO_PRP, 156 NVME_COPY_FROM_PRP, 157 }; 158 159 #define NVME_CQ_INTEN 0x01 160 #define NVME_CQ_INTCOAL 0x02 161 162 struct nvme_completion_queue { 163 struct nvme_completion *qbase; 164 pthread_mutex_t mtx; 165 uint32_t size; 166 uint16_t tail; /* nvme progress */ 167 uint16_t head; /* guest progress */ 168 uint16_t intr_vec; 169 uint32_t intr_en; 170 }; 171 172 struct nvme_submission_queue { 173 struct nvme_command *qbase; 174 pthread_mutex_t mtx; 175 uint32_t size; 176 uint16_t head; /* nvme progress */ 177 uint16_t tail; /* guest progress */ 178 uint16_t cqid; /* completion queue id */ 179 int qpriority; 180 }; 181 182 enum nvme_storage_type { 183 NVME_STOR_BLOCKIF = 0, 184 NVME_STOR_RAM = 1, 185 }; 186 187 struct pci_nvme_blockstore { 188 enum nvme_storage_type type; 189 void *ctx; 190 uint64_t size; 191 uint32_t sectsz; 192 uint32_t sectsz_bits; 193 uint64_t eui64; 194 uint32_t deallocate:1; 195 }; 196 197 /* 198 * Calculate the number of additional page descriptors for guest IO requests 199 * based on the advertised Max Data Transfer (MDTS) and given the number of 200 * default iovec's in a struct blockif_req. 201 */ 202 #define MDTS_PAD_SIZE \ 203 ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \ 204 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \ 205 0 ) 206 207 struct pci_nvme_ioreq { 208 struct pci_nvme_softc *sc; 209 STAILQ_ENTRY(pci_nvme_ioreq) link; 210 struct nvme_submission_queue *nvme_sq; 211 uint16_t sqid; 212 213 /* command information */ 214 uint16_t opc; 215 uint16_t cid; 216 uint32_t nsid; 217 218 uint64_t prev_gpaddr; 219 size_t prev_size; 220 size_t bytes; 221 222 struct blockif_req io_req; 223 224 struct iovec iovpadding[MDTS_PAD_SIZE]; 225 }; 226 227 enum nvme_dsm_type { 228 /* Dataset Management bit in ONCS reflects backing storage capability */ 229 NVME_DATASET_MANAGEMENT_AUTO, 230 /* Unconditionally set Dataset Management bit in ONCS */ 231 NVME_DATASET_MANAGEMENT_ENABLE, 232 /* Unconditionally clear Dataset Management bit in ONCS */ 233 NVME_DATASET_MANAGEMENT_DISABLE, 234 }; 235 236 struct pci_nvme_softc; 237 struct nvme_feature_obj; 238 239 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *, 240 struct nvme_feature_obj *, 241 struct nvme_command *, 242 struct nvme_completion *); 243 244 struct nvme_feature_obj { 245 uint32_t cdw11; 246 nvme_feature_cb set; 247 nvme_feature_cb get; 248 bool namespace_specific; 249 }; 250 251 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1) 252 253 typedef enum { 254 PCI_NVME_AE_TYPE_ERROR = 0, 255 PCI_NVME_AE_TYPE_SMART, 256 PCI_NVME_AE_TYPE_NOTICE, 257 PCI_NVME_AE_TYPE_IO_CMD = 6, 258 PCI_NVME_AE_TYPE_VENDOR = 7, 259 PCI_NVME_AE_TYPE_MAX /* Must be last */ 260 } pci_nvme_async_type; 261 262 /* Asynchronous Event Requests */ 263 struct pci_nvme_aer { 264 STAILQ_ENTRY(pci_nvme_aer) link; 265 uint16_t cid; /* Command ID of the submitted AER */ 266 }; 267 268 /** Asynchronous Event Information - Notice */ 269 typedef enum { 270 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED = 0, 271 PCI_NVME_AEI_NOTICE_FW_ACTIVATION, 272 PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE, 273 PCI_NVME_AEI_NOTICE_ANA_CHANGE, 274 PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE, 275 PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT, 276 PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE, 277 PCI_NVME_AEI_NOTICE_MAX, 278 } pci_nvme_async_event_info_notice; 279 280 #define PCI_NVME_AEI_NOTICE_SHIFT 8 281 #define PCI_NVME_AEI_NOTICE_MASK(event) (1 << (event + PCI_NVME_AEI_NOTICE_SHIFT)) 282 283 /* Asynchronous Event Notifications */ 284 struct pci_nvme_aen { 285 pci_nvme_async_type atype; 286 uint32_t event_data; 287 bool posted; 288 }; 289 290 /* 291 * By default, enable all Asynchrnous Event Notifications: 292 * SMART / Health Critical Warnings 293 * Namespace Attribute Notices 294 */ 295 #define PCI_NVME_AEN_DEFAULT_MASK 0x11f 296 297 typedef enum { 298 NVME_CNTRLTYPE_IO = 1, 299 NVME_CNTRLTYPE_DISCOVERY = 2, 300 NVME_CNTRLTYPE_ADMIN = 3, 301 } pci_nvme_cntrl_type; 302 303 struct pci_nvme_softc { 304 struct pci_devinst *nsc_pi; 305 306 pthread_mutex_t mtx; 307 308 struct nvme_registers regs; 309 310 struct nvme_namespace_data nsdata; 311 struct nvme_controller_data ctrldata; 312 struct nvme_error_information_entry err_log; 313 struct nvme_health_information_page health_log; 314 struct nvme_firmware_page fw_log; 315 struct nvme_ns_list ns_log; 316 317 struct pci_nvme_blockstore nvstore; 318 319 uint16_t max_qentries; /* max entries per queue */ 320 uint32_t max_queues; /* max number of IO SQ's or CQ's */ 321 uint32_t num_cqueues; 322 uint32_t num_squeues; 323 bool num_q_is_set; /* Has host set Number of Queues */ 324 325 struct pci_nvme_ioreq *ioreqs; 326 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */ 327 uint32_t pending_ios; 328 uint32_t ioslots; 329 sem_t iosemlock; 330 331 /* 332 * Memory mapped Submission and Completion queues 333 * Each array includes both Admin and IO queues 334 */ 335 struct nvme_completion_queue *compl_queues; 336 struct nvme_submission_queue *submit_queues; 337 338 struct nvme_feature_obj feat[NVME_FID_MAX]; 339 340 enum nvme_dsm_type dataset_management; 341 342 /* Accounting for SMART data */ 343 __uint128_t read_data_units; 344 __uint128_t write_data_units; 345 __uint128_t read_commands; 346 __uint128_t write_commands; 347 uint32_t read_dunits_remainder; 348 uint32_t write_dunits_remainder; 349 350 STAILQ_HEAD(, pci_nvme_aer) aer_list; 351 pthread_mutex_t aer_mtx; 352 uint32_t aer_count; 353 struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX]; 354 pthread_t aen_tid; 355 pthread_mutex_t aen_mtx; 356 pthread_cond_t aen_cond; 357 }; 358 359 360 static void pci_nvme_cq_update(struct pci_nvme_softc *sc, 361 struct nvme_completion_queue *cq, 362 uint32_t cdw0, 363 uint16_t cid, 364 uint16_t sqid, 365 uint16_t status); 366 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); 367 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); 368 static void pci_nvme_io_done(struct blockif_req *, int); 369 370 /* Controller Configuration utils */ 371 #define NVME_CC_GET_EN(cc) \ 372 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) 373 #define NVME_CC_GET_CSS(cc) \ 374 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) 375 #define NVME_CC_GET_SHN(cc) \ 376 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) 377 #define NVME_CC_GET_IOSQES(cc) \ 378 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) 379 #define NVME_CC_GET_IOCQES(cc) \ 380 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) 381 382 #define NVME_CC_WRITE_MASK \ 383 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ 384 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ 385 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) 386 387 #define NVME_CC_NEN_WRITE_MASK \ 388 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ 389 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ 390 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) 391 392 /* Controller Status utils */ 393 #define NVME_CSTS_GET_RDY(sts) \ 394 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) 395 396 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) 397 #define NVME_CSTS_CFS (1 << NVME_CSTS_REG_CFS_SHIFT) 398 399 /* Completion Queue status word utils */ 400 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) 401 #define NVME_STATUS_MASK \ 402 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ 403 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) 404 405 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \ 406 NVME_CTRLR_DATA_ONCS_DSM_SHIFT) 407 408 static void nvme_feature_invalid_cb(struct pci_nvme_softc *, 409 struct nvme_feature_obj *, 410 struct nvme_command *, 411 struct nvme_completion *); 412 static void nvme_feature_temperature(struct pci_nvme_softc *, 413 struct nvme_feature_obj *, 414 struct nvme_command *, 415 struct nvme_completion *); 416 static void nvme_feature_num_queues(struct pci_nvme_softc *, 417 struct nvme_feature_obj *, 418 struct nvme_command *, 419 struct nvme_completion *); 420 static void nvme_feature_iv_config(struct pci_nvme_softc *, 421 struct nvme_feature_obj *, 422 struct nvme_command *, 423 struct nvme_completion *); 424 static void nvme_feature_async_event(struct pci_nvme_softc *, 425 struct nvme_feature_obj *, 426 struct nvme_command *, 427 struct nvme_completion *); 428 429 static void *aen_thr(void *arg); 430 431 static __inline void 432 cpywithpad(char *dst, size_t dst_size, const char *src, char pad) 433 { 434 size_t len; 435 436 len = strnlen(src, dst_size); 437 memset(dst, pad, dst_size); 438 memcpy(dst, src, len); 439 } 440 441 static __inline void 442 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) 443 { 444 445 *status &= ~NVME_STATUS_MASK; 446 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | 447 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; 448 } 449 450 static __inline void 451 pci_nvme_status_genc(uint16_t *status, uint16_t code) 452 { 453 454 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); 455 } 456 457 /* 458 * Initialize the requested number or IO Submission and Completion Queues. 459 * Admin queues are allocated implicitly. 460 */ 461 static void 462 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq) 463 { 464 uint32_t i; 465 466 /* 467 * Allocate and initialize the Submission Queues 468 */ 469 if (nsq > NVME_QUEUES) { 470 WPRINTF("%s: clamping number of SQ from %u to %u", 471 __func__, nsq, NVME_QUEUES); 472 nsq = NVME_QUEUES; 473 } 474 475 sc->num_squeues = nsq; 476 477 sc->submit_queues = calloc(sc->num_squeues + 1, 478 sizeof(struct nvme_submission_queue)); 479 if (sc->submit_queues == NULL) { 480 WPRINTF("%s: SQ allocation failed", __func__); 481 sc->num_squeues = 0; 482 } else { 483 struct nvme_submission_queue *sq = sc->submit_queues; 484 485 for (i = 0; i < sc->num_squeues + 1; i++) 486 pthread_mutex_init(&sq[i].mtx, NULL); 487 } 488 489 /* 490 * Allocate and initialize the Completion Queues 491 */ 492 if (ncq > NVME_QUEUES) { 493 WPRINTF("%s: clamping number of CQ from %u to %u", 494 __func__, ncq, NVME_QUEUES); 495 ncq = NVME_QUEUES; 496 } 497 498 sc->num_cqueues = ncq; 499 500 sc->compl_queues = calloc(sc->num_cqueues + 1, 501 sizeof(struct nvme_completion_queue)); 502 if (sc->compl_queues == NULL) { 503 WPRINTF("%s: CQ allocation failed", __func__); 504 sc->num_cqueues = 0; 505 } else { 506 struct nvme_completion_queue *cq = sc->compl_queues; 507 508 for (i = 0; i < sc->num_cqueues + 1; i++) 509 pthread_mutex_init(&cq[i].mtx, NULL); 510 } 511 } 512 513 static void 514 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) 515 { 516 struct nvme_controller_data *cd = &sc->ctrldata; 517 int ret; 518 519 cd->vid = 0xFB5D; 520 cd->ssvid = 0x0000; 521 522 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); 523 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); 524 525 /* Num of submission commands that we can handle at a time (2^rab) */ 526 cd->rab = 4; 527 528 /* FreeBSD OUI */ 529 cd->ieee[0] = 0xfc; 530 cd->ieee[1] = 0x9c; 531 cd->ieee[2] = 0x58; 532 533 cd->mic = 0; 534 535 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */ 536 537 cd->ver = NVME_REV(1,4); 538 539 cd->cntrltype = NVME_CNTRLTYPE_IO; 540 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; 541 cd->oaes = NVMEB(NVME_CTRLR_DATA_OAES_NS_ATTR); 542 cd->acl = 2; 543 cd->aerl = 4; 544 545 /* Advertise 1, Read-only firmware slot */ 546 cd->frmw = NVMEB(NVME_CTRLR_DATA_FRMW_SLOT1_RO) | 547 (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT); 548 cd->lpa = 0; /* TODO: support some simple things like SMART */ 549 cd->elpe = 0; /* max error log page entries */ 550 /* 551 * Report a single power state (zero-based value) 552 * power_state[] values are left as zero to indicate "Not reported" 553 */ 554 cd->npss = 0; 555 556 /* Warning Composite Temperature Threshold */ 557 cd->wctemp = 0x0157; 558 cd->cctemp = 0x0157; 559 560 /* SANICAP must not be 0 for Revision 1.4 and later NVMe Controllers */ 561 cd->sanicap = (NVME_CTRLR_DATA_SANICAP_NODMMAS_NO << 562 NVME_CTRLR_DATA_SANICAP_NODMMAS_SHIFT); 563 564 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | 565 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); 566 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | 567 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); 568 cd->nn = 1; /* number of namespaces */ 569 570 cd->oncs = 0; 571 switch (sc->dataset_management) { 572 case NVME_DATASET_MANAGEMENT_AUTO: 573 if (sc->nvstore.deallocate) 574 cd->oncs |= NVME_ONCS_DSM; 575 break; 576 case NVME_DATASET_MANAGEMENT_ENABLE: 577 cd->oncs |= NVME_ONCS_DSM; 578 break; 579 default: 580 break; 581 } 582 583 cd->fna = NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK << 584 NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT; 585 586 cd->vwc = NVME_CTRLR_DATA_VWC_ALL_NO << NVME_CTRLR_DATA_VWC_ALL_SHIFT; 587 588 ret = snprintf(cd->subnqn, sizeof(cd->subnqn), 589 "nqn.2013-12.org.freebsd:bhyve-%s-%u-%u-%u", 590 get_config_value("name"), sc->nsc_pi->pi_bus, 591 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 592 if ((ret < 0) || ((unsigned)ret > sizeof(cd->subnqn))) 593 EPRINTLN("%s: error setting subnqn (%d)", __func__, ret); 594 } 595 596 static void 597 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore, 598 struct nvme_namespace_data *nd) 599 { 600 601 /* Get capacity and block size information from backing store */ 602 nd->nsze = nvstore->size / nvstore->sectsz; 603 nd->ncap = nd->nsze; 604 nd->nuse = nd->nsze; 605 } 606 607 static void 608 pci_nvme_init_nsdata(struct pci_nvme_softc *sc, 609 struct nvme_namespace_data *nd, uint32_t nsid, 610 struct pci_nvme_blockstore *nvstore) 611 { 612 613 pci_nvme_init_nsdata_size(nvstore, nd); 614 615 if (nvstore->type == NVME_STOR_BLOCKIF) 616 nvstore->deallocate = blockif_candelete(nvstore->ctx); 617 618 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ 619 nd->flbas = 0; 620 621 /* Create an EUI-64 if user did not provide one */ 622 if (nvstore->eui64 == 0) { 623 char *data = NULL; 624 uint64_t eui64 = nvstore->eui64; 625 626 asprintf(&data, "%s%u%u%u", get_config_value("name"), 627 sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot, 628 sc->nsc_pi->pi_func); 629 630 if (data != NULL) { 631 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); 632 free(data); 633 } 634 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff); 635 } 636 be64enc(nd->eui64, nvstore->eui64); 637 638 /* LBA data-sz = 2^lbads */ 639 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; 640 } 641 642 static void 643 pci_nvme_init_logpages(struct pci_nvme_softc *sc) 644 { 645 __uint128_t power_cycles = 1; 646 647 memset(&sc->err_log, 0, sizeof(sc->err_log)); 648 memset(&sc->health_log, 0, sizeof(sc->health_log)); 649 memset(&sc->fw_log, 0, sizeof(sc->fw_log)); 650 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 651 652 /* Set read/write remainder to round up according to spec */ 653 sc->read_dunits_remainder = 999; 654 sc->write_dunits_remainder = 999; 655 656 /* Set nominal Health values checked by implementations */ 657 sc->health_log.temperature = NVME_TEMPERATURE; 658 sc->health_log.available_spare = 100; 659 sc->health_log.available_spare_threshold = 10; 660 661 /* Set Active Firmware Info to slot 1 */ 662 sc->fw_log.afi = (1 << NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT); 663 memcpy(&sc->fw_log.revision[0], sc->ctrldata.fr, 664 sizeof(sc->fw_log.revision[0])); 665 666 memcpy(&sc->health_log.power_cycles, &power_cycles, 667 sizeof(sc->health_log.power_cycles)); 668 } 669 670 static void 671 pci_nvme_init_features(struct pci_nvme_softc *sc) 672 { 673 enum nvme_feature fid; 674 675 for (fid = 0; fid < NVME_FID_MAX; fid++) { 676 switch (fid) { 677 case NVME_FEAT_ARBITRATION: 678 case NVME_FEAT_POWER_MANAGEMENT: 679 case NVME_FEAT_INTERRUPT_COALESCING: //XXX 680 case NVME_FEAT_WRITE_ATOMICITY: 681 /* Mandatory but no special handling required */ 682 //XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 683 //XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 684 // this returns a data buffer 685 break; 686 case NVME_FEAT_TEMPERATURE_THRESHOLD: 687 sc->feat[fid].set = nvme_feature_temperature; 688 break; 689 case NVME_FEAT_ERROR_RECOVERY: 690 sc->feat[fid].namespace_specific = true; 691 break; 692 case NVME_FEAT_NUMBER_OF_QUEUES: 693 sc->feat[fid].set = nvme_feature_num_queues; 694 break; 695 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 696 sc->feat[fid].set = nvme_feature_iv_config; 697 break; 698 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 699 sc->feat[fid].set = nvme_feature_async_event; 700 /* Enable all AENs by default */ 701 sc->feat[fid].cdw11 = PCI_NVME_AEN_DEFAULT_MASK; 702 break; 703 default: 704 sc->feat[fid].set = nvme_feature_invalid_cb; 705 sc->feat[fid].get = nvme_feature_invalid_cb; 706 } 707 } 708 } 709 710 static void 711 pci_nvme_aer_reset(struct pci_nvme_softc *sc) 712 { 713 714 STAILQ_INIT(&sc->aer_list); 715 sc->aer_count = 0; 716 } 717 718 static void 719 pci_nvme_aer_init(struct pci_nvme_softc *sc) 720 { 721 722 pthread_mutex_init(&sc->aer_mtx, NULL); 723 pci_nvme_aer_reset(sc); 724 } 725 726 static void 727 pci_nvme_aer_destroy(struct pci_nvme_softc *sc) 728 { 729 struct pci_nvme_aer *aer = NULL; 730 731 pthread_mutex_lock(&sc->aer_mtx); 732 while (!STAILQ_EMPTY(&sc->aer_list)) { 733 aer = STAILQ_FIRST(&sc->aer_list); 734 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 735 free(aer); 736 } 737 pthread_mutex_unlock(&sc->aer_mtx); 738 739 pci_nvme_aer_reset(sc); 740 } 741 742 static bool 743 pci_nvme_aer_available(struct pci_nvme_softc *sc) 744 { 745 746 return (sc->aer_count != 0); 747 } 748 749 static bool 750 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc) 751 { 752 struct nvme_controller_data *cd = &sc->ctrldata; 753 754 /* AERL is a zero based value while aer_count is one's based */ 755 return (sc->aer_count == (cd->aerl + 1U)); 756 } 757 758 /* 759 * Add an Async Event Request 760 * 761 * Stores an AER to be returned later if the Controller needs to notify the 762 * host of an event. 763 * Note that while the NVMe spec doesn't require Controllers to return AER's 764 * in order, this implementation does preserve the order. 765 */ 766 static int 767 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid) 768 { 769 struct pci_nvme_aer *aer = NULL; 770 771 aer = calloc(1, sizeof(struct pci_nvme_aer)); 772 if (aer == NULL) 773 return (-1); 774 775 /* Save the Command ID for use in the completion message */ 776 aer->cid = cid; 777 778 pthread_mutex_lock(&sc->aer_mtx); 779 sc->aer_count++; 780 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link); 781 pthread_mutex_unlock(&sc->aer_mtx); 782 783 return (0); 784 } 785 786 /* 787 * Get an Async Event Request structure 788 * 789 * Returns a pointer to an AER previously submitted by the host or NULL if 790 * no AER's exist. Caller is responsible for freeing the returned struct. 791 */ 792 static struct pci_nvme_aer * 793 pci_nvme_aer_get(struct pci_nvme_softc *sc) 794 { 795 struct pci_nvme_aer *aer = NULL; 796 797 pthread_mutex_lock(&sc->aer_mtx); 798 aer = STAILQ_FIRST(&sc->aer_list); 799 if (aer != NULL) { 800 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 801 sc->aer_count--; 802 } 803 pthread_mutex_unlock(&sc->aer_mtx); 804 805 return (aer); 806 } 807 808 static void 809 pci_nvme_aen_reset(struct pci_nvme_softc *sc) 810 { 811 uint32_t atype; 812 813 memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen)); 814 815 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 816 sc->aen[atype].atype = atype; 817 } 818 } 819 820 static void 821 pci_nvme_aen_init(struct pci_nvme_softc *sc) 822 { 823 char nstr[80]; 824 825 pci_nvme_aen_reset(sc); 826 827 pthread_mutex_init(&sc->aen_mtx, NULL); 828 pthread_create(&sc->aen_tid, NULL, aen_thr, sc); 829 snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot, 830 sc->nsc_pi->pi_func); 831 pthread_set_name_np(sc->aen_tid, nstr); 832 } 833 834 static void 835 pci_nvme_aen_destroy(struct pci_nvme_softc *sc) 836 { 837 838 pci_nvme_aen_reset(sc); 839 } 840 841 /* Notify the AEN thread of pending work */ 842 static void 843 pci_nvme_aen_notify(struct pci_nvme_softc *sc) 844 { 845 846 pthread_cond_signal(&sc->aen_cond); 847 } 848 849 /* 850 * Post an Asynchronous Event Notification 851 */ 852 static int32_t 853 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype, 854 uint32_t event_data) 855 { 856 struct pci_nvme_aen *aen; 857 858 if (atype >= PCI_NVME_AE_TYPE_MAX) { 859 return(EINVAL); 860 } 861 862 pthread_mutex_lock(&sc->aen_mtx); 863 aen = &sc->aen[atype]; 864 865 /* Has the controller already posted an event of this type? */ 866 if (aen->posted) { 867 pthread_mutex_unlock(&sc->aen_mtx); 868 return(EALREADY); 869 } 870 871 aen->event_data = event_data; 872 aen->posted = true; 873 pthread_mutex_unlock(&sc->aen_mtx); 874 875 pci_nvme_aen_notify(sc); 876 877 return(0); 878 } 879 880 static void 881 pci_nvme_aen_process(struct pci_nvme_softc *sc) 882 { 883 struct pci_nvme_aer *aer; 884 struct pci_nvme_aen *aen; 885 pci_nvme_async_type atype; 886 uint32_t mask; 887 uint16_t status; 888 uint8_t lid; 889 890 assert(pthread_mutex_isowned_np(&sc->aen_mtx)); 891 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 892 aen = &sc->aen[atype]; 893 /* Previous iterations may have depleted the available AER's */ 894 if (!pci_nvme_aer_available(sc)) { 895 DPRINTF("%s: no AER", __func__); 896 break; 897 } 898 899 if (!aen->posted) { 900 DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype); 901 continue; 902 } 903 904 status = NVME_SC_SUCCESS; 905 906 /* Is the event masked? */ 907 mask = 908 sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11; 909 910 DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data); 911 switch (atype) { 912 case PCI_NVME_AE_TYPE_ERROR: 913 lid = NVME_LOG_ERROR; 914 break; 915 case PCI_NVME_AE_TYPE_SMART: 916 mask &= 0xff; 917 if ((mask & aen->event_data) == 0) 918 continue; 919 lid = NVME_LOG_HEALTH_INFORMATION; 920 break; 921 case PCI_NVME_AE_TYPE_NOTICE: 922 if (aen->event_data >= PCI_NVME_AEI_NOTICE_MAX) { 923 EPRINTLN("%s unknown AEN notice type %u", 924 __func__, aen->event_data); 925 status = NVME_SC_INTERNAL_DEVICE_ERROR; 926 lid = 0; 927 break; 928 } 929 if ((PCI_NVME_AEI_NOTICE_MASK(aen->event_data) & mask) == 0) 930 continue; 931 switch (aen->event_data) { 932 case PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED: 933 lid = NVME_LOG_CHANGED_NAMESPACE; 934 break; 935 case PCI_NVME_AEI_NOTICE_FW_ACTIVATION: 936 lid = NVME_LOG_FIRMWARE_SLOT; 937 break; 938 case PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE: 939 lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED; 940 break; 941 case PCI_NVME_AEI_NOTICE_ANA_CHANGE: 942 lid = NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS; 943 break; 944 case PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE: 945 lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE; 946 break; 947 case PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT: 948 lid = NVME_LOG_LBA_STATUS_INFORMATION; 949 break; 950 case PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE: 951 lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE; 952 break; 953 default: 954 lid = 0; 955 } 956 break; 957 default: 958 /* bad type?!? */ 959 EPRINTLN("%s unknown AEN type %u", __func__, atype); 960 status = NVME_SC_INTERNAL_DEVICE_ERROR; 961 lid = 0; 962 break; 963 } 964 965 aer = pci_nvme_aer_get(sc); 966 assert(aer != NULL); 967 968 DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype); 969 pci_nvme_cq_update(sc, &sc->compl_queues[0], 970 (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */ 971 aer->cid, 972 0, /* SQID */ 973 status); 974 975 aen->event_data = 0; 976 aen->posted = false; 977 978 pci_generate_msix(sc->nsc_pi, 0); 979 } 980 } 981 982 static void * 983 aen_thr(void *arg) 984 { 985 struct pci_nvme_softc *sc; 986 987 sc = arg; 988 989 pthread_mutex_lock(&sc->aen_mtx); 990 for (;;) { 991 pci_nvme_aen_process(sc); 992 pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx); 993 } 994 pthread_mutex_unlock(&sc->aen_mtx); 995 996 pthread_exit(NULL); 997 return (NULL); 998 } 999 1000 static void 1001 pci_nvme_reset_locked(struct pci_nvme_softc *sc) 1002 { 1003 uint32_t i; 1004 1005 DPRINTF("%s", __func__); 1006 1007 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | 1008 (1 << NVME_CAP_LO_REG_CQR_SHIFT) | 1009 (60 << NVME_CAP_LO_REG_TO_SHIFT); 1010 1011 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; 1012 1013 sc->regs.vs = NVME_REV(1,4); /* NVMe v1.4 */ 1014 1015 sc->regs.cc = 0; 1016 1017 assert(sc->submit_queues != NULL); 1018 1019 for (i = 0; i < sc->num_squeues + 1; i++) { 1020 sc->submit_queues[i].qbase = NULL; 1021 sc->submit_queues[i].size = 0; 1022 sc->submit_queues[i].cqid = 0; 1023 sc->submit_queues[i].tail = 0; 1024 sc->submit_queues[i].head = 0; 1025 } 1026 1027 assert(sc->compl_queues != NULL); 1028 1029 for (i = 0; i < sc->num_cqueues + 1; i++) { 1030 sc->compl_queues[i].qbase = NULL; 1031 sc->compl_queues[i].size = 0; 1032 sc->compl_queues[i].tail = 0; 1033 sc->compl_queues[i].head = 0; 1034 } 1035 1036 sc->num_q_is_set = false; 1037 1038 pci_nvme_aer_destroy(sc); 1039 pci_nvme_aen_destroy(sc); 1040 1041 /* 1042 * Clear CSTS.RDY last to prevent the host from enabling Controller 1043 * before cleanup completes 1044 */ 1045 sc->regs.csts = 0; 1046 } 1047 1048 static void 1049 pci_nvme_reset(struct pci_nvme_softc *sc) 1050 { 1051 pthread_mutex_lock(&sc->mtx); 1052 pci_nvme_reset_locked(sc); 1053 pthread_mutex_unlock(&sc->mtx); 1054 } 1055 1056 static int 1057 pci_nvme_init_controller(struct pci_nvme_softc *sc) 1058 { 1059 uint16_t acqs, asqs; 1060 1061 DPRINTF("%s", __func__); 1062 1063 /* 1064 * NVMe 2.0 states that "enabling a controller while this field is 1065 * cleared to 0h produces undefined results" for both ACQS and 1066 * ASQS. If zero, set CFS and do not become ready. 1067 */ 1068 asqs = ONE_BASED(sc->regs.aqa & NVME_AQA_REG_ASQS_MASK); 1069 if (asqs < 2) { 1070 EPRINTLN("%s: illegal ASQS value %#x (aqa=%#x)", __func__, 1071 asqs - 1, sc->regs.aqa); 1072 sc->regs.csts |= NVME_CSTS_CFS; 1073 return (-1); 1074 } 1075 sc->submit_queues[0].size = asqs; 1076 sc->submit_queues[0].qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1077 sc->regs.asq, sizeof(struct nvme_command) * asqs); 1078 if (sc->submit_queues[0].qbase == NULL) { 1079 EPRINTLN("%s: ASQ vm_map_gpa(%lx) failed", __func__, 1080 sc->regs.asq); 1081 sc->regs.csts |= NVME_CSTS_CFS; 1082 return (-1); 1083 } 1084 1085 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p", 1086 __func__, sc->regs.asq, sc->submit_queues[0].qbase); 1087 1088 acqs = ONE_BASED((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 1089 NVME_AQA_REG_ACQS_MASK); 1090 if (acqs < 2) { 1091 EPRINTLN("%s: illegal ACQS value %#x (aqa=%#x)", __func__, 1092 acqs - 1, sc->regs.aqa); 1093 sc->regs.csts |= NVME_CSTS_CFS; 1094 return (-1); 1095 } 1096 sc->compl_queues[0].size = acqs; 1097 sc->compl_queues[0].qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1098 sc->regs.acq, sizeof(struct nvme_completion) * acqs); 1099 if (sc->compl_queues[0].qbase == NULL) { 1100 EPRINTLN("%s: ACQ vm_map_gpa(%lx) failed", __func__, 1101 sc->regs.acq); 1102 sc->regs.csts |= NVME_CSTS_CFS; 1103 return (-1); 1104 } 1105 sc->compl_queues[0].intr_en = NVME_CQ_INTEN; 1106 1107 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p", 1108 __func__, sc->regs.acq, sc->compl_queues[0].qbase); 1109 1110 return (0); 1111 } 1112 1113 static int 1114 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b, 1115 size_t len, enum nvme_copy_dir dir) 1116 { 1117 uint8_t *p; 1118 size_t bytes; 1119 1120 if (len > (8 * 1024)) { 1121 return (-1); 1122 } 1123 1124 /* Copy from the start of prp1 to the end of the physical page */ 1125 bytes = PAGE_SIZE - (prp1 & PAGE_MASK); 1126 bytes = MIN(bytes, len); 1127 1128 p = vm_map_gpa(ctx, prp1, bytes); 1129 if (p == NULL) { 1130 return (-1); 1131 } 1132 1133 if (dir == NVME_COPY_TO_PRP) 1134 memcpy(p, b, bytes); 1135 else 1136 memcpy(b, p, bytes); 1137 1138 b += bytes; 1139 1140 len -= bytes; 1141 if (len == 0) { 1142 return (0); 1143 } 1144 1145 len = MIN(len, PAGE_SIZE); 1146 1147 p = vm_map_gpa(ctx, prp2, len); 1148 if (p == NULL) { 1149 return (-1); 1150 } 1151 1152 if (dir == NVME_COPY_TO_PRP) 1153 memcpy(p, b, len); 1154 else 1155 memcpy(b, p, len); 1156 1157 return (0); 1158 } 1159 1160 /* 1161 * Write a Completion Queue Entry update 1162 * 1163 * Write the completion and update the doorbell value 1164 */ 1165 static void 1166 pci_nvme_cq_update(struct pci_nvme_softc *sc, 1167 struct nvme_completion_queue *cq, 1168 uint32_t cdw0, 1169 uint16_t cid, 1170 uint16_t sqid, 1171 uint16_t status) 1172 { 1173 struct nvme_submission_queue *sq = &sc->submit_queues[sqid]; 1174 struct nvme_completion *cqe; 1175 1176 assert(cq->qbase != NULL); 1177 1178 pthread_mutex_lock(&cq->mtx); 1179 1180 cqe = &cq->qbase[cq->tail]; 1181 1182 /* Flip the phase bit */ 1183 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK; 1184 1185 cqe->cdw0 = cdw0; 1186 cqe->sqhd = sq->head; 1187 cqe->sqid = sqid; 1188 cqe->cid = cid; 1189 cqe->status = status; 1190 1191 cq->tail++; 1192 if (cq->tail >= cq->size) { 1193 cq->tail = 0; 1194 } 1195 1196 pthread_mutex_unlock(&cq->mtx); 1197 } 1198 1199 static int 1200 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1201 struct nvme_completion* compl) 1202 { 1203 uint16_t qid = command->cdw10 & 0xffff; 1204 1205 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid); 1206 if (qid == 0 || qid > sc->num_squeues || 1207 (sc->submit_queues[qid].qbase == NULL)) { 1208 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u", 1209 __func__, qid, sc->num_squeues); 1210 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1211 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1212 return (1); 1213 } 1214 1215 sc->submit_queues[qid].qbase = NULL; 1216 sc->submit_queues[qid].cqid = 0; 1217 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1218 return (1); 1219 } 1220 1221 static int 1222 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1223 struct nvme_completion* compl) 1224 { 1225 if (command->cdw11 & NVME_CMD_CDW11_PC) { 1226 uint16_t qid = command->cdw10 & 0xffff; 1227 struct nvme_submission_queue *nsq; 1228 1229 if ((qid == 0) || (qid > sc->num_squeues) || 1230 (sc->submit_queues[qid].qbase != NULL)) { 1231 WPRINTF("%s queue index %u > num_squeues %u", 1232 __func__, qid, sc->num_squeues); 1233 pci_nvme_status_tc(&compl->status, 1234 NVME_SCT_COMMAND_SPECIFIC, 1235 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1236 return (1); 1237 } 1238 1239 nsq = &sc->submit_queues[qid]; 1240 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1241 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries); 1242 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) { 1243 /* 1244 * Queues must specify at least two entries 1245 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1246 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1247 */ 1248 pci_nvme_status_tc(&compl->status, 1249 NVME_SCT_COMMAND_SPECIFIC, 1250 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1251 return (1); 1252 } 1253 nsq->head = nsq->tail = 0; 1254 1255 nsq->cqid = (command->cdw11 >> 16) & 0xffff; 1256 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) { 1257 pci_nvme_status_tc(&compl->status, 1258 NVME_SCT_COMMAND_SPECIFIC, 1259 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1260 return (1); 1261 } 1262 1263 if (sc->compl_queues[nsq->cqid].qbase == NULL) { 1264 pci_nvme_status_tc(&compl->status, 1265 NVME_SCT_COMMAND_SPECIFIC, 1266 NVME_SC_COMPLETION_QUEUE_INVALID); 1267 return (1); 1268 } 1269 1270 nsq->qpriority = (command->cdw11 >> 1) & 0x03; 1271 1272 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1273 sizeof(struct nvme_command) * (size_t)nsq->size); 1274 1275 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__, 1276 qid, nsq->size, nsq->qbase, nsq->cqid); 1277 1278 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1279 1280 DPRINTF("%s completed creating IOSQ qid %u", 1281 __func__, qid); 1282 } else { 1283 /* 1284 * Guest sent non-cont submission queue request. 1285 * This setting is unsupported by this emulation. 1286 */ 1287 WPRINTF("%s unsupported non-contig (list-based) " 1288 "create i/o submission queue", __func__); 1289 1290 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1291 } 1292 return (1); 1293 } 1294 1295 static int 1296 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1297 struct nvme_completion* compl) 1298 { 1299 uint16_t qid = command->cdw10 & 0xffff; 1300 uint16_t sqid; 1301 1302 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid); 1303 if (qid == 0 || qid > sc->num_cqueues || 1304 (sc->compl_queues[qid].qbase == NULL)) { 1305 WPRINTF("%s queue index %u / num_cqueues %u", 1306 __func__, qid, sc->num_cqueues); 1307 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1308 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1309 return (1); 1310 } 1311 1312 /* Deleting an Active CQ is an error */ 1313 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++) 1314 if (sc->submit_queues[sqid].cqid == qid) { 1315 pci_nvme_status_tc(&compl->status, 1316 NVME_SCT_COMMAND_SPECIFIC, 1317 NVME_SC_INVALID_QUEUE_DELETION); 1318 return (1); 1319 } 1320 1321 sc->compl_queues[qid].qbase = NULL; 1322 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1323 return (1); 1324 } 1325 1326 static int 1327 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1328 struct nvme_completion* compl) 1329 { 1330 struct nvme_completion_queue *ncq; 1331 uint16_t qid = command->cdw10 & 0xffff; 1332 1333 /* Only support Physically Contiguous queues */ 1334 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) { 1335 WPRINTF("%s unsupported non-contig (list-based) " 1336 "create i/o completion queue", 1337 __func__); 1338 1339 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1340 return (1); 1341 } 1342 1343 if ((qid == 0) || (qid > sc->num_cqueues) || 1344 (sc->compl_queues[qid].qbase != NULL)) { 1345 WPRINTF("%s queue index %u > num_cqueues %u", 1346 __func__, qid, sc->num_cqueues); 1347 pci_nvme_status_tc(&compl->status, 1348 NVME_SCT_COMMAND_SPECIFIC, 1349 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1350 return (1); 1351 } 1352 1353 ncq = &sc->compl_queues[qid]; 1354 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; 1355 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; 1356 if (ncq->intr_vec > (sc->max_queues + 1)) { 1357 pci_nvme_status_tc(&compl->status, 1358 NVME_SCT_COMMAND_SPECIFIC, 1359 NVME_SC_INVALID_INTERRUPT_VECTOR); 1360 return (1); 1361 } 1362 1363 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1364 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) { 1365 /* 1366 * Queues must specify at least two entries 1367 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1368 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1369 */ 1370 pci_nvme_status_tc(&compl->status, 1371 NVME_SCT_COMMAND_SPECIFIC, 1372 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1373 return (1); 1374 } 1375 ncq->head = ncq->tail = 0; 1376 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1377 command->prp1, 1378 sizeof(struct nvme_command) * (size_t)ncq->size); 1379 1380 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1381 1382 1383 return (1); 1384 } 1385 1386 static int 1387 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, 1388 struct nvme_completion* compl) 1389 { 1390 uint64_t logoff; 1391 uint32_t logsize; 1392 uint8_t logpage; 1393 1394 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1395 1396 /* 1397 * Command specifies the number of dwords to return in fields NUMDU 1398 * and NUMDL. This is a zero-based value. 1399 */ 1400 logpage = command->cdw10 & 0xFF; 1401 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1; 1402 logsize *= sizeof(uint32_t); 1403 logoff = ((uint64_t)(command->cdw13) << 32) | command->cdw12; 1404 1405 DPRINTF("%s log page %u len %u", __func__, logpage, logsize); 1406 1407 switch (logpage) { 1408 case NVME_LOG_ERROR: 1409 if (logoff >= sizeof(sc->err_log)) { 1410 pci_nvme_status_genc(&compl->status, 1411 NVME_SC_INVALID_FIELD); 1412 break; 1413 } 1414 1415 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1416 command->prp2, (uint8_t *)&sc->err_log + logoff, 1417 MIN(logsize - logoff, sizeof(sc->err_log)), 1418 NVME_COPY_TO_PRP); 1419 break; 1420 case NVME_LOG_HEALTH_INFORMATION: 1421 if (logoff >= sizeof(sc->health_log)) { 1422 pci_nvme_status_genc(&compl->status, 1423 NVME_SC_INVALID_FIELD); 1424 break; 1425 } 1426 1427 pthread_mutex_lock(&sc->mtx); 1428 memcpy(&sc->health_log.data_units_read, &sc->read_data_units, 1429 sizeof(sc->health_log.data_units_read)); 1430 memcpy(&sc->health_log.data_units_written, &sc->write_data_units, 1431 sizeof(sc->health_log.data_units_written)); 1432 memcpy(&sc->health_log.host_read_commands, &sc->read_commands, 1433 sizeof(sc->health_log.host_read_commands)); 1434 memcpy(&sc->health_log.host_write_commands, &sc->write_commands, 1435 sizeof(sc->health_log.host_write_commands)); 1436 pthread_mutex_unlock(&sc->mtx); 1437 1438 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1439 command->prp2, (uint8_t *)&sc->health_log + logoff, 1440 MIN(logsize - logoff, sizeof(sc->health_log)), 1441 NVME_COPY_TO_PRP); 1442 break; 1443 case NVME_LOG_FIRMWARE_SLOT: 1444 if (logoff >= sizeof(sc->fw_log)) { 1445 pci_nvme_status_genc(&compl->status, 1446 NVME_SC_INVALID_FIELD); 1447 break; 1448 } 1449 1450 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1451 command->prp2, (uint8_t *)&sc->fw_log + logoff, 1452 MIN(logsize - logoff, sizeof(sc->fw_log)), 1453 NVME_COPY_TO_PRP); 1454 break; 1455 case NVME_LOG_CHANGED_NAMESPACE: 1456 if (logoff >= sizeof(sc->ns_log)) { 1457 pci_nvme_status_genc(&compl->status, 1458 NVME_SC_INVALID_FIELD); 1459 break; 1460 } 1461 1462 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1463 command->prp2, (uint8_t *)&sc->ns_log + logoff, 1464 MIN(logsize - logoff, sizeof(sc->ns_log)), 1465 NVME_COPY_TO_PRP); 1466 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 1467 break; 1468 default: 1469 DPRINTF("%s get log page %x command not supported", 1470 __func__, logpage); 1471 1472 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1473 NVME_SC_INVALID_LOG_PAGE); 1474 } 1475 1476 return (1); 1477 } 1478 1479 static int 1480 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, 1481 struct nvme_completion* compl) 1482 { 1483 void *dest; 1484 uint16_t status; 1485 1486 DPRINTF("%s identify 0x%x nsid 0x%x", __func__, 1487 command->cdw10 & 0xFF, command->nsid); 1488 1489 status = 0; 1490 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1491 1492 switch (command->cdw10 & 0xFF) { 1493 case 0x00: /* return Identify Namespace data structure */ 1494 /* Global NS only valid with NS Management */ 1495 if (command->nsid == NVME_GLOBAL_NAMESPACE_TAG) { 1496 pci_nvme_status_genc(&status, 1497 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1498 break; 1499 } 1500 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1501 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata), 1502 NVME_COPY_TO_PRP); 1503 break; 1504 case 0x01: /* return Identify Controller data structure */ 1505 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1506 command->prp2, (uint8_t *)&sc->ctrldata, 1507 sizeof(sc->ctrldata), 1508 NVME_COPY_TO_PRP); 1509 break; 1510 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ 1511 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1512 sizeof(uint32_t) * 1024); 1513 /* All unused entries shall be zero */ 1514 memset(dest, 0, sizeof(uint32_t) * 1024); 1515 ((uint32_t *)dest)[0] = 1; 1516 break; 1517 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ 1518 if (command->nsid != 1) { 1519 pci_nvme_status_genc(&status, 1520 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1521 break; 1522 } 1523 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1524 sizeof(uint32_t) * 1024); 1525 /* All bytes after the descriptor shall be zero */ 1526 memset(dest, 0, sizeof(uint32_t) * 1024); 1527 1528 /* Return NIDT=1 (i.e. EUI64) descriptor */ 1529 ((uint8_t *)dest)[0] = 1; 1530 ((uint8_t *)dest)[1] = sizeof(uint64_t); 1531 memcpy(((uint8_t *)dest) + 4, sc->nsdata.eui64, sizeof(uint64_t)); 1532 break; 1533 case 0x13: 1534 /* 1535 * Controller list is optional but used by UNH tests. Return 1536 * a valid but empty list. 1537 */ 1538 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1539 sizeof(uint16_t) * 2048); 1540 memset(dest, 0, sizeof(uint16_t) * 2048); 1541 break; 1542 default: 1543 DPRINTF("%s unsupported identify command requested 0x%x", 1544 __func__, command->cdw10 & 0xFF); 1545 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD); 1546 break; 1547 } 1548 1549 compl->status = status; 1550 return (1); 1551 } 1552 1553 static const char * 1554 nvme_fid_to_name(uint8_t fid) 1555 { 1556 const char *name; 1557 1558 switch (fid) { 1559 case NVME_FEAT_ARBITRATION: 1560 name = "Arbitration"; 1561 break; 1562 case NVME_FEAT_POWER_MANAGEMENT: 1563 name = "Power Management"; 1564 break; 1565 case NVME_FEAT_LBA_RANGE_TYPE: 1566 name = "LBA Range Type"; 1567 break; 1568 case NVME_FEAT_TEMPERATURE_THRESHOLD: 1569 name = "Temperature Threshold"; 1570 break; 1571 case NVME_FEAT_ERROR_RECOVERY: 1572 name = "Error Recovery"; 1573 break; 1574 case NVME_FEAT_VOLATILE_WRITE_CACHE: 1575 name = "Volatile Write Cache"; 1576 break; 1577 case NVME_FEAT_NUMBER_OF_QUEUES: 1578 name = "Number of Queues"; 1579 break; 1580 case NVME_FEAT_INTERRUPT_COALESCING: 1581 name = "Interrupt Coalescing"; 1582 break; 1583 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 1584 name = "Interrupt Vector Configuration"; 1585 break; 1586 case NVME_FEAT_WRITE_ATOMICITY: 1587 name = "Write Atomicity Normal"; 1588 break; 1589 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 1590 name = "Asynchronous Event Configuration"; 1591 break; 1592 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 1593 name = "Autonomous Power State Transition"; 1594 break; 1595 case NVME_FEAT_HOST_MEMORY_BUFFER: 1596 name = "Host Memory Buffer"; 1597 break; 1598 case NVME_FEAT_TIMESTAMP: 1599 name = "Timestamp"; 1600 break; 1601 case NVME_FEAT_KEEP_ALIVE_TIMER: 1602 name = "Keep Alive Timer"; 1603 break; 1604 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT: 1605 name = "Host Controlled Thermal Management"; 1606 break; 1607 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG: 1608 name = "Non-Operation Power State Config"; 1609 break; 1610 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG: 1611 name = "Read Recovery Level Config"; 1612 break; 1613 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 1614 name = "Predictable Latency Mode Config"; 1615 break; 1616 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW: 1617 name = "Predictable Latency Mode Window"; 1618 break; 1619 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES: 1620 name = "LBA Status Information Report Interval"; 1621 break; 1622 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 1623 name = "Host Behavior Support"; 1624 break; 1625 case NVME_FEAT_SANITIZE_CONFIG: 1626 name = "Sanitize Config"; 1627 break; 1628 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION: 1629 name = "Endurance Group Event Configuration"; 1630 break; 1631 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 1632 name = "Software Progress Marker"; 1633 break; 1634 case NVME_FEAT_HOST_IDENTIFIER: 1635 name = "Host Identifier"; 1636 break; 1637 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK: 1638 name = "Reservation Notification Mask"; 1639 break; 1640 case NVME_FEAT_RESERVATION_PERSISTENCE: 1641 name = "Reservation Persistence"; 1642 break; 1643 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG: 1644 name = "Namespace Write Protection Config"; 1645 break; 1646 default: 1647 name = "Unknown"; 1648 break; 1649 } 1650 1651 return (name); 1652 } 1653 1654 static void 1655 nvme_feature_invalid_cb(struct pci_nvme_softc *sc __unused, 1656 struct nvme_feature_obj *feat __unused, 1657 struct nvme_command *command __unused, 1658 struct nvme_completion *compl) 1659 { 1660 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1661 } 1662 1663 static void 1664 nvme_feature_iv_config(struct pci_nvme_softc *sc, 1665 struct nvme_feature_obj *feat __unused, 1666 struct nvme_command *command, 1667 struct nvme_completion *compl) 1668 { 1669 uint32_t i; 1670 uint32_t cdw11 = command->cdw11; 1671 uint16_t iv; 1672 bool cd; 1673 1674 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1675 1676 iv = cdw11 & 0xffff; 1677 cd = cdw11 & (1 << 16); 1678 1679 if (iv > (sc->max_queues + 1)) { 1680 return; 1681 } 1682 1683 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */ 1684 if ((iv == 0) && !cd) 1685 return; 1686 1687 /* Requested Interrupt Vector must be used by a CQ */ 1688 for (i = 0; i < sc->num_cqueues + 1; i++) { 1689 if (sc->compl_queues[i].intr_vec == iv) { 1690 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1691 } 1692 } 1693 } 1694 1695 #define NVME_ASYNC_EVENT_ENDURANCE_GROUP (0x4000) 1696 static void 1697 nvme_feature_async_event(struct pci_nvme_softc *sc __unused, 1698 struct nvme_feature_obj *feat __unused, 1699 struct nvme_command *command, 1700 struct nvme_completion *compl) 1701 { 1702 if (command->cdw11 & NVME_ASYNC_EVENT_ENDURANCE_GROUP) 1703 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1704 } 1705 1706 #define NVME_TEMP_THRESH_OVER 0 1707 #define NVME_TEMP_THRESH_UNDER 1 1708 static void 1709 nvme_feature_temperature(struct pci_nvme_softc *sc, 1710 struct nvme_feature_obj *feat __unused, 1711 struct nvme_command *command, 1712 struct nvme_completion *compl) 1713 { 1714 uint16_t tmpth; /* Temperature Threshold */ 1715 uint8_t tmpsel; /* Threshold Temperature Select */ 1716 uint8_t thsel; /* Threshold Type Select */ 1717 bool set_crit = false; 1718 bool report_crit; 1719 1720 tmpth = command->cdw11 & 0xffff; 1721 tmpsel = (command->cdw11 >> 16) & 0xf; 1722 thsel = (command->cdw11 >> 20) & 0x3; 1723 1724 DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel); 1725 1726 /* Check for unsupported values */ 1727 if (((tmpsel != 0) && (tmpsel != 0xf)) || 1728 (thsel > NVME_TEMP_THRESH_UNDER)) { 1729 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1730 return; 1731 } 1732 1733 if (((thsel == NVME_TEMP_THRESH_OVER) && (NVME_TEMPERATURE >= tmpth)) || 1734 ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth))) 1735 set_crit = true; 1736 1737 pthread_mutex_lock(&sc->mtx); 1738 if (set_crit) 1739 sc->health_log.critical_warning |= 1740 NVME_CRIT_WARN_ST_TEMPERATURE; 1741 else 1742 sc->health_log.critical_warning &= 1743 ~NVME_CRIT_WARN_ST_TEMPERATURE; 1744 pthread_mutex_unlock(&sc->mtx); 1745 1746 report_crit = sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11 & 1747 NVME_CRIT_WARN_ST_TEMPERATURE; 1748 1749 if (set_crit && report_crit) 1750 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART, 1751 sc->health_log.critical_warning); 1752 1753 DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status); 1754 } 1755 1756 static void 1757 nvme_feature_num_queues(struct pci_nvme_softc *sc, 1758 struct nvme_feature_obj *feat __unused, 1759 struct nvme_command *command, 1760 struct nvme_completion *compl) 1761 { 1762 uint16_t nqr; /* Number of Queues Requested */ 1763 1764 if (sc->num_q_is_set) { 1765 WPRINTF("%s: Number of Queues already set", __func__); 1766 pci_nvme_status_genc(&compl->status, 1767 NVME_SC_COMMAND_SEQUENCE_ERROR); 1768 return; 1769 } 1770 1771 nqr = command->cdw11 & 0xFFFF; 1772 if (nqr == 0xffff) { 1773 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr); 1774 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1775 return; 1776 } 1777 1778 sc->num_squeues = ONE_BASED(nqr); 1779 if (sc->num_squeues > sc->max_queues) { 1780 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues, 1781 sc->max_queues); 1782 sc->num_squeues = sc->max_queues; 1783 } 1784 1785 nqr = (command->cdw11 >> 16) & 0xFFFF; 1786 if (nqr == 0xffff) { 1787 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr); 1788 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1789 return; 1790 } 1791 1792 sc->num_cqueues = ONE_BASED(nqr); 1793 if (sc->num_cqueues > sc->max_queues) { 1794 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues, 1795 sc->max_queues); 1796 sc->num_cqueues = sc->max_queues; 1797 } 1798 1799 /* Patch the command value which will be saved on callback's return */ 1800 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc); 1801 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 1802 1803 sc->num_q_is_set = true; 1804 } 1805 1806 static int 1807 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command, 1808 struct nvme_completion *compl) 1809 { 1810 struct nvme_feature_obj *feat; 1811 uint32_t nsid = command->nsid; 1812 uint8_t fid = NVMEV(NVME_FEAT_SET_FID, command->cdw10); 1813 bool sv = NVMEV(NVME_FEAT_SET_SV, command->cdw10); 1814 1815 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1816 1817 if (fid >= NVME_FID_MAX) { 1818 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1819 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1820 return (1); 1821 } 1822 1823 if (sv) { 1824 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1825 NVME_SC_FEATURE_NOT_SAVEABLE); 1826 return (1); 1827 } 1828 1829 feat = &sc->feat[fid]; 1830 1831 if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) { 1832 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1833 return (1); 1834 } 1835 1836 if (!feat->namespace_specific && 1837 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) { 1838 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1839 NVME_SC_FEATURE_NOT_NS_SPECIFIC); 1840 return (1); 1841 } 1842 1843 compl->cdw0 = 0; 1844 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1845 1846 if (feat->set) 1847 feat->set(sc, feat, command, compl); 1848 else { 1849 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1850 NVME_SC_FEATURE_NOT_CHANGEABLE); 1851 return (1); 1852 } 1853 1854 DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11); 1855 if (compl->status == NVME_SC_SUCCESS) { 1856 feat->cdw11 = command->cdw11; 1857 if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) && 1858 (command->cdw11 != 0)) 1859 pci_nvme_aen_notify(sc); 1860 } 1861 1862 return (0); 1863 } 1864 1865 #define NVME_FEATURES_SEL_SUPPORTED 0x3 1866 #define NVME_FEATURES_NS_SPECIFIC (1 << 1) 1867 1868 static int 1869 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, 1870 struct nvme_completion* compl) 1871 { 1872 struct nvme_feature_obj *feat; 1873 uint8_t fid = command->cdw10 & 0xFF; 1874 uint8_t sel = (command->cdw10 >> 8) & 0x7; 1875 1876 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1877 1878 if (fid >= NVME_FID_MAX) { 1879 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1880 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1881 return (1); 1882 } 1883 1884 compl->cdw0 = 0; 1885 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1886 1887 feat = &sc->feat[fid]; 1888 if (feat->get) { 1889 feat->get(sc, feat, command, compl); 1890 } 1891 1892 if (compl->status == NVME_SC_SUCCESS) { 1893 if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific) 1894 compl->cdw0 = NVME_FEATURES_NS_SPECIFIC; 1895 else 1896 compl->cdw0 = feat->cdw11; 1897 } 1898 1899 return (0); 1900 } 1901 1902 static int 1903 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command, 1904 struct nvme_completion* compl) 1905 { 1906 uint8_t ses, lbaf, pi; 1907 1908 /* Only supports Secure Erase Setting - User Data Erase */ 1909 ses = (command->cdw10 >> 9) & 0x7; 1910 if (ses > 0x1) { 1911 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1912 return (1); 1913 } 1914 1915 /* Only supports a single LBA Format */ 1916 lbaf = command->cdw10 & 0xf; 1917 if (lbaf != 0) { 1918 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1919 NVME_SC_INVALID_FORMAT); 1920 return (1); 1921 } 1922 1923 /* Doesn't support Protection Information */ 1924 pi = (command->cdw10 >> 5) & 0x7; 1925 if (pi != 0) { 1926 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1927 return (1); 1928 } 1929 1930 if (sc->nvstore.type == NVME_STOR_RAM) { 1931 if (sc->nvstore.ctx) 1932 free(sc->nvstore.ctx); 1933 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 1934 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1935 } else { 1936 struct pci_nvme_ioreq *req; 1937 int err; 1938 1939 req = pci_nvme_get_ioreq(sc); 1940 if (req == NULL) { 1941 pci_nvme_status_genc(&compl->status, 1942 NVME_SC_INTERNAL_DEVICE_ERROR); 1943 WPRINTF("%s: unable to allocate IO req", __func__); 1944 return (1); 1945 } 1946 req->nvme_sq = &sc->submit_queues[0]; 1947 req->sqid = 0; 1948 req->opc = command->opc; 1949 req->cid = command->cid; 1950 req->nsid = command->nsid; 1951 1952 req->io_req.br_offset = 0; 1953 req->io_req.br_resid = sc->nvstore.size; 1954 req->io_req.br_callback = pci_nvme_io_done; 1955 1956 err = blockif_delete(sc->nvstore.ctx, &req->io_req); 1957 if (err) { 1958 pci_nvme_status_genc(&compl->status, 1959 NVME_SC_INTERNAL_DEVICE_ERROR); 1960 pci_nvme_release_ioreq(sc, req); 1961 } else 1962 compl->status = NVME_NO_STATUS; 1963 } 1964 1965 return (1); 1966 } 1967 1968 static int 1969 nvme_opc_abort(struct pci_nvme_softc *sc __unused, struct nvme_command *command, 1970 struct nvme_completion *compl) 1971 { 1972 DPRINTF("%s submission queue %u, command ID 0x%x", __func__, 1973 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF); 1974 1975 /* TODO: search for the command ID and abort it */ 1976 1977 compl->cdw0 = 1; 1978 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1979 return (1); 1980 } 1981 1982 static int 1983 nvme_opc_async_event_req(struct pci_nvme_softc* sc, 1984 struct nvme_command* command, struct nvme_completion* compl) 1985 { 1986 DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__, 1987 sc->aer_count, sc->ctrldata.aerl, command->cid); 1988 1989 /* Don't exceed the Async Event Request Limit (AERL). */ 1990 if (pci_nvme_aer_limit_reached(sc)) { 1991 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1992 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); 1993 return (1); 1994 } 1995 1996 if (pci_nvme_aer_add(sc, command->cid)) { 1997 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC, 1998 NVME_SC_INTERNAL_DEVICE_ERROR); 1999 return (1); 2000 } 2001 2002 /* 2003 * Raise events when they happen based on the Set Features cmd. 2004 * These events happen async, so only set completion successful if 2005 * there is an event reflective of the request to get event. 2006 */ 2007 compl->status = NVME_NO_STATUS; 2008 pci_nvme_aen_notify(sc); 2009 2010 return (0); 2011 } 2012 2013 static void 2014 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) 2015 { 2016 struct nvme_completion compl; 2017 struct nvme_command *cmd; 2018 struct nvme_submission_queue *sq; 2019 struct nvme_completion_queue *cq; 2020 uint16_t sqhead; 2021 2022 DPRINTF("%s index %u", __func__, (uint32_t)value); 2023 2024 sq = &sc->submit_queues[0]; 2025 cq = &sc->compl_queues[0]; 2026 2027 pthread_mutex_lock(&sq->mtx); 2028 2029 sqhead = sq->head; 2030 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail); 2031 2032 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2033 cmd = &(sq->qbase)[sqhead]; 2034 compl.cdw0 = 0; 2035 compl.status = 0; 2036 2037 switch (cmd->opc) { 2038 case NVME_OPC_DELETE_IO_SQ: 2039 DPRINTF("%s command DELETE_IO_SQ", __func__); 2040 nvme_opc_delete_io_sq(sc, cmd, &compl); 2041 break; 2042 case NVME_OPC_CREATE_IO_SQ: 2043 DPRINTF("%s command CREATE_IO_SQ", __func__); 2044 nvme_opc_create_io_sq(sc, cmd, &compl); 2045 break; 2046 case NVME_OPC_DELETE_IO_CQ: 2047 DPRINTF("%s command DELETE_IO_CQ", __func__); 2048 nvme_opc_delete_io_cq(sc, cmd, &compl); 2049 break; 2050 case NVME_OPC_CREATE_IO_CQ: 2051 DPRINTF("%s command CREATE_IO_CQ", __func__); 2052 nvme_opc_create_io_cq(sc, cmd, &compl); 2053 break; 2054 case NVME_OPC_GET_LOG_PAGE: 2055 DPRINTF("%s command GET_LOG_PAGE", __func__); 2056 nvme_opc_get_log_page(sc, cmd, &compl); 2057 break; 2058 case NVME_OPC_IDENTIFY: 2059 DPRINTF("%s command IDENTIFY", __func__); 2060 nvme_opc_identify(sc, cmd, &compl); 2061 break; 2062 case NVME_OPC_ABORT: 2063 DPRINTF("%s command ABORT", __func__); 2064 nvme_opc_abort(sc, cmd, &compl); 2065 break; 2066 case NVME_OPC_SET_FEATURES: 2067 DPRINTF("%s command SET_FEATURES", __func__); 2068 nvme_opc_set_features(sc, cmd, &compl); 2069 break; 2070 case NVME_OPC_GET_FEATURES: 2071 DPRINTF("%s command GET_FEATURES", __func__); 2072 nvme_opc_get_features(sc, cmd, &compl); 2073 break; 2074 case NVME_OPC_FIRMWARE_ACTIVATE: 2075 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__); 2076 pci_nvme_status_tc(&compl.status, 2077 NVME_SCT_COMMAND_SPECIFIC, 2078 NVME_SC_INVALID_FIRMWARE_SLOT); 2079 break; 2080 case NVME_OPC_ASYNC_EVENT_REQUEST: 2081 DPRINTF("%s command ASYNC_EVENT_REQ", __func__); 2082 nvme_opc_async_event_req(sc, cmd, &compl); 2083 break; 2084 case NVME_OPC_FORMAT_NVM: 2085 DPRINTF("%s command FORMAT_NVM", __func__); 2086 if ((sc->ctrldata.oacs & 2087 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) { 2088 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 2089 break; 2090 } 2091 nvme_opc_format_nvm(sc, cmd, &compl); 2092 break; 2093 case NVME_OPC_SECURITY_SEND: 2094 case NVME_OPC_SECURITY_RECEIVE: 2095 case NVME_OPC_SANITIZE: 2096 case NVME_OPC_GET_LBA_STATUS: 2097 DPRINTF("%s command OPC=%#x (unsupported)", __func__, 2098 cmd->opc); 2099 /* Valid but unsupported opcodes */ 2100 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD); 2101 break; 2102 default: 2103 DPRINTF("%s command OPC=%#X (not implemented)", 2104 __func__, 2105 cmd->opc); 2106 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 2107 } 2108 sqhead = (sqhead + 1) % sq->size; 2109 2110 if (NVME_COMPLETION_VALID(compl)) { 2111 pci_nvme_cq_update(sc, &sc->compl_queues[0], 2112 compl.cdw0, 2113 cmd->cid, 2114 0, /* SQID */ 2115 compl.status); 2116 } 2117 } 2118 2119 DPRINTF("setting sqhead %u", sqhead); 2120 sq->head = sqhead; 2121 2122 if (cq->head != cq->tail) 2123 pci_generate_msix(sc->nsc_pi, 0); 2124 2125 pthread_mutex_unlock(&sq->mtx); 2126 } 2127 2128 /* 2129 * Update the Write and Read statistics reported in SMART data 2130 * 2131 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up. 2132 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000 2133 * 512 byte blocks. Rounding up is achieved by initializing the remainder to 999. 2134 */ 2135 static void 2136 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc, 2137 size_t bytes, uint16_t status) 2138 { 2139 2140 pthread_mutex_lock(&sc->mtx); 2141 switch (opc) { 2142 case NVME_OPC_WRITE: 2143 sc->write_commands++; 2144 if (status != NVME_SC_SUCCESS) 2145 break; 2146 sc->write_dunits_remainder += (bytes / 512); 2147 while (sc->write_dunits_remainder >= 1000) { 2148 sc->write_data_units++; 2149 sc->write_dunits_remainder -= 1000; 2150 } 2151 break; 2152 case NVME_OPC_READ: 2153 sc->read_commands++; 2154 if (status != NVME_SC_SUCCESS) 2155 break; 2156 sc->read_dunits_remainder += (bytes / 512); 2157 while (sc->read_dunits_remainder >= 1000) { 2158 sc->read_data_units++; 2159 sc->read_dunits_remainder -= 1000; 2160 } 2161 break; 2162 default: 2163 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc); 2164 break; 2165 } 2166 pthread_mutex_unlock(&sc->mtx); 2167 } 2168 2169 /* 2170 * Check if the combination of Starting LBA (slba) and number of blocks 2171 * exceeds the range of the underlying storage. 2172 * 2173 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores 2174 * the capacity in bytes as a uint64_t, care must be taken to avoid integer 2175 * overflow. 2176 */ 2177 static bool 2178 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba, 2179 uint32_t nblocks) 2180 { 2181 size_t offset, bytes; 2182 2183 /* Overflow check of multiplying Starting LBA by the sector size */ 2184 if (slba >> (64 - nvstore->sectsz_bits)) 2185 return (true); 2186 2187 offset = slba << nvstore->sectsz_bits; 2188 bytes = nblocks << nvstore->sectsz_bits; 2189 2190 /* Overflow check of Number of Logical Blocks */ 2191 if ((nvstore->size <= offset) || ((nvstore->size - offset) < bytes)) 2192 return (true); 2193 2194 return (false); 2195 } 2196 2197 static int 2198 pci_nvme_append_iov_req(struct pci_nvme_softc *sc __unused, 2199 struct pci_nvme_ioreq *req, uint64_t gpaddr, size_t size, uint64_t offset) 2200 { 2201 int iovidx; 2202 bool range_is_contiguous; 2203 2204 if (req == NULL) 2205 return (-1); 2206 2207 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) { 2208 return (-1); 2209 } 2210 2211 /* 2212 * Minimize the number of IOVs by concatenating contiguous address 2213 * ranges. If the IOV count is zero, there is no previous range to 2214 * concatenate. 2215 */ 2216 if (req->io_req.br_iovcnt == 0) 2217 range_is_contiguous = false; 2218 else 2219 range_is_contiguous = (req->prev_gpaddr + req->prev_size) == gpaddr; 2220 2221 if (range_is_contiguous) { 2222 iovidx = req->io_req.br_iovcnt - 1; 2223 2224 req->io_req.br_iov[iovidx].iov_base = 2225 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2226 req->prev_gpaddr, size); 2227 if (req->io_req.br_iov[iovidx].iov_base == NULL) 2228 return (-1); 2229 2230 req->prev_size += size; 2231 req->io_req.br_resid += size; 2232 2233 req->io_req.br_iov[iovidx].iov_len = req->prev_size; 2234 } else { 2235 iovidx = req->io_req.br_iovcnt; 2236 if (iovidx == 0) { 2237 req->io_req.br_offset = offset; 2238 req->io_req.br_resid = 0; 2239 req->io_req.br_param = req; 2240 } 2241 2242 req->io_req.br_iov[iovidx].iov_base = 2243 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2244 gpaddr, size); 2245 if (req->io_req.br_iov[iovidx].iov_base == NULL) 2246 return (-1); 2247 2248 req->io_req.br_iov[iovidx].iov_len = size; 2249 2250 req->prev_gpaddr = gpaddr; 2251 req->prev_size = size; 2252 req->io_req.br_resid += size; 2253 2254 req->io_req.br_iovcnt++; 2255 } 2256 2257 return (0); 2258 } 2259 2260 static void 2261 pci_nvme_set_completion(struct pci_nvme_softc *sc, 2262 struct nvme_submission_queue *sq, int sqid, uint16_t cid, uint16_t status) 2263 { 2264 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; 2265 2266 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x", 2267 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), 2268 NVME_STATUS_GET_SC(status)); 2269 2270 pci_nvme_cq_update(sc, cq, 0, cid, sqid, status); 2271 2272 if (cq->head != cq->tail) { 2273 if (cq->intr_en & NVME_CQ_INTEN) { 2274 pci_generate_msix(sc->nsc_pi, cq->intr_vec); 2275 } else { 2276 DPRINTF("%s: CQ%u interrupt disabled", 2277 __func__, sq->cqid); 2278 } 2279 } 2280 } 2281 2282 static void 2283 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) 2284 { 2285 req->sc = NULL; 2286 req->nvme_sq = NULL; 2287 req->sqid = 0; 2288 2289 pthread_mutex_lock(&sc->mtx); 2290 2291 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link); 2292 sc->pending_ios--; 2293 2294 /* when no more IO pending, can set to ready if device reset/enabled */ 2295 if (sc->pending_ios == 0 && 2296 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) 2297 sc->regs.csts |= NVME_CSTS_RDY; 2298 2299 pthread_mutex_unlock(&sc->mtx); 2300 2301 sem_post(&sc->iosemlock); 2302 } 2303 2304 static struct pci_nvme_ioreq * 2305 pci_nvme_get_ioreq(struct pci_nvme_softc *sc) 2306 { 2307 struct pci_nvme_ioreq *req = NULL; 2308 2309 sem_wait(&sc->iosemlock); 2310 pthread_mutex_lock(&sc->mtx); 2311 2312 req = STAILQ_FIRST(&sc->ioreqs_free); 2313 assert(req != NULL); 2314 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link); 2315 2316 req->sc = sc; 2317 2318 sc->pending_ios++; 2319 2320 pthread_mutex_unlock(&sc->mtx); 2321 2322 req->io_req.br_iovcnt = 0; 2323 req->io_req.br_offset = 0; 2324 req->io_req.br_resid = 0; 2325 req->io_req.br_param = req; 2326 req->prev_gpaddr = 0; 2327 req->prev_size = 0; 2328 2329 return req; 2330 } 2331 2332 static void 2333 pci_nvme_io_done(struct blockif_req *br, int err) 2334 { 2335 struct pci_nvme_ioreq *req = br->br_param; 2336 struct nvme_submission_queue *sq = req->nvme_sq; 2337 uint16_t code, status; 2338 2339 DPRINTF("%s error %d %s", __func__, err, strerror(err)); 2340 2341 /* TODO return correct error */ 2342 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; 2343 status = 0; 2344 pci_nvme_status_genc(&status, code); 2345 2346 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, status); 2347 pci_nvme_stats_write_read_update(req->sc, req->opc, 2348 req->bytes, status); 2349 pci_nvme_release_ioreq(req->sc, req); 2350 } 2351 2352 /* 2353 * Implements the Flush command. The specification states: 2354 * If a volatile write cache is not present, Flush commands complete 2355 * successfully and have no effect 2356 * in the description of the Volatile Write Cache (VWC) field of the Identify 2357 * Controller data. Therefore, set status to Success if the command is 2358 * not supported (i.e. RAM or as indicated by the blockif). 2359 */ 2360 static bool 2361 nvme_opc_flush(struct pci_nvme_softc *sc __unused, 2362 struct nvme_command *cmd __unused, 2363 struct pci_nvme_blockstore *nvstore, 2364 struct pci_nvme_ioreq *req, 2365 uint16_t *status) 2366 { 2367 bool pending = false; 2368 2369 if (nvstore->type == NVME_STOR_RAM) { 2370 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2371 } else { 2372 int err; 2373 2374 req->io_req.br_callback = pci_nvme_io_done; 2375 2376 err = blockif_flush(nvstore->ctx, &req->io_req); 2377 switch (err) { 2378 case 0: 2379 pending = true; 2380 break; 2381 case EOPNOTSUPP: 2382 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2383 break; 2384 default: 2385 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2386 } 2387 } 2388 2389 return (pending); 2390 } 2391 2392 static uint16_t 2393 nvme_write_read_ram(struct pci_nvme_softc *sc, 2394 struct pci_nvme_blockstore *nvstore, 2395 uint64_t prp1, uint64_t prp2, 2396 size_t offset, uint64_t bytes, 2397 bool is_write) 2398 { 2399 uint8_t *buf = nvstore->ctx; 2400 enum nvme_copy_dir dir; 2401 uint16_t status; 2402 2403 if (is_write) 2404 dir = NVME_COPY_TO_PRP; 2405 else 2406 dir = NVME_COPY_FROM_PRP; 2407 2408 status = 0; 2409 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2, 2410 buf + offset, bytes, dir)) 2411 pci_nvme_status_genc(&status, 2412 NVME_SC_DATA_TRANSFER_ERROR); 2413 else 2414 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2415 2416 return (status); 2417 } 2418 2419 static uint16_t 2420 nvme_write_read_blockif(struct pci_nvme_softc *sc, 2421 struct pci_nvme_blockstore *nvstore, 2422 struct pci_nvme_ioreq *req, 2423 uint64_t prp1, uint64_t prp2, 2424 size_t offset, uint64_t bytes, 2425 bool is_write) 2426 { 2427 uint64_t size; 2428 int err; 2429 uint16_t status = NVME_NO_STATUS; 2430 2431 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes); 2432 if (pci_nvme_append_iov_req(sc, req, prp1, size, offset)) { 2433 err = -1; 2434 goto out; 2435 } 2436 2437 offset += size; 2438 bytes -= size; 2439 2440 if (bytes == 0) { 2441 ; 2442 } else if (bytes <= PAGE_SIZE) { 2443 size = bytes; 2444 if (pci_nvme_append_iov_req(sc, req, prp2, size, offset)) { 2445 err = -1; 2446 goto out; 2447 } 2448 } else { 2449 void *vmctx = sc->nsc_pi->pi_vmctx; 2450 uint64_t *prp_list = &prp2; 2451 uint64_t *last = prp_list; 2452 2453 /* PRP2 is pointer to a physical region page list */ 2454 while (bytes) { 2455 /* Last entry in list points to the next list */ 2456 if ((prp_list == last) && (bytes > PAGE_SIZE)) { 2457 uint64_t prp = *prp_list; 2458 2459 prp_list = paddr_guest2host(vmctx, prp, 2460 PAGE_SIZE - (prp % PAGE_SIZE)); 2461 if (prp_list == NULL) { 2462 err = -1; 2463 goto out; 2464 } 2465 last = prp_list + (NVME_PRP2_ITEMS - 1); 2466 } 2467 2468 size = MIN(bytes, PAGE_SIZE); 2469 2470 if (pci_nvme_append_iov_req(sc, req, *prp_list, size, 2471 offset)) { 2472 err = -1; 2473 goto out; 2474 } 2475 2476 offset += size; 2477 bytes -= size; 2478 2479 prp_list++; 2480 } 2481 } 2482 req->io_req.br_callback = pci_nvme_io_done; 2483 if (is_write) 2484 err = blockif_write(nvstore->ctx, &req->io_req); 2485 else 2486 err = blockif_read(nvstore->ctx, &req->io_req); 2487 out: 2488 if (err) 2489 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); 2490 2491 return (status); 2492 } 2493 2494 static bool 2495 nvme_opc_write_read(struct pci_nvme_softc *sc, 2496 struct nvme_command *cmd, 2497 struct pci_nvme_blockstore *nvstore, 2498 struct pci_nvme_ioreq *req, 2499 uint16_t *status) 2500 { 2501 uint64_t lba, nblocks, bytes; 2502 size_t offset; 2503 bool is_write = cmd->opc == NVME_OPC_WRITE; 2504 bool pending = false; 2505 2506 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; 2507 nblocks = (cmd->cdw12 & 0xFFFF) + 1; 2508 bytes = nblocks << nvstore->sectsz_bits; 2509 if (bytes > NVME_MAX_DATA_SIZE) { 2510 WPRINTF("%s command would exceed MDTS", __func__); 2511 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD); 2512 goto out; 2513 } 2514 2515 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) { 2516 WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)", 2517 __func__, lba, nblocks); 2518 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2519 goto out; 2520 } 2521 2522 offset = lba << nvstore->sectsz_bits; 2523 2524 req->bytes = bytes; 2525 req->io_req.br_offset = lba; 2526 2527 /* PRP bits 1:0 must be zero */ 2528 cmd->prp1 &= ~0x3UL; 2529 cmd->prp2 &= ~0x3UL; 2530 2531 if (nvstore->type == NVME_STOR_RAM) { 2532 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1, 2533 cmd->prp2, offset, bytes, is_write); 2534 } else { 2535 *status = nvme_write_read_blockif(sc, nvstore, req, 2536 cmd->prp1, cmd->prp2, offset, bytes, is_write); 2537 2538 if (*status == NVME_NO_STATUS) 2539 pending = true; 2540 } 2541 out: 2542 if (!pending) 2543 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status); 2544 2545 return (pending); 2546 } 2547 2548 static void 2549 pci_nvme_dealloc_sm(struct blockif_req *br, int err) 2550 { 2551 struct pci_nvme_ioreq *req = br->br_param; 2552 struct pci_nvme_softc *sc = req->sc; 2553 bool done = true; 2554 uint16_t status; 2555 2556 status = 0; 2557 if (err) { 2558 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); 2559 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) { 2560 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2561 } else { 2562 struct iovec *iov = req->io_req.br_iov; 2563 2564 req->prev_gpaddr++; 2565 iov += req->prev_gpaddr; 2566 2567 /* The iov_* values already include the sector size */ 2568 req->io_req.br_offset = (off_t)iov->iov_base; 2569 req->io_req.br_resid = iov->iov_len; 2570 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) { 2571 pci_nvme_status_genc(&status, 2572 NVME_SC_INTERNAL_DEVICE_ERROR); 2573 } else 2574 done = false; 2575 } 2576 2577 if (done) { 2578 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, req->cid, 2579 status); 2580 pci_nvme_release_ioreq(sc, req); 2581 } 2582 } 2583 2584 static bool 2585 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, 2586 struct nvme_command *cmd, 2587 struct pci_nvme_blockstore *nvstore, 2588 struct pci_nvme_ioreq *req, 2589 uint16_t *status) 2590 { 2591 struct nvme_dsm_range *range = NULL; 2592 uint32_t nr, r, non_zero, dr; 2593 int err; 2594 bool pending = false; 2595 2596 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) { 2597 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE); 2598 goto out; 2599 } 2600 2601 nr = cmd->cdw10 & 0xff; 2602 2603 /* copy locally because a range entry could straddle PRPs */ 2604 range = calloc(1, NVME_MAX_DSM_TRIM); 2605 if (range == NULL) { 2606 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2607 goto out; 2608 } 2609 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, 2610 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP); 2611 2612 /* Check for invalid ranges and the number of non-zero lengths */ 2613 non_zero = 0; 2614 for (r = 0; r <= nr; r++) { 2615 if (pci_nvme_out_of_range(nvstore, 2616 range[r].starting_lba, range[r].length)) { 2617 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2618 goto out; 2619 } 2620 if (range[r].length != 0) 2621 non_zero++; 2622 } 2623 2624 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) { 2625 size_t offset, bytes; 2626 int sectsz_bits = sc->nvstore.sectsz_bits; 2627 2628 /* 2629 * DSM calls are advisory only, and compliant controllers 2630 * may choose to take no actions (i.e. return Success). 2631 */ 2632 if (!nvstore->deallocate) { 2633 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2634 goto out; 2635 } 2636 2637 /* If all ranges have a zero length, return Success */ 2638 if (non_zero == 0) { 2639 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2640 goto out; 2641 } 2642 2643 if (req == NULL) { 2644 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2645 goto out; 2646 } 2647 2648 offset = range[0].starting_lba << sectsz_bits; 2649 bytes = range[0].length << sectsz_bits; 2650 2651 /* 2652 * If the request is for more than a single range, store 2653 * the ranges in the br_iov. Optimize for the common case 2654 * of a single range. 2655 * 2656 * Note that NVMe Number of Ranges is a zero based value 2657 */ 2658 req->io_req.br_iovcnt = 0; 2659 req->io_req.br_offset = offset; 2660 req->io_req.br_resid = bytes; 2661 2662 if (nr == 0) { 2663 req->io_req.br_callback = pci_nvme_io_done; 2664 } else { 2665 struct iovec *iov = req->io_req.br_iov; 2666 2667 for (r = 0, dr = 0; r <= nr; r++) { 2668 offset = range[r].starting_lba << sectsz_bits; 2669 bytes = range[r].length << sectsz_bits; 2670 if (bytes == 0) 2671 continue; 2672 2673 if ((nvstore->size - offset) < bytes) { 2674 pci_nvme_status_genc(status, 2675 NVME_SC_LBA_OUT_OF_RANGE); 2676 goto out; 2677 } 2678 iov[dr].iov_base = (void *)offset; 2679 iov[dr].iov_len = bytes; 2680 dr++; 2681 } 2682 req->io_req.br_callback = pci_nvme_dealloc_sm; 2683 2684 /* 2685 * Use prev_gpaddr to track the current entry and 2686 * prev_size to track the number of entries 2687 */ 2688 req->prev_gpaddr = 0; 2689 req->prev_size = dr; 2690 } 2691 2692 err = blockif_delete(nvstore->ctx, &req->io_req); 2693 if (err) 2694 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2695 else 2696 pending = true; 2697 } 2698 out: 2699 free(range); 2700 return (pending); 2701 } 2702 2703 static void 2704 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) 2705 { 2706 struct nvme_submission_queue *sq; 2707 uint16_t status; 2708 uint16_t sqhead; 2709 2710 /* handle all submissions up to sq->tail index */ 2711 sq = &sc->submit_queues[idx]; 2712 2713 pthread_mutex_lock(&sq->mtx); 2714 2715 sqhead = sq->head; 2716 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p", 2717 idx, sqhead, sq->tail, sq->qbase); 2718 2719 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2720 struct nvme_command *cmd; 2721 struct pci_nvme_ioreq *req; 2722 uint32_t nsid; 2723 bool pending; 2724 2725 pending = false; 2726 req = NULL; 2727 status = 0; 2728 2729 cmd = &sq->qbase[sqhead]; 2730 sqhead = (sqhead + 1) % sq->size; 2731 2732 nsid = le32toh(cmd->nsid); 2733 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) { 2734 pci_nvme_status_genc(&status, 2735 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 2736 status |= 2737 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT; 2738 goto complete; 2739 } 2740 2741 req = pci_nvme_get_ioreq(sc); 2742 if (req == NULL) { 2743 pci_nvme_status_genc(&status, 2744 NVME_SC_INTERNAL_DEVICE_ERROR); 2745 WPRINTF("%s: unable to allocate IO req", __func__); 2746 goto complete; 2747 } 2748 req->nvme_sq = sq; 2749 req->sqid = idx; 2750 req->opc = cmd->opc; 2751 req->cid = cmd->cid; 2752 req->nsid = cmd->nsid; 2753 2754 switch (cmd->opc) { 2755 case NVME_OPC_FLUSH: 2756 pending = nvme_opc_flush(sc, cmd, &sc->nvstore, 2757 req, &status); 2758 break; 2759 case NVME_OPC_WRITE: 2760 case NVME_OPC_READ: 2761 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore, 2762 req, &status); 2763 break; 2764 case NVME_OPC_WRITE_ZEROES: 2765 /* TODO: write zeroes 2766 WPRINTF("%s write zeroes lba 0x%lx blocks %u", 2767 __func__, lba, cmd->cdw12 & 0xFFFF); */ 2768 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2769 break; 2770 case NVME_OPC_DATASET_MANAGEMENT: 2771 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, 2772 req, &status); 2773 break; 2774 default: 2775 WPRINTF("%s unhandled io command 0x%x", 2776 __func__, cmd->opc); 2777 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE); 2778 } 2779 complete: 2780 if (!pending) { 2781 pci_nvme_set_completion(sc, sq, idx, cmd->cid, status); 2782 if (req != NULL) 2783 pci_nvme_release_ioreq(sc, req); 2784 } 2785 } 2786 2787 sq->head = sqhead; 2788 2789 pthread_mutex_unlock(&sq->mtx); 2790 } 2791 2792 static void 2793 pci_nvme_handle_doorbell(struct pci_nvme_softc* sc, 2794 uint64_t idx, int is_sq, uint64_t value) 2795 { 2796 DPRINTF("nvme doorbell %lu, %s, val 0x%lx", 2797 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF); 2798 2799 if (is_sq) { 2800 if (idx > sc->num_squeues) { 2801 WPRINTF("%s queue index %lu overflow from " 2802 "guest (max %u)", 2803 __func__, idx, sc->num_squeues); 2804 return; 2805 } 2806 2807 atomic_store_short(&sc->submit_queues[idx].tail, 2808 (uint16_t)value); 2809 2810 if (idx == 0) { 2811 pci_nvme_handle_admin_cmd(sc, value); 2812 } else { 2813 /* submission queue; handle new entries in SQ */ 2814 if (idx > sc->num_squeues) { 2815 WPRINTF("%s SQ index %lu overflow from " 2816 "guest (max %u)", 2817 __func__, idx, sc->num_squeues); 2818 return; 2819 } 2820 pci_nvme_handle_io_cmd(sc, (uint16_t)idx); 2821 } 2822 } else { 2823 if (idx > sc->num_cqueues) { 2824 WPRINTF("%s queue index %lu overflow from " 2825 "guest (max %u)", 2826 __func__, idx, sc->num_cqueues); 2827 return; 2828 } 2829 2830 atomic_store_short(&sc->compl_queues[idx].head, 2831 (uint16_t)value); 2832 } 2833 } 2834 2835 static void 2836 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) 2837 { 2838 const char *s = iswrite ? "WRITE" : "READ"; 2839 2840 switch (offset) { 2841 case NVME_CR_CAP_LOW: 2842 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s); 2843 break; 2844 case NVME_CR_CAP_HI: 2845 DPRINTF("%s %s NVME_CR_CAP_HI", func, s); 2846 break; 2847 case NVME_CR_VS: 2848 DPRINTF("%s %s NVME_CR_VS", func, s); 2849 break; 2850 case NVME_CR_INTMS: 2851 DPRINTF("%s %s NVME_CR_INTMS", func, s); 2852 break; 2853 case NVME_CR_INTMC: 2854 DPRINTF("%s %s NVME_CR_INTMC", func, s); 2855 break; 2856 case NVME_CR_CC: 2857 DPRINTF("%s %s NVME_CR_CC", func, s); 2858 break; 2859 case NVME_CR_CSTS: 2860 DPRINTF("%s %s NVME_CR_CSTS", func, s); 2861 break; 2862 case NVME_CR_NSSR: 2863 DPRINTF("%s %s NVME_CR_NSSR", func, s); 2864 break; 2865 case NVME_CR_AQA: 2866 DPRINTF("%s %s NVME_CR_AQA", func, s); 2867 break; 2868 case NVME_CR_ASQ_LOW: 2869 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s); 2870 break; 2871 case NVME_CR_ASQ_HI: 2872 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s); 2873 break; 2874 case NVME_CR_ACQ_LOW: 2875 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s); 2876 break; 2877 case NVME_CR_ACQ_HI: 2878 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s); 2879 break; 2880 default: 2881 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset); 2882 } 2883 2884 } 2885 2886 static void 2887 pci_nvme_write_bar_0(struct pci_nvme_softc *sc, uint64_t offset, int size, 2888 uint64_t value) 2889 { 2890 uint32_t ccreg; 2891 2892 if (offset >= NVME_DOORBELL_OFFSET) { 2893 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; 2894 uint64_t idx = belloffset / 8; /* door bell size = 2*int */ 2895 int is_sq = (belloffset % 8) < 4; 2896 2897 if ((sc->regs.csts & NVME_CSTS_RDY) == 0) { 2898 WPRINTF("doorbell write prior to RDY (offset=%#lx)\n", 2899 offset); 2900 return; 2901 } 2902 2903 if (belloffset > ((sc->max_queues+1) * 8 - 4)) { 2904 WPRINTF("guest attempted an overflow write offset " 2905 "0x%lx, val 0x%lx in %s", 2906 offset, value, __func__); 2907 return; 2908 } 2909 2910 if (is_sq) { 2911 if (sc->submit_queues[idx].qbase == NULL) 2912 return; 2913 } else if (sc->compl_queues[idx].qbase == NULL) 2914 return; 2915 2916 pci_nvme_handle_doorbell(sc, idx, is_sq, value); 2917 return; 2918 } 2919 2920 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx", 2921 offset, size, value); 2922 2923 if (size != 4) { 2924 WPRINTF("guest wrote invalid size %d (offset 0x%lx, " 2925 "val 0x%lx) to bar0 in %s", 2926 size, offset, value, __func__); 2927 /* TODO: shutdown device */ 2928 return; 2929 } 2930 2931 pci_nvme_bar0_reg_dumps(__func__, offset, 1); 2932 2933 pthread_mutex_lock(&sc->mtx); 2934 2935 switch (offset) { 2936 case NVME_CR_CAP_LOW: 2937 case NVME_CR_CAP_HI: 2938 /* readonly */ 2939 break; 2940 case NVME_CR_VS: 2941 /* readonly */ 2942 break; 2943 case NVME_CR_INTMS: 2944 /* MSI-X, so ignore */ 2945 break; 2946 case NVME_CR_INTMC: 2947 /* MSI-X, so ignore */ 2948 break; 2949 case NVME_CR_CC: 2950 ccreg = (uint32_t)value; 2951 2952 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u " 2953 "iocqes %u", 2954 __func__, 2955 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), 2956 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), 2957 NVME_CC_GET_IOCQES(ccreg)); 2958 2959 if (NVME_CC_GET_SHN(ccreg)) { 2960 /* perform shutdown - flush out data to backend */ 2961 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << 2962 NVME_CSTS_REG_SHST_SHIFT); 2963 sc->regs.csts |= NVME_SHST_COMPLETE << 2964 NVME_CSTS_REG_SHST_SHIFT; 2965 } 2966 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { 2967 if (NVME_CC_GET_EN(ccreg) == 0) 2968 /* transition 1-> causes controller reset */ 2969 pci_nvme_reset_locked(sc); 2970 else 2971 pci_nvme_init_controller(sc); 2972 } 2973 2974 /* Insert the iocqes, iosqes and en bits from the write */ 2975 sc->regs.cc &= ~NVME_CC_WRITE_MASK; 2976 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; 2977 if (NVME_CC_GET_EN(ccreg) == 0) { 2978 /* Insert the ams, mps and css bit fields */ 2979 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; 2980 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; 2981 sc->regs.csts &= ~NVME_CSTS_RDY; 2982 } else if ((sc->pending_ios == 0) && 2983 !(sc->regs.csts & NVME_CSTS_CFS)) { 2984 sc->regs.csts |= NVME_CSTS_RDY; 2985 } 2986 break; 2987 case NVME_CR_CSTS: 2988 break; 2989 case NVME_CR_NSSR: 2990 /* ignore writes; don't support subsystem reset */ 2991 break; 2992 case NVME_CR_AQA: 2993 sc->regs.aqa = (uint32_t)value; 2994 break; 2995 case NVME_CR_ASQ_LOW: 2996 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | 2997 (0xFFFFF000 & value); 2998 break; 2999 case NVME_CR_ASQ_HI: 3000 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | 3001 (value << 32); 3002 break; 3003 case NVME_CR_ACQ_LOW: 3004 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | 3005 (0xFFFFF000 & value); 3006 break; 3007 case NVME_CR_ACQ_HI: 3008 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | 3009 (value << 32); 3010 break; 3011 default: 3012 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d", 3013 __func__, offset, value, size); 3014 } 3015 pthread_mutex_unlock(&sc->mtx); 3016 } 3017 3018 static void 3019 pci_nvme_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size, 3020 uint64_t value) 3021 { 3022 struct pci_nvme_softc* sc = pi->pi_arg; 3023 3024 if (baridx == pci_msix_table_bar(pi) || 3025 baridx == pci_msix_pba_bar(pi)) { 3026 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, " 3027 " value 0x%lx", baridx, offset, size, value); 3028 3029 pci_emul_msix_twrite(pi, offset, size, value); 3030 return; 3031 } 3032 3033 switch (baridx) { 3034 case 0: 3035 pci_nvme_write_bar_0(sc, offset, size, value); 3036 break; 3037 3038 default: 3039 DPRINTF("%s unknown baridx %d, val 0x%lx", 3040 __func__, baridx, value); 3041 } 3042 } 3043 3044 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, 3045 uint64_t offset, int size) 3046 { 3047 uint64_t value; 3048 3049 pci_nvme_bar0_reg_dumps(__func__, offset, 0); 3050 3051 if (offset < NVME_DOORBELL_OFFSET) { 3052 void *p = &(sc->regs); 3053 pthread_mutex_lock(&sc->mtx); 3054 memcpy(&value, (void *)((uintptr_t)p + offset), size); 3055 pthread_mutex_unlock(&sc->mtx); 3056 } else { 3057 value = 0; 3058 WPRINTF("pci_nvme: read invalid offset %ld", offset); 3059 } 3060 3061 switch (size) { 3062 case 1: 3063 value &= 0xFF; 3064 break; 3065 case 2: 3066 value &= 0xFFFF; 3067 break; 3068 case 4: 3069 value &= 0xFFFFFFFF; 3070 break; 3071 } 3072 3073 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x", 3074 offset, size, (uint32_t)value); 3075 3076 return (value); 3077 } 3078 3079 3080 3081 static uint64_t 3082 pci_nvme_read(struct pci_devinst *pi, int baridx, uint64_t offset, int size) 3083 { 3084 struct pci_nvme_softc* sc = pi->pi_arg; 3085 3086 if (baridx == pci_msix_table_bar(pi) || 3087 baridx == pci_msix_pba_bar(pi)) { 3088 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d", 3089 baridx, offset, size); 3090 3091 return pci_emul_msix_tread(pi, offset, size); 3092 } 3093 3094 switch (baridx) { 3095 case 0: 3096 return pci_nvme_read_bar_0(sc, offset, size); 3097 3098 default: 3099 DPRINTF("unknown bar %d, 0x%lx", baridx, offset); 3100 } 3101 3102 return (0); 3103 } 3104 3105 static int 3106 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl) 3107 { 3108 char bident[sizeof("XXX:XXX")]; 3109 const char *value; 3110 uint32_t sectsz; 3111 3112 sc->max_queues = NVME_QUEUES; 3113 sc->max_qentries = NVME_MAX_QENTRIES; 3114 sc->ioslots = NVME_IOSLOTS; 3115 sc->num_squeues = sc->max_queues; 3116 sc->num_cqueues = sc->max_queues; 3117 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 3118 sectsz = 0; 3119 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), 3120 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3121 3122 value = get_config_value_node(nvl, "maxq"); 3123 if (value != NULL) 3124 sc->max_queues = atoi(value); 3125 value = get_config_value_node(nvl, "qsz"); 3126 if (value != NULL) { 3127 sc->max_qentries = atoi(value); 3128 if (sc->max_qentries <= 0) { 3129 EPRINTLN("nvme: Invalid qsz option %d", 3130 sc->max_qentries); 3131 return (-1); 3132 } 3133 } 3134 value = get_config_value_node(nvl, "ioslots"); 3135 if (value != NULL) { 3136 sc->ioslots = atoi(value); 3137 if (sc->ioslots <= 0) { 3138 EPRINTLN("Invalid ioslots option %d", sc->ioslots); 3139 return (-1); 3140 } 3141 } 3142 value = get_config_value_node(nvl, "sectsz"); 3143 if (value != NULL) 3144 sectsz = atoi(value); 3145 value = get_config_value_node(nvl, "ser"); 3146 if (value != NULL) { 3147 /* 3148 * This field indicates the Product Serial Number in 3149 * 7-bit ASCII, unused bytes should be space characters. 3150 * Ref: NVMe v1.3c. 3151 */ 3152 cpywithpad((char *)sc->ctrldata.sn, 3153 sizeof(sc->ctrldata.sn), value, ' '); 3154 } 3155 value = get_config_value_node(nvl, "eui64"); 3156 if (value != NULL) 3157 sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0)); 3158 value = get_config_value_node(nvl, "dsm"); 3159 if (value != NULL) { 3160 if (strcmp(value, "auto") == 0) 3161 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 3162 else if (strcmp(value, "enable") == 0) 3163 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE; 3164 else if (strcmp(value, "disable") == 0) 3165 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE; 3166 } 3167 3168 value = get_config_value_node(nvl, "bootindex"); 3169 if (value != NULL) { 3170 if (pci_emul_add_boot_device(sc->nsc_pi, atoi(value))) { 3171 EPRINTLN("Invalid bootindex %d", atoi(value)); 3172 return (-1); 3173 } 3174 } 3175 3176 value = get_config_value_node(nvl, "ram"); 3177 if (value != NULL) { 3178 uint64_t sz = strtoull(value, NULL, 10); 3179 3180 sc->nvstore.type = NVME_STOR_RAM; 3181 sc->nvstore.size = sz * 1024 * 1024; 3182 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 3183 sc->nvstore.sectsz = 4096; 3184 sc->nvstore.sectsz_bits = 12; 3185 if (sc->nvstore.ctx == NULL) { 3186 EPRINTLN("nvme: Unable to allocate RAM"); 3187 return (-1); 3188 } 3189 } else { 3190 snprintf(bident, sizeof(bident), "%u:%u", 3191 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3192 sc->nvstore.ctx = blockif_open(nvl, bident); 3193 if (sc->nvstore.ctx == NULL) { 3194 EPRINTLN("nvme: Could not open backing file: %s", 3195 strerror(errno)); 3196 return (-1); 3197 } 3198 sc->nvstore.type = NVME_STOR_BLOCKIF; 3199 sc->nvstore.size = blockif_size(sc->nvstore.ctx); 3200 } 3201 3202 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) 3203 sc->nvstore.sectsz = sectsz; 3204 else if (sc->nvstore.type != NVME_STOR_RAM) 3205 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); 3206 for (sc->nvstore.sectsz_bits = 9; 3207 (1U << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; 3208 sc->nvstore.sectsz_bits++); 3209 3210 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) 3211 sc->max_queues = NVME_QUEUES; 3212 3213 return (0); 3214 } 3215 3216 static void 3217 pci_nvme_resized(struct blockif_ctxt *bctxt __unused, void *arg, 3218 size_t new_size) 3219 { 3220 struct pci_nvme_softc *sc; 3221 struct pci_nvme_blockstore *nvstore; 3222 struct nvme_namespace_data *nd; 3223 3224 sc = arg; 3225 nvstore = &sc->nvstore; 3226 nd = &sc->nsdata; 3227 3228 nvstore->size = new_size; 3229 pci_nvme_init_nsdata_size(nvstore, nd); 3230 3231 /* Add changed NSID to list */ 3232 sc->ns_log.ns[0] = 1; 3233 sc->ns_log.ns[1] = 0; 3234 3235 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE, 3236 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED); 3237 } 3238 3239 static int 3240 pci_nvme_init(struct pci_devinst *pi, nvlist_t *nvl) 3241 { 3242 struct pci_nvme_softc *sc; 3243 uint32_t pci_membar_sz; 3244 int error; 3245 3246 error = 0; 3247 3248 sc = calloc(1, sizeof(struct pci_nvme_softc)); 3249 pi->pi_arg = sc; 3250 sc->nsc_pi = pi; 3251 3252 error = pci_nvme_parse_config(sc, nvl); 3253 if (error < 0) 3254 goto done; 3255 else 3256 error = 0; 3257 3258 STAILQ_INIT(&sc->ioreqs_free); 3259 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); 3260 for (uint32_t i = 0; i < sc->ioslots; i++) { 3261 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); 3262 } 3263 3264 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); 3265 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); 3266 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); 3267 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); 3268 pci_set_cfgdata8(pi, PCIR_PROGIF, 3269 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); 3270 3271 /* 3272 * Allocate size of NVMe registers + doorbell space for all queues. 3273 * 3274 * The specification requires a minimum memory I/O window size of 16K. 3275 * The Windows driver will refuse to start a device with a smaller 3276 * window. 3277 */ 3278 pci_membar_sz = sizeof(struct nvme_registers) + 3279 2 * sizeof(uint32_t) * (sc->max_queues + 1); 3280 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); 3281 3282 DPRINTF("nvme membar size: %u", pci_membar_sz); 3283 3284 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); 3285 if (error) { 3286 WPRINTF("%s pci alloc mem bar failed", __func__); 3287 goto done; 3288 } 3289 3290 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); 3291 if (error) { 3292 WPRINTF("%s pci add msixcap failed", __func__); 3293 goto done; 3294 } 3295 3296 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); 3297 if (error) { 3298 WPRINTF("%s pci add Express capability failed", __func__); 3299 goto done; 3300 } 3301 3302 pthread_mutex_init(&sc->mtx, NULL); 3303 sem_init(&sc->iosemlock, 0, sc->ioslots); 3304 blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc); 3305 3306 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues); 3307 /* 3308 * Controller data depends on Namespace data so initialize Namespace 3309 * data first. 3310 */ 3311 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore); 3312 pci_nvme_init_ctrldata(sc); 3313 pci_nvme_init_logpages(sc); 3314 pci_nvme_init_features(sc); 3315 3316 pci_nvme_aer_init(sc); 3317 pci_nvme_aen_init(sc); 3318 3319 pci_nvme_reset(sc); 3320 done: 3321 return (error); 3322 } 3323 3324 static int 3325 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts) 3326 { 3327 char *cp, *ram; 3328 3329 if (opts == NULL) 3330 return (0); 3331 3332 if (strncmp(opts, "ram=", 4) == 0) { 3333 cp = strchr(opts, ','); 3334 if (cp == NULL) { 3335 set_config_value_node(nvl, "ram", opts + 4); 3336 return (0); 3337 } 3338 ram = strndup(opts + 4, cp - opts - 4); 3339 set_config_value_node(nvl, "ram", ram); 3340 free(ram); 3341 return (pci_parse_legacy_config(nvl, cp + 1)); 3342 } else 3343 return (blockif_legacy_config(nvl, opts)); 3344 } 3345 3346 static const struct pci_devemu pci_de_nvme = { 3347 .pe_emu = "nvme", 3348 .pe_init = pci_nvme_init, 3349 .pe_legacy_config = pci_nvme_legacy_config, 3350 .pe_barwrite = pci_nvme_write, 3351 .pe_barread = pci_nvme_read 3352 }; 3353 PCI_EMUL_SET(pci_de_nvme); 3354