1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2017 Shunsuke Mie 5 * Copyright (c) 2018 Leon Dang 6 * Copyright (c) 2020 Chuck Tuffli 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 /* 31 * bhyve PCIe-NVMe device emulation. 32 * 33 * options: 34 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt> 35 * 36 * accepted devpath: 37 * /dev/blockdev 38 * /path/to/image 39 * ram=size_in_MiB 40 * 41 * maxq = max number of queues 42 * qsz = max elements in each queue 43 * ioslots = max number of concurrent io requests 44 * sectsz = sector size (defaults to blockif sector size) 45 * ser = serial number (20-chars max) 46 * eui64 = IEEE Extended Unique Identifier (8 byte value) 47 * dsm = DataSet Management support. Option is one of auto, enable,disable 48 * 49 */ 50 51 /* TODO: 52 - create async event for smart and log 53 - intr coalesce 54 */ 55 56 #include <sys/cdefs.h> 57 #include <sys/errno.h> 58 #include <sys/types.h> 59 #include <sys/crc16.h> 60 #include <net/ieee_oui.h> 61 62 #include <assert.h> 63 #include <pthread.h> 64 #include <pthread_np.h> 65 #include <semaphore.h> 66 #include <stdbool.h> 67 #include <stddef.h> 68 #include <stdint.h> 69 #include <stdio.h> 70 #include <stdlib.h> 71 #include <string.h> 72 73 #include <machine/atomic.h> 74 #include <machine/vmm.h> 75 #include <vmmapi.h> 76 77 #include <dev/nvme/nvme.h> 78 79 #include "bhyverun.h" 80 #include "block_if.h" 81 #include "config.h" 82 #include "debug.h" 83 #include "pci_emul.h" 84 85 86 static int nvme_debug = 0; 87 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args) 88 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args) 89 90 /* defaults; can be overridden */ 91 #define NVME_MSIX_BAR 4 92 93 #define NVME_IOSLOTS 8 94 95 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ 96 #define NVME_MMIO_SPACE_MIN (1 << 14) 97 98 #define NVME_QUEUES 16 99 #define NVME_MAX_QENTRIES 2048 100 /* Memory Page size Minimum reported in CAP register */ 101 #define NVME_MPSMIN 0 102 /* MPSMIN converted to bytes */ 103 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN)) 104 105 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) 106 #define NVME_MDTS 9 107 /* Note the + 1 allows for the initial descriptor to not be page aligned */ 108 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1) 109 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES) 110 111 /* This is a synthetic status code to indicate there is no status */ 112 #define NVME_NO_STATUS 0xffff 113 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) 114 115 /* Reported temperature in Kelvin (i.e. room temperature) */ 116 #define NVME_TEMPERATURE 296 117 118 /* helpers */ 119 120 /* Convert a zero-based value into a one-based value */ 121 #define ONE_BASED(zero) ((zero) + 1) 122 /* Convert a one-based value into a zero-based value */ 123 #define ZERO_BASED(one) ((one) - 1) 124 125 /* Encode number of SQ's and CQ's for Set/Get Features */ 126 #define NVME_FEATURE_NUM_QUEUES(sc) \ 127 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ 128 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16 129 130 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) 131 132 enum nvme_controller_register_offsets { 133 NVME_CR_CAP_LOW = 0x00, 134 NVME_CR_CAP_HI = 0x04, 135 NVME_CR_VS = 0x08, 136 NVME_CR_INTMS = 0x0c, 137 NVME_CR_INTMC = 0x10, 138 NVME_CR_CC = 0x14, 139 NVME_CR_CSTS = 0x1c, 140 NVME_CR_NSSR = 0x20, 141 NVME_CR_AQA = 0x24, 142 NVME_CR_ASQ_LOW = 0x28, 143 NVME_CR_ASQ_HI = 0x2c, 144 NVME_CR_ACQ_LOW = 0x30, 145 NVME_CR_ACQ_HI = 0x34, 146 }; 147 148 enum nvme_cmd_cdw11 { 149 NVME_CMD_CDW11_PC = 0x0001, 150 NVME_CMD_CDW11_IEN = 0x0002, 151 NVME_CMD_CDW11_IV = 0xFFFF0000, 152 }; 153 154 enum nvme_copy_dir { 155 NVME_COPY_TO_PRP, 156 NVME_COPY_FROM_PRP, 157 }; 158 159 #define NVME_CQ_INTEN 0x01 160 #define NVME_CQ_INTCOAL 0x02 161 162 struct nvme_completion_queue { 163 struct nvme_completion *qbase; 164 pthread_mutex_t mtx; 165 uint32_t size; 166 uint16_t tail; /* nvme progress */ 167 uint16_t head; /* guest progress */ 168 uint16_t intr_vec; 169 uint32_t intr_en; 170 }; 171 172 struct nvme_submission_queue { 173 struct nvme_command *qbase; 174 pthread_mutex_t mtx; 175 uint32_t size; 176 uint16_t head; /* nvme progress */ 177 uint16_t tail; /* guest progress */ 178 uint16_t cqid; /* completion queue id */ 179 int qpriority; 180 }; 181 182 enum nvme_storage_type { 183 NVME_STOR_BLOCKIF = 0, 184 NVME_STOR_RAM = 1, 185 }; 186 187 struct pci_nvme_blockstore { 188 enum nvme_storage_type type; 189 void *ctx; 190 uint64_t size; 191 uint32_t sectsz; 192 uint32_t sectsz_bits; 193 uint64_t eui64; 194 uint32_t deallocate:1; 195 }; 196 197 /* 198 * Calculate the number of additional page descriptors for guest IO requests 199 * based on the advertised Max Data Transfer (MDTS) and given the number of 200 * default iovec's in a struct blockif_req. 201 */ 202 #define MDTS_PAD_SIZE \ 203 ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \ 204 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \ 205 0 ) 206 207 struct pci_nvme_ioreq { 208 struct pci_nvme_softc *sc; 209 STAILQ_ENTRY(pci_nvme_ioreq) link; 210 struct nvme_submission_queue *nvme_sq; 211 uint16_t sqid; 212 213 /* command information */ 214 uint16_t opc; 215 uint16_t cid; 216 uint32_t nsid; 217 218 uint64_t prev_gpaddr; 219 size_t prev_size; 220 size_t bytes; 221 222 struct blockif_req io_req; 223 224 struct iovec iovpadding[MDTS_PAD_SIZE]; 225 }; 226 227 enum nvme_dsm_type { 228 /* Dataset Management bit in ONCS reflects backing storage capability */ 229 NVME_DATASET_MANAGEMENT_AUTO, 230 /* Unconditionally set Dataset Management bit in ONCS */ 231 NVME_DATASET_MANAGEMENT_ENABLE, 232 /* Unconditionally clear Dataset Management bit in ONCS */ 233 NVME_DATASET_MANAGEMENT_DISABLE, 234 }; 235 236 struct pci_nvme_softc; 237 struct nvme_feature_obj; 238 239 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *, 240 struct nvme_feature_obj *, 241 struct nvme_command *, 242 struct nvme_completion *); 243 244 struct nvme_feature_obj { 245 uint32_t cdw11; 246 nvme_feature_cb set; 247 nvme_feature_cb get; 248 bool namespace_specific; 249 }; 250 251 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1) 252 253 typedef enum { 254 PCI_NVME_AE_TYPE_ERROR = 0, 255 PCI_NVME_AE_TYPE_SMART, 256 PCI_NVME_AE_TYPE_NOTICE, 257 PCI_NVME_AE_TYPE_IO_CMD = 6, 258 PCI_NVME_AE_TYPE_VENDOR = 7, 259 PCI_NVME_AE_TYPE_MAX /* Must be last */ 260 } pci_nvme_async_type; 261 262 /* Asynchronous Event Requests */ 263 struct pci_nvme_aer { 264 STAILQ_ENTRY(pci_nvme_aer) link; 265 uint16_t cid; /* Command ID of the submitted AER */ 266 }; 267 268 /** Asynchronous Event Information - Notice */ 269 typedef enum { 270 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED = 0, 271 PCI_NVME_AEI_NOTICE_FW_ACTIVATION, 272 PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE, 273 PCI_NVME_AEI_NOTICE_ANA_CHANGE, 274 PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE, 275 PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT, 276 PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE, 277 PCI_NVME_AEI_NOTICE_MAX, 278 } pci_nvme_async_event_info_notice; 279 280 #define PCI_NVME_AEI_NOTICE_SHIFT 8 281 #define PCI_NVME_AEI_NOTICE_MASK(event) (1 << (event + PCI_NVME_AEI_NOTICE_SHIFT)) 282 283 /* Asynchronous Event Notifications */ 284 struct pci_nvme_aen { 285 pci_nvme_async_type atype; 286 uint32_t event_data; 287 bool posted; 288 }; 289 290 /* 291 * By default, enable all Asynchrnous Event Notifications: 292 * SMART / Health Critical Warnings 293 * Namespace Attribute Notices 294 */ 295 #define PCI_NVME_AEN_DEFAULT_MASK 0x11f 296 297 typedef enum { 298 NVME_CNTRLTYPE_IO = 1, 299 NVME_CNTRLTYPE_DISCOVERY = 2, 300 NVME_CNTRLTYPE_ADMIN = 3, 301 } pci_nvme_cntrl_type; 302 303 struct pci_nvme_softc { 304 struct pci_devinst *nsc_pi; 305 306 pthread_mutex_t mtx; 307 308 struct nvme_registers regs; 309 310 struct nvme_namespace_data nsdata; 311 struct nvme_controller_data ctrldata; 312 struct nvme_error_information_entry err_log; 313 struct nvme_health_information_page health_log; 314 struct nvme_firmware_page fw_log; 315 struct nvme_ns_list ns_log; 316 317 struct pci_nvme_blockstore nvstore; 318 319 uint16_t max_qentries; /* max entries per queue */ 320 uint32_t max_queues; /* max number of IO SQ's or CQ's */ 321 uint32_t num_cqueues; 322 uint32_t num_squeues; 323 bool num_q_is_set; /* Has host set Number of Queues */ 324 325 struct pci_nvme_ioreq *ioreqs; 326 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */ 327 uint32_t pending_ios; 328 uint32_t ioslots; 329 sem_t iosemlock; 330 331 /* 332 * Memory mapped Submission and Completion queues 333 * Each array includes both Admin and IO queues 334 */ 335 struct nvme_completion_queue *compl_queues; 336 struct nvme_submission_queue *submit_queues; 337 338 struct nvme_feature_obj feat[NVME_FID_MAX]; 339 340 enum nvme_dsm_type dataset_management; 341 342 /* Accounting for SMART data */ 343 __uint128_t read_data_units; 344 __uint128_t write_data_units; 345 __uint128_t read_commands; 346 __uint128_t write_commands; 347 uint32_t read_dunits_remainder; 348 uint32_t write_dunits_remainder; 349 350 STAILQ_HEAD(, pci_nvme_aer) aer_list; 351 pthread_mutex_t aer_mtx; 352 uint32_t aer_count; 353 struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX]; 354 pthread_t aen_tid; 355 pthread_mutex_t aen_mtx; 356 pthread_cond_t aen_cond; 357 }; 358 359 360 static void pci_nvme_cq_update(struct pci_nvme_softc *sc, 361 struct nvme_completion_queue *cq, 362 uint32_t cdw0, 363 uint16_t cid, 364 uint16_t sqid, 365 uint16_t status); 366 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); 367 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); 368 static void pci_nvme_io_done(struct blockif_req *, int); 369 370 /* Controller Configuration utils */ 371 #define NVME_CC_GET_EN(cc) \ 372 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) 373 #define NVME_CC_GET_CSS(cc) \ 374 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) 375 #define NVME_CC_GET_SHN(cc) \ 376 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) 377 #define NVME_CC_GET_IOSQES(cc) \ 378 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) 379 #define NVME_CC_GET_IOCQES(cc) \ 380 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) 381 382 #define NVME_CC_WRITE_MASK \ 383 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ 384 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ 385 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) 386 387 #define NVME_CC_NEN_WRITE_MASK \ 388 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ 389 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ 390 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) 391 392 /* Controller Status utils */ 393 #define NVME_CSTS_GET_RDY(sts) \ 394 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) 395 396 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) 397 #define NVME_CSTS_CFS (1 << NVME_CSTS_REG_CFS_SHIFT) 398 399 /* Completion Queue status word utils */ 400 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) 401 #define NVME_STATUS_MASK \ 402 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ 403 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) 404 405 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \ 406 NVME_CTRLR_DATA_ONCS_DSM_SHIFT) 407 408 static void nvme_feature_invalid_cb(struct pci_nvme_softc *, 409 struct nvme_feature_obj *, 410 struct nvme_command *, 411 struct nvme_completion *); 412 static void nvme_feature_temperature(struct pci_nvme_softc *, 413 struct nvme_feature_obj *, 414 struct nvme_command *, 415 struct nvme_completion *); 416 static void nvme_feature_num_queues(struct pci_nvme_softc *, 417 struct nvme_feature_obj *, 418 struct nvme_command *, 419 struct nvme_completion *); 420 static void nvme_feature_iv_config(struct pci_nvme_softc *, 421 struct nvme_feature_obj *, 422 struct nvme_command *, 423 struct nvme_completion *); 424 static void nvme_feature_async_event(struct pci_nvme_softc *, 425 struct nvme_feature_obj *, 426 struct nvme_command *, 427 struct nvme_completion *); 428 429 static void *aen_thr(void *arg); 430 431 static __inline void 432 cpywithpad(char *dst, size_t dst_size, const char *src, char pad) 433 { 434 size_t len; 435 436 len = strnlen(src, dst_size); 437 memset(dst, pad, dst_size); 438 memcpy(dst, src, len); 439 } 440 441 static __inline void 442 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) 443 { 444 445 *status &= ~NVME_STATUS_MASK; 446 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | 447 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; 448 } 449 450 static __inline void 451 pci_nvme_status_genc(uint16_t *status, uint16_t code) 452 { 453 454 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); 455 } 456 457 /* 458 * Initialize the requested number or IO Submission and Completion Queues. 459 * Admin queues are allocated implicitly. 460 */ 461 static void 462 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq) 463 { 464 uint32_t i; 465 466 /* 467 * Allocate and initialize the Submission Queues 468 */ 469 if (nsq > NVME_QUEUES) { 470 WPRINTF("%s: clamping number of SQ from %u to %u", 471 __func__, nsq, NVME_QUEUES); 472 nsq = NVME_QUEUES; 473 } 474 475 sc->num_squeues = nsq; 476 477 sc->submit_queues = calloc(sc->num_squeues + 1, 478 sizeof(struct nvme_submission_queue)); 479 if (sc->submit_queues == NULL) { 480 WPRINTF("%s: SQ allocation failed", __func__); 481 sc->num_squeues = 0; 482 } else { 483 struct nvme_submission_queue *sq = sc->submit_queues; 484 485 for (i = 0; i < sc->num_squeues + 1; i++) 486 pthread_mutex_init(&sq[i].mtx, NULL); 487 } 488 489 /* 490 * Allocate and initialize the Completion Queues 491 */ 492 if (ncq > NVME_QUEUES) { 493 WPRINTF("%s: clamping number of CQ from %u to %u", 494 __func__, ncq, NVME_QUEUES); 495 ncq = NVME_QUEUES; 496 } 497 498 sc->num_cqueues = ncq; 499 500 sc->compl_queues = calloc(sc->num_cqueues + 1, 501 sizeof(struct nvme_completion_queue)); 502 if (sc->compl_queues == NULL) { 503 WPRINTF("%s: CQ allocation failed", __func__); 504 sc->num_cqueues = 0; 505 } else { 506 struct nvme_completion_queue *cq = sc->compl_queues; 507 508 for (i = 0; i < sc->num_cqueues + 1; i++) 509 pthread_mutex_init(&cq[i].mtx, NULL); 510 } 511 } 512 513 static void 514 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) 515 { 516 struct nvme_controller_data *cd = &sc->ctrldata; 517 518 cd->vid = 0xFB5D; 519 cd->ssvid = 0x0000; 520 521 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); 522 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); 523 524 /* Num of submission commands that we can handle at a time (2^rab) */ 525 cd->rab = 4; 526 527 /* FreeBSD OUI */ 528 cd->ieee[0] = 0xfc; 529 cd->ieee[1] = 0x9c; 530 cd->ieee[2] = 0x58; 531 532 cd->mic = 0; 533 534 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */ 535 536 cd->ver = NVME_REV(1,4); 537 538 cd->cntrltype = NVME_CNTRLTYPE_IO; 539 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; 540 cd->oaes = NVMEB(NVME_CTRLR_DATA_OAES_NS_ATTR); 541 cd->acl = 2; 542 cd->aerl = 4; 543 544 /* Advertise 1, Read-only firmware slot */ 545 cd->frmw = NVMEB(NVME_CTRLR_DATA_FRMW_SLOT1_RO) | 546 (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT); 547 cd->lpa = 0; /* TODO: support some simple things like SMART */ 548 cd->elpe = 0; /* max error log page entries */ 549 /* 550 * Report a single power state (zero-based value) 551 * power_state[] values are left as zero to indicate "Not reported" 552 */ 553 cd->npss = 0; 554 555 /* Warning Composite Temperature Threshold */ 556 cd->wctemp = 0x0157; 557 cd->cctemp = 0x0157; 558 559 /* SANICAP must not be 0 for Revision 1.4 and later NVMe Controllers */ 560 cd->sanicap = (NVME_CTRLR_DATA_SANICAP_NODMMAS_NO << 561 NVME_CTRLR_DATA_SANICAP_NODMMAS_SHIFT); 562 563 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | 564 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); 565 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | 566 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); 567 cd->nn = 1; /* number of namespaces */ 568 569 cd->oncs = 0; 570 switch (sc->dataset_management) { 571 case NVME_DATASET_MANAGEMENT_AUTO: 572 if (sc->nvstore.deallocate) 573 cd->oncs |= NVME_ONCS_DSM; 574 break; 575 case NVME_DATASET_MANAGEMENT_ENABLE: 576 cd->oncs |= NVME_ONCS_DSM; 577 break; 578 default: 579 break; 580 } 581 582 cd->fna = NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK << 583 NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT; 584 585 cd->vwc = NVME_CTRLR_DATA_VWC_ALL_NO << NVME_CTRLR_DATA_VWC_ALL_SHIFT; 586 } 587 588 static void 589 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore, 590 struct nvme_namespace_data *nd) 591 { 592 593 /* Get capacity and block size information from backing store */ 594 nd->nsze = nvstore->size / nvstore->sectsz; 595 nd->ncap = nd->nsze; 596 nd->nuse = nd->nsze; 597 } 598 599 static void 600 pci_nvme_init_nsdata(struct pci_nvme_softc *sc, 601 struct nvme_namespace_data *nd, uint32_t nsid, 602 struct pci_nvme_blockstore *nvstore) 603 { 604 605 pci_nvme_init_nsdata_size(nvstore, nd); 606 607 if (nvstore->type == NVME_STOR_BLOCKIF) 608 nvstore->deallocate = blockif_candelete(nvstore->ctx); 609 610 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ 611 nd->flbas = 0; 612 613 /* Create an EUI-64 if user did not provide one */ 614 if (nvstore->eui64 == 0) { 615 char *data = NULL; 616 uint64_t eui64 = nvstore->eui64; 617 618 asprintf(&data, "%s%u%u%u", get_config_value("name"), 619 sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot, 620 sc->nsc_pi->pi_func); 621 622 if (data != NULL) { 623 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); 624 free(data); 625 } 626 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff); 627 } 628 be64enc(nd->eui64, nvstore->eui64); 629 630 /* LBA data-sz = 2^lbads */ 631 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; 632 } 633 634 static void 635 pci_nvme_init_logpages(struct pci_nvme_softc *sc) 636 { 637 __uint128_t power_cycles = 1; 638 639 memset(&sc->err_log, 0, sizeof(sc->err_log)); 640 memset(&sc->health_log, 0, sizeof(sc->health_log)); 641 memset(&sc->fw_log, 0, sizeof(sc->fw_log)); 642 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 643 644 /* Set read/write remainder to round up according to spec */ 645 sc->read_dunits_remainder = 999; 646 sc->write_dunits_remainder = 999; 647 648 /* Set nominal Health values checked by implementations */ 649 sc->health_log.temperature = NVME_TEMPERATURE; 650 sc->health_log.available_spare = 100; 651 sc->health_log.available_spare_threshold = 10; 652 653 /* Set Active Firmware Info to slot 1 */ 654 sc->fw_log.afi = (1 << NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT); 655 memcpy(&sc->fw_log.revision[0], sc->ctrldata.fr, 656 sizeof(sc->fw_log.revision[0])); 657 658 memcpy(&sc->health_log.power_cycles, &power_cycles, 659 sizeof(sc->health_log.power_cycles)); 660 } 661 662 static void 663 pci_nvme_init_features(struct pci_nvme_softc *sc) 664 { 665 enum nvme_feature fid; 666 667 for (fid = 0; fid < NVME_FID_MAX; fid++) { 668 switch (fid) { 669 case NVME_FEAT_ARBITRATION: 670 case NVME_FEAT_POWER_MANAGEMENT: 671 case NVME_FEAT_INTERRUPT_COALESCING: //XXX 672 case NVME_FEAT_WRITE_ATOMICITY: 673 /* Mandatory but no special handling required */ 674 //XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 675 //XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 676 // this returns a data buffer 677 break; 678 case NVME_FEAT_TEMPERATURE_THRESHOLD: 679 sc->feat[fid].set = nvme_feature_temperature; 680 break; 681 case NVME_FEAT_ERROR_RECOVERY: 682 sc->feat[fid].namespace_specific = true; 683 break; 684 case NVME_FEAT_NUMBER_OF_QUEUES: 685 sc->feat[fid].set = nvme_feature_num_queues; 686 break; 687 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 688 sc->feat[fid].set = nvme_feature_iv_config; 689 break; 690 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 691 sc->feat[fid].set = nvme_feature_async_event; 692 /* Enable all AENs by default */ 693 sc->feat[fid].cdw11 = PCI_NVME_AEN_DEFAULT_MASK; 694 break; 695 default: 696 sc->feat[fid].set = nvme_feature_invalid_cb; 697 sc->feat[fid].get = nvme_feature_invalid_cb; 698 } 699 } 700 } 701 702 static void 703 pci_nvme_aer_reset(struct pci_nvme_softc *sc) 704 { 705 706 STAILQ_INIT(&sc->aer_list); 707 sc->aer_count = 0; 708 } 709 710 static void 711 pci_nvme_aer_init(struct pci_nvme_softc *sc) 712 { 713 714 pthread_mutex_init(&sc->aer_mtx, NULL); 715 pci_nvme_aer_reset(sc); 716 } 717 718 static void 719 pci_nvme_aer_destroy(struct pci_nvme_softc *sc) 720 { 721 struct pci_nvme_aer *aer = NULL; 722 723 pthread_mutex_lock(&sc->aer_mtx); 724 while (!STAILQ_EMPTY(&sc->aer_list)) { 725 aer = STAILQ_FIRST(&sc->aer_list); 726 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 727 free(aer); 728 } 729 pthread_mutex_unlock(&sc->aer_mtx); 730 731 pci_nvme_aer_reset(sc); 732 } 733 734 static bool 735 pci_nvme_aer_available(struct pci_nvme_softc *sc) 736 { 737 738 return (sc->aer_count != 0); 739 } 740 741 static bool 742 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc) 743 { 744 struct nvme_controller_data *cd = &sc->ctrldata; 745 746 /* AERL is a zero based value while aer_count is one's based */ 747 return (sc->aer_count == (cd->aerl + 1U)); 748 } 749 750 /* 751 * Add an Async Event Request 752 * 753 * Stores an AER to be returned later if the Controller needs to notify the 754 * host of an event. 755 * Note that while the NVMe spec doesn't require Controllers to return AER's 756 * in order, this implementation does preserve the order. 757 */ 758 static int 759 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid) 760 { 761 struct pci_nvme_aer *aer = NULL; 762 763 aer = calloc(1, sizeof(struct pci_nvme_aer)); 764 if (aer == NULL) 765 return (-1); 766 767 /* Save the Command ID for use in the completion message */ 768 aer->cid = cid; 769 770 pthread_mutex_lock(&sc->aer_mtx); 771 sc->aer_count++; 772 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link); 773 pthread_mutex_unlock(&sc->aer_mtx); 774 775 return (0); 776 } 777 778 /* 779 * Get an Async Event Request structure 780 * 781 * Returns a pointer to an AER previously submitted by the host or NULL if 782 * no AER's exist. Caller is responsible for freeing the returned struct. 783 */ 784 static struct pci_nvme_aer * 785 pci_nvme_aer_get(struct pci_nvme_softc *sc) 786 { 787 struct pci_nvme_aer *aer = NULL; 788 789 pthread_mutex_lock(&sc->aer_mtx); 790 aer = STAILQ_FIRST(&sc->aer_list); 791 if (aer != NULL) { 792 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 793 sc->aer_count--; 794 } 795 pthread_mutex_unlock(&sc->aer_mtx); 796 797 return (aer); 798 } 799 800 static void 801 pci_nvme_aen_reset(struct pci_nvme_softc *sc) 802 { 803 uint32_t atype; 804 805 memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen)); 806 807 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 808 sc->aen[atype].atype = atype; 809 } 810 } 811 812 static void 813 pci_nvme_aen_init(struct pci_nvme_softc *sc) 814 { 815 char nstr[80]; 816 817 pci_nvme_aen_reset(sc); 818 819 pthread_mutex_init(&sc->aen_mtx, NULL); 820 pthread_create(&sc->aen_tid, NULL, aen_thr, sc); 821 snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot, 822 sc->nsc_pi->pi_func); 823 pthread_set_name_np(sc->aen_tid, nstr); 824 } 825 826 static void 827 pci_nvme_aen_destroy(struct pci_nvme_softc *sc) 828 { 829 830 pci_nvme_aen_reset(sc); 831 } 832 833 /* Notify the AEN thread of pending work */ 834 static void 835 pci_nvme_aen_notify(struct pci_nvme_softc *sc) 836 { 837 838 pthread_cond_signal(&sc->aen_cond); 839 } 840 841 /* 842 * Post an Asynchronous Event Notification 843 */ 844 static int32_t 845 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype, 846 uint32_t event_data) 847 { 848 struct pci_nvme_aen *aen; 849 850 if (atype >= PCI_NVME_AE_TYPE_MAX) { 851 return(EINVAL); 852 } 853 854 pthread_mutex_lock(&sc->aen_mtx); 855 aen = &sc->aen[atype]; 856 857 /* Has the controller already posted an event of this type? */ 858 if (aen->posted) { 859 pthread_mutex_unlock(&sc->aen_mtx); 860 return(EALREADY); 861 } 862 863 aen->event_data = event_data; 864 aen->posted = true; 865 pthread_mutex_unlock(&sc->aen_mtx); 866 867 pci_nvme_aen_notify(sc); 868 869 return(0); 870 } 871 872 static void 873 pci_nvme_aen_process(struct pci_nvme_softc *sc) 874 { 875 struct pci_nvme_aer *aer; 876 struct pci_nvme_aen *aen; 877 pci_nvme_async_type atype; 878 uint32_t mask; 879 uint16_t status; 880 uint8_t lid; 881 882 assert(pthread_mutex_isowned_np(&sc->aen_mtx)); 883 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 884 aen = &sc->aen[atype]; 885 /* Previous iterations may have depleted the available AER's */ 886 if (!pci_nvme_aer_available(sc)) { 887 DPRINTF("%s: no AER", __func__); 888 break; 889 } 890 891 if (!aen->posted) { 892 DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype); 893 continue; 894 } 895 896 status = NVME_SC_SUCCESS; 897 898 /* Is the event masked? */ 899 mask = 900 sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11; 901 902 DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data); 903 switch (atype) { 904 case PCI_NVME_AE_TYPE_ERROR: 905 lid = NVME_LOG_ERROR; 906 break; 907 case PCI_NVME_AE_TYPE_SMART: 908 mask &= 0xff; 909 if ((mask & aen->event_data) == 0) 910 continue; 911 lid = NVME_LOG_HEALTH_INFORMATION; 912 break; 913 case PCI_NVME_AE_TYPE_NOTICE: 914 if (aen->event_data >= PCI_NVME_AEI_NOTICE_MAX) { 915 EPRINTLN("%s unknown AEN notice type %u", 916 __func__, aen->event_data); 917 status = NVME_SC_INTERNAL_DEVICE_ERROR; 918 lid = 0; 919 break; 920 } 921 if ((PCI_NVME_AEI_NOTICE_MASK(aen->event_data) & mask) == 0) 922 continue; 923 switch (aen->event_data) { 924 case PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED: 925 lid = NVME_LOG_CHANGED_NAMESPACE; 926 break; 927 case PCI_NVME_AEI_NOTICE_FW_ACTIVATION: 928 lid = NVME_LOG_FIRMWARE_SLOT; 929 break; 930 case PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE: 931 lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED; 932 break; 933 case PCI_NVME_AEI_NOTICE_ANA_CHANGE: 934 lid = NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS; 935 break; 936 case PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE: 937 lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE; 938 break; 939 case PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT: 940 lid = NVME_LOG_LBA_STATUS_INFORMATION; 941 break; 942 case PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE: 943 lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE; 944 break; 945 default: 946 lid = 0; 947 } 948 break; 949 default: 950 /* bad type?!? */ 951 EPRINTLN("%s unknown AEN type %u", __func__, atype); 952 status = NVME_SC_INTERNAL_DEVICE_ERROR; 953 lid = 0; 954 break; 955 } 956 957 aer = pci_nvme_aer_get(sc); 958 assert(aer != NULL); 959 960 DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype); 961 pci_nvme_cq_update(sc, &sc->compl_queues[0], 962 (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */ 963 aer->cid, 964 0, /* SQID */ 965 status); 966 967 aen->event_data = 0; 968 aen->posted = false; 969 970 pci_generate_msix(sc->nsc_pi, 0); 971 } 972 } 973 974 static void * 975 aen_thr(void *arg) 976 { 977 struct pci_nvme_softc *sc; 978 979 sc = arg; 980 981 pthread_mutex_lock(&sc->aen_mtx); 982 for (;;) { 983 pci_nvme_aen_process(sc); 984 pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx); 985 } 986 pthread_mutex_unlock(&sc->aen_mtx); 987 988 pthread_exit(NULL); 989 return (NULL); 990 } 991 992 static void 993 pci_nvme_reset_locked(struct pci_nvme_softc *sc) 994 { 995 uint32_t i; 996 997 DPRINTF("%s", __func__); 998 999 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | 1000 (1 << NVME_CAP_LO_REG_CQR_SHIFT) | 1001 (60 << NVME_CAP_LO_REG_TO_SHIFT); 1002 1003 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; 1004 1005 sc->regs.vs = NVME_REV(1,4); /* NVMe v1.4 */ 1006 1007 sc->regs.cc = 0; 1008 1009 assert(sc->submit_queues != NULL); 1010 1011 for (i = 0; i < sc->num_squeues + 1; i++) { 1012 sc->submit_queues[i].qbase = NULL; 1013 sc->submit_queues[i].size = 0; 1014 sc->submit_queues[i].cqid = 0; 1015 sc->submit_queues[i].tail = 0; 1016 sc->submit_queues[i].head = 0; 1017 } 1018 1019 assert(sc->compl_queues != NULL); 1020 1021 for (i = 0; i < sc->num_cqueues + 1; i++) { 1022 sc->compl_queues[i].qbase = NULL; 1023 sc->compl_queues[i].size = 0; 1024 sc->compl_queues[i].tail = 0; 1025 sc->compl_queues[i].head = 0; 1026 } 1027 1028 sc->num_q_is_set = false; 1029 1030 pci_nvme_aer_destroy(sc); 1031 pci_nvme_aen_destroy(sc); 1032 1033 /* 1034 * Clear CSTS.RDY last to prevent the host from enabling Controller 1035 * before cleanup completes 1036 */ 1037 sc->regs.csts = 0; 1038 } 1039 1040 static void 1041 pci_nvme_reset(struct pci_nvme_softc *sc) 1042 { 1043 pthread_mutex_lock(&sc->mtx); 1044 pci_nvme_reset_locked(sc); 1045 pthread_mutex_unlock(&sc->mtx); 1046 } 1047 1048 static int 1049 pci_nvme_init_controller(struct pci_nvme_softc *sc) 1050 { 1051 uint16_t acqs, asqs; 1052 1053 DPRINTF("%s", __func__); 1054 1055 /* 1056 * NVMe 2.0 states that "enabling a controller while this field is 1057 * cleared to 0h produces undefined results" for both ACQS and 1058 * ASQS. If zero, set CFS and do not become ready. 1059 */ 1060 asqs = ONE_BASED(sc->regs.aqa & NVME_AQA_REG_ASQS_MASK); 1061 if (asqs < 2) { 1062 EPRINTLN("%s: illegal ASQS value %#x (aqa=%#x)", __func__, 1063 asqs - 1, sc->regs.aqa); 1064 sc->regs.csts |= NVME_CSTS_CFS; 1065 return (-1); 1066 } 1067 sc->submit_queues[0].size = asqs; 1068 sc->submit_queues[0].qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1069 sc->regs.asq, sizeof(struct nvme_command) * asqs); 1070 if (sc->submit_queues[0].qbase == NULL) { 1071 EPRINTLN("%s: ASQ vm_map_gpa(%lx) failed", __func__, 1072 sc->regs.asq); 1073 sc->regs.csts |= NVME_CSTS_CFS; 1074 return (-1); 1075 } 1076 1077 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p", 1078 __func__, sc->regs.asq, sc->submit_queues[0].qbase); 1079 1080 acqs = ONE_BASED((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 1081 NVME_AQA_REG_ACQS_MASK); 1082 if (acqs < 2) { 1083 EPRINTLN("%s: illegal ACQS value %#x (aqa=%#x)", __func__, 1084 acqs - 1, sc->regs.aqa); 1085 sc->regs.csts |= NVME_CSTS_CFS; 1086 return (-1); 1087 } 1088 sc->compl_queues[0].size = acqs; 1089 sc->compl_queues[0].qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1090 sc->regs.acq, sizeof(struct nvme_completion) * acqs); 1091 if (sc->compl_queues[0].qbase == NULL) { 1092 EPRINTLN("%s: ACQ vm_map_gpa(%lx) failed", __func__, 1093 sc->regs.acq); 1094 sc->regs.csts |= NVME_CSTS_CFS; 1095 return (-1); 1096 } 1097 sc->compl_queues[0].intr_en = NVME_CQ_INTEN; 1098 1099 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p", 1100 __func__, sc->regs.acq, sc->compl_queues[0].qbase); 1101 1102 return (0); 1103 } 1104 1105 static int 1106 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b, 1107 size_t len, enum nvme_copy_dir dir) 1108 { 1109 uint8_t *p; 1110 size_t bytes; 1111 1112 if (len > (8 * 1024)) { 1113 return (-1); 1114 } 1115 1116 /* Copy from the start of prp1 to the end of the physical page */ 1117 bytes = PAGE_SIZE - (prp1 & PAGE_MASK); 1118 bytes = MIN(bytes, len); 1119 1120 p = vm_map_gpa(ctx, prp1, bytes); 1121 if (p == NULL) { 1122 return (-1); 1123 } 1124 1125 if (dir == NVME_COPY_TO_PRP) 1126 memcpy(p, b, bytes); 1127 else 1128 memcpy(b, p, bytes); 1129 1130 b += bytes; 1131 1132 len -= bytes; 1133 if (len == 0) { 1134 return (0); 1135 } 1136 1137 len = MIN(len, PAGE_SIZE); 1138 1139 p = vm_map_gpa(ctx, prp2, len); 1140 if (p == NULL) { 1141 return (-1); 1142 } 1143 1144 if (dir == NVME_COPY_TO_PRP) 1145 memcpy(p, b, len); 1146 else 1147 memcpy(b, p, len); 1148 1149 return (0); 1150 } 1151 1152 /* 1153 * Write a Completion Queue Entry update 1154 * 1155 * Write the completion and update the doorbell value 1156 */ 1157 static void 1158 pci_nvme_cq_update(struct pci_nvme_softc *sc, 1159 struct nvme_completion_queue *cq, 1160 uint32_t cdw0, 1161 uint16_t cid, 1162 uint16_t sqid, 1163 uint16_t status) 1164 { 1165 struct nvme_submission_queue *sq = &sc->submit_queues[sqid]; 1166 struct nvme_completion *cqe; 1167 1168 assert(cq->qbase != NULL); 1169 1170 pthread_mutex_lock(&cq->mtx); 1171 1172 cqe = &cq->qbase[cq->tail]; 1173 1174 /* Flip the phase bit */ 1175 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK; 1176 1177 cqe->cdw0 = cdw0; 1178 cqe->sqhd = sq->head; 1179 cqe->sqid = sqid; 1180 cqe->cid = cid; 1181 cqe->status = status; 1182 1183 cq->tail++; 1184 if (cq->tail >= cq->size) { 1185 cq->tail = 0; 1186 } 1187 1188 pthread_mutex_unlock(&cq->mtx); 1189 } 1190 1191 static int 1192 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1193 struct nvme_completion* compl) 1194 { 1195 uint16_t qid = command->cdw10 & 0xffff; 1196 1197 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid); 1198 if (qid == 0 || qid > sc->num_squeues || 1199 (sc->submit_queues[qid].qbase == NULL)) { 1200 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u", 1201 __func__, qid, sc->num_squeues); 1202 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1203 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1204 return (1); 1205 } 1206 1207 sc->submit_queues[qid].qbase = NULL; 1208 sc->submit_queues[qid].cqid = 0; 1209 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1210 return (1); 1211 } 1212 1213 static int 1214 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1215 struct nvme_completion* compl) 1216 { 1217 if (command->cdw11 & NVME_CMD_CDW11_PC) { 1218 uint16_t qid = command->cdw10 & 0xffff; 1219 struct nvme_submission_queue *nsq; 1220 1221 if ((qid == 0) || (qid > sc->num_squeues) || 1222 (sc->submit_queues[qid].qbase != NULL)) { 1223 WPRINTF("%s queue index %u > num_squeues %u", 1224 __func__, qid, sc->num_squeues); 1225 pci_nvme_status_tc(&compl->status, 1226 NVME_SCT_COMMAND_SPECIFIC, 1227 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1228 return (1); 1229 } 1230 1231 nsq = &sc->submit_queues[qid]; 1232 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1233 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries); 1234 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) { 1235 /* 1236 * Queues must specify at least two entries 1237 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1238 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1239 */ 1240 pci_nvme_status_tc(&compl->status, 1241 NVME_SCT_COMMAND_SPECIFIC, 1242 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1243 return (1); 1244 } 1245 nsq->head = nsq->tail = 0; 1246 1247 nsq->cqid = (command->cdw11 >> 16) & 0xffff; 1248 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) { 1249 pci_nvme_status_tc(&compl->status, 1250 NVME_SCT_COMMAND_SPECIFIC, 1251 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1252 return (1); 1253 } 1254 1255 if (sc->compl_queues[nsq->cqid].qbase == NULL) { 1256 pci_nvme_status_tc(&compl->status, 1257 NVME_SCT_COMMAND_SPECIFIC, 1258 NVME_SC_COMPLETION_QUEUE_INVALID); 1259 return (1); 1260 } 1261 1262 nsq->qpriority = (command->cdw11 >> 1) & 0x03; 1263 1264 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1265 sizeof(struct nvme_command) * (size_t)nsq->size); 1266 1267 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__, 1268 qid, nsq->size, nsq->qbase, nsq->cqid); 1269 1270 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1271 1272 DPRINTF("%s completed creating IOSQ qid %u", 1273 __func__, qid); 1274 } else { 1275 /* 1276 * Guest sent non-cont submission queue request. 1277 * This setting is unsupported by this emulation. 1278 */ 1279 WPRINTF("%s unsupported non-contig (list-based) " 1280 "create i/o submission queue", __func__); 1281 1282 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1283 } 1284 return (1); 1285 } 1286 1287 static int 1288 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1289 struct nvme_completion* compl) 1290 { 1291 uint16_t qid = command->cdw10 & 0xffff; 1292 uint16_t sqid; 1293 1294 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid); 1295 if (qid == 0 || qid > sc->num_cqueues || 1296 (sc->compl_queues[qid].qbase == NULL)) { 1297 WPRINTF("%s queue index %u / num_cqueues %u", 1298 __func__, qid, sc->num_cqueues); 1299 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1300 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1301 return (1); 1302 } 1303 1304 /* Deleting an Active CQ is an error */ 1305 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++) 1306 if (sc->submit_queues[sqid].cqid == qid) { 1307 pci_nvme_status_tc(&compl->status, 1308 NVME_SCT_COMMAND_SPECIFIC, 1309 NVME_SC_INVALID_QUEUE_DELETION); 1310 return (1); 1311 } 1312 1313 sc->compl_queues[qid].qbase = NULL; 1314 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1315 return (1); 1316 } 1317 1318 static int 1319 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1320 struct nvme_completion* compl) 1321 { 1322 struct nvme_completion_queue *ncq; 1323 uint16_t qid = command->cdw10 & 0xffff; 1324 1325 /* Only support Physically Contiguous queues */ 1326 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) { 1327 WPRINTF("%s unsupported non-contig (list-based) " 1328 "create i/o completion queue", 1329 __func__); 1330 1331 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1332 return (1); 1333 } 1334 1335 if ((qid == 0) || (qid > sc->num_cqueues) || 1336 (sc->compl_queues[qid].qbase != NULL)) { 1337 WPRINTF("%s queue index %u > num_cqueues %u", 1338 __func__, qid, sc->num_cqueues); 1339 pci_nvme_status_tc(&compl->status, 1340 NVME_SCT_COMMAND_SPECIFIC, 1341 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1342 return (1); 1343 } 1344 1345 ncq = &sc->compl_queues[qid]; 1346 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; 1347 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; 1348 if (ncq->intr_vec > (sc->max_queues + 1)) { 1349 pci_nvme_status_tc(&compl->status, 1350 NVME_SCT_COMMAND_SPECIFIC, 1351 NVME_SC_INVALID_INTERRUPT_VECTOR); 1352 return (1); 1353 } 1354 1355 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1356 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) { 1357 /* 1358 * Queues must specify at least two entries 1359 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1360 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1361 */ 1362 pci_nvme_status_tc(&compl->status, 1363 NVME_SCT_COMMAND_SPECIFIC, 1364 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1365 return (1); 1366 } 1367 ncq->head = ncq->tail = 0; 1368 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1369 command->prp1, 1370 sizeof(struct nvme_command) * (size_t)ncq->size); 1371 1372 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1373 1374 1375 return (1); 1376 } 1377 1378 static int 1379 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, 1380 struct nvme_completion* compl) 1381 { 1382 uint64_t logoff; 1383 uint32_t logsize; 1384 uint8_t logpage; 1385 1386 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1387 1388 /* 1389 * Command specifies the number of dwords to return in fields NUMDU 1390 * and NUMDL. This is a zero-based value. 1391 */ 1392 logpage = command->cdw10 & 0xFF; 1393 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1; 1394 logsize *= sizeof(uint32_t); 1395 logoff = ((uint64_t)(command->cdw13) << 32) | command->cdw12; 1396 1397 DPRINTF("%s log page %u len %u", __func__, logpage, logsize); 1398 1399 switch (logpage) { 1400 case NVME_LOG_ERROR: 1401 if (logoff >= sizeof(sc->err_log)) { 1402 pci_nvme_status_genc(&compl->status, 1403 NVME_SC_INVALID_FIELD); 1404 break; 1405 } 1406 1407 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1408 command->prp2, (uint8_t *)&sc->err_log + logoff, 1409 MIN(logsize - logoff, sizeof(sc->err_log)), 1410 NVME_COPY_TO_PRP); 1411 break; 1412 case NVME_LOG_HEALTH_INFORMATION: 1413 if (logoff >= sizeof(sc->health_log)) { 1414 pci_nvme_status_genc(&compl->status, 1415 NVME_SC_INVALID_FIELD); 1416 break; 1417 } 1418 1419 pthread_mutex_lock(&sc->mtx); 1420 memcpy(&sc->health_log.data_units_read, &sc->read_data_units, 1421 sizeof(sc->health_log.data_units_read)); 1422 memcpy(&sc->health_log.data_units_written, &sc->write_data_units, 1423 sizeof(sc->health_log.data_units_written)); 1424 memcpy(&sc->health_log.host_read_commands, &sc->read_commands, 1425 sizeof(sc->health_log.host_read_commands)); 1426 memcpy(&sc->health_log.host_write_commands, &sc->write_commands, 1427 sizeof(sc->health_log.host_write_commands)); 1428 pthread_mutex_unlock(&sc->mtx); 1429 1430 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1431 command->prp2, (uint8_t *)&sc->health_log + logoff, 1432 MIN(logsize - logoff, sizeof(sc->health_log)), 1433 NVME_COPY_TO_PRP); 1434 break; 1435 case NVME_LOG_FIRMWARE_SLOT: 1436 if (logoff >= sizeof(sc->fw_log)) { 1437 pci_nvme_status_genc(&compl->status, 1438 NVME_SC_INVALID_FIELD); 1439 break; 1440 } 1441 1442 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1443 command->prp2, (uint8_t *)&sc->fw_log + logoff, 1444 MIN(logsize - logoff, sizeof(sc->fw_log)), 1445 NVME_COPY_TO_PRP); 1446 break; 1447 case NVME_LOG_CHANGED_NAMESPACE: 1448 if (logoff >= sizeof(sc->ns_log)) { 1449 pci_nvme_status_genc(&compl->status, 1450 NVME_SC_INVALID_FIELD); 1451 break; 1452 } 1453 1454 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1455 command->prp2, (uint8_t *)&sc->ns_log + logoff, 1456 MIN(logsize - logoff, sizeof(sc->ns_log)), 1457 NVME_COPY_TO_PRP); 1458 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 1459 break; 1460 default: 1461 DPRINTF("%s get log page %x command not supported", 1462 __func__, logpage); 1463 1464 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1465 NVME_SC_INVALID_LOG_PAGE); 1466 } 1467 1468 return (1); 1469 } 1470 1471 static int 1472 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, 1473 struct nvme_completion* compl) 1474 { 1475 void *dest; 1476 uint16_t status; 1477 1478 DPRINTF("%s identify 0x%x nsid 0x%x", __func__, 1479 command->cdw10 & 0xFF, command->nsid); 1480 1481 status = 0; 1482 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1483 1484 switch (command->cdw10 & 0xFF) { 1485 case 0x00: /* return Identify Namespace data structure */ 1486 /* Global NS only valid with NS Management */ 1487 if (command->nsid == NVME_GLOBAL_NAMESPACE_TAG) { 1488 pci_nvme_status_genc(&status, 1489 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1490 break; 1491 } 1492 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1493 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata), 1494 NVME_COPY_TO_PRP); 1495 break; 1496 case 0x01: /* return Identify Controller data structure */ 1497 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1498 command->prp2, (uint8_t *)&sc->ctrldata, 1499 sizeof(sc->ctrldata), 1500 NVME_COPY_TO_PRP); 1501 break; 1502 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ 1503 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1504 sizeof(uint32_t) * 1024); 1505 /* All unused entries shall be zero */ 1506 memset(dest, 0, sizeof(uint32_t) * 1024); 1507 ((uint32_t *)dest)[0] = 1; 1508 break; 1509 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ 1510 if (command->nsid != 1) { 1511 pci_nvme_status_genc(&status, 1512 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1513 break; 1514 } 1515 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1516 sizeof(uint32_t) * 1024); 1517 /* All bytes after the descriptor shall be zero */ 1518 memset(dest, 0, sizeof(uint32_t) * 1024); 1519 1520 /* Return NIDT=1 (i.e. EUI64) descriptor */ 1521 ((uint8_t *)dest)[0] = 1; 1522 ((uint8_t *)dest)[1] = sizeof(uint64_t); 1523 memcpy(((uint8_t *)dest) + 4, sc->nsdata.eui64, sizeof(uint64_t)); 1524 break; 1525 case 0x13: 1526 /* 1527 * Controller list is optional but used by UNH tests. Return 1528 * a valid but empty list. 1529 */ 1530 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1531 sizeof(uint16_t) * 2048); 1532 memset(dest, 0, sizeof(uint16_t) * 2048); 1533 break; 1534 default: 1535 DPRINTF("%s unsupported identify command requested 0x%x", 1536 __func__, command->cdw10 & 0xFF); 1537 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD); 1538 break; 1539 } 1540 1541 compl->status = status; 1542 return (1); 1543 } 1544 1545 static const char * 1546 nvme_fid_to_name(uint8_t fid) 1547 { 1548 const char *name; 1549 1550 switch (fid) { 1551 case NVME_FEAT_ARBITRATION: 1552 name = "Arbitration"; 1553 break; 1554 case NVME_FEAT_POWER_MANAGEMENT: 1555 name = "Power Management"; 1556 break; 1557 case NVME_FEAT_LBA_RANGE_TYPE: 1558 name = "LBA Range Type"; 1559 break; 1560 case NVME_FEAT_TEMPERATURE_THRESHOLD: 1561 name = "Temperature Threshold"; 1562 break; 1563 case NVME_FEAT_ERROR_RECOVERY: 1564 name = "Error Recovery"; 1565 break; 1566 case NVME_FEAT_VOLATILE_WRITE_CACHE: 1567 name = "Volatile Write Cache"; 1568 break; 1569 case NVME_FEAT_NUMBER_OF_QUEUES: 1570 name = "Number of Queues"; 1571 break; 1572 case NVME_FEAT_INTERRUPT_COALESCING: 1573 name = "Interrupt Coalescing"; 1574 break; 1575 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 1576 name = "Interrupt Vector Configuration"; 1577 break; 1578 case NVME_FEAT_WRITE_ATOMICITY: 1579 name = "Write Atomicity Normal"; 1580 break; 1581 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 1582 name = "Asynchronous Event Configuration"; 1583 break; 1584 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 1585 name = "Autonomous Power State Transition"; 1586 break; 1587 case NVME_FEAT_HOST_MEMORY_BUFFER: 1588 name = "Host Memory Buffer"; 1589 break; 1590 case NVME_FEAT_TIMESTAMP: 1591 name = "Timestamp"; 1592 break; 1593 case NVME_FEAT_KEEP_ALIVE_TIMER: 1594 name = "Keep Alive Timer"; 1595 break; 1596 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT: 1597 name = "Host Controlled Thermal Management"; 1598 break; 1599 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG: 1600 name = "Non-Operation Power State Config"; 1601 break; 1602 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG: 1603 name = "Read Recovery Level Config"; 1604 break; 1605 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 1606 name = "Predictable Latency Mode Config"; 1607 break; 1608 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW: 1609 name = "Predictable Latency Mode Window"; 1610 break; 1611 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES: 1612 name = "LBA Status Information Report Interval"; 1613 break; 1614 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 1615 name = "Host Behavior Support"; 1616 break; 1617 case NVME_FEAT_SANITIZE_CONFIG: 1618 name = "Sanitize Config"; 1619 break; 1620 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION: 1621 name = "Endurance Group Event Configuration"; 1622 break; 1623 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 1624 name = "Software Progress Marker"; 1625 break; 1626 case NVME_FEAT_HOST_IDENTIFIER: 1627 name = "Host Identifier"; 1628 break; 1629 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK: 1630 name = "Reservation Notification Mask"; 1631 break; 1632 case NVME_FEAT_RESERVATION_PERSISTENCE: 1633 name = "Reservation Persistence"; 1634 break; 1635 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG: 1636 name = "Namespace Write Protection Config"; 1637 break; 1638 default: 1639 name = "Unknown"; 1640 break; 1641 } 1642 1643 return (name); 1644 } 1645 1646 static void 1647 nvme_feature_invalid_cb(struct pci_nvme_softc *sc __unused, 1648 struct nvme_feature_obj *feat __unused, 1649 struct nvme_command *command __unused, 1650 struct nvme_completion *compl) 1651 { 1652 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1653 } 1654 1655 static void 1656 nvme_feature_iv_config(struct pci_nvme_softc *sc, 1657 struct nvme_feature_obj *feat __unused, 1658 struct nvme_command *command, 1659 struct nvme_completion *compl) 1660 { 1661 uint32_t i; 1662 uint32_t cdw11 = command->cdw11; 1663 uint16_t iv; 1664 bool cd; 1665 1666 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1667 1668 iv = cdw11 & 0xffff; 1669 cd = cdw11 & (1 << 16); 1670 1671 if (iv > (sc->max_queues + 1)) { 1672 return; 1673 } 1674 1675 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */ 1676 if ((iv == 0) && !cd) 1677 return; 1678 1679 /* Requested Interrupt Vector must be used by a CQ */ 1680 for (i = 0; i < sc->num_cqueues + 1; i++) { 1681 if (sc->compl_queues[i].intr_vec == iv) { 1682 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1683 } 1684 } 1685 } 1686 1687 #define NVME_ASYNC_EVENT_ENDURANCE_GROUP (0x4000) 1688 static void 1689 nvme_feature_async_event(struct pci_nvme_softc *sc __unused, 1690 struct nvme_feature_obj *feat __unused, 1691 struct nvme_command *command, 1692 struct nvme_completion *compl) 1693 { 1694 if (command->cdw11 & NVME_ASYNC_EVENT_ENDURANCE_GROUP) 1695 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1696 } 1697 1698 #define NVME_TEMP_THRESH_OVER 0 1699 #define NVME_TEMP_THRESH_UNDER 1 1700 static void 1701 nvme_feature_temperature(struct pci_nvme_softc *sc, 1702 struct nvme_feature_obj *feat __unused, 1703 struct nvme_command *command, 1704 struct nvme_completion *compl) 1705 { 1706 uint16_t tmpth; /* Temperature Threshold */ 1707 uint8_t tmpsel; /* Threshold Temperature Select */ 1708 uint8_t thsel; /* Threshold Type Select */ 1709 bool set_crit = false; 1710 bool report_crit; 1711 1712 tmpth = command->cdw11 & 0xffff; 1713 tmpsel = (command->cdw11 >> 16) & 0xf; 1714 thsel = (command->cdw11 >> 20) & 0x3; 1715 1716 DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel); 1717 1718 /* Check for unsupported values */ 1719 if (((tmpsel != 0) && (tmpsel != 0xf)) || 1720 (thsel > NVME_TEMP_THRESH_UNDER)) { 1721 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1722 return; 1723 } 1724 1725 if (((thsel == NVME_TEMP_THRESH_OVER) && (NVME_TEMPERATURE >= tmpth)) || 1726 ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth))) 1727 set_crit = true; 1728 1729 pthread_mutex_lock(&sc->mtx); 1730 if (set_crit) 1731 sc->health_log.critical_warning |= 1732 NVME_CRIT_WARN_ST_TEMPERATURE; 1733 else 1734 sc->health_log.critical_warning &= 1735 ~NVME_CRIT_WARN_ST_TEMPERATURE; 1736 pthread_mutex_unlock(&sc->mtx); 1737 1738 report_crit = sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11 & 1739 NVME_CRIT_WARN_ST_TEMPERATURE; 1740 1741 if (set_crit && report_crit) 1742 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART, 1743 sc->health_log.critical_warning); 1744 1745 DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status); 1746 } 1747 1748 static void 1749 nvme_feature_num_queues(struct pci_nvme_softc *sc, 1750 struct nvme_feature_obj *feat __unused, 1751 struct nvme_command *command, 1752 struct nvme_completion *compl) 1753 { 1754 uint16_t nqr; /* Number of Queues Requested */ 1755 1756 if (sc->num_q_is_set) { 1757 WPRINTF("%s: Number of Queues already set", __func__); 1758 pci_nvme_status_genc(&compl->status, 1759 NVME_SC_COMMAND_SEQUENCE_ERROR); 1760 return; 1761 } 1762 1763 nqr = command->cdw11 & 0xFFFF; 1764 if (nqr == 0xffff) { 1765 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr); 1766 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1767 return; 1768 } 1769 1770 sc->num_squeues = ONE_BASED(nqr); 1771 if (sc->num_squeues > sc->max_queues) { 1772 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues, 1773 sc->max_queues); 1774 sc->num_squeues = sc->max_queues; 1775 } 1776 1777 nqr = (command->cdw11 >> 16) & 0xFFFF; 1778 if (nqr == 0xffff) { 1779 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr); 1780 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1781 return; 1782 } 1783 1784 sc->num_cqueues = ONE_BASED(nqr); 1785 if (sc->num_cqueues > sc->max_queues) { 1786 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues, 1787 sc->max_queues); 1788 sc->num_cqueues = sc->max_queues; 1789 } 1790 1791 /* Patch the command value which will be saved on callback's return */ 1792 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc); 1793 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 1794 1795 sc->num_q_is_set = true; 1796 } 1797 1798 static int 1799 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command, 1800 struct nvme_completion *compl) 1801 { 1802 struct nvme_feature_obj *feat; 1803 uint32_t nsid = command->nsid; 1804 uint8_t fid = NVMEV(NVME_FEAT_SET_FID, command->cdw10); 1805 bool sv = NVMEV(NVME_FEAT_SET_SV, command->cdw10); 1806 1807 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1808 1809 if (fid >= NVME_FID_MAX) { 1810 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1811 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1812 return (1); 1813 } 1814 1815 if (sv) { 1816 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1817 NVME_SC_FEATURE_NOT_SAVEABLE); 1818 return (1); 1819 } 1820 1821 feat = &sc->feat[fid]; 1822 1823 if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) { 1824 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1825 return (1); 1826 } 1827 1828 if (!feat->namespace_specific && 1829 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) { 1830 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1831 NVME_SC_FEATURE_NOT_NS_SPECIFIC); 1832 return (1); 1833 } 1834 1835 compl->cdw0 = 0; 1836 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1837 1838 if (feat->set) 1839 feat->set(sc, feat, command, compl); 1840 else { 1841 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1842 NVME_SC_FEATURE_NOT_CHANGEABLE); 1843 return (1); 1844 } 1845 1846 DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11); 1847 if (compl->status == NVME_SC_SUCCESS) { 1848 feat->cdw11 = command->cdw11; 1849 if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) && 1850 (command->cdw11 != 0)) 1851 pci_nvme_aen_notify(sc); 1852 } 1853 1854 return (0); 1855 } 1856 1857 #define NVME_FEATURES_SEL_SUPPORTED 0x3 1858 #define NVME_FEATURES_NS_SPECIFIC (1 << 1) 1859 1860 static int 1861 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, 1862 struct nvme_completion* compl) 1863 { 1864 struct nvme_feature_obj *feat; 1865 uint8_t fid = command->cdw10 & 0xFF; 1866 uint8_t sel = (command->cdw10 >> 8) & 0x7; 1867 1868 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1869 1870 if (fid >= NVME_FID_MAX) { 1871 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1872 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1873 return (1); 1874 } 1875 1876 compl->cdw0 = 0; 1877 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1878 1879 feat = &sc->feat[fid]; 1880 if (feat->get) { 1881 feat->get(sc, feat, command, compl); 1882 } 1883 1884 if (compl->status == NVME_SC_SUCCESS) { 1885 if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific) 1886 compl->cdw0 = NVME_FEATURES_NS_SPECIFIC; 1887 else 1888 compl->cdw0 = feat->cdw11; 1889 } 1890 1891 return (0); 1892 } 1893 1894 static int 1895 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command, 1896 struct nvme_completion* compl) 1897 { 1898 uint8_t ses, lbaf, pi; 1899 1900 /* Only supports Secure Erase Setting - User Data Erase */ 1901 ses = (command->cdw10 >> 9) & 0x7; 1902 if (ses > 0x1) { 1903 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1904 return (1); 1905 } 1906 1907 /* Only supports a single LBA Format */ 1908 lbaf = command->cdw10 & 0xf; 1909 if (lbaf != 0) { 1910 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1911 NVME_SC_INVALID_FORMAT); 1912 return (1); 1913 } 1914 1915 /* Doesn't support Protection Information */ 1916 pi = (command->cdw10 >> 5) & 0x7; 1917 if (pi != 0) { 1918 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1919 return (1); 1920 } 1921 1922 if (sc->nvstore.type == NVME_STOR_RAM) { 1923 if (sc->nvstore.ctx) 1924 free(sc->nvstore.ctx); 1925 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 1926 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1927 } else { 1928 struct pci_nvme_ioreq *req; 1929 int err; 1930 1931 req = pci_nvme_get_ioreq(sc); 1932 if (req == NULL) { 1933 pci_nvme_status_genc(&compl->status, 1934 NVME_SC_INTERNAL_DEVICE_ERROR); 1935 WPRINTF("%s: unable to allocate IO req", __func__); 1936 return (1); 1937 } 1938 req->nvme_sq = &sc->submit_queues[0]; 1939 req->sqid = 0; 1940 req->opc = command->opc; 1941 req->cid = command->cid; 1942 req->nsid = command->nsid; 1943 1944 req->io_req.br_offset = 0; 1945 req->io_req.br_resid = sc->nvstore.size; 1946 req->io_req.br_callback = pci_nvme_io_done; 1947 1948 err = blockif_delete(sc->nvstore.ctx, &req->io_req); 1949 if (err) { 1950 pci_nvme_status_genc(&compl->status, 1951 NVME_SC_INTERNAL_DEVICE_ERROR); 1952 pci_nvme_release_ioreq(sc, req); 1953 } else 1954 compl->status = NVME_NO_STATUS; 1955 } 1956 1957 return (1); 1958 } 1959 1960 static int 1961 nvme_opc_abort(struct pci_nvme_softc *sc __unused, struct nvme_command *command, 1962 struct nvme_completion *compl) 1963 { 1964 DPRINTF("%s submission queue %u, command ID 0x%x", __func__, 1965 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF); 1966 1967 /* TODO: search for the command ID and abort it */ 1968 1969 compl->cdw0 = 1; 1970 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1971 return (1); 1972 } 1973 1974 static int 1975 nvme_opc_async_event_req(struct pci_nvme_softc* sc, 1976 struct nvme_command* command, struct nvme_completion* compl) 1977 { 1978 DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__, 1979 sc->aer_count, sc->ctrldata.aerl, command->cid); 1980 1981 /* Don't exceed the Async Event Request Limit (AERL). */ 1982 if (pci_nvme_aer_limit_reached(sc)) { 1983 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1984 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); 1985 return (1); 1986 } 1987 1988 if (pci_nvme_aer_add(sc, command->cid)) { 1989 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC, 1990 NVME_SC_INTERNAL_DEVICE_ERROR); 1991 return (1); 1992 } 1993 1994 /* 1995 * Raise events when they happen based on the Set Features cmd. 1996 * These events happen async, so only set completion successful if 1997 * there is an event reflective of the request to get event. 1998 */ 1999 compl->status = NVME_NO_STATUS; 2000 pci_nvme_aen_notify(sc); 2001 2002 return (0); 2003 } 2004 2005 static void 2006 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) 2007 { 2008 struct nvme_completion compl; 2009 struct nvme_command *cmd; 2010 struct nvme_submission_queue *sq; 2011 struct nvme_completion_queue *cq; 2012 uint16_t sqhead; 2013 2014 DPRINTF("%s index %u", __func__, (uint32_t)value); 2015 2016 sq = &sc->submit_queues[0]; 2017 cq = &sc->compl_queues[0]; 2018 2019 pthread_mutex_lock(&sq->mtx); 2020 2021 sqhead = sq->head; 2022 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail); 2023 2024 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2025 cmd = &(sq->qbase)[sqhead]; 2026 compl.cdw0 = 0; 2027 compl.status = 0; 2028 2029 switch (cmd->opc) { 2030 case NVME_OPC_DELETE_IO_SQ: 2031 DPRINTF("%s command DELETE_IO_SQ", __func__); 2032 nvme_opc_delete_io_sq(sc, cmd, &compl); 2033 break; 2034 case NVME_OPC_CREATE_IO_SQ: 2035 DPRINTF("%s command CREATE_IO_SQ", __func__); 2036 nvme_opc_create_io_sq(sc, cmd, &compl); 2037 break; 2038 case NVME_OPC_DELETE_IO_CQ: 2039 DPRINTF("%s command DELETE_IO_CQ", __func__); 2040 nvme_opc_delete_io_cq(sc, cmd, &compl); 2041 break; 2042 case NVME_OPC_CREATE_IO_CQ: 2043 DPRINTF("%s command CREATE_IO_CQ", __func__); 2044 nvme_opc_create_io_cq(sc, cmd, &compl); 2045 break; 2046 case NVME_OPC_GET_LOG_PAGE: 2047 DPRINTF("%s command GET_LOG_PAGE", __func__); 2048 nvme_opc_get_log_page(sc, cmd, &compl); 2049 break; 2050 case NVME_OPC_IDENTIFY: 2051 DPRINTF("%s command IDENTIFY", __func__); 2052 nvme_opc_identify(sc, cmd, &compl); 2053 break; 2054 case NVME_OPC_ABORT: 2055 DPRINTF("%s command ABORT", __func__); 2056 nvme_opc_abort(sc, cmd, &compl); 2057 break; 2058 case NVME_OPC_SET_FEATURES: 2059 DPRINTF("%s command SET_FEATURES", __func__); 2060 nvme_opc_set_features(sc, cmd, &compl); 2061 break; 2062 case NVME_OPC_GET_FEATURES: 2063 DPRINTF("%s command GET_FEATURES", __func__); 2064 nvme_opc_get_features(sc, cmd, &compl); 2065 break; 2066 case NVME_OPC_FIRMWARE_ACTIVATE: 2067 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__); 2068 pci_nvme_status_tc(&compl.status, 2069 NVME_SCT_COMMAND_SPECIFIC, 2070 NVME_SC_INVALID_FIRMWARE_SLOT); 2071 break; 2072 case NVME_OPC_ASYNC_EVENT_REQUEST: 2073 DPRINTF("%s command ASYNC_EVENT_REQ", __func__); 2074 nvme_opc_async_event_req(sc, cmd, &compl); 2075 break; 2076 case NVME_OPC_FORMAT_NVM: 2077 DPRINTF("%s command FORMAT_NVM", __func__); 2078 if ((sc->ctrldata.oacs & 2079 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) { 2080 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 2081 break; 2082 } 2083 nvme_opc_format_nvm(sc, cmd, &compl); 2084 break; 2085 case NVME_OPC_SECURITY_SEND: 2086 case NVME_OPC_SECURITY_RECEIVE: 2087 case NVME_OPC_SANITIZE: 2088 case NVME_OPC_GET_LBA_STATUS: 2089 DPRINTF("%s command OPC=%#x (unsupported)", __func__, 2090 cmd->opc); 2091 /* Valid but unsupported opcodes */ 2092 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD); 2093 break; 2094 default: 2095 DPRINTF("%s command OPC=%#X (not implemented)", 2096 __func__, 2097 cmd->opc); 2098 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 2099 } 2100 sqhead = (sqhead + 1) % sq->size; 2101 2102 if (NVME_COMPLETION_VALID(compl)) { 2103 pci_nvme_cq_update(sc, &sc->compl_queues[0], 2104 compl.cdw0, 2105 cmd->cid, 2106 0, /* SQID */ 2107 compl.status); 2108 } 2109 } 2110 2111 DPRINTF("setting sqhead %u", sqhead); 2112 sq->head = sqhead; 2113 2114 if (cq->head != cq->tail) 2115 pci_generate_msix(sc->nsc_pi, 0); 2116 2117 pthread_mutex_unlock(&sq->mtx); 2118 } 2119 2120 /* 2121 * Update the Write and Read statistics reported in SMART data 2122 * 2123 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up. 2124 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000 2125 * 512 byte blocks. Rounding up is achieved by initializing the remainder to 999. 2126 */ 2127 static void 2128 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc, 2129 size_t bytes, uint16_t status) 2130 { 2131 2132 pthread_mutex_lock(&sc->mtx); 2133 switch (opc) { 2134 case NVME_OPC_WRITE: 2135 sc->write_commands++; 2136 if (status != NVME_SC_SUCCESS) 2137 break; 2138 sc->write_dunits_remainder += (bytes / 512); 2139 while (sc->write_dunits_remainder >= 1000) { 2140 sc->write_data_units++; 2141 sc->write_dunits_remainder -= 1000; 2142 } 2143 break; 2144 case NVME_OPC_READ: 2145 sc->read_commands++; 2146 if (status != NVME_SC_SUCCESS) 2147 break; 2148 sc->read_dunits_remainder += (bytes / 512); 2149 while (sc->read_dunits_remainder >= 1000) { 2150 sc->read_data_units++; 2151 sc->read_dunits_remainder -= 1000; 2152 } 2153 break; 2154 default: 2155 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc); 2156 break; 2157 } 2158 pthread_mutex_unlock(&sc->mtx); 2159 } 2160 2161 /* 2162 * Check if the combination of Starting LBA (slba) and number of blocks 2163 * exceeds the range of the underlying storage. 2164 * 2165 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores 2166 * the capacity in bytes as a uint64_t, care must be taken to avoid integer 2167 * overflow. 2168 */ 2169 static bool 2170 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba, 2171 uint32_t nblocks) 2172 { 2173 size_t offset, bytes; 2174 2175 /* Overflow check of multiplying Starting LBA by the sector size */ 2176 if (slba >> (64 - nvstore->sectsz_bits)) 2177 return (true); 2178 2179 offset = slba << nvstore->sectsz_bits; 2180 bytes = nblocks << nvstore->sectsz_bits; 2181 2182 /* Overflow check of Number of Logical Blocks */ 2183 if ((nvstore->size <= offset) || ((nvstore->size - offset) < bytes)) 2184 return (true); 2185 2186 return (false); 2187 } 2188 2189 static int 2190 pci_nvme_append_iov_req(struct pci_nvme_softc *sc __unused, 2191 struct pci_nvme_ioreq *req, uint64_t gpaddr, size_t size, uint64_t offset) 2192 { 2193 int iovidx; 2194 bool range_is_contiguous; 2195 2196 if (req == NULL) 2197 return (-1); 2198 2199 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) { 2200 return (-1); 2201 } 2202 2203 /* 2204 * Minimize the number of IOVs by concatenating contiguous address 2205 * ranges. If the IOV count is zero, there is no previous range to 2206 * concatenate. 2207 */ 2208 if (req->io_req.br_iovcnt == 0) 2209 range_is_contiguous = false; 2210 else 2211 range_is_contiguous = (req->prev_gpaddr + req->prev_size) == gpaddr; 2212 2213 if (range_is_contiguous) { 2214 iovidx = req->io_req.br_iovcnt - 1; 2215 2216 req->io_req.br_iov[iovidx].iov_base = 2217 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2218 req->prev_gpaddr, size); 2219 if (req->io_req.br_iov[iovidx].iov_base == NULL) 2220 return (-1); 2221 2222 req->prev_size += size; 2223 req->io_req.br_resid += size; 2224 2225 req->io_req.br_iov[iovidx].iov_len = req->prev_size; 2226 } else { 2227 iovidx = req->io_req.br_iovcnt; 2228 if (iovidx == 0) { 2229 req->io_req.br_offset = offset; 2230 req->io_req.br_resid = 0; 2231 req->io_req.br_param = req; 2232 } 2233 2234 req->io_req.br_iov[iovidx].iov_base = 2235 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2236 gpaddr, size); 2237 if (req->io_req.br_iov[iovidx].iov_base == NULL) 2238 return (-1); 2239 2240 req->io_req.br_iov[iovidx].iov_len = size; 2241 2242 req->prev_gpaddr = gpaddr; 2243 req->prev_size = size; 2244 req->io_req.br_resid += size; 2245 2246 req->io_req.br_iovcnt++; 2247 } 2248 2249 return (0); 2250 } 2251 2252 static void 2253 pci_nvme_set_completion(struct pci_nvme_softc *sc, 2254 struct nvme_submission_queue *sq, int sqid, uint16_t cid, uint16_t status) 2255 { 2256 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; 2257 2258 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x", 2259 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), 2260 NVME_STATUS_GET_SC(status)); 2261 2262 pci_nvme_cq_update(sc, cq, 0, cid, sqid, status); 2263 2264 if (cq->head != cq->tail) { 2265 if (cq->intr_en & NVME_CQ_INTEN) { 2266 pci_generate_msix(sc->nsc_pi, cq->intr_vec); 2267 } else { 2268 DPRINTF("%s: CQ%u interrupt disabled", 2269 __func__, sq->cqid); 2270 } 2271 } 2272 } 2273 2274 static void 2275 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) 2276 { 2277 req->sc = NULL; 2278 req->nvme_sq = NULL; 2279 req->sqid = 0; 2280 2281 pthread_mutex_lock(&sc->mtx); 2282 2283 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link); 2284 sc->pending_ios--; 2285 2286 /* when no more IO pending, can set to ready if device reset/enabled */ 2287 if (sc->pending_ios == 0 && 2288 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) 2289 sc->regs.csts |= NVME_CSTS_RDY; 2290 2291 pthread_mutex_unlock(&sc->mtx); 2292 2293 sem_post(&sc->iosemlock); 2294 } 2295 2296 static struct pci_nvme_ioreq * 2297 pci_nvme_get_ioreq(struct pci_nvme_softc *sc) 2298 { 2299 struct pci_nvme_ioreq *req = NULL; 2300 2301 sem_wait(&sc->iosemlock); 2302 pthread_mutex_lock(&sc->mtx); 2303 2304 req = STAILQ_FIRST(&sc->ioreqs_free); 2305 assert(req != NULL); 2306 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link); 2307 2308 req->sc = sc; 2309 2310 sc->pending_ios++; 2311 2312 pthread_mutex_unlock(&sc->mtx); 2313 2314 req->io_req.br_iovcnt = 0; 2315 req->io_req.br_offset = 0; 2316 req->io_req.br_resid = 0; 2317 req->io_req.br_param = req; 2318 req->prev_gpaddr = 0; 2319 req->prev_size = 0; 2320 2321 return req; 2322 } 2323 2324 static void 2325 pci_nvme_io_done(struct blockif_req *br, int err) 2326 { 2327 struct pci_nvme_ioreq *req = br->br_param; 2328 struct nvme_submission_queue *sq = req->nvme_sq; 2329 uint16_t code, status; 2330 2331 DPRINTF("%s error %d %s", __func__, err, strerror(err)); 2332 2333 /* TODO return correct error */ 2334 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; 2335 status = 0; 2336 pci_nvme_status_genc(&status, code); 2337 2338 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, status); 2339 pci_nvme_stats_write_read_update(req->sc, req->opc, 2340 req->bytes, status); 2341 pci_nvme_release_ioreq(req->sc, req); 2342 } 2343 2344 /* 2345 * Implements the Flush command. The specification states: 2346 * If a volatile write cache is not present, Flush commands complete 2347 * successfully and have no effect 2348 * in the description of the Volatile Write Cache (VWC) field of the Identify 2349 * Controller data. Therefore, set status to Success if the command is 2350 * not supported (i.e. RAM or as indicated by the blockif). 2351 */ 2352 static bool 2353 nvme_opc_flush(struct pci_nvme_softc *sc __unused, 2354 struct nvme_command *cmd __unused, 2355 struct pci_nvme_blockstore *nvstore, 2356 struct pci_nvme_ioreq *req, 2357 uint16_t *status) 2358 { 2359 bool pending = false; 2360 2361 if (nvstore->type == NVME_STOR_RAM) { 2362 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2363 } else { 2364 int err; 2365 2366 req->io_req.br_callback = pci_nvme_io_done; 2367 2368 err = blockif_flush(nvstore->ctx, &req->io_req); 2369 switch (err) { 2370 case 0: 2371 pending = true; 2372 break; 2373 case EOPNOTSUPP: 2374 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2375 break; 2376 default: 2377 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2378 } 2379 } 2380 2381 return (pending); 2382 } 2383 2384 static uint16_t 2385 nvme_write_read_ram(struct pci_nvme_softc *sc, 2386 struct pci_nvme_blockstore *nvstore, 2387 uint64_t prp1, uint64_t prp2, 2388 size_t offset, uint64_t bytes, 2389 bool is_write) 2390 { 2391 uint8_t *buf = nvstore->ctx; 2392 enum nvme_copy_dir dir; 2393 uint16_t status; 2394 2395 if (is_write) 2396 dir = NVME_COPY_TO_PRP; 2397 else 2398 dir = NVME_COPY_FROM_PRP; 2399 2400 status = 0; 2401 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2, 2402 buf + offset, bytes, dir)) 2403 pci_nvme_status_genc(&status, 2404 NVME_SC_DATA_TRANSFER_ERROR); 2405 else 2406 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2407 2408 return (status); 2409 } 2410 2411 static uint16_t 2412 nvme_write_read_blockif(struct pci_nvme_softc *sc, 2413 struct pci_nvme_blockstore *nvstore, 2414 struct pci_nvme_ioreq *req, 2415 uint64_t prp1, uint64_t prp2, 2416 size_t offset, uint64_t bytes, 2417 bool is_write) 2418 { 2419 uint64_t size; 2420 int err; 2421 uint16_t status = NVME_NO_STATUS; 2422 2423 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes); 2424 if (pci_nvme_append_iov_req(sc, req, prp1, size, offset)) { 2425 err = -1; 2426 goto out; 2427 } 2428 2429 offset += size; 2430 bytes -= size; 2431 2432 if (bytes == 0) { 2433 ; 2434 } else if (bytes <= PAGE_SIZE) { 2435 size = bytes; 2436 if (pci_nvme_append_iov_req(sc, req, prp2, size, offset)) { 2437 err = -1; 2438 goto out; 2439 } 2440 } else { 2441 void *vmctx = sc->nsc_pi->pi_vmctx; 2442 uint64_t *prp_list = &prp2; 2443 uint64_t *last = prp_list; 2444 2445 /* PRP2 is pointer to a physical region page list */ 2446 while (bytes) { 2447 /* Last entry in list points to the next list */ 2448 if ((prp_list == last) && (bytes > PAGE_SIZE)) { 2449 uint64_t prp = *prp_list; 2450 2451 prp_list = paddr_guest2host(vmctx, prp, 2452 PAGE_SIZE - (prp % PAGE_SIZE)); 2453 if (prp_list == NULL) { 2454 err = -1; 2455 goto out; 2456 } 2457 last = prp_list + (NVME_PRP2_ITEMS - 1); 2458 } 2459 2460 size = MIN(bytes, PAGE_SIZE); 2461 2462 if (pci_nvme_append_iov_req(sc, req, *prp_list, size, 2463 offset)) { 2464 err = -1; 2465 goto out; 2466 } 2467 2468 offset += size; 2469 bytes -= size; 2470 2471 prp_list++; 2472 } 2473 } 2474 req->io_req.br_callback = pci_nvme_io_done; 2475 if (is_write) 2476 err = blockif_write(nvstore->ctx, &req->io_req); 2477 else 2478 err = blockif_read(nvstore->ctx, &req->io_req); 2479 out: 2480 if (err) 2481 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); 2482 2483 return (status); 2484 } 2485 2486 static bool 2487 nvme_opc_write_read(struct pci_nvme_softc *sc, 2488 struct nvme_command *cmd, 2489 struct pci_nvme_blockstore *nvstore, 2490 struct pci_nvme_ioreq *req, 2491 uint16_t *status) 2492 { 2493 uint64_t lba, nblocks, bytes; 2494 size_t offset; 2495 bool is_write = cmd->opc == NVME_OPC_WRITE; 2496 bool pending = false; 2497 2498 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; 2499 nblocks = (cmd->cdw12 & 0xFFFF) + 1; 2500 bytes = nblocks << nvstore->sectsz_bits; 2501 if (bytes > NVME_MAX_DATA_SIZE) { 2502 WPRINTF("%s command would exceed MDTS", __func__); 2503 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD); 2504 goto out; 2505 } 2506 2507 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) { 2508 WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)", 2509 __func__, lba, nblocks); 2510 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2511 goto out; 2512 } 2513 2514 offset = lba << nvstore->sectsz_bits; 2515 2516 req->bytes = bytes; 2517 req->io_req.br_offset = lba; 2518 2519 /* PRP bits 1:0 must be zero */ 2520 cmd->prp1 &= ~0x3UL; 2521 cmd->prp2 &= ~0x3UL; 2522 2523 if (nvstore->type == NVME_STOR_RAM) { 2524 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1, 2525 cmd->prp2, offset, bytes, is_write); 2526 } else { 2527 *status = nvme_write_read_blockif(sc, nvstore, req, 2528 cmd->prp1, cmd->prp2, offset, bytes, is_write); 2529 2530 if (*status == NVME_NO_STATUS) 2531 pending = true; 2532 } 2533 out: 2534 if (!pending) 2535 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status); 2536 2537 return (pending); 2538 } 2539 2540 static void 2541 pci_nvme_dealloc_sm(struct blockif_req *br, int err) 2542 { 2543 struct pci_nvme_ioreq *req = br->br_param; 2544 struct pci_nvme_softc *sc = req->sc; 2545 bool done = true; 2546 uint16_t status; 2547 2548 status = 0; 2549 if (err) { 2550 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); 2551 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) { 2552 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2553 } else { 2554 struct iovec *iov = req->io_req.br_iov; 2555 2556 req->prev_gpaddr++; 2557 iov += req->prev_gpaddr; 2558 2559 /* The iov_* values already include the sector size */ 2560 req->io_req.br_offset = (off_t)iov->iov_base; 2561 req->io_req.br_resid = iov->iov_len; 2562 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) { 2563 pci_nvme_status_genc(&status, 2564 NVME_SC_INTERNAL_DEVICE_ERROR); 2565 } else 2566 done = false; 2567 } 2568 2569 if (done) { 2570 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, req->cid, 2571 status); 2572 pci_nvme_release_ioreq(sc, req); 2573 } 2574 } 2575 2576 static bool 2577 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, 2578 struct nvme_command *cmd, 2579 struct pci_nvme_blockstore *nvstore, 2580 struct pci_nvme_ioreq *req, 2581 uint16_t *status) 2582 { 2583 struct nvme_dsm_range *range = NULL; 2584 uint32_t nr, r, non_zero, dr; 2585 int err; 2586 bool pending = false; 2587 2588 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) { 2589 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE); 2590 goto out; 2591 } 2592 2593 nr = cmd->cdw10 & 0xff; 2594 2595 /* copy locally because a range entry could straddle PRPs */ 2596 range = calloc(1, NVME_MAX_DSM_TRIM); 2597 if (range == NULL) { 2598 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2599 goto out; 2600 } 2601 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, 2602 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP); 2603 2604 /* Check for invalid ranges and the number of non-zero lengths */ 2605 non_zero = 0; 2606 for (r = 0; r <= nr; r++) { 2607 if (pci_nvme_out_of_range(nvstore, 2608 range[r].starting_lba, range[r].length)) { 2609 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2610 goto out; 2611 } 2612 if (range[r].length != 0) 2613 non_zero++; 2614 } 2615 2616 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) { 2617 size_t offset, bytes; 2618 int sectsz_bits = sc->nvstore.sectsz_bits; 2619 2620 /* 2621 * DSM calls are advisory only, and compliant controllers 2622 * may choose to take no actions (i.e. return Success). 2623 */ 2624 if (!nvstore->deallocate) { 2625 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2626 goto out; 2627 } 2628 2629 /* If all ranges have a zero length, return Success */ 2630 if (non_zero == 0) { 2631 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2632 goto out; 2633 } 2634 2635 if (req == NULL) { 2636 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2637 goto out; 2638 } 2639 2640 offset = range[0].starting_lba << sectsz_bits; 2641 bytes = range[0].length << sectsz_bits; 2642 2643 /* 2644 * If the request is for more than a single range, store 2645 * the ranges in the br_iov. Optimize for the common case 2646 * of a single range. 2647 * 2648 * Note that NVMe Number of Ranges is a zero based value 2649 */ 2650 req->io_req.br_iovcnt = 0; 2651 req->io_req.br_offset = offset; 2652 req->io_req.br_resid = bytes; 2653 2654 if (nr == 0) { 2655 req->io_req.br_callback = pci_nvme_io_done; 2656 } else { 2657 struct iovec *iov = req->io_req.br_iov; 2658 2659 for (r = 0, dr = 0; r <= nr; r++) { 2660 offset = range[r].starting_lba << sectsz_bits; 2661 bytes = range[r].length << sectsz_bits; 2662 if (bytes == 0) 2663 continue; 2664 2665 if ((nvstore->size - offset) < bytes) { 2666 pci_nvme_status_genc(status, 2667 NVME_SC_LBA_OUT_OF_RANGE); 2668 goto out; 2669 } 2670 iov[dr].iov_base = (void *)offset; 2671 iov[dr].iov_len = bytes; 2672 dr++; 2673 } 2674 req->io_req.br_callback = pci_nvme_dealloc_sm; 2675 2676 /* 2677 * Use prev_gpaddr to track the current entry and 2678 * prev_size to track the number of entries 2679 */ 2680 req->prev_gpaddr = 0; 2681 req->prev_size = dr; 2682 } 2683 2684 err = blockif_delete(nvstore->ctx, &req->io_req); 2685 if (err) 2686 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2687 else 2688 pending = true; 2689 } 2690 out: 2691 free(range); 2692 return (pending); 2693 } 2694 2695 static void 2696 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) 2697 { 2698 struct nvme_submission_queue *sq; 2699 uint16_t status; 2700 uint16_t sqhead; 2701 2702 /* handle all submissions up to sq->tail index */ 2703 sq = &sc->submit_queues[idx]; 2704 2705 pthread_mutex_lock(&sq->mtx); 2706 2707 sqhead = sq->head; 2708 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p", 2709 idx, sqhead, sq->tail, sq->qbase); 2710 2711 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2712 struct nvme_command *cmd; 2713 struct pci_nvme_ioreq *req; 2714 uint32_t nsid; 2715 bool pending; 2716 2717 pending = false; 2718 req = NULL; 2719 status = 0; 2720 2721 cmd = &sq->qbase[sqhead]; 2722 sqhead = (sqhead + 1) % sq->size; 2723 2724 nsid = le32toh(cmd->nsid); 2725 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) { 2726 pci_nvme_status_genc(&status, 2727 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 2728 status |= 2729 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT; 2730 goto complete; 2731 } 2732 2733 req = pci_nvme_get_ioreq(sc); 2734 if (req == NULL) { 2735 pci_nvme_status_genc(&status, 2736 NVME_SC_INTERNAL_DEVICE_ERROR); 2737 WPRINTF("%s: unable to allocate IO req", __func__); 2738 goto complete; 2739 } 2740 req->nvme_sq = sq; 2741 req->sqid = idx; 2742 req->opc = cmd->opc; 2743 req->cid = cmd->cid; 2744 req->nsid = cmd->nsid; 2745 2746 switch (cmd->opc) { 2747 case NVME_OPC_FLUSH: 2748 pending = nvme_opc_flush(sc, cmd, &sc->nvstore, 2749 req, &status); 2750 break; 2751 case NVME_OPC_WRITE: 2752 case NVME_OPC_READ: 2753 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore, 2754 req, &status); 2755 break; 2756 case NVME_OPC_WRITE_ZEROES: 2757 /* TODO: write zeroes 2758 WPRINTF("%s write zeroes lba 0x%lx blocks %u", 2759 __func__, lba, cmd->cdw12 & 0xFFFF); */ 2760 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2761 break; 2762 case NVME_OPC_DATASET_MANAGEMENT: 2763 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, 2764 req, &status); 2765 break; 2766 default: 2767 WPRINTF("%s unhandled io command 0x%x", 2768 __func__, cmd->opc); 2769 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE); 2770 } 2771 complete: 2772 if (!pending) { 2773 pci_nvme_set_completion(sc, sq, idx, cmd->cid, status); 2774 if (req != NULL) 2775 pci_nvme_release_ioreq(sc, req); 2776 } 2777 } 2778 2779 sq->head = sqhead; 2780 2781 pthread_mutex_unlock(&sq->mtx); 2782 } 2783 2784 static void 2785 pci_nvme_handle_doorbell(struct pci_nvme_softc* sc, 2786 uint64_t idx, int is_sq, uint64_t value) 2787 { 2788 DPRINTF("nvme doorbell %lu, %s, val 0x%lx", 2789 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF); 2790 2791 if (is_sq) { 2792 if (idx > sc->num_squeues) { 2793 WPRINTF("%s queue index %lu overflow from " 2794 "guest (max %u)", 2795 __func__, idx, sc->num_squeues); 2796 return; 2797 } 2798 2799 atomic_store_short(&sc->submit_queues[idx].tail, 2800 (uint16_t)value); 2801 2802 if (idx == 0) { 2803 pci_nvme_handle_admin_cmd(sc, value); 2804 } else { 2805 /* submission queue; handle new entries in SQ */ 2806 if (idx > sc->num_squeues) { 2807 WPRINTF("%s SQ index %lu overflow from " 2808 "guest (max %u)", 2809 __func__, idx, sc->num_squeues); 2810 return; 2811 } 2812 pci_nvme_handle_io_cmd(sc, (uint16_t)idx); 2813 } 2814 } else { 2815 if (idx > sc->num_cqueues) { 2816 WPRINTF("%s queue index %lu overflow from " 2817 "guest (max %u)", 2818 __func__, idx, sc->num_cqueues); 2819 return; 2820 } 2821 2822 atomic_store_short(&sc->compl_queues[idx].head, 2823 (uint16_t)value); 2824 } 2825 } 2826 2827 static void 2828 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) 2829 { 2830 const char *s = iswrite ? "WRITE" : "READ"; 2831 2832 switch (offset) { 2833 case NVME_CR_CAP_LOW: 2834 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s); 2835 break; 2836 case NVME_CR_CAP_HI: 2837 DPRINTF("%s %s NVME_CR_CAP_HI", func, s); 2838 break; 2839 case NVME_CR_VS: 2840 DPRINTF("%s %s NVME_CR_VS", func, s); 2841 break; 2842 case NVME_CR_INTMS: 2843 DPRINTF("%s %s NVME_CR_INTMS", func, s); 2844 break; 2845 case NVME_CR_INTMC: 2846 DPRINTF("%s %s NVME_CR_INTMC", func, s); 2847 break; 2848 case NVME_CR_CC: 2849 DPRINTF("%s %s NVME_CR_CC", func, s); 2850 break; 2851 case NVME_CR_CSTS: 2852 DPRINTF("%s %s NVME_CR_CSTS", func, s); 2853 break; 2854 case NVME_CR_NSSR: 2855 DPRINTF("%s %s NVME_CR_NSSR", func, s); 2856 break; 2857 case NVME_CR_AQA: 2858 DPRINTF("%s %s NVME_CR_AQA", func, s); 2859 break; 2860 case NVME_CR_ASQ_LOW: 2861 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s); 2862 break; 2863 case NVME_CR_ASQ_HI: 2864 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s); 2865 break; 2866 case NVME_CR_ACQ_LOW: 2867 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s); 2868 break; 2869 case NVME_CR_ACQ_HI: 2870 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s); 2871 break; 2872 default: 2873 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset); 2874 } 2875 2876 } 2877 2878 static void 2879 pci_nvme_write_bar_0(struct pci_nvme_softc *sc, uint64_t offset, int size, 2880 uint64_t value) 2881 { 2882 uint32_t ccreg; 2883 2884 if (offset >= NVME_DOORBELL_OFFSET) { 2885 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; 2886 uint64_t idx = belloffset / 8; /* door bell size = 2*int */ 2887 int is_sq = (belloffset % 8) < 4; 2888 2889 if ((sc->regs.csts & NVME_CSTS_RDY) == 0) { 2890 WPRINTF("doorbell write prior to RDY (offset=%#lx)\n", 2891 offset); 2892 return; 2893 } 2894 2895 if (belloffset > ((sc->max_queues+1) * 8 - 4)) { 2896 WPRINTF("guest attempted an overflow write offset " 2897 "0x%lx, val 0x%lx in %s", 2898 offset, value, __func__); 2899 return; 2900 } 2901 2902 if (is_sq) { 2903 if (sc->submit_queues[idx].qbase == NULL) 2904 return; 2905 } else if (sc->compl_queues[idx].qbase == NULL) 2906 return; 2907 2908 pci_nvme_handle_doorbell(sc, idx, is_sq, value); 2909 return; 2910 } 2911 2912 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx", 2913 offset, size, value); 2914 2915 if (size != 4) { 2916 WPRINTF("guest wrote invalid size %d (offset 0x%lx, " 2917 "val 0x%lx) to bar0 in %s", 2918 size, offset, value, __func__); 2919 /* TODO: shutdown device */ 2920 return; 2921 } 2922 2923 pci_nvme_bar0_reg_dumps(__func__, offset, 1); 2924 2925 pthread_mutex_lock(&sc->mtx); 2926 2927 switch (offset) { 2928 case NVME_CR_CAP_LOW: 2929 case NVME_CR_CAP_HI: 2930 /* readonly */ 2931 break; 2932 case NVME_CR_VS: 2933 /* readonly */ 2934 break; 2935 case NVME_CR_INTMS: 2936 /* MSI-X, so ignore */ 2937 break; 2938 case NVME_CR_INTMC: 2939 /* MSI-X, so ignore */ 2940 break; 2941 case NVME_CR_CC: 2942 ccreg = (uint32_t)value; 2943 2944 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u " 2945 "iocqes %u", 2946 __func__, 2947 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), 2948 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), 2949 NVME_CC_GET_IOCQES(ccreg)); 2950 2951 if (NVME_CC_GET_SHN(ccreg)) { 2952 /* perform shutdown - flush out data to backend */ 2953 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << 2954 NVME_CSTS_REG_SHST_SHIFT); 2955 sc->regs.csts |= NVME_SHST_COMPLETE << 2956 NVME_CSTS_REG_SHST_SHIFT; 2957 } 2958 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { 2959 if (NVME_CC_GET_EN(ccreg) == 0) 2960 /* transition 1-> causes controller reset */ 2961 pci_nvme_reset_locked(sc); 2962 else 2963 pci_nvme_init_controller(sc); 2964 } 2965 2966 /* Insert the iocqes, iosqes and en bits from the write */ 2967 sc->regs.cc &= ~NVME_CC_WRITE_MASK; 2968 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; 2969 if (NVME_CC_GET_EN(ccreg) == 0) { 2970 /* Insert the ams, mps and css bit fields */ 2971 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; 2972 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; 2973 sc->regs.csts &= ~NVME_CSTS_RDY; 2974 } else if ((sc->pending_ios == 0) && 2975 !(sc->regs.csts & NVME_CSTS_CFS)) { 2976 sc->regs.csts |= NVME_CSTS_RDY; 2977 } 2978 break; 2979 case NVME_CR_CSTS: 2980 break; 2981 case NVME_CR_NSSR: 2982 /* ignore writes; don't support subsystem reset */ 2983 break; 2984 case NVME_CR_AQA: 2985 sc->regs.aqa = (uint32_t)value; 2986 break; 2987 case NVME_CR_ASQ_LOW: 2988 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | 2989 (0xFFFFF000 & value); 2990 break; 2991 case NVME_CR_ASQ_HI: 2992 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | 2993 (value << 32); 2994 break; 2995 case NVME_CR_ACQ_LOW: 2996 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | 2997 (0xFFFFF000 & value); 2998 break; 2999 case NVME_CR_ACQ_HI: 3000 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | 3001 (value << 32); 3002 break; 3003 default: 3004 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d", 3005 __func__, offset, value, size); 3006 } 3007 pthread_mutex_unlock(&sc->mtx); 3008 } 3009 3010 static void 3011 pci_nvme_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size, 3012 uint64_t value) 3013 { 3014 struct pci_nvme_softc* sc = pi->pi_arg; 3015 3016 if (baridx == pci_msix_table_bar(pi) || 3017 baridx == pci_msix_pba_bar(pi)) { 3018 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, " 3019 " value 0x%lx", baridx, offset, size, value); 3020 3021 pci_emul_msix_twrite(pi, offset, size, value); 3022 return; 3023 } 3024 3025 switch (baridx) { 3026 case 0: 3027 pci_nvme_write_bar_0(sc, offset, size, value); 3028 break; 3029 3030 default: 3031 DPRINTF("%s unknown baridx %d, val 0x%lx", 3032 __func__, baridx, value); 3033 } 3034 } 3035 3036 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, 3037 uint64_t offset, int size) 3038 { 3039 uint64_t value; 3040 3041 pci_nvme_bar0_reg_dumps(__func__, offset, 0); 3042 3043 if (offset < NVME_DOORBELL_OFFSET) { 3044 void *p = &(sc->regs); 3045 pthread_mutex_lock(&sc->mtx); 3046 memcpy(&value, (void *)((uintptr_t)p + offset), size); 3047 pthread_mutex_unlock(&sc->mtx); 3048 } else { 3049 value = 0; 3050 WPRINTF("pci_nvme: read invalid offset %ld", offset); 3051 } 3052 3053 switch (size) { 3054 case 1: 3055 value &= 0xFF; 3056 break; 3057 case 2: 3058 value &= 0xFFFF; 3059 break; 3060 case 4: 3061 value &= 0xFFFFFFFF; 3062 break; 3063 } 3064 3065 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x", 3066 offset, size, (uint32_t)value); 3067 3068 return (value); 3069 } 3070 3071 3072 3073 static uint64_t 3074 pci_nvme_read(struct pci_devinst *pi, int baridx, uint64_t offset, int size) 3075 { 3076 struct pci_nvme_softc* sc = pi->pi_arg; 3077 3078 if (baridx == pci_msix_table_bar(pi) || 3079 baridx == pci_msix_pba_bar(pi)) { 3080 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d", 3081 baridx, offset, size); 3082 3083 return pci_emul_msix_tread(pi, offset, size); 3084 } 3085 3086 switch (baridx) { 3087 case 0: 3088 return pci_nvme_read_bar_0(sc, offset, size); 3089 3090 default: 3091 DPRINTF("unknown bar %d, 0x%lx", baridx, offset); 3092 } 3093 3094 return (0); 3095 } 3096 3097 static int 3098 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl) 3099 { 3100 char bident[sizeof("XXX:XXX")]; 3101 const char *value; 3102 uint32_t sectsz; 3103 3104 sc->max_queues = NVME_QUEUES; 3105 sc->max_qentries = NVME_MAX_QENTRIES; 3106 sc->ioslots = NVME_IOSLOTS; 3107 sc->num_squeues = sc->max_queues; 3108 sc->num_cqueues = sc->max_queues; 3109 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 3110 sectsz = 0; 3111 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), 3112 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3113 3114 value = get_config_value_node(nvl, "maxq"); 3115 if (value != NULL) 3116 sc->max_queues = atoi(value); 3117 value = get_config_value_node(nvl, "qsz"); 3118 if (value != NULL) { 3119 sc->max_qentries = atoi(value); 3120 if (sc->max_qentries <= 0) { 3121 EPRINTLN("nvme: Invalid qsz option %d", 3122 sc->max_qentries); 3123 return (-1); 3124 } 3125 } 3126 value = get_config_value_node(nvl, "ioslots"); 3127 if (value != NULL) { 3128 sc->ioslots = atoi(value); 3129 if (sc->ioslots <= 0) { 3130 EPRINTLN("Invalid ioslots option %d", sc->ioslots); 3131 return (-1); 3132 } 3133 } 3134 value = get_config_value_node(nvl, "sectsz"); 3135 if (value != NULL) 3136 sectsz = atoi(value); 3137 value = get_config_value_node(nvl, "ser"); 3138 if (value != NULL) { 3139 /* 3140 * This field indicates the Product Serial Number in 3141 * 7-bit ASCII, unused bytes should be space characters. 3142 * Ref: NVMe v1.3c. 3143 */ 3144 cpywithpad((char *)sc->ctrldata.sn, 3145 sizeof(sc->ctrldata.sn), value, ' '); 3146 } 3147 value = get_config_value_node(nvl, "eui64"); 3148 if (value != NULL) 3149 sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0)); 3150 value = get_config_value_node(nvl, "dsm"); 3151 if (value != NULL) { 3152 if (strcmp(value, "auto") == 0) 3153 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 3154 else if (strcmp(value, "enable") == 0) 3155 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE; 3156 else if (strcmp(value, "disable") == 0) 3157 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE; 3158 } 3159 3160 value = get_config_value_node(nvl, "bootindex"); 3161 if (value != NULL) { 3162 if (pci_emul_add_boot_device(sc->nsc_pi, atoi(value))) { 3163 EPRINTLN("Invalid bootindex %d", atoi(value)); 3164 return (-1); 3165 } 3166 } 3167 3168 value = get_config_value_node(nvl, "ram"); 3169 if (value != NULL) { 3170 uint64_t sz = strtoull(value, NULL, 10); 3171 3172 sc->nvstore.type = NVME_STOR_RAM; 3173 sc->nvstore.size = sz * 1024 * 1024; 3174 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 3175 sc->nvstore.sectsz = 4096; 3176 sc->nvstore.sectsz_bits = 12; 3177 if (sc->nvstore.ctx == NULL) { 3178 EPRINTLN("nvme: Unable to allocate RAM"); 3179 return (-1); 3180 } 3181 } else { 3182 snprintf(bident, sizeof(bident), "%u:%u", 3183 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3184 sc->nvstore.ctx = blockif_open(nvl, bident); 3185 if (sc->nvstore.ctx == NULL) { 3186 EPRINTLN("nvme: Could not open backing file: %s", 3187 strerror(errno)); 3188 return (-1); 3189 } 3190 sc->nvstore.type = NVME_STOR_BLOCKIF; 3191 sc->nvstore.size = blockif_size(sc->nvstore.ctx); 3192 } 3193 3194 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) 3195 sc->nvstore.sectsz = sectsz; 3196 else if (sc->nvstore.type != NVME_STOR_RAM) 3197 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); 3198 for (sc->nvstore.sectsz_bits = 9; 3199 (1U << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; 3200 sc->nvstore.sectsz_bits++); 3201 3202 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) 3203 sc->max_queues = NVME_QUEUES; 3204 3205 return (0); 3206 } 3207 3208 static void 3209 pci_nvme_resized(struct blockif_ctxt *bctxt __unused, void *arg, 3210 size_t new_size) 3211 { 3212 struct pci_nvme_softc *sc; 3213 struct pci_nvme_blockstore *nvstore; 3214 struct nvme_namespace_data *nd; 3215 3216 sc = arg; 3217 nvstore = &sc->nvstore; 3218 nd = &sc->nsdata; 3219 3220 nvstore->size = new_size; 3221 pci_nvme_init_nsdata_size(nvstore, nd); 3222 3223 /* Add changed NSID to list */ 3224 sc->ns_log.ns[0] = 1; 3225 sc->ns_log.ns[1] = 0; 3226 3227 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE, 3228 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED); 3229 } 3230 3231 static int 3232 pci_nvme_init(struct pci_devinst *pi, nvlist_t *nvl) 3233 { 3234 struct pci_nvme_softc *sc; 3235 uint32_t pci_membar_sz; 3236 int error; 3237 3238 error = 0; 3239 3240 sc = calloc(1, sizeof(struct pci_nvme_softc)); 3241 pi->pi_arg = sc; 3242 sc->nsc_pi = pi; 3243 3244 error = pci_nvme_parse_config(sc, nvl); 3245 if (error < 0) 3246 goto done; 3247 else 3248 error = 0; 3249 3250 STAILQ_INIT(&sc->ioreqs_free); 3251 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); 3252 for (uint32_t i = 0; i < sc->ioslots; i++) { 3253 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); 3254 } 3255 3256 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); 3257 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); 3258 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); 3259 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); 3260 pci_set_cfgdata8(pi, PCIR_PROGIF, 3261 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); 3262 3263 /* 3264 * Allocate size of NVMe registers + doorbell space for all queues. 3265 * 3266 * The specification requires a minimum memory I/O window size of 16K. 3267 * The Windows driver will refuse to start a device with a smaller 3268 * window. 3269 */ 3270 pci_membar_sz = sizeof(struct nvme_registers) + 3271 2 * sizeof(uint32_t) * (sc->max_queues + 1); 3272 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); 3273 3274 DPRINTF("nvme membar size: %u", pci_membar_sz); 3275 3276 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); 3277 if (error) { 3278 WPRINTF("%s pci alloc mem bar failed", __func__); 3279 goto done; 3280 } 3281 3282 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); 3283 if (error) { 3284 WPRINTF("%s pci add msixcap failed", __func__); 3285 goto done; 3286 } 3287 3288 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); 3289 if (error) { 3290 WPRINTF("%s pci add Express capability failed", __func__); 3291 goto done; 3292 } 3293 3294 pthread_mutex_init(&sc->mtx, NULL); 3295 sem_init(&sc->iosemlock, 0, sc->ioslots); 3296 blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc); 3297 3298 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues); 3299 /* 3300 * Controller data depends on Namespace data so initialize Namespace 3301 * data first. 3302 */ 3303 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore); 3304 pci_nvme_init_ctrldata(sc); 3305 pci_nvme_init_logpages(sc); 3306 pci_nvme_init_features(sc); 3307 3308 pci_nvme_aer_init(sc); 3309 pci_nvme_aen_init(sc); 3310 3311 pci_nvme_reset(sc); 3312 3313 done: 3314 return (error); 3315 } 3316 3317 static int 3318 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts) 3319 { 3320 char *cp, *ram; 3321 3322 if (opts == NULL) 3323 return (0); 3324 3325 if (strncmp(opts, "ram=", 4) == 0) { 3326 cp = strchr(opts, ','); 3327 if (cp == NULL) { 3328 set_config_value_node(nvl, "ram", opts + 4); 3329 return (0); 3330 } 3331 ram = strndup(opts + 4, cp - opts - 4); 3332 set_config_value_node(nvl, "ram", ram); 3333 free(ram); 3334 return (pci_parse_legacy_config(nvl, cp + 1)); 3335 } else 3336 return (blockif_legacy_config(nvl, opts)); 3337 } 3338 3339 static const struct pci_devemu pci_de_nvme = { 3340 .pe_emu = "nvme", 3341 .pe_init = pci_nvme_init, 3342 .pe_legacy_config = pci_nvme_legacy_config, 3343 .pe_barwrite = pci_nvme_write, 3344 .pe_barread = pci_nvme_read 3345 }; 3346 PCI_EMUL_SET(pci_de_nvme); 3347