1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2017 Shunsuke Mie 5 * Copyright (c) 2018 Leon Dang 6 * Copyright (c) 2020 Chuck Tuffli 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 /* 31 * bhyve PCIe-NVMe device emulation. 32 * 33 * options: 34 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt> 35 * 36 * accepted devpath: 37 * /dev/blockdev 38 * /path/to/image 39 * ram=size_in_MiB 40 * 41 * maxq = max number of queues 42 * qsz = max elements in each queue 43 * ioslots = max number of concurrent io requests 44 * sectsz = sector size (defaults to blockif sector size) 45 * ser = serial number (20-chars max) 46 * eui64 = IEEE Extended Unique Identifier (8 byte value) 47 * dsm = DataSet Management support. Option is one of auto, enable,disable 48 * 49 */ 50 51 /* TODO: 52 - create async event for smart and log 53 - intr coalesce 54 */ 55 56 #include <sys/cdefs.h> 57 __FBSDID("$FreeBSD$"); 58 59 #include <sys/errno.h> 60 #include <sys/types.h> 61 #include <sys/crc16.h> 62 #include <net/ieee_oui.h> 63 64 #include <assert.h> 65 #include <pthread.h> 66 #include <pthread_np.h> 67 #include <semaphore.h> 68 #include <stdbool.h> 69 #include <stddef.h> 70 #include <stdint.h> 71 #include <stdio.h> 72 #include <stdlib.h> 73 #include <string.h> 74 75 #include <machine/atomic.h> 76 #include <machine/vmm.h> 77 #include <vmmapi.h> 78 79 #include <dev/nvme/nvme.h> 80 81 #include "bhyverun.h" 82 #include "block_if.h" 83 #include "config.h" 84 #include "debug.h" 85 #include "pci_emul.h" 86 87 88 static int nvme_debug = 0; 89 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args) 90 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args) 91 92 /* defaults; can be overridden */ 93 #define NVME_MSIX_BAR 4 94 95 #define NVME_IOSLOTS 8 96 97 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ 98 #define NVME_MMIO_SPACE_MIN (1 << 14) 99 100 #define NVME_QUEUES 16 101 #define NVME_MAX_QENTRIES 2048 102 /* Memory Page size Minimum reported in CAP register */ 103 #define NVME_MPSMIN 0 104 /* MPSMIN converted to bytes */ 105 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN)) 106 107 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) 108 #define NVME_MDTS 9 109 /* Note the + 1 allows for the initial descriptor to not be page aligned */ 110 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1) 111 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES) 112 113 /* This is a synthetic status code to indicate there is no status */ 114 #define NVME_NO_STATUS 0xffff 115 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) 116 117 /* Reported temperature in Kelvin (i.e. room temperature) */ 118 #define NVME_TEMPERATURE 296 119 120 /* helpers */ 121 122 /* Convert a zero-based value into a one-based value */ 123 #define ONE_BASED(zero) ((zero) + 1) 124 /* Convert a one-based value into a zero-based value */ 125 #define ZERO_BASED(one) ((one) - 1) 126 127 /* Encode number of SQ's and CQ's for Set/Get Features */ 128 #define NVME_FEATURE_NUM_QUEUES(sc) \ 129 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ 130 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16 131 132 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) 133 134 enum nvme_controller_register_offsets { 135 NVME_CR_CAP_LOW = 0x00, 136 NVME_CR_CAP_HI = 0x04, 137 NVME_CR_VS = 0x08, 138 NVME_CR_INTMS = 0x0c, 139 NVME_CR_INTMC = 0x10, 140 NVME_CR_CC = 0x14, 141 NVME_CR_CSTS = 0x1c, 142 NVME_CR_NSSR = 0x20, 143 NVME_CR_AQA = 0x24, 144 NVME_CR_ASQ_LOW = 0x28, 145 NVME_CR_ASQ_HI = 0x2c, 146 NVME_CR_ACQ_LOW = 0x30, 147 NVME_CR_ACQ_HI = 0x34, 148 }; 149 150 enum nvme_cmd_cdw11 { 151 NVME_CMD_CDW11_PC = 0x0001, 152 NVME_CMD_CDW11_IEN = 0x0002, 153 NVME_CMD_CDW11_IV = 0xFFFF0000, 154 }; 155 156 enum nvme_copy_dir { 157 NVME_COPY_TO_PRP, 158 NVME_COPY_FROM_PRP, 159 }; 160 161 #define NVME_CQ_INTEN 0x01 162 #define NVME_CQ_INTCOAL 0x02 163 164 struct nvme_completion_queue { 165 struct nvme_completion *qbase; 166 pthread_mutex_t mtx; 167 uint32_t size; 168 uint16_t tail; /* nvme progress */ 169 uint16_t head; /* guest progress */ 170 uint16_t intr_vec; 171 uint32_t intr_en; 172 }; 173 174 struct nvme_submission_queue { 175 struct nvme_command *qbase; 176 pthread_mutex_t mtx; 177 uint32_t size; 178 uint16_t head; /* nvme progress */ 179 uint16_t tail; /* guest progress */ 180 uint16_t cqid; /* completion queue id */ 181 int qpriority; 182 }; 183 184 enum nvme_storage_type { 185 NVME_STOR_BLOCKIF = 0, 186 NVME_STOR_RAM = 1, 187 }; 188 189 struct pci_nvme_blockstore { 190 enum nvme_storage_type type; 191 void *ctx; 192 uint64_t size; 193 uint32_t sectsz; 194 uint32_t sectsz_bits; 195 uint64_t eui64; 196 uint32_t deallocate:1; 197 }; 198 199 /* 200 * Calculate the number of additional page descriptors for guest IO requests 201 * based on the advertised Max Data Transfer (MDTS) and given the number of 202 * default iovec's in a struct blockif_req. 203 */ 204 #define MDTS_PAD_SIZE \ 205 ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \ 206 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \ 207 0 ) 208 209 struct pci_nvme_ioreq { 210 struct pci_nvme_softc *sc; 211 STAILQ_ENTRY(pci_nvme_ioreq) link; 212 struct nvme_submission_queue *nvme_sq; 213 uint16_t sqid; 214 215 /* command information */ 216 uint16_t opc; 217 uint16_t cid; 218 uint32_t nsid; 219 220 uint64_t prev_gpaddr; 221 size_t prev_size; 222 size_t bytes; 223 224 struct blockif_req io_req; 225 226 struct iovec iovpadding[MDTS_PAD_SIZE]; 227 }; 228 229 enum nvme_dsm_type { 230 /* Dataset Management bit in ONCS reflects backing storage capability */ 231 NVME_DATASET_MANAGEMENT_AUTO, 232 /* Unconditionally set Dataset Management bit in ONCS */ 233 NVME_DATASET_MANAGEMENT_ENABLE, 234 /* Unconditionally clear Dataset Management bit in ONCS */ 235 NVME_DATASET_MANAGEMENT_DISABLE, 236 }; 237 238 struct pci_nvme_softc; 239 struct nvme_feature_obj; 240 241 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *, 242 struct nvme_feature_obj *, 243 struct nvme_command *, 244 struct nvme_completion *); 245 246 struct nvme_feature_obj { 247 uint32_t cdw11; 248 nvme_feature_cb set; 249 nvme_feature_cb get; 250 bool namespace_specific; 251 }; 252 253 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1) 254 255 typedef enum { 256 PCI_NVME_AE_TYPE_ERROR = 0, 257 PCI_NVME_AE_TYPE_SMART, 258 PCI_NVME_AE_TYPE_NOTICE, 259 PCI_NVME_AE_TYPE_IO_CMD = 6, 260 PCI_NVME_AE_TYPE_VENDOR = 7, 261 PCI_NVME_AE_TYPE_MAX /* Must be last */ 262 } pci_nvme_async_type; 263 264 /* Asynchronous Event Requests */ 265 struct pci_nvme_aer { 266 STAILQ_ENTRY(pci_nvme_aer) link; 267 uint16_t cid; /* Command ID of the submitted AER */ 268 }; 269 270 /** Asynchronous Event Information - Notice */ 271 typedef enum { 272 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED = 0, 273 PCI_NVME_AEI_NOTICE_FW_ACTIVATION, 274 PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE, 275 PCI_NVME_AEI_NOTICE_ANA_CHANGE, 276 PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE, 277 PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT, 278 PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE, 279 PCI_NVME_AEI_NOTICE_MAX, 280 } pci_nvme_async_event_info_notice; 281 282 #define PCI_NVME_AEI_NOTICE_SHIFT 8 283 #define PCI_NVME_AEI_NOTICE_MASK(event) (1 << (event + PCI_NVME_AEI_NOTICE_SHIFT)) 284 285 /* Asynchronous Event Notifications */ 286 struct pci_nvme_aen { 287 pci_nvme_async_type atype; 288 uint32_t event_data; 289 bool posted; 290 }; 291 292 /* 293 * By default, enable all Asynchrnous Event Notifications: 294 * SMART / Health Critical Warnings 295 * Namespace Attribute Notices 296 */ 297 #define PCI_NVME_AEN_DEFAULT_MASK 0x11f 298 299 typedef enum { 300 NVME_CNTRLTYPE_IO = 1, 301 NVME_CNTRLTYPE_DISCOVERY = 2, 302 NVME_CNTRLTYPE_ADMIN = 3, 303 } pci_nvme_cntrl_type; 304 305 struct pci_nvme_softc { 306 struct pci_devinst *nsc_pi; 307 308 pthread_mutex_t mtx; 309 310 struct nvme_registers regs; 311 312 struct nvme_namespace_data nsdata; 313 struct nvme_controller_data ctrldata; 314 struct nvme_error_information_entry err_log; 315 struct nvme_health_information_page health_log; 316 struct nvme_firmware_page fw_log; 317 struct nvme_ns_list ns_log; 318 319 struct pci_nvme_blockstore nvstore; 320 321 uint16_t max_qentries; /* max entries per queue */ 322 uint32_t max_queues; /* max number of IO SQ's or CQ's */ 323 uint32_t num_cqueues; 324 uint32_t num_squeues; 325 bool num_q_is_set; /* Has host set Number of Queues */ 326 327 struct pci_nvme_ioreq *ioreqs; 328 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */ 329 uint32_t pending_ios; 330 uint32_t ioslots; 331 sem_t iosemlock; 332 333 /* 334 * Memory mapped Submission and Completion queues 335 * Each array includes both Admin and IO queues 336 */ 337 struct nvme_completion_queue *compl_queues; 338 struct nvme_submission_queue *submit_queues; 339 340 struct nvme_feature_obj feat[NVME_FID_MAX]; 341 342 enum nvme_dsm_type dataset_management; 343 344 /* Accounting for SMART data */ 345 __uint128_t read_data_units; 346 __uint128_t write_data_units; 347 __uint128_t read_commands; 348 __uint128_t write_commands; 349 uint32_t read_dunits_remainder; 350 uint32_t write_dunits_remainder; 351 352 STAILQ_HEAD(, pci_nvme_aer) aer_list; 353 pthread_mutex_t aer_mtx; 354 uint32_t aer_count; 355 struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX]; 356 pthread_t aen_tid; 357 pthread_mutex_t aen_mtx; 358 pthread_cond_t aen_cond; 359 }; 360 361 362 static void pci_nvme_cq_update(struct pci_nvme_softc *sc, 363 struct nvme_completion_queue *cq, 364 uint32_t cdw0, 365 uint16_t cid, 366 uint16_t sqid, 367 uint16_t status); 368 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); 369 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); 370 static void pci_nvme_io_done(struct blockif_req *, int); 371 372 /* Controller Configuration utils */ 373 #define NVME_CC_GET_EN(cc) \ 374 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) 375 #define NVME_CC_GET_CSS(cc) \ 376 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) 377 #define NVME_CC_GET_SHN(cc) \ 378 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) 379 #define NVME_CC_GET_IOSQES(cc) \ 380 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) 381 #define NVME_CC_GET_IOCQES(cc) \ 382 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) 383 384 #define NVME_CC_WRITE_MASK \ 385 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ 386 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ 387 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) 388 389 #define NVME_CC_NEN_WRITE_MASK \ 390 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ 391 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ 392 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) 393 394 /* Controller Status utils */ 395 #define NVME_CSTS_GET_RDY(sts) \ 396 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) 397 398 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) 399 #define NVME_CSTS_CFS (1 << NVME_CSTS_REG_CFS_SHIFT) 400 401 /* Completion Queue status word utils */ 402 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) 403 #define NVME_STATUS_MASK \ 404 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ 405 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) 406 407 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \ 408 NVME_CTRLR_DATA_ONCS_DSM_SHIFT) 409 410 static void nvme_feature_invalid_cb(struct pci_nvme_softc *, 411 struct nvme_feature_obj *, 412 struct nvme_command *, 413 struct nvme_completion *); 414 static void nvme_feature_temperature(struct pci_nvme_softc *, 415 struct nvme_feature_obj *, 416 struct nvme_command *, 417 struct nvme_completion *); 418 static void nvme_feature_num_queues(struct pci_nvme_softc *, 419 struct nvme_feature_obj *, 420 struct nvme_command *, 421 struct nvme_completion *); 422 static void nvme_feature_iv_config(struct pci_nvme_softc *, 423 struct nvme_feature_obj *, 424 struct nvme_command *, 425 struct nvme_completion *); 426 static void nvme_feature_async_event(struct pci_nvme_softc *, 427 struct nvme_feature_obj *, 428 struct nvme_command *, 429 struct nvme_completion *); 430 431 static void *aen_thr(void *arg); 432 433 static __inline void 434 cpywithpad(char *dst, size_t dst_size, const char *src, char pad) 435 { 436 size_t len; 437 438 len = strnlen(src, dst_size); 439 memset(dst, pad, dst_size); 440 memcpy(dst, src, len); 441 } 442 443 static __inline void 444 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) 445 { 446 447 *status &= ~NVME_STATUS_MASK; 448 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | 449 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; 450 } 451 452 static __inline void 453 pci_nvme_status_genc(uint16_t *status, uint16_t code) 454 { 455 456 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); 457 } 458 459 /* 460 * Initialize the requested number or IO Submission and Completion Queues. 461 * Admin queues are allocated implicitly. 462 */ 463 static void 464 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq) 465 { 466 uint32_t i; 467 468 /* 469 * Allocate and initialize the Submission Queues 470 */ 471 if (nsq > NVME_QUEUES) { 472 WPRINTF("%s: clamping number of SQ from %u to %u", 473 __func__, nsq, NVME_QUEUES); 474 nsq = NVME_QUEUES; 475 } 476 477 sc->num_squeues = nsq; 478 479 sc->submit_queues = calloc(sc->num_squeues + 1, 480 sizeof(struct nvme_submission_queue)); 481 if (sc->submit_queues == NULL) { 482 WPRINTF("%s: SQ allocation failed", __func__); 483 sc->num_squeues = 0; 484 } else { 485 struct nvme_submission_queue *sq = sc->submit_queues; 486 487 for (i = 0; i < sc->num_squeues + 1; i++) 488 pthread_mutex_init(&sq[i].mtx, NULL); 489 } 490 491 /* 492 * Allocate and initialize the Completion Queues 493 */ 494 if (ncq > NVME_QUEUES) { 495 WPRINTF("%s: clamping number of CQ from %u to %u", 496 __func__, ncq, NVME_QUEUES); 497 ncq = NVME_QUEUES; 498 } 499 500 sc->num_cqueues = ncq; 501 502 sc->compl_queues = calloc(sc->num_cqueues + 1, 503 sizeof(struct nvme_completion_queue)); 504 if (sc->compl_queues == NULL) { 505 WPRINTF("%s: CQ allocation failed", __func__); 506 sc->num_cqueues = 0; 507 } else { 508 struct nvme_completion_queue *cq = sc->compl_queues; 509 510 for (i = 0; i < sc->num_cqueues + 1; i++) 511 pthread_mutex_init(&cq[i].mtx, NULL); 512 } 513 } 514 515 static void 516 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) 517 { 518 struct nvme_controller_data *cd = &sc->ctrldata; 519 520 cd->vid = 0xFB5D; 521 cd->ssvid = 0x0000; 522 523 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); 524 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); 525 526 /* Num of submission commands that we can handle at a time (2^rab) */ 527 cd->rab = 4; 528 529 /* FreeBSD OUI */ 530 cd->ieee[0] = 0x58; 531 cd->ieee[1] = 0x9c; 532 cd->ieee[2] = 0xfc; 533 534 cd->mic = 0; 535 536 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */ 537 538 cd->ver = NVME_REV(1,4); 539 540 cd->cntrltype = NVME_CNTRLTYPE_IO; 541 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; 542 cd->oaes = NVMEB(NVME_CTRLR_DATA_OAES_NS_ATTR); 543 cd->acl = 2; 544 cd->aerl = 4; 545 546 /* Advertise 1, Read-only firmware slot */ 547 cd->frmw = NVMEB(NVME_CTRLR_DATA_FRMW_SLOT1_RO) | 548 (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT); 549 cd->lpa = 0; /* TODO: support some simple things like SMART */ 550 cd->elpe = 0; /* max error log page entries */ 551 /* 552 * Report a single power state (zero-based value) 553 * power_state[] values are left as zero to indicate "Not reported" 554 */ 555 cd->npss = 0; 556 557 /* Warning Composite Temperature Threshold */ 558 cd->wctemp = 0x0157; 559 cd->cctemp = 0x0157; 560 561 /* SANICAP must not be 0 for Revision 1.4 and later NVMe Controllers */ 562 cd->sanicap = (NVME_CTRLR_DATA_SANICAP_NODMMAS_NO << 563 NVME_CTRLR_DATA_SANICAP_NODMMAS_SHIFT); 564 565 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | 566 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); 567 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | 568 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); 569 cd->nn = 1; /* number of namespaces */ 570 571 cd->oncs = 0; 572 switch (sc->dataset_management) { 573 case NVME_DATASET_MANAGEMENT_AUTO: 574 if (sc->nvstore.deallocate) 575 cd->oncs |= NVME_ONCS_DSM; 576 break; 577 case NVME_DATASET_MANAGEMENT_ENABLE: 578 cd->oncs |= NVME_ONCS_DSM; 579 break; 580 default: 581 break; 582 } 583 584 cd->fna = NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK << 585 NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT; 586 587 cd->vwc = NVME_CTRLR_DATA_VWC_ALL_NO << NVME_CTRLR_DATA_VWC_ALL_SHIFT; 588 } 589 590 static void 591 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore, 592 struct nvme_namespace_data *nd) 593 { 594 595 /* Get capacity and block size information from backing store */ 596 nd->nsze = nvstore->size / nvstore->sectsz; 597 nd->ncap = nd->nsze; 598 nd->nuse = nd->nsze; 599 } 600 601 static void 602 pci_nvme_init_nsdata(struct pci_nvme_softc *sc, 603 struct nvme_namespace_data *nd, uint32_t nsid, 604 struct pci_nvme_blockstore *nvstore) 605 { 606 607 pci_nvme_init_nsdata_size(nvstore, nd); 608 609 if (nvstore->type == NVME_STOR_BLOCKIF) 610 nvstore->deallocate = blockif_candelete(nvstore->ctx); 611 612 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ 613 nd->flbas = 0; 614 615 /* Create an EUI-64 if user did not provide one */ 616 if (nvstore->eui64 == 0) { 617 char *data = NULL; 618 uint64_t eui64 = nvstore->eui64; 619 620 asprintf(&data, "%s%u%u%u", get_config_value("name"), 621 sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot, 622 sc->nsc_pi->pi_func); 623 624 if (data != NULL) { 625 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); 626 free(data); 627 } 628 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff); 629 } 630 be64enc(nd->eui64, nvstore->eui64); 631 632 /* LBA data-sz = 2^lbads */ 633 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; 634 } 635 636 static void 637 pci_nvme_init_logpages(struct pci_nvme_softc *sc) 638 { 639 __uint128_t power_cycles = 1; 640 641 memset(&sc->err_log, 0, sizeof(sc->err_log)); 642 memset(&sc->health_log, 0, sizeof(sc->health_log)); 643 memset(&sc->fw_log, 0, sizeof(sc->fw_log)); 644 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 645 646 /* Set read/write remainder to round up according to spec */ 647 sc->read_dunits_remainder = 999; 648 sc->write_dunits_remainder = 999; 649 650 /* Set nominal Health values checked by implementations */ 651 sc->health_log.temperature = NVME_TEMPERATURE; 652 sc->health_log.available_spare = 100; 653 sc->health_log.available_spare_threshold = 10; 654 655 /* Set Active Firmware Info to slot 1 */ 656 sc->fw_log.afi = (1 << NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT); 657 memcpy(&sc->fw_log.revision[0], sc->ctrldata.fr, 658 sizeof(sc->fw_log.revision[0])); 659 660 memcpy(&sc->health_log.power_cycles, &power_cycles, 661 sizeof(sc->health_log.power_cycles)); 662 } 663 664 static void 665 pci_nvme_init_features(struct pci_nvme_softc *sc) 666 { 667 enum nvme_feature fid; 668 669 for (fid = 0; fid < NVME_FID_MAX; fid++) { 670 switch (fid) { 671 case NVME_FEAT_ARBITRATION: 672 case NVME_FEAT_POWER_MANAGEMENT: 673 case NVME_FEAT_INTERRUPT_COALESCING: //XXX 674 case NVME_FEAT_WRITE_ATOMICITY: 675 /* Mandatory but no special handling required */ 676 //XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 677 //XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 678 // this returns a data buffer 679 break; 680 case NVME_FEAT_TEMPERATURE_THRESHOLD: 681 sc->feat[fid].set = nvme_feature_temperature; 682 break; 683 case NVME_FEAT_ERROR_RECOVERY: 684 sc->feat[fid].namespace_specific = true; 685 break; 686 case NVME_FEAT_NUMBER_OF_QUEUES: 687 sc->feat[fid].set = nvme_feature_num_queues; 688 break; 689 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 690 sc->feat[fid].set = nvme_feature_iv_config; 691 break; 692 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 693 sc->feat[fid].set = nvme_feature_async_event; 694 /* Enable all AENs by default */ 695 sc->feat[fid].cdw11 = PCI_NVME_AEN_DEFAULT_MASK; 696 break; 697 default: 698 sc->feat[fid].set = nvme_feature_invalid_cb; 699 sc->feat[fid].get = nvme_feature_invalid_cb; 700 } 701 } 702 } 703 704 static void 705 pci_nvme_aer_reset(struct pci_nvme_softc *sc) 706 { 707 708 STAILQ_INIT(&sc->aer_list); 709 sc->aer_count = 0; 710 } 711 712 static void 713 pci_nvme_aer_init(struct pci_nvme_softc *sc) 714 { 715 716 pthread_mutex_init(&sc->aer_mtx, NULL); 717 pci_nvme_aer_reset(sc); 718 } 719 720 static void 721 pci_nvme_aer_destroy(struct pci_nvme_softc *sc) 722 { 723 struct pci_nvme_aer *aer = NULL; 724 725 pthread_mutex_lock(&sc->aer_mtx); 726 while (!STAILQ_EMPTY(&sc->aer_list)) { 727 aer = STAILQ_FIRST(&sc->aer_list); 728 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 729 free(aer); 730 } 731 pthread_mutex_unlock(&sc->aer_mtx); 732 733 pci_nvme_aer_reset(sc); 734 } 735 736 static bool 737 pci_nvme_aer_available(struct pci_nvme_softc *sc) 738 { 739 740 return (sc->aer_count != 0); 741 } 742 743 static bool 744 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc) 745 { 746 struct nvme_controller_data *cd = &sc->ctrldata; 747 748 /* AERL is a zero based value while aer_count is one's based */ 749 return (sc->aer_count == (cd->aerl + 1U)); 750 } 751 752 /* 753 * Add an Async Event Request 754 * 755 * Stores an AER to be returned later if the Controller needs to notify the 756 * host of an event. 757 * Note that while the NVMe spec doesn't require Controllers to return AER's 758 * in order, this implementation does preserve the order. 759 */ 760 static int 761 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid) 762 { 763 struct pci_nvme_aer *aer = NULL; 764 765 aer = calloc(1, sizeof(struct pci_nvme_aer)); 766 if (aer == NULL) 767 return (-1); 768 769 /* Save the Command ID for use in the completion message */ 770 aer->cid = cid; 771 772 pthread_mutex_lock(&sc->aer_mtx); 773 sc->aer_count++; 774 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link); 775 pthread_mutex_unlock(&sc->aer_mtx); 776 777 return (0); 778 } 779 780 /* 781 * Get an Async Event Request structure 782 * 783 * Returns a pointer to an AER previously submitted by the host or NULL if 784 * no AER's exist. Caller is responsible for freeing the returned struct. 785 */ 786 static struct pci_nvme_aer * 787 pci_nvme_aer_get(struct pci_nvme_softc *sc) 788 { 789 struct pci_nvme_aer *aer = NULL; 790 791 pthread_mutex_lock(&sc->aer_mtx); 792 aer = STAILQ_FIRST(&sc->aer_list); 793 if (aer != NULL) { 794 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 795 sc->aer_count--; 796 } 797 pthread_mutex_unlock(&sc->aer_mtx); 798 799 return (aer); 800 } 801 802 static void 803 pci_nvme_aen_reset(struct pci_nvme_softc *sc) 804 { 805 uint32_t atype; 806 807 memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen)); 808 809 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 810 sc->aen[atype].atype = atype; 811 } 812 } 813 814 static void 815 pci_nvme_aen_init(struct pci_nvme_softc *sc) 816 { 817 char nstr[80]; 818 819 pci_nvme_aen_reset(sc); 820 821 pthread_mutex_init(&sc->aen_mtx, NULL); 822 pthread_create(&sc->aen_tid, NULL, aen_thr, sc); 823 snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot, 824 sc->nsc_pi->pi_func); 825 pthread_set_name_np(sc->aen_tid, nstr); 826 } 827 828 static void 829 pci_nvme_aen_destroy(struct pci_nvme_softc *sc) 830 { 831 832 pci_nvme_aen_reset(sc); 833 } 834 835 /* Notify the AEN thread of pending work */ 836 static void 837 pci_nvme_aen_notify(struct pci_nvme_softc *sc) 838 { 839 840 pthread_cond_signal(&sc->aen_cond); 841 } 842 843 /* 844 * Post an Asynchronous Event Notification 845 */ 846 static int32_t 847 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype, 848 uint32_t event_data) 849 { 850 struct pci_nvme_aen *aen; 851 852 if (atype >= PCI_NVME_AE_TYPE_MAX) { 853 return(EINVAL); 854 } 855 856 pthread_mutex_lock(&sc->aen_mtx); 857 aen = &sc->aen[atype]; 858 859 /* Has the controller already posted an event of this type? */ 860 if (aen->posted) { 861 pthread_mutex_unlock(&sc->aen_mtx); 862 return(EALREADY); 863 } 864 865 aen->event_data = event_data; 866 aen->posted = true; 867 pthread_mutex_unlock(&sc->aen_mtx); 868 869 pci_nvme_aen_notify(sc); 870 871 return(0); 872 } 873 874 static void 875 pci_nvme_aen_process(struct pci_nvme_softc *sc) 876 { 877 struct pci_nvme_aer *aer; 878 struct pci_nvme_aen *aen; 879 pci_nvme_async_type atype; 880 uint32_t mask; 881 uint16_t status; 882 uint8_t lid; 883 884 assert(pthread_mutex_isowned_np(&sc->aen_mtx)); 885 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 886 aen = &sc->aen[atype]; 887 /* Previous iterations may have depleted the available AER's */ 888 if (!pci_nvme_aer_available(sc)) { 889 DPRINTF("%s: no AER", __func__); 890 break; 891 } 892 893 if (!aen->posted) { 894 DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype); 895 continue; 896 } 897 898 status = NVME_SC_SUCCESS; 899 900 /* Is the event masked? */ 901 mask = 902 sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11; 903 904 DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data); 905 switch (atype) { 906 case PCI_NVME_AE_TYPE_ERROR: 907 lid = NVME_LOG_ERROR; 908 break; 909 case PCI_NVME_AE_TYPE_SMART: 910 mask &= 0xff; 911 if ((mask & aen->event_data) == 0) 912 continue; 913 lid = NVME_LOG_HEALTH_INFORMATION; 914 break; 915 case PCI_NVME_AE_TYPE_NOTICE: 916 if (aen->event_data >= PCI_NVME_AEI_NOTICE_MAX) { 917 EPRINTLN("%s unknown AEN notice type %u", 918 __func__, aen->event_data); 919 status = NVME_SC_INTERNAL_DEVICE_ERROR; 920 lid = 0; 921 break; 922 } 923 if ((PCI_NVME_AEI_NOTICE_MASK(aen->event_data) & mask) == 0) 924 continue; 925 switch (aen->event_data) { 926 case PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED: 927 lid = NVME_LOG_CHANGED_NAMESPACE; 928 break; 929 case PCI_NVME_AEI_NOTICE_FW_ACTIVATION: 930 lid = NVME_LOG_FIRMWARE_SLOT; 931 break; 932 case PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE: 933 lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED; 934 break; 935 case PCI_NVME_AEI_NOTICE_ANA_CHANGE: 936 lid = NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS; 937 break; 938 case PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE: 939 lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE; 940 break; 941 case PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT: 942 lid = NVME_LOG_LBA_STATUS_INFORMATION; 943 break; 944 case PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE: 945 lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE; 946 break; 947 default: 948 lid = 0; 949 } 950 break; 951 default: 952 /* bad type?!? */ 953 EPRINTLN("%s unknown AEN type %u", __func__, atype); 954 status = NVME_SC_INTERNAL_DEVICE_ERROR; 955 lid = 0; 956 break; 957 } 958 959 aer = pci_nvme_aer_get(sc); 960 assert(aer != NULL); 961 962 DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype); 963 pci_nvme_cq_update(sc, &sc->compl_queues[0], 964 (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */ 965 aer->cid, 966 0, /* SQID */ 967 status); 968 969 aen->event_data = 0; 970 aen->posted = false; 971 972 pci_generate_msix(sc->nsc_pi, 0); 973 } 974 } 975 976 static void * 977 aen_thr(void *arg) 978 { 979 struct pci_nvme_softc *sc; 980 981 sc = arg; 982 983 pthread_mutex_lock(&sc->aen_mtx); 984 for (;;) { 985 pci_nvme_aen_process(sc); 986 pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx); 987 } 988 pthread_mutex_unlock(&sc->aen_mtx); 989 990 pthread_exit(NULL); 991 return (NULL); 992 } 993 994 static void 995 pci_nvme_reset_locked(struct pci_nvme_softc *sc) 996 { 997 uint32_t i; 998 999 DPRINTF("%s", __func__); 1000 1001 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | 1002 (1 << NVME_CAP_LO_REG_CQR_SHIFT) | 1003 (60 << NVME_CAP_LO_REG_TO_SHIFT); 1004 1005 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; 1006 1007 sc->regs.vs = NVME_REV(1,4); /* NVMe v1.4 */ 1008 1009 sc->regs.cc = 0; 1010 1011 assert(sc->submit_queues != NULL); 1012 1013 for (i = 0; i < sc->num_squeues + 1; i++) { 1014 sc->submit_queues[i].qbase = NULL; 1015 sc->submit_queues[i].size = 0; 1016 sc->submit_queues[i].cqid = 0; 1017 sc->submit_queues[i].tail = 0; 1018 sc->submit_queues[i].head = 0; 1019 } 1020 1021 assert(sc->compl_queues != NULL); 1022 1023 for (i = 0; i < sc->num_cqueues + 1; i++) { 1024 sc->compl_queues[i].qbase = NULL; 1025 sc->compl_queues[i].size = 0; 1026 sc->compl_queues[i].tail = 0; 1027 sc->compl_queues[i].head = 0; 1028 } 1029 1030 sc->num_q_is_set = false; 1031 1032 pci_nvme_aer_destroy(sc); 1033 pci_nvme_aen_destroy(sc); 1034 1035 /* 1036 * Clear CSTS.RDY last to prevent the host from enabling Controller 1037 * before cleanup completes 1038 */ 1039 sc->regs.csts = 0; 1040 } 1041 1042 static void 1043 pci_nvme_reset(struct pci_nvme_softc *sc) 1044 { 1045 pthread_mutex_lock(&sc->mtx); 1046 pci_nvme_reset_locked(sc); 1047 pthread_mutex_unlock(&sc->mtx); 1048 } 1049 1050 static int 1051 pci_nvme_init_controller(struct pci_nvme_softc *sc) 1052 { 1053 uint16_t acqs, asqs; 1054 1055 DPRINTF("%s", __func__); 1056 1057 /* 1058 * NVMe 2.0 states that "enabling a controller while this field is 1059 * cleared to 0h produces undefined results" for both ACQS and 1060 * ASQS. If zero, set CFS and do not become ready. 1061 */ 1062 asqs = ONE_BASED(sc->regs.aqa & NVME_AQA_REG_ASQS_MASK); 1063 if (asqs < 2) { 1064 EPRINTLN("%s: illegal ASQS value %#x (aqa=%#x)", __func__, 1065 asqs - 1, sc->regs.aqa); 1066 sc->regs.csts |= NVME_CSTS_CFS; 1067 return (-1); 1068 } 1069 sc->submit_queues[0].size = asqs; 1070 sc->submit_queues[0].qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1071 sc->regs.asq, sizeof(struct nvme_command) * asqs); 1072 if (sc->submit_queues[0].qbase == NULL) { 1073 EPRINTLN("%s: ASQ vm_map_gpa(%lx) failed", __func__, 1074 sc->regs.asq); 1075 sc->regs.csts |= NVME_CSTS_CFS; 1076 return (-1); 1077 } 1078 1079 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p", 1080 __func__, sc->regs.asq, sc->submit_queues[0].qbase); 1081 1082 acqs = ONE_BASED((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 1083 NVME_AQA_REG_ACQS_MASK); 1084 if (acqs < 2) { 1085 EPRINTLN("%s: illegal ACQS value %#x (aqa=%#x)", __func__, 1086 acqs - 1, sc->regs.aqa); 1087 sc->regs.csts |= NVME_CSTS_CFS; 1088 return (-1); 1089 } 1090 sc->compl_queues[0].size = acqs; 1091 sc->compl_queues[0].qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1092 sc->regs.acq, sizeof(struct nvme_completion) * acqs); 1093 if (sc->compl_queues[0].qbase == NULL) { 1094 EPRINTLN("%s: ACQ vm_map_gpa(%lx) failed", __func__, 1095 sc->regs.acq); 1096 sc->regs.csts |= NVME_CSTS_CFS; 1097 return (-1); 1098 } 1099 sc->compl_queues[0].intr_en = NVME_CQ_INTEN; 1100 1101 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p", 1102 __func__, sc->regs.acq, sc->compl_queues[0].qbase); 1103 1104 return (0); 1105 } 1106 1107 static int 1108 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b, 1109 size_t len, enum nvme_copy_dir dir) 1110 { 1111 uint8_t *p; 1112 size_t bytes; 1113 1114 if (len > (8 * 1024)) { 1115 return (-1); 1116 } 1117 1118 /* Copy from the start of prp1 to the end of the physical page */ 1119 bytes = PAGE_SIZE - (prp1 & PAGE_MASK); 1120 bytes = MIN(bytes, len); 1121 1122 p = vm_map_gpa(ctx, prp1, bytes); 1123 if (p == NULL) { 1124 return (-1); 1125 } 1126 1127 if (dir == NVME_COPY_TO_PRP) 1128 memcpy(p, b, bytes); 1129 else 1130 memcpy(b, p, bytes); 1131 1132 b += bytes; 1133 1134 len -= bytes; 1135 if (len == 0) { 1136 return (0); 1137 } 1138 1139 len = MIN(len, PAGE_SIZE); 1140 1141 p = vm_map_gpa(ctx, prp2, len); 1142 if (p == NULL) { 1143 return (-1); 1144 } 1145 1146 if (dir == NVME_COPY_TO_PRP) 1147 memcpy(p, b, len); 1148 else 1149 memcpy(b, p, len); 1150 1151 return (0); 1152 } 1153 1154 /* 1155 * Write a Completion Queue Entry update 1156 * 1157 * Write the completion and update the doorbell value 1158 */ 1159 static void 1160 pci_nvme_cq_update(struct pci_nvme_softc *sc, 1161 struct nvme_completion_queue *cq, 1162 uint32_t cdw0, 1163 uint16_t cid, 1164 uint16_t sqid, 1165 uint16_t status) 1166 { 1167 struct nvme_submission_queue *sq = &sc->submit_queues[sqid]; 1168 struct nvme_completion *cqe; 1169 1170 assert(cq->qbase != NULL); 1171 1172 pthread_mutex_lock(&cq->mtx); 1173 1174 cqe = &cq->qbase[cq->tail]; 1175 1176 /* Flip the phase bit */ 1177 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK; 1178 1179 cqe->cdw0 = cdw0; 1180 cqe->sqhd = sq->head; 1181 cqe->sqid = sqid; 1182 cqe->cid = cid; 1183 cqe->status = status; 1184 1185 cq->tail++; 1186 if (cq->tail >= cq->size) { 1187 cq->tail = 0; 1188 } 1189 1190 pthread_mutex_unlock(&cq->mtx); 1191 } 1192 1193 static int 1194 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1195 struct nvme_completion* compl) 1196 { 1197 uint16_t qid = command->cdw10 & 0xffff; 1198 1199 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid); 1200 if (qid == 0 || qid > sc->num_squeues || 1201 (sc->submit_queues[qid].qbase == NULL)) { 1202 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u", 1203 __func__, qid, sc->num_squeues); 1204 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1205 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1206 return (1); 1207 } 1208 1209 sc->submit_queues[qid].qbase = NULL; 1210 sc->submit_queues[qid].cqid = 0; 1211 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1212 return (1); 1213 } 1214 1215 static int 1216 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1217 struct nvme_completion* compl) 1218 { 1219 if (command->cdw11 & NVME_CMD_CDW11_PC) { 1220 uint16_t qid = command->cdw10 & 0xffff; 1221 struct nvme_submission_queue *nsq; 1222 1223 if ((qid == 0) || (qid > sc->num_squeues) || 1224 (sc->submit_queues[qid].qbase != NULL)) { 1225 WPRINTF("%s queue index %u > num_squeues %u", 1226 __func__, qid, sc->num_squeues); 1227 pci_nvme_status_tc(&compl->status, 1228 NVME_SCT_COMMAND_SPECIFIC, 1229 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1230 return (1); 1231 } 1232 1233 nsq = &sc->submit_queues[qid]; 1234 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1235 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries); 1236 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) { 1237 /* 1238 * Queues must specify at least two entries 1239 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1240 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1241 */ 1242 pci_nvme_status_tc(&compl->status, 1243 NVME_SCT_COMMAND_SPECIFIC, 1244 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1245 return (1); 1246 } 1247 nsq->head = nsq->tail = 0; 1248 1249 nsq->cqid = (command->cdw11 >> 16) & 0xffff; 1250 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) { 1251 pci_nvme_status_tc(&compl->status, 1252 NVME_SCT_COMMAND_SPECIFIC, 1253 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1254 return (1); 1255 } 1256 1257 if (sc->compl_queues[nsq->cqid].qbase == NULL) { 1258 pci_nvme_status_tc(&compl->status, 1259 NVME_SCT_COMMAND_SPECIFIC, 1260 NVME_SC_COMPLETION_QUEUE_INVALID); 1261 return (1); 1262 } 1263 1264 nsq->qpriority = (command->cdw11 >> 1) & 0x03; 1265 1266 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1267 sizeof(struct nvme_command) * (size_t)nsq->size); 1268 1269 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__, 1270 qid, nsq->size, nsq->qbase, nsq->cqid); 1271 1272 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1273 1274 DPRINTF("%s completed creating IOSQ qid %u", 1275 __func__, qid); 1276 } else { 1277 /* 1278 * Guest sent non-cont submission queue request. 1279 * This setting is unsupported by this emulation. 1280 */ 1281 WPRINTF("%s unsupported non-contig (list-based) " 1282 "create i/o submission queue", __func__); 1283 1284 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1285 } 1286 return (1); 1287 } 1288 1289 static int 1290 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1291 struct nvme_completion* compl) 1292 { 1293 uint16_t qid = command->cdw10 & 0xffff; 1294 uint16_t sqid; 1295 1296 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid); 1297 if (qid == 0 || qid > sc->num_cqueues || 1298 (sc->compl_queues[qid].qbase == NULL)) { 1299 WPRINTF("%s queue index %u / num_cqueues %u", 1300 __func__, qid, sc->num_cqueues); 1301 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1302 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1303 return (1); 1304 } 1305 1306 /* Deleting an Active CQ is an error */ 1307 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++) 1308 if (sc->submit_queues[sqid].cqid == qid) { 1309 pci_nvme_status_tc(&compl->status, 1310 NVME_SCT_COMMAND_SPECIFIC, 1311 NVME_SC_INVALID_QUEUE_DELETION); 1312 return (1); 1313 } 1314 1315 sc->compl_queues[qid].qbase = NULL; 1316 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1317 return (1); 1318 } 1319 1320 static int 1321 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1322 struct nvme_completion* compl) 1323 { 1324 struct nvme_completion_queue *ncq; 1325 uint16_t qid = command->cdw10 & 0xffff; 1326 1327 /* Only support Physically Contiguous queues */ 1328 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) { 1329 WPRINTF("%s unsupported non-contig (list-based) " 1330 "create i/o completion queue", 1331 __func__); 1332 1333 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1334 return (1); 1335 } 1336 1337 if ((qid == 0) || (qid > sc->num_cqueues) || 1338 (sc->compl_queues[qid].qbase != NULL)) { 1339 WPRINTF("%s queue index %u > num_cqueues %u", 1340 __func__, qid, sc->num_cqueues); 1341 pci_nvme_status_tc(&compl->status, 1342 NVME_SCT_COMMAND_SPECIFIC, 1343 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1344 return (1); 1345 } 1346 1347 ncq = &sc->compl_queues[qid]; 1348 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; 1349 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; 1350 if (ncq->intr_vec > (sc->max_queues + 1)) { 1351 pci_nvme_status_tc(&compl->status, 1352 NVME_SCT_COMMAND_SPECIFIC, 1353 NVME_SC_INVALID_INTERRUPT_VECTOR); 1354 return (1); 1355 } 1356 1357 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1358 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) { 1359 /* 1360 * Queues must specify at least two entries 1361 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1362 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1363 */ 1364 pci_nvme_status_tc(&compl->status, 1365 NVME_SCT_COMMAND_SPECIFIC, 1366 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1367 return (1); 1368 } 1369 ncq->head = ncq->tail = 0; 1370 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1371 command->prp1, 1372 sizeof(struct nvme_command) * (size_t)ncq->size); 1373 1374 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1375 1376 1377 return (1); 1378 } 1379 1380 static int 1381 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, 1382 struct nvme_completion* compl) 1383 { 1384 uint64_t logoff; 1385 uint32_t logsize; 1386 uint8_t logpage; 1387 1388 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1389 1390 /* 1391 * Command specifies the number of dwords to return in fields NUMDU 1392 * and NUMDL. This is a zero-based value. 1393 */ 1394 logpage = command->cdw10 & 0xFF; 1395 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1; 1396 logsize *= sizeof(uint32_t); 1397 logoff = ((uint64_t)(command->cdw13) << 32) | command->cdw12; 1398 1399 DPRINTF("%s log page %u len %u", __func__, logpage, logsize); 1400 1401 switch (logpage) { 1402 case NVME_LOG_ERROR: 1403 if (logoff >= sizeof(sc->err_log)) { 1404 pci_nvme_status_genc(&compl->status, 1405 NVME_SC_INVALID_FIELD); 1406 break; 1407 } 1408 1409 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1410 command->prp2, (uint8_t *)&sc->err_log + logoff, 1411 MIN(logsize - logoff, sizeof(sc->err_log)), 1412 NVME_COPY_TO_PRP); 1413 break; 1414 case NVME_LOG_HEALTH_INFORMATION: 1415 if (logoff >= sizeof(sc->health_log)) { 1416 pci_nvme_status_genc(&compl->status, 1417 NVME_SC_INVALID_FIELD); 1418 break; 1419 } 1420 1421 pthread_mutex_lock(&sc->mtx); 1422 memcpy(&sc->health_log.data_units_read, &sc->read_data_units, 1423 sizeof(sc->health_log.data_units_read)); 1424 memcpy(&sc->health_log.data_units_written, &sc->write_data_units, 1425 sizeof(sc->health_log.data_units_written)); 1426 memcpy(&sc->health_log.host_read_commands, &sc->read_commands, 1427 sizeof(sc->health_log.host_read_commands)); 1428 memcpy(&sc->health_log.host_write_commands, &sc->write_commands, 1429 sizeof(sc->health_log.host_write_commands)); 1430 pthread_mutex_unlock(&sc->mtx); 1431 1432 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1433 command->prp2, (uint8_t *)&sc->health_log + logoff, 1434 MIN(logsize - logoff, sizeof(sc->health_log)), 1435 NVME_COPY_TO_PRP); 1436 break; 1437 case NVME_LOG_FIRMWARE_SLOT: 1438 if (logoff >= sizeof(sc->fw_log)) { 1439 pci_nvme_status_genc(&compl->status, 1440 NVME_SC_INVALID_FIELD); 1441 break; 1442 } 1443 1444 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1445 command->prp2, (uint8_t *)&sc->fw_log + logoff, 1446 MIN(logsize - logoff, sizeof(sc->fw_log)), 1447 NVME_COPY_TO_PRP); 1448 break; 1449 case NVME_LOG_CHANGED_NAMESPACE: 1450 if (logoff >= sizeof(sc->ns_log)) { 1451 pci_nvme_status_genc(&compl->status, 1452 NVME_SC_INVALID_FIELD); 1453 break; 1454 } 1455 1456 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1457 command->prp2, (uint8_t *)&sc->ns_log + logoff, 1458 MIN(logsize - logoff, sizeof(sc->ns_log)), 1459 NVME_COPY_TO_PRP); 1460 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 1461 break; 1462 default: 1463 DPRINTF("%s get log page %x command not supported", 1464 __func__, logpage); 1465 1466 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1467 NVME_SC_INVALID_LOG_PAGE); 1468 } 1469 1470 return (1); 1471 } 1472 1473 static int 1474 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, 1475 struct nvme_completion* compl) 1476 { 1477 void *dest; 1478 uint16_t status; 1479 1480 DPRINTF("%s identify 0x%x nsid 0x%x", __func__, 1481 command->cdw10 & 0xFF, command->nsid); 1482 1483 status = 0; 1484 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1485 1486 switch (command->cdw10 & 0xFF) { 1487 case 0x00: /* return Identify Namespace data structure */ 1488 /* Global NS only valid with NS Management */ 1489 if (command->nsid == NVME_GLOBAL_NAMESPACE_TAG) { 1490 pci_nvme_status_genc(&status, 1491 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1492 break; 1493 } 1494 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1495 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata), 1496 NVME_COPY_TO_PRP); 1497 break; 1498 case 0x01: /* return Identify Controller data structure */ 1499 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1500 command->prp2, (uint8_t *)&sc->ctrldata, 1501 sizeof(sc->ctrldata), 1502 NVME_COPY_TO_PRP); 1503 break; 1504 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ 1505 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1506 sizeof(uint32_t) * 1024); 1507 /* All unused entries shall be zero */ 1508 memset(dest, 0, sizeof(uint32_t) * 1024); 1509 ((uint32_t *)dest)[0] = 1; 1510 break; 1511 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ 1512 if (command->nsid != 1) { 1513 pci_nvme_status_genc(&status, 1514 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1515 break; 1516 } 1517 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1518 sizeof(uint32_t) * 1024); 1519 /* All bytes after the descriptor shall be zero */ 1520 memset(dest, 0, sizeof(uint32_t) * 1024); 1521 1522 /* Return NIDT=1 (i.e. EUI64) descriptor */ 1523 ((uint8_t *)dest)[0] = 1; 1524 ((uint8_t *)dest)[1] = sizeof(uint64_t); 1525 memcpy(((uint8_t *)dest) + 4, sc->nsdata.eui64, sizeof(uint64_t)); 1526 break; 1527 case 0x13: 1528 /* 1529 * Controller list is optional but used by UNH tests. Return 1530 * a valid but empty list. 1531 */ 1532 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1533 sizeof(uint16_t) * 2048); 1534 memset(dest, 0, sizeof(uint16_t) * 2048); 1535 break; 1536 default: 1537 DPRINTF("%s unsupported identify command requested 0x%x", 1538 __func__, command->cdw10 & 0xFF); 1539 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD); 1540 break; 1541 } 1542 1543 compl->status = status; 1544 return (1); 1545 } 1546 1547 static const char * 1548 nvme_fid_to_name(uint8_t fid) 1549 { 1550 const char *name; 1551 1552 switch (fid) { 1553 case NVME_FEAT_ARBITRATION: 1554 name = "Arbitration"; 1555 break; 1556 case NVME_FEAT_POWER_MANAGEMENT: 1557 name = "Power Management"; 1558 break; 1559 case NVME_FEAT_LBA_RANGE_TYPE: 1560 name = "LBA Range Type"; 1561 break; 1562 case NVME_FEAT_TEMPERATURE_THRESHOLD: 1563 name = "Temperature Threshold"; 1564 break; 1565 case NVME_FEAT_ERROR_RECOVERY: 1566 name = "Error Recovery"; 1567 break; 1568 case NVME_FEAT_VOLATILE_WRITE_CACHE: 1569 name = "Volatile Write Cache"; 1570 break; 1571 case NVME_FEAT_NUMBER_OF_QUEUES: 1572 name = "Number of Queues"; 1573 break; 1574 case NVME_FEAT_INTERRUPT_COALESCING: 1575 name = "Interrupt Coalescing"; 1576 break; 1577 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 1578 name = "Interrupt Vector Configuration"; 1579 break; 1580 case NVME_FEAT_WRITE_ATOMICITY: 1581 name = "Write Atomicity Normal"; 1582 break; 1583 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 1584 name = "Asynchronous Event Configuration"; 1585 break; 1586 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 1587 name = "Autonomous Power State Transition"; 1588 break; 1589 case NVME_FEAT_HOST_MEMORY_BUFFER: 1590 name = "Host Memory Buffer"; 1591 break; 1592 case NVME_FEAT_TIMESTAMP: 1593 name = "Timestamp"; 1594 break; 1595 case NVME_FEAT_KEEP_ALIVE_TIMER: 1596 name = "Keep Alive Timer"; 1597 break; 1598 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT: 1599 name = "Host Controlled Thermal Management"; 1600 break; 1601 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG: 1602 name = "Non-Operation Power State Config"; 1603 break; 1604 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG: 1605 name = "Read Recovery Level Config"; 1606 break; 1607 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 1608 name = "Predictable Latency Mode Config"; 1609 break; 1610 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW: 1611 name = "Predictable Latency Mode Window"; 1612 break; 1613 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES: 1614 name = "LBA Status Information Report Interval"; 1615 break; 1616 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 1617 name = "Host Behavior Support"; 1618 break; 1619 case NVME_FEAT_SANITIZE_CONFIG: 1620 name = "Sanitize Config"; 1621 break; 1622 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION: 1623 name = "Endurance Group Event Configuration"; 1624 break; 1625 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 1626 name = "Software Progress Marker"; 1627 break; 1628 case NVME_FEAT_HOST_IDENTIFIER: 1629 name = "Host Identifier"; 1630 break; 1631 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK: 1632 name = "Reservation Notification Mask"; 1633 break; 1634 case NVME_FEAT_RESERVATION_PERSISTENCE: 1635 name = "Reservation Persistence"; 1636 break; 1637 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG: 1638 name = "Namespace Write Protection Config"; 1639 break; 1640 default: 1641 name = "Unknown"; 1642 break; 1643 } 1644 1645 return (name); 1646 } 1647 1648 static void 1649 nvme_feature_invalid_cb(struct pci_nvme_softc *sc __unused, 1650 struct nvme_feature_obj *feat __unused, 1651 struct nvme_command *command __unused, 1652 struct nvme_completion *compl) 1653 { 1654 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1655 } 1656 1657 static void 1658 nvme_feature_iv_config(struct pci_nvme_softc *sc, 1659 struct nvme_feature_obj *feat __unused, 1660 struct nvme_command *command, 1661 struct nvme_completion *compl) 1662 { 1663 uint32_t i; 1664 uint32_t cdw11 = command->cdw11; 1665 uint16_t iv; 1666 bool cd; 1667 1668 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1669 1670 iv = cdw11 & 0xffff; 1671 cd = cdw11 & (1 << 16); 1672 1673 if (iv > (sc->max_queues + 1)) { 1674 return; 1675 } 1676 1677 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */ 1678 if ((iv == 0) && !cd) 1679 return; 1680 1681 /* Requested Interrupt Vector must be used by a CQ */ 1682 for (i = 0; i < sc->num_cqueues + 1; i++) { 1683 if (sc->compl_queues[i].intr_vec == iv) { 1684 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1685 } 1686 } 1687 } 1688 1689 #define NVME_ASYNC_EVENT_ENDURANCE_GROUP (0x4000) 1690 static void 1691 nvme_feature_async_event(struct pci_nvme_softc *sc __unused, 1692 struct nvme_feature_obj *feat __unused, 1693 struct nvme_command *command, 1694 struct nvme_completion *compl) 1695 { 1696 if (command->cdw11 & NVME_ASYNC_EVENT_ENDURANCE_GROUP) 1697 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1698 } 1699 1700 #define NVME_TEMP_THRESH_OVER 0 1701 #define NVME_TEMP_THRESH_UNDER 1 1702 static void 1703 nvme_feature_temperature(struct pci_nvme_softc *sc, 1704 struct nvme_feature_obj *feat __unused, 1705 struct nvme_command *command, 1706 struct nvme_completion *compl) 1707 { 1708 uint16_t tmpth; /* Temperature Threshold */ 1709 uint8_t tmpsel; /* Threshold Temperature Select */ 1710 uint8_t thsel; /* Threshold Type Select */ 1711 bool set_crit = false; 1712 bool report_crit; 1713 1714 tmpth = command->cdw11 & 0xffff; 1715 tmpsel = (command->cdw11 >> 16) & 0xf; 1716 thsel = (command->cdw11 >> 20) & 0x3; 1717 1718 DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel); 1719 1720 /* Check for unsupported values */ 1721 if (((tmpsel != 0) && (tmpsel != 0xf)) || 1722 (thsel > NVME_TEMP_THRESH_UNDER)) { 1723 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1724 return; 1725 } 1726 1727 if (((thsel == NVME_TEMP_THRESH_OVER) && (NVME_TEMPERATURE >= tmpth)) || 1728 ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth))) 1729 set_crit = true; 1730 1731 pthread_mutex_lock(&sc->mtx); 1732 if (set_crit) 1733 sc->health_log.critical_warning |= 1734 NVME_CRIT_WARN_ST_TEMPERATURE; 1735 else 1736 sc->health_log.critical_warning &= 1737 ~NVME_CRIT_WARN_ST_TEMPERATURE; 1738 pthread_mutex_unlock(&sc->mtx); 1739 1740 report_crit = sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11 & 1741 NVME_CRIT_WARN_ST_TEMPERATURE; 1742 1743 if (set_crit && report_crit) 1744 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART, 1745 sc->health_log.critical_warning); 1746 1747 DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status); 1748 } 1749 1750 static void 1751 nvme_feature_num_queues(struct pci_nvme_softc *sc, 1752 struct nvme_feature_obj *feat __unused, 1753 struct nvme_command *command, 1754 struct nvme_completion *compl) 1755 { 1756 uint16_t nqr; /* Number of Queues Requested */ 1757 1758 if (sc->num_q_is_set) { 1759 WPRINTF("%s: Number of Queues already set", __func__); 1760 pci_nvme_status_genc(&compl->status, 1761 NVME_SC_COMMAND_SEQUENCE_ERROR); 1762 return; 1763 } 1764 1765 nqr = command->cdw11 & 0xFFFF; 1766 if (nqr == 0xffff) { 1767 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr); 1768 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1769 return; 1770 } 1771 1772 sc->num_squeues = ONE_BASED(nqr); 1773 if (sc->num_squeues > sc->max_queues) { 1774 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues, 1775 sc->max_queues); 1776 sc->num_squeues = sc->max_queues; 1777 } 1778 1779 nqr = (command->cdw11 >> 16) & 0xFFFF; 1780 if (nqr == 0xffff) { 1781 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr); 1782 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1783 return; 1784 } 1785 1786 sc->num_cqueues = ONE_BASED(nqr); 1787 if (sc->num_cqueues > sc->max_queues) { 1788 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues, 1789 sc->max_queues); 1790 sc->num_cqueues = sc->max_queues; 1791 } 1792 1793 /* Patch the command value which will be saved on callback's return */ 1794 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc); 1795 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 1796 1797 sc->num_q_is_set = true; 1798 } 1799 1800 static int 1801 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command, 1802 struct nvme_completion *compl) 1803 { 1804 struct nvme_feature_obj *feat; 1805 uint32_t nsid = command->nsid; 1806 uint8_t fid = NVMEV(NVME_FEAT_SET_FID, command->cdw10); 1807 bool sv = NVMEV(NVME_FEAT_SET_SV, command->cdw10); 1808 1809 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1810 1811 if (fid >= NVME_FID_MAX) { 1812 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1813 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1814 return (1); 1815 } 1816 1817 if (sv) { 1818 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1819 NVME_SC_FEATURE_NOT_SAVEABLE); 1820 return (1); 1821 } 1822 1823 feat = &sc->feat[fid]; 1824 1825 if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) { 1826 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1827 return (1); 1828 } 1829 1830 if (!feat->namespace_specific && 1831 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) { 1832 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1833 NVME_SC_FEATURE_NOT_NS_SPECIFIC); 1834 return (1); 1835 } 1836 1837 compl->cdw0 = 0; 1838 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1839 1840 if (feat->set) 1841 feat->set(sc, feat, command, compl); 1842 else { 1843 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1844 NVME_SC_FEATURE_NOT_CHANGEABLE); 1845 return (1); 1846 } 1847 1848 DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11); 1849 if (compl->status == NVME_SC_SUCCESS) { 1850 feat->cdw11 = command->cdw11; 1851 if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) && 1852 (command->cdw11 != 0)) 1853 pci_nvme_aen_notify(sc); 1854 } 1855 1856 return (0); 1857 } 1858 1859 #define NVME_FEATURES_SEL_SUPPORTED 0x3 1860 #define NVME_FEATURES_NS_SPECIFIC (1 << 1) 1861 1862 static int 1863 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, 1864 struct nvme_completion* compl) 1865 { 1866 struct nvme_feature_obj *feat; 1867 uint8_t fid = command->cdw10 & 0xFF; 1868 uint8_t sel = (command->cdw10 >> 8) & 0x7; 1869 1870 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1871 1872 if (fid >= NVME_FID_MAX) { 1873 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1874 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1875 return (1); 1876 } 1877 1878 compl->cdw0 = 0; 1879 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1880 1881 feat = &sc->feat[fid]; 1882 if (feat->get) { 1883 feat->get(sc, feat, command, compl); 1884 } 1885 1886 if (compl->status == NVME_SC_SUCCESS) { 1887 if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific) 1888 compl->cdw0 = NVME_FEATURES_NS_SPECIFIC; 1889 else 1890 compl->cdw0 = feat->cdw11; 1891 } 1892 1893 return (0); 1894 } 1895 1896 static int 1897 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command, 1898 struct nvme_completion* compl) 1899 { 1900 uint8_t ses, lbaf, pi; 1901 1902 /* Only supports Secure Erase Setting - User Data Erase */ 1903 ses = (command->cdw10 >> 9) & 0x7; 1904 if (ses > 0x1) { 1905 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1906 return (1); 1907 } 1908 1909 /* Only supports a single LBA Format */ 1910 lbaf = command->cdw10 & 0xf; 1911 if (lbaf != 0) { 1912 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1913 NVME_SC_INVALID_FORMAT); 1914 return (1); 1915 } 1916 1917 /* Doesn't support Protection Infomation */ 1918 pi = (command->cdw10 >> 5) & 0x7; 1919 if (pi != 0) { 1920 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1921 return (1); 1922 } 1923 1924 if (sc->nvstore.type == NVME_STOR_RAM) { 1925 if (sc->nvstore.ctx) 1926 free(sc->nvstore.ctx); 1927 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 1928 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1929 } else { 1930 struct pci_nvme_ioreq *req; 1931 int err; 1932 1933 req = pci_nvme_get_ioreq(sc); 1934 if (req == NULL) { 1935 pci_nvme_status_genc(&compl->status, 1936 NVME_SC_INTERNAL_DEVICE_ERROR); 1937 WPRINTF("%s: unable to allocate IO req", __func__); 1938 return (1); 1939 } 1940 req->nvme_sq = &sc->submit_queues[0]; 1941 req->sqid = 0; 1942 req->opc = command->opc; 1943 req->cid = command->cid; 1944 req->nsid = command->nsid; 1945 1946 req->io_req.br_offset = 0; 1947 req->io_req.br_resid = sc->nvstore.size; 1948 req->io_req.br_callback = pci_nvme_io_done; 1949 1950 err = blockif_delete(sc->nvstore.ctx, &req->io_req); 1951 if (err) { 1952 pci_nvme_status_genc(&compl->status, 1953 NVME_SC_INTERNAL_DEVICE_ERROR); 1954 pci_nvme_release_ioreq(sc, req); 1955 } else 1956 compl->status = NVME_NO_STATUS; 1957 } 1958 1959 return (1); 1960 } 1961 1962 static int 1963 nvme_opc_abort(struct pci_nvme_softc *sc __unused, struct nvme_command *command, 1964 struct nvme_completion *compl) 1965 { 1966 DPRINTF("%s submission queue %u, command ID 0x%x", __func__, 1967 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF); 1968 1969 /* TODO: search for the command ID and abort it */ 1970 1971 compl->cdw0 = 1; 1972 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1973 return (1); 1974 } 1975 1976 static int 1977 nvme_opc_async_event_req(struct pci_nvme_softc* sc, 1978 struct nvme_command* command, struct nvme_completion* compl) 1979 { 1980 DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__, 1981 sc->aer_count, sc->ctrldata.aerl, command->cid); 1982 1983 /* Don't exceed the Async Event Request Limit (AERL). */ 1984 if (pci_nvme_aer_limit_reached(sc)) { 1985 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1986 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); 1987 return (1); 1988 } 1989 1990 if (pci_nvme_aer_add(sc, command->cid)) { 1991 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC, 1992 NVME_SC_INTERNAL_DEVICE_ERROR); 1993 return (1); 1994 } 1995 1996 /* 1997 * Raise events when they happen based on the Set Features cmd. 1998 * These events happen async, so only set completion successful if 1999 * there is an event reflective of the request to get event. 2000 */ 2001 compl->status = NVME_NO_STATUS; 2002 pci_nvme_aen_notify(sc); 2003 2004 return (0); 2005 } 2006 2007 static void 2008 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) 2009 { 2010 struct nvme_completion compl; 2011 struct nvme_command *cmd; 2012 struct nvme_submission_queue *sq; 2013 struct nvme_completion_queue *cq; 2014 uint16_t sqhead; 2015 2016 DPRINTF("%s index %u", __func__, (uint32_t)value); 2017 2018 sq = &sc->submit_queues[0]; 2019 cq = &sc->compl_queues[0]; 2020 2021 pthread_mutex_lock(&sq->mtx); 2022 2023 sqhead = sq->head; 2024 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail); 2025 2026 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2027 cmd = &(sq->qbase)[sqhead]; 2028 compl.cdw0 = 0; 2029 compl.status = 0; 2030 2031 switch (cmd->opc) { 2032 case NVME_OPC_DELETE_IO_SQ: 2033 DPRINTF("%s command DELETE_IO_SQ", __func__); 2034 nvme_opc_delete_io_sq(sc, cmd, &compl); 2035 break; 2036 case NVME_OPC_CREATE_IO_SQ: 2037 DPRINTF("%s command CREATE_IO_SQ", __func__); 2038 nvme_opc_create_io_sq(sc, cmd, &compl); 2039 break; 2040 case NVME_OPC_DELETE_IO_CQ: 2041 DPRINTF("%s command DELETE_IO_CQ", __func__); 2042 nvme_opc_delete_io_cq(sc, cmd, &compl); 2043 break; 2044 case NVME_OPC_CREATE_IO_CQ: 2045 DPRINTF("%s command CREATE_IO_CQ", __func__); 2046 nvme_opc_create_io_cq(sc, cmd, &compl); 2047 break; 2048 case NVME_OPC_GET_LOG_PAGE: 2049 DPRINTF("%s command GET_LOG_PAGE", __func__); 2050 nvme_opc_get_log_page(sc, cmd, &compl); 2051 break; 2052 case NVME_OPC_IDENTIFY: 2053 DPRINTF("%s command IDENTIFY", __func__); 2054 nvme_opc_identify(sc, cmd, &compl); 2055 break; 2056 case NVME_OPC_ABORT: 2057 DPRINTF("%s command ABORT", __func__); 2058 nvme_opc_abort(sc, cmd, &compl); 2059 break; 2060 case NVME_OPC_SET_FEATURES: 2061 DPRINTF("%s command SET_FEATURES", __func__); 2062 nvme_opc_set_features(sc, cmd, &compl); 2063 break; 2064 case NVME_OPC_GET_FEATURES: 2065 DPRINTF("%s command GET_FEATURES", __func__); 2066 nvme_opc_get_features(sc, cmd, &compl); 2067 break; 2068 case NVME_OPC_FIRMWARE_ACTIVATE: 2069 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__); 2070 pci_nvme_status_tc(&compl.status, 2071 NVME_SCT_COMMAND_SPECIFIC, 2072 NVME_SC_INVALID_FIRMWARE_SLOT); 2073 break; 2074 case NVME_OPC_ASYNC_EVENT_REQUEST: 2075 DPRINTF("%s command ASYNC_EVENT_REQ", __func__); 2076 nvme_opc_async_event_req(sc, cmd, &compl); 2077 break; 2078 case NVME_OPC_FORMAT_NVM: 2079 DPRINTF("%s command FORMAT_NVM", __func__); 2080 if ((sc->ctrldata.oacs & 2081 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) { 2082 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 2083 break; 2084 } 2085 nvme_opc_format_nvm(sc, cmd, &compl); 2086 break; 2087 case NVME_OPC_SECURITY_SEND: 2088 case NVME_OPC_SECURITY_RECEIVE: 2089 case NVME_OPC_SANITIZE: 2090 case NVME_OPC_GET_LBA_STATUS: 2091 DPRINTF("%s command OPC=%#x (unsupported)", __func__, 2092 cmd->opc); 2093 /* Valid but unsupported opcodes */ 2094 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD); 2095 break; 2096 default: 2097 DPRINTF("%s command OPC=%#X (not implemented)", 2098 __func__, 2099 cmd->opc); 2100 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 2101 } 2102 sqhead = (sqhead + 1) % sq->size; 2103 2104 if (NVME_COMPLETION_VALID(compl)) { 2105 pci_nvme_cq_update(sc, &sc->compl_queues[0], 2106 compl.cdw0, 2107 cmd->cid, 2108 0, /* SQID */ 2109 compl.status); 2110 } 2111 } 2112 2113 DPRINTF("setting sqhead %u", sqhead); 2114 sq->head = sqhead; 2115 2116 if (cq->head != cq->tail) 2117 pci_generate_msix(sc->nsc_pi, 0); 2118 2119 pthread_mutex_unlock(&sq->mtx); 2120 } 2121 2122 /* 2123 * Update the Write and Read statistics reported in SMART data 2124 * 2125 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up. 2126 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000 2127 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999. 2128 */ 2129 static void 2130 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc, 2131 size_t bytes, uint16_t status) 2132 { 2133 2134 pthread_mutex_lock(&sc->mtx); 2135 switch (opc) { 2136 case NVME_OPC_WRITE: 2137 sc->write_commands++; 2138 if (status != NVME_SC_SUCCESS) 2139 break; 2140 sc->write_dunits_remainder += (bytes / 512); 2141 while (sc->write_dunits_remainder >= 1000) { 2142 sc->write_data_units++; 2143 sc->write_dunits_remainder -= 1000; 2144 } 2145 break; 2146 case NVME_OPC_READ: 2147 sc->read_commands++; 2148 if (status != NVME_SC_SUCCESS) 2149 break; 2150 sc->read_dunits_remainder += (bytes / 512); 2151 while (sc->read_dunits_remainder >= 1000) { 2152 sc->read_data_units++; 2153 sc->read_dunits_remainder -= 1000; 2154 } 2155 break; 2156 default: 2157 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc); 2158 break; 2159 } 2160 pthread_mutex_unlock(&sc->mtx); 2161 } 2162 2163 /* 2164 * Check if the combination of Starting LBA (slba) and number of blocks 2165 * exceeds the range of the underlying storage. 2166 * 2167 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores 2168 * the capacity in bytes as a uint64_t, care must be taken to avoid integer 2169 * overflow. 2170 */ 2171 static bool 2172 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba, 2173 uint32_t nblocks) 2174 { 2175 size_t offset, bytes; 2176 2177 /* Overflow check of multiplying Starting LBA by the sector size */ 2178 if (slba >> (64 - nvstore->sectsz_bits)) 2179 return (true); 2180 2181 offset = slba << nvstore->sectsz_bits; 2182 bytes = nblocks << nvstore->sectsz_bits; 2183 2184 /* Overflow check of Number of Logical Blocks */ 2185 if ((nvstore->size <= offset) || ((nvstore->size - offset) < bytes)) 2186 return (true); 2187 2188 return (false); 2189 } 2190 2191 static int 2192 pci_nvme_append_iov_req(struct pci_nvme_softc *sc __unused, 2193 struct pci_nvme_ioreq *req, uint64_t gpaddr, size_t size, uint64_t offset) 2194 { 2195 int iovidx; 2196 bool range_is_contiguous; 2197 2198 if (req == NULL) 2199 return (-1); 2200 2201 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) { 2202 return (-1); 2203 } 2204 2205 /* 2206 * Minimize the number of IOVs by concatenating contiguous address 2207 * ranges. If the IOV count is zero, there is no previous range to 2208 * concatenate. 2209 */ 2210 if (req->io_req.br_iovcnt == 0) 2211 range_is_contiguous = false; 2212 else 2213 range_is_contiguous = (req->prev_gpaddr + req->prev_size) == gpaddr; 2214 2215 if (range_is_contiguous) { 2216 iovidx = req->io_req.br_iovcnt - 1; 2217 2218 req->io_req.br_iov[iovidx].iov_base = 2219 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2220 req->prev_gpaddr, size); 2221 if (req->io_req.br_iov[iovidx].iov_base == NULL) 2222 return (-1); 2223 2224 req->prev_size += size; 2225 req->io_req.br_resid += size; 2226 2227 req->io_req.br_iov[iovidx].iov_len = req->prev_size; 2228 } else { 2229 iovidx = req->io_req.br_iovcnt; 2230 if (iovidx == 0) { 2231 req->io_req.br_offset = offset; 2232 req->io_req.br_resid = 0; 2233 req->io_req.br_param = req; 2234 } 2235 2236 req->io_req.br_iov[iovidx].iov_base = 2237 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2238 gpaddr, size); 2239 if (req->io_req.br_iov[iovidx].iov_base == NULL) 2240 return (-1); 2241 2242 req->io_req.br_iov[iovidx].iov_len = size; 2243 2244 req->prev_gpaddr = gpaddr; 2245 req->prev_size = size; 2246 req->io_req.br_resid += size; 2247 2248 req->io_req.br_iovcnt++; 2249 } 2250 2251 return (0); 2252 } 2253 2254 static void 2255 pci_nvme_set_completion(struct pci_nvme_softc *sc, 2256 struct nvme_submission_queue *sq, int sqid, uint16_t cid, uint16_t status) 2257 { 2258 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; 2259 2260 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x", 2261 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), 2262 NVME_STATUS_GET_SC(status)); 2263 2264 pci_nvme_cq_update(sc, cq, 0, cid, sqid, status); 2265 2266 if (cq->head != cq->tail) { 2267 if (cq->intr_en & NVME_CQ_INTEN) { 2268 pci_generate_msix(sc->nsc_pi, cq->intr_vec); 2269 } else { 2270 DPRINTF("%s: CQ%u interrupt disabled", 2271 __func__, sq->cqid); 2272 } 2273 } 2274 } 2275 2276 static void 2277 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) 2278 { 2279 req->sc = NULL; 2280 req->nvme_sq = NULL; 2281 req->sqid = 0; 2282 2283 pthread_mutex_lock(&sc->mtx); 2284 2285 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link); 2286 sc->pending_ios--; 2287 2288 /* when no more IO pending, can set to ready if device reset/enabled */ 2289 if (sc->pending_ios == 0 && 2290 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) 2291 sc->regs.csts |= NVME_CSTS_RDY; 2292 2293 pthread_mutex_unlock(&sc->mtx); 2294 2295 sem_post(&sc->iosemlock); 2296 } 2297 2298 static struct pci_nvme_ioreq * 2299 pci_nvme_get_ioreq(struct pci_nvme_softc *sc) 2300 { 2301 struct pci_nvme_ioreq *req = NULL; 2302 2303 sem_wait(&sc->iosemlock); 2304 pthread_mutex_lock(&sc->mtx); 2305 2306 req = STAILQ_FIRST(&sc->ioreqs_free); 2307 assert(req != NULL); 2308 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link); 2309 2310 req->sc = sc; 2311 2312 sc->pending_ios++; 2313 2314 pthread_mutex_unlock(&sc->mtx); 2315 2316 req->io_req.br_iovcnt = 0; 2317 req->io_req.br_offset = 0; 2318 req->io_req.br_resid = 0; 2319 req->io_req.br_param = req; 2320 req->prev_gpaddr = 0; 2321 req->prev_size = 0; 2322 2323 return req; 2324 } 2325 2326 static void 2327 pci_nvme_io_done(struct blockif_req *br, int err) 2328 { 2329 struct pci_nvme_ioreq *req = br->br_param; 2330 struct nvme_submission_queue *sq = req->nvme_sq; 2331 uint16_t code, status; 2332 2333 DPRINTF("%s error %d %s", __func__, err, strerror(err)); 2334 2335 /* TODO return correct error */ 2336 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; 2337 status = 0; 2338 pci_nvme_status_genc(&status, code); 2339 2340 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, status); 2341 pci_nvme_stats_write_read_update(req->sc, req->opc, 2342 req->bytes, status); 2343 pci_nvme_release_ioreq(req->sc, req); 2344 } 2345 2346 /* 2347 * Implements the Flush command. The specification states: 2348 * If a volatile write cache is not present, Flush commands complete 2349 * successfully and have no effect 2350 * in the description of the Volatile Write Cache (VWC) field of the Identify 2351 * Controller data. Therefore, set status to Success if the command is 2352 * not supported (i.e. RAM or as indicated by the blockif). 2353 */ 2354 static bool 2355 nvme_opc_flush(struct pci_nvme_softc *sc __unused, 2356 struct nvme_command *cmd __unused, 2357 struct pci_nvme_blockstore *nvstore, 2358 struct pci_nvme_ioreq *req, 2359 uint16_t *status) 2360 { 2361 bool pending = false; 2362 2363 if (nvstore->type == NVME_STOR_RAM) { 2364 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2365 } else { 2366 int err; 2367 2368 req->io_req.br_callback = pci_nvme_io_done; 2369 2370 err = blockif_flush(nvstore->ctx, &req->io_req); 2371 switch (err) { 2372 case 0: 2373 pending = true; 2374 break; 2375 case EOPNOTSUPP: 2376 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2377 break; 2378 default: 2379 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2380 } 2381 } 2382 2383 return (pending); 2384 } 2385 2386 static uint16_t 2387 nvme_write_read_ram(struct pci_nvme_softc *sc, 2388 struct pci_nvme_blockstore *nvstore, 2389 uint64_t prp1, uint64_t prp2, 2390 size_t offset, uint64_t bytes, 2391 bool is_write) 2392 { 2393 uint8_t *buf = nvstore->ctx; 2394 enum nvme_copy_dir dir; 2395 uint16_t status; 2396 2397 if (is_write) 2398 dir = NVME_COPY_TO_PRP; 2399 else 2400 dir = NVME_COPY_FROM_PRP; 2401 2402 status = 0; 2403 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2, 2404 buf + offset, bytes, dir)) 2405 pci_nvme_status_genc(&status, 2406 NVME_SC_DATA_TRANSFER_ERROR); 2407 else 2408 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2409 2410 return (status); 2411 } 2412 2413 static uint16_t 2414 nvme_write_read_blockif(struct pci_nvme_softc *sc, 2415 struct pci_nvme_blockstore *nvstore, 2416 struct pci_nvme_ioreq *req, 2417 uint64_t prp1, uint64_t prp2, 2418 size_t offset, uint64_t bytes, 2419 bool is_write) 2420 { 2421 uint64_t size; 2422 int err; 2423 uint16_t status = NVME_NO_STATUS; 2424 2425 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes); 2426 if (pci_nvme_append_iov_req(sc, req, prp1, size, offset)) { 2427 err = -1; 2428 goto out; 2429 } 2430 2431 offset += size; 2432 bytes -= size; 2433 2434 if (bytes == 0) { 2435 ; 2436 } else if (bytes <= PAGE_SIZE) { 2437 size = bytes; 2438 if (pci_nvme_append_iov_req(sc, req, prp2, size, offset)) { 2439 err = -1; 2440 goto out; 2441 } 2442 } else { 2443 void *vmctx = sc->nsc_pi->pi_vmctx; 2444 uint64_t *prp_list = &prp2; 2445 uint64_t *last = prp_list; 2446 2447 /* PRP2 is pointer to a physical region page list */ 2448 while (bytes) { 2449 /* Last entry in list points to the next list */ 2450 if ((prp_list == last) && (bytes > PAGE_SIZE)) { 2451 uint64_t prp = *prp_list; 2452 2453 prp_list = paddr_guest2host(vmctx, prp, 2454 PAGE_SIZE - (prp % PAGE_SIZE)); 2455 if (prp_list == NULL) { 2456 err = -1; 2457 goto out; 2458 } 2459 last = prp_list + (NVME_PRP2_ITEMS - 1); 2460 } 2461 2462 size = MIN(bytes, PAGE_SIZE); 2463 2464 if (pci_nvme_append_iov_req(sc, req, *prp_list, size, 2465 offset)) { 2466 err = -1; 2467 goto out; 2468 } 2469 2470 offset += size; 2471 bytes -= size; 2472 2473 prp_list++; 2474 } 2475 } 2476 req->io_req.br_callback = pci_nvme_io_done; 2477 if (is_write) 2478 err = blockif_write(nvstore->ctx, &req->io_req); 2479 else 2480 err = blockif_read(nvstore->ctx, &req->io_req); 2481 out: 2482 if (err) 2483 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); 2484 2485 return (status); 2486 } 2487 2488 static bool 2489 nvme_opc_write_read(struct pci_nvme_softc *sc, 2490 struct nvme_command *cmd, 2491 struct pci_nvme_blockstore *nvstore, 2492 struct pci_nvme_ioreq *req, 2493 uint16_t *status) 2494 { 2495 uint64_t lba, nblocks, bytes; 2496 size_t offset; 2497 bool is_write = cmd->opc == NVME_OPC_WRITE; 2498 bool pending = false; 2499 2500 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; 2501 nblocks = (cmd->cdw12 & 0xFFFF) + 1; 2502 bytes = nblocks << nvstore->sectsz_bits; 2503 if (bytes > NVME_MAX_DATA_SIZE) { 2504 WPRINTF("%s command would exceed MDTS", __func__); 2505 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD); 2506 goto out; 2507 } 2508 2509 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) { 2510 WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)", 2511 __func__, lba, nblocks); 2512 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2513 goto out; 2514 } 2515 2516 offset = lba << nvstore->sectsz_bits; 2517 2518 req->bytes = bytes; 2519 req->io_req.br_offset = lba; 2520 2521 /* PRP bits 1:0 must be zero */ 2522 cmd->prp1 &= ~0x3UL; 2523 cmd->prp2 &= ~0x3UL; 2524 2525 if (nvstore->type == NVME_STOR_RAM) { 2526 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1, 2527 cmd->prp2, offset, bytes, is_write); 2528 } else { 2529 *status = nvme_write_read_blockif(sc, nvstore, req, 2530 cmd->prp1, cmd->prp2, offset, bytes, is_write); 2531 2532 if (*status == NVME_NO_STATUS) 2533 pending = true; 2534 } 2535 out: 2536 if (!pending) 2537 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status); 2538 2539 return (pending); 2540 } 2541 2542 static void 2543 pci_nvme_dealloc_sm(struct blockif_req *br, int err) 2544 { 2545 struct pci_nvme_ioreq *req = br->br_param; 2546 struct pci_nvme_softc *sc = req->sc; 2547 bool done = true; 2548 uint16_t status; 2549 2550 status = 0; 2551 if (err) { 2552 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); 2553 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) { 2554 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2555 } else { 2556 struct iovec *iov = req->io_req.br_iov; 2557 2558 req->prev_gpaddr++; 2559 iov += req->prev_gpaddr; 2560 2561 /* The iov_* values already include the sector size */ 2562 req->io_req.br_offset = (off_t)iov->iov_base; 2563 req->io_req.br_resid = iov->iov_len; 2564 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) { 2565 pci_nvme_status_genc(&status, 2566 NVME_SC_INTERNAL_DEVICE_ERROR); 2567 } else 2568 done = false; 2569 } 2570 2571 if (done) { 2572 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, req->cid, 2573 status); 2574 pci_nvme_release_ioreq(sc, req); 2575 } 2576 } 2577 2578 static bool 2579 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, 2580 struct nvme_command *cmd, 2581 struct pci_nvme_blockstore *nvstore, 2582 struct pci_nvme_ioreq *req, 2583 uint16_t *status) 2584 { 2585 struct nvme_dsm_range *range = NULL; 2586 uint32_t nr, r, non_zero, dr; 2587 int err; 2588 bool pending = false; 2589 2590 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) { 2591 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE); 2592 goto out; 2593 } 2594 2595 nr = cmd->cdw10 & 0xff; 2596 2597 /* copy locally because a range entry could straddle PRPs */ 2598 range = calloc(1, NVME_MAX_DSM_TRIM); 2599 if (range == NULL) { 2600 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2601 goto out; 2602 } 2603 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, 2604 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP); 2605 2606 /* Check for invalid ranges and the number of non-zero lengths */ 2607 non_zero = 0; 2608 for (r = 0; r <= nr; r++) { 2609 if (pci_nvme_out_of_range(nvstore, 2610 range[r].starting_lba, range[r].length)) { 2611 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2612 goto out; 2613 } 2614 if (range[r].length != 0) 2615 non_zero++; 2616 } 2617 2618 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) { 2619 size_t offset, bytes; 2620 int sectsz_bits = sc->nvstore.sectsz_bits; 2621 2622 /* 2623 * DSM calls are advisory only, and compliant controllers 2624 * may choose to take no actions (i.e. return Success). 2625 */ 2626 if (!nvstore->deallocate) { 2627 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2628 goto out; 2629 } 2630 2631 /* If all ranges have a zero length, return Success */ 2632 if (non_zero == 0) { 2633 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2634 goto out; 2635 } 2636 2637 if (req == NULL) { 2638 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2639 goto out; 2640 } 2641 2642 offset = range[0].starting_lba << sectsz_bits; 2643 bytes = range[0].length << sectsz_bits; 2644 2645 /* 2646 * If the request is for more than a single range, store 2647 * the ranges in the br_iov. Optimize for the common case 2648 * of a single range. 2649 * 2650 * Note that NVMe Number of Ranges is a zero based value 2651 */ 2652 req->io_req.br_iovcnt = 0; 2653 req->io_req.br_offset = offset; 2654 req->io_req.br_resid = bytes; 2655 2656 if (nr == 0) { 2657 req->io_req.br_callback = pci_nvme_io_done; 2658 } else { 2659 struct iovec *iov = req->io_req.br_iov; 2660 2661 for (r = 0, dr = 0; r <= nr; r++) { 2662 offset = range[r].starting_lba << sectsz_bits; 2663 bytes = range[r].length << sectsz_bits; 2664 if (bytes == 0) 2665 continue; 2666 2667 if ((nvstore->size - offset) < bytes) { 2668 pci_nvme_status_genc(status, 2669 NVME_SC_LBA_OUT_OF_RANGE); 2670 goto out; 2671 } 2672 iov[dr].iov_base = (void *)offset; 2673 iov[dr].iov_len = bytes; 2674 dr++; 2675 } 2676 req->io_req.br_callback = pci_nvme_dealloc_sm; 2677 2678 /* 2679 * Use prev_gpaddr to track the current entry and 2680 * prev_size to track the number of entries 2681 */ 2682 req->prev_gpaddr = 0; 2683 req->prev_size = dr; 2684 } 2685 2686 err = blockif_delete(nvstore->ctx, &req->io_req); 2687 if (err) 2688 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2689 else 2690 pending = true; 2691 } 2692 out: 2693 free(range); 2694 return (pending); 2695 } 2696 2697 static void 2698 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) 2699 { 2700 struct nvme_submission_queue *sq; 2701 uint16_t status; 2702 uint16_t sqhead; 2703 2704 /* handle all submissions up to sq->tail index */ 2705 sq = &sc->submit_queues[idx]; 2706 2707 pthread_mutex_lock(&sq->mtx); 2708 2709 sqhead = sq->head; 2710 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p", 2711 idx, sqhead, sq->tail, sq->qbase); 2712 2713 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2714 struct nvme_command *cmd; 2715 struct pci_nvme_ioreq *req; 2716 uint32_t nsid; 2717 bool pending; 2718 2719 pending = false; 2720 req = NULL; 2721 status = 0; 2722 2723 cmd = &sq->qbase[sqhead]; 2724 sqhead = (sqhead + 1) % sq->size; 2725 2726 nsid = le32toh(cmd->nsid); 2727 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) { 2728 pci_nvme_status_genc(&status, 2729 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 2730 status |= 2731 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT; 2732 goto complete; 2733 } 2734 2735 req = pci_nvme_get_ioreq(sc); 2736 if (req == NULL) { 2737 pci_nvme_status_genc(&status, 2738 NVME_SC_INTERNAL_DEVICE_ERROR); 2739 WPRINTF("%s: unable to allocate IO req", __func__); 2740 goto complete; 2741 } 2742 req->nvme_sq = sq; 2743 req->sqid = idx; 2744 req->opc = cmd->opc; 2745 req->cid = cmd->cid; 2746 req->nsid = cmd->nsid; 2747 2748 switch (cmd->opc) { 2749 case NVME_OPC_FLUSH: 2750 pending = nvme_opc_flush(sc, cmd, &sc->nvstore, 2751 req, &status); 2752 break; 2753 case NVME_OPC_WRITE: 2754 case NVME_OPC_READ: 2755 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore, 2756 req, &status); 2757 break; 2758 case NVME_OPC_WRITE_ZEROES: 2759 /* TODO: write zeroes 2760 WPRINTF("%s write zeroes lba 0x%lx blocks %u", 2761 __func__, lba, cmd->cdw12 & 0xFFFF); */ 2762 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2763 break; 2764 case NVME_OPC_DATASET_MANAGEMENT: 2765 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, 2766 req, &status); 2767 break; 2768 default: 2769 WPRINTF("%s unhandled io command 0x%x", 2770 __func__, cmd->opc); 2771 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE); 2772 } 2773 complete: 2774 if (!pending) { 2775 pci_nvme_set_completion(sc, sq, idx, cmd->cid, status); 2776 if (req != NULL) 2777 pci_nvme_release_ioreq(sc, req); 2778 } 2779 } 2780 2781 sq->head = sqhead; 2782 2783 pthread_mutex_unlock(&sq->mtx); 2784 } 2785 2786 static void 2787 pci_nvme_handle_doorbell(struct pci_nvme_softc* sc, 2788 uint64_t idx, int is_sq, uint64_t value) 2789 { 2790 DPRINTF("nvme doorbell %lu, %s, val 0x%lx", 2791 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF); 2792 2793 if (is_sq) { 2794 if (idx > sc->num_squeues) { 2795 WPRINTF("%s queue index %lu overflow from " 2796 "guest (max %u)", 2797 __func__, idx, sc->num_squeues); 2798 return; 2799 } 2800 2801 atomic_store_short(&sc->submit_queues[idx].tail, 2802 (uint16_t)value); 2803 2804 if (idx == 0) { 2805 pci_nvme_handle_admin_cmd(sc, value); 2806 } else { 2807 /* submission queue; handle new entries in SQ */ 2808 if (idx > sc->num_squeues) { 2809 WPRINTF("%s SQ index %lu overflow from " 2810 "guest (max %u)", 2811 __func__, idx, sc->num_squeues); 2812 return; 2813 } 2814 pci_nvme_handle_io_cmd(sc, (uint16_t)idx); 2815 } 2816 } else { 2817 if (idx > sc->num_cqueues) { 2818 WPRINTF("%s queue index %lu overflow from " 2819 "guest (max %u)", 2820 __func__, idx, sc->num_cqueues); 2821 return; 2822 } 2823 2824 atomic_store_short(&sc->compl_queues[idx].head, 2825 (uint16_t)value); 2826 } 2827 } 2828 2829 static void 2830 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) 2831 { 2832 const char *s = iswrite ? "WRITE" : "READ"; 2833 2834 switch (offset) { 2835 case NVME_CR_CAP_LOW: 2836 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s); 2837 break; 2838 case NVME_CR_CAP_HI: 2839 DPRINTF("%s %s NVME_CR_CAP_HI", func, s); 2840 break; 2841 case NVME_CR_VS: 2842 DPRINTF("%s %s NVME_CR_VS", func, s); 2843 break; 2844 case NVME_CR_INTMS: 2845 DPRINTF("%s %s NVME_CR_INTMS", func, s); 2846 break; 2847 case NVME_CR_INTMC: 2848 DPRINTF("%s %s NVME_CR_INTMC", func, s); 2849 break; 2850 case NVME_CR_CC: 2851 DPRINTF("%s %s NVME_CR_CC", func, s); 2852 break; 2853 case NVME_CR_CSTS: 2854 DPRINTF("%s %s NVME_CR_CSTS", func, s); 2855 break; 2856 case NVME_CR_NSSR: 2857 DPRINTF("%s %s NVME_CR_NSSR", func, s); 2858 break; 2859 case NVME_CR_AQA: 2860 DPRINTF("%s %s NVME_CR_AQA", func, s); 2861 break; 2862 case NVME_CR_ASQ_LOW: 2863 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s); 2864 break; 2865 case NVME_CR_ASQ_HI: 2866 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s); 2867 break; 2868 case NVME_CR_ACQ_LOW: 2869 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s); 2870 break; 2871 case NVME_CR_ACQ_HI: 2872 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s); 2873 break; 2874 default: 2875 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset); 2876 } 2877 2878 } 2879 2880 static void 2881 pci_nvme_write_bar_0(struct pci_nvme_softc *sc, uint64_t offset, int size, 2882 uint64_t value) 2883 { 2884 uint32_t ccreg; 2885 2886 if (offset >= NVME_DOORBELL_OFFSET) { 2887 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; 2888 uint64_t idx = belloffset / 8; /* door bell size = 2*int */ 2889 int is_sq = (belloffset % 8) < 4; 2890 2891 if ((sc->regs.csts & NVME_CSTS_RDY) == 0) { 2892 WPRINTF("doorbell write prior to RDY (offset=%#lx)\n", 2893 offset); 2894 return; 2895 } 2896 2897 if (belloffset > ((sc->max_queues+1) * 8 - 4)) { 2898 WPRINTF("guest attempted an overflow write offset " 2899 "0x%lx, val 0x%lx in %s", 2900 offset, value, __func__); 2901 return; 2902 } 2903 2904 if (is_sq) { 2905 if (sc->submit_queues[idx].qbase == NULL) 2906 return; 2907 } else if (sc->compl_queues[idx].qbase == NULL) 2908 return; 2909 2910 pci_nvme_handle_doorbell(sc, idx, is_sq, value); 2911 return; 2912 } 2913 2914 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx", 2915 offset, size, value); 2916 2917 if (size != 4) { 2918 WPRINTF("guest wrote invalid size %d (offset 0x%lx, " 2919 "val 0x%lx) to bar0 in %s", 2920 size, offset, value, __func__); 2921 /* TODO: shutdown device */ 2922 return; 2923 } 2924 2925 pci_nvme_bar0_reg_dumps(__func__, offset, 1); 2926 2927 pthread_mutex_lock(&sc->mtx); 2928 2929 switch (offset) { 2930 case NVME_CR_CAP_LOW: 2931 case NVME_CR_CAP_HI: 2932 /* readonly */ 2933 break; 2934 case NVME_CR_VS: 2935 /* readonly */ 2936 break; 2937 case NVME_CR_INTMS: 2938 /* MSI-X, so ignore */ 2939 break; 2940 case NVME_CR_INTMC: 2941 /* MSI-X, so ignore */ 2942 break; 2943 case NVME_CR_CC: 2944 ccreg = (uint32_t)value; 2945 2946 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u " 2947 "iocqes %u", 2948 __func__, 2949 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), 2950 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), 2951 NVME_CC_GET_IOCQES(ccreg)); 2952 2953 if (NVME_CC_GET_SHN(ccreg)) { 2954 /* perform shutdown - flush out data to backend */ 2955 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << 2956 NVME_CSTS_REG_SHST_SHIFT); 2957 sc->regs.csts |= NVME_SHST_COMPLETE << 2958 NVME_CSTS_REG_SHST_SHIFT; 2959 } 2960 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { 2961 if (NVME_CC_GET_EN(ccreg) == 0) 2962 /* transition 1-> causes controller reset */ 2963 pci_nvme_reset_locked(sc); 2964 else 2965 pci_nvme_init_controller(sc); 2966 } 2967 2968 /* Insert the iocqes, iosqes and en bits from the write */ 2969 sc->regs.cc &= ~NVME_CC_WRITE_MASK; 2970 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; 2971 if (NVME_CC_GET_EN(ccreg) == 0) { 2972 /* Insert the ams, mps and css bit fields */ 2973 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; 2974 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; 2975 sc->regs.csts &= ~NVME_CSTS_RDY; 2976 } else if ((sc->pending_ios == 0) && 2977 !(sc->regs.csts & NVME_CSTS_CFS)) { 2978 sc->regs.csts |= NVME_CSTS_RDY; 2979 } 2980 break; 2981 case NVME_CR_CSTS: 2982 break; 2983 case NVME_CR_NSSR: 2984 /* ignore writes; don't support subsystem reset */ 2985 break; 2986 case NVME_CR_AQA: 2987 sc->regs.aqa = (uint32_t)value; 2988 break; 2989 case NVME_CR_ASQ_LOW: 2990 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | 2991 (0xFFFFF000 & value); 2992 break; 2993 case NVME_CR_ASQ_HI: 2994 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | 2995 (value << 32); 2996 break; 2997 case NVME_CR_ACQ_LOW: 2998 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | 2999 (0xFFFFF000 & value); 3000 break; 3001 case NVME_CR_ACQ_HI: 3002 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | 3003 (value << 32); 3004 break; 3005 default: 3006 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d", 3007 __func__, offset, value, size); 3008 } 3009 pthread_mutex_unlock(&sc->mtx); 3010 } 3011 3012 static void 3013 pci_nvme_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size, 3014 uint64_t value) 3015 { 3016 struct pci_nvme_softc* sc = pi->pi_arg; 3017 3018 if (baridx == pci_msix_table_bar(pi) || 3019 baridx == pci_msix_pba_bar(pi)) { 3020 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, " 3021 " value 0x%lx", baridx, offset, size, value); 3022 3023 pci_emul_msix_twrite(pi, offset, size, value); 3024 return; 3025 } 3026 3027 switch (baridx) { 3028 case 0: 3029 pci_nvme_write_bar_0(sc, offset, size, value); 3030 break; 3031 3032 default: 3033 DPRINTF("%s unknown baridx %d, val 0x%lx", 3034 __func__, baridx, value); 3035 } 3036 } 3037 3038 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, 3039 uint64_t offset, int size) 3040 { 3041 uint64_t value; 3042 3043 pci_nvme_bar0_reg_dumps(__func__, offset, 0); 3044 3045 if (offset < NVME_DOORBELL_OFFSET) { 3046 void *p = &(sc->regs); 3047 pthread_mutex_lock(&sc->mtx); 3048 memcpy(&value, (void *)((uintptr_t)p + offset), size); 3049 pthread_mutex_unlock(&sc->mtx); 3050 } else { 3051 value = 0; 3052 WPRINTF("pci_nvme: read invalid offset %ld", offset); 3053 } 3054 3055 switch (size) { 3056 case 1: 3057 value &= 0xFF; 3058 break; 3059 case 2: 3060 value &= 0xFFFF; 3061 break; 3062 case 4: 3063 value &= 0xFFFFFFFF; 3064 break; 3065 } 3066 3067 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x", 3068 offset, size, (uint32_t)value); 3069 3070 return (value); 3071 } 3072 3073 3074 3075 static uint64_t 3076 pci_nvme_read(struct pci_devinst *pi, int baridx, uint64_t offset, int size) 3077 { 3078 struct pci_nvme_softc* sc = pi->pi_arg; 3079 3080 if (baridx == pci_msix_table_bar(pi) || 3081 baridx == pci_msix_pba_bar(pi)) { 3082 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d", 3083 baridx, offset, size); 3084 3085 return pci_emul_msix_tread(pi, offset, size); 3086 } 3087 3088 switch (baridx) { 3089 case 0: 3090 return pci_nvme_read_bar_0(sc, offset, size); 3091 3092 default: 3093 DPRINTF("unknown bar %d, 0x%lx", baridx, offset); 3094 } 3095 3096 return (0); 3097 } 3098 3099 static int 3100 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl) 3101 { 3102 char bident[sizeof("XXX:XXX")]; 3103 const char *value; 3104 uint32_t sectsz; 3105 3106 sc->max_queues = NVME_QUEUES; 3107 sc->max_qentries = NVME_MAX_QENTRIES; 3108 sc->ioslots = NVME_IOSLOTS; 3109 sc->num_squeues = sc->max_queues; 3110 sc->num_cqueues = sc->max_queues; 3111 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 3112 sectsz = 0; 3113 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), 3114 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3115 3116 value = get_config_value_node(nvl, "maxq"); 3117 if (value != NULL) 3118 sc->max_queues = atoi(value); 3119 value = get_config_value_node(nvl, "qsz"); 3120 if (value != NULL) { 3121 sc->max_qentries = atoi(value); 3122 if (sc->max_qentries <= 0) { 3123 EPRINTLN("nvme: Invalid qsz option %d", 3124 sc->max_qentries); 3125 return (-1); 3126 } 3127 } 3128 value = get_config_value_node(nvl, "ioslots"); 3129 if (value != NULL) { 3130 sc->ioslots = atoi(value); 3131 if (sc->ioslots <= 0) { 3132 EPRINTLN("Invalid ioslots option %d", sc->ioslots); 3133 return (-1); 3134 } 3135 } 3136 value = get_config_value_node(nvl, "sectsz"); 3137 if (value != NULL) 3138 sectsz = atoi(value); 3139 value = get_config_value_node(nvl, "ser"); 3140 if (value != NULL) { 3141 /* 3142 * This field indicates the Product Serial Number in 3143 * 7-bit ASCII, unused bytes should be space characters. 3144 * Ref: NVMe v1.3c. 3145 */ 3146 cpywithpad((char *)sc->ctrldata.sn, 3147 sizeof(sc->ctrldata.sn), value, ' '); 3148 } 3149 value = get_config_value_node(nvl, "eui64"); 3150 if (value != NULL) 3151 sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0)); 3152 value = get_config_value_node(nvl, "dsm"); 3153 if (value != NULL) { 3154 if (strcmp(value, "auto") == 0) 3155 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 3156 else if (strcmp(value, "enable") == 0) 3157 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE; 3158 else if (strcmp(value, "disable") == 0) 3159 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE; 3160 } 3161 3162 value = get_config_value_node(nvl, "ram"); 3163 if (value != NULL) { 3164 uint64_t sz = strtoull(value, NULL, 10); 3165 3166 sc->nvstore.type = NVME_STOR_RAM; 3167 sc->nvstore.size = sz * 1024 * 1024; 3168 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 3169 sc->nvstore.sectsz = 4096; 3170 sc->nvstore.sectsz_bits = 12; 3171 if (sc->nvstore.ctx == NULL) { 3172 EPRINTLN("nvme: Unable to allocate RAM"); 3173 return (-1); 3174 } 3175 } else { 3176 snprintf(bident, sizeof(bident), "%u:%u", 3177 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3178 sc->nvstore.ctx = blockif_open(nvl, bident); 3179 if (sc->nvstore.ctx == NULL) { 3180 EPRINTLN("nvme: Could not open backing file: %s", 3181 strerror(errno)); 3182 return (-1); 3183 } 3184 sc->nvstore.type = NVME_STOR_BLOCKIF; 3185 sc->nvstore.size = blockif_size(sc->nvstore.ctx); 3186 } 3187 3188 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) 3189 sc->nvstore.sectsz = sectsz; 3190 else if (sc->nvstore.type != NVME_STOR_RAM) 3191 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); 3192 for (sc->nvstore.sectsz_bits = 9; 3193 (1U << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; 3194 sc->nvstore.sectsz_bits++); 3195 3196 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) 3197 sc->max_queues = NVME_QUEUES; 3198 3199 return (0); 3200 } 3201 3202 static void 3203 pci_nvme_resized(struct blockif_ctxt *bctxt __unused, void *arg, 3204 size_t new_size) 3205 { 3206 struct pci_nvme_softc *sc; 3207 struct pci_nvme_blockstore *nvstore; 3208 struct nvme_namespace_data *nd; 3209 3210 sc = arg; 3211 nvstore = &sc->nvstore; 3212 nd = &sc->nsdata; 3213 3214 nvstore->size = new_size; 3215 pci_nvme_init_nsdata_size(nvstore, nd); 3216 3217 /* Add changed NSID to list */ 3218 sc->ns_log.ns[0] = 1; 3219 sc->ns_log.ns[1] = 0; 3220 3221 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE, 3222 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED); 3223 } 3224 3225 static int 3226 pci_nvme_init(struct pci_devinst *pi, nvlist_t *nvl) 3227 { 3228 struct pci_nvme_softc *sc; 3229 uint32_t pci_membar_sz; 3230 int error; 3231 3232 error = 0; 3233 3234 sc = calloc(1, sizeof(struct pci_nvme_softc)); 3235 pi->pi_arg = sc; 3236 sc->nsc_pi = pi; 3237 3238 error = pci_nvme_parse_config(sc, nvl); 3239 if (error < 0) 3240 goto done; 3241 else 3242 error = 0; 3243 3244 STAILQ_INIT(&sc->ioreqs_free); 3245 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); 3246 for (uint32_t i = 0; i < sc->ioslots; i++) { 3247 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); 3248 } 3249 3250 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); 3251 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); 3252 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); 3253 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); 3254 pci_set_cfgdata8(pi, PCIR_PROGIF, 3255 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); 3256 3257 /* 3258 * Allocate size of NVMe registers + doorbell space for all queues. 3259 * 3260 * The specification requires a minimum memory I/O window size of 16K. 3261 * The Windows driver will refuse to start a device with a smaller 3262 * window. 3263 */ 3264 pci_membar_sz = sizeof(struct nvme_registers) + 3265 2 * sizeof(uint32_t) * (sc->max_queues + 1); 3266 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); 3267 3268 DPRINTF("nvme membar size: %u", pci_membar_sz); 3269 3270 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); 3271 if (error) { 3272 WPRINTF("%s pci alloc mem bar failed", __func__); 3273 goto done; 3274 } 3275 3276 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); 3277 if (error) { 3278 WPRINTF("%s pci add msixcap failed", __func__); 3279 goto done; 3280 } 3281 3282 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); 3283 if (error) { 3284 WPRINTF("%s pci add Express capability failed", __func__); 3285 goto done; 3286 } 3287 3288 pthread_mutex_init(&sc->mtx, NULL); 3289 sem_init(&sc->iosemlock, 0, sc->ioslots); 3290 blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc); 3291 3292 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues); 3293 /* 3294 * Controller data depends on Namespace data so initialize Namespace 3295 * data first. 3296 */ 3297 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore); 3298 pci_nvme_init_ctrldata(sc); 3299 pci_nvme_init_logpages(sc); 3300 pci_nvme_init_features(sc); 3301 3302 pci_nvme_aer_init(sc); 3303 pci_nvme_aen_init(sc); 3304 3305 pci_nvme_reset(sc); 3306 3307 pci_lintr_request(pi); 3308 3309 done: 3310 return (error); 3311 } 3312 3313 static int 3314 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts) 3315 { 3316 char *cp, *ram; 3317 3318 if (opts == NULL) 3319 return (0); 3320 3321 if (strncmp(opts, "ram=", 4) == 0) { 3322 cp = strchr(opts, ','); 3323 if (cp == NULL) { 3324 set_config_value_node(nvl, "ram", opts + 4); 3325 return (0); 3326 } 3327 ram = strndup(opts + 4, cp - opts - 4); 3328 set_config_value_node(nvl, "ram", ram); 3329 free(ram); 3330 return (pci_parse_legacy_config(nvl, cp + 1)); 3331 } else 3332 return (blockif_legacy_config(nvl, opts)); 3333 } 3334 3335 static const struct pci_devemu pci_de_nvme = { 3336 .pe_emu = "nvme", 3337 .pe_init = pci_nvme_init, 3338 .pe_legacy_config = pci_nvme_legacy_config, 3339 .pe_barwrite = pci_nvme_write, 3340 .pe_barread = pci_nvme_read 3341 }; 3342 PCI_EMUL_SET(pci_de_nvme); 3343