1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2017 Shunsuke Mie 5 * Copyright (c) 2018 Leon Dang 6 * Copyright (c) 2020 Chuck Tuffli 7 * 8 * Function crc16 Copyright (c) 2017, Fedor Uporov 9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 33 /* 34 * bhyve PCIe-NVMe device emulation. 35 * 36 * options: 37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt> 38 * 39 * accepted devpath: 40 * /dev/blockdev 41 * /path/to/image 42 * ram=size_in_MiB 43 * 44 * maxq = max number of queues 45 * qsz = max elements in each queue 46 * ioslots = max number of concurrent io requests 47 * sectsz = sector size (defaults to blockif sector size) 48 * ser = serial number (20-chars max) 49 * eui64 = IEEE Extended Unique Identifier (8 byte value) 50 * dsm = DataSet Management support. Option is one of auto, enable,disable 51 * 52 */ 53 54 /* TODO: 55 - create async event for smart and log 56 - intr coalesce 57 */ 58 59 #include <sys/cdefs.h> 60 __FBSDID("$FreeBSD$"); 61 62 #include <sys/errno.h> 63 #include <sys/types.h> 64 #include <net/ieee_oui.h> 65 66 #include <assert.h> 67 #include <pthread.h> 68 #include <pthread_np.h> 69 #include <semaphore.h> 70 #include <stdbool.h> 71 #include <stddef.h> 72 #include <stdint.h> 73 #include <stdio.h> 74 #include <stdlib.h> 75 #include <string.h> 76 77 #include <machine/atomic.h> 78 #include <machine/vmm.h> 79 #include <vmmapi.h> 80 81 #include <dev/nvme/nvme.h> 82 83 #include "bhyverun.h" 84 #include "block_if.h" 85 #include "config.h" 86 #include "debug.h" 87 #include "pci_emul.h" 88 89 90 static int nvme_debug = 0; 91 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args) 92 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args) 93 94 /* defaults; can be overridden */ 95 #define NVME_MSIX_BAR 4 96 97 #define NVME_IOSLOTS 8 98 99 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ 100 #define NVME_MMIO_SPACE_MIN (1 << 14) 101 102 #define NVME_QUEUES 16 103 #define NVME_MAX_QENTRIES 2048 104 /* Memory Page size Minimum reported in CAP register */ 105 #define NVME_MPSMIN 0 106 /* MPSMIN converted to bytes */ 107 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN)) 108 109 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) 110 #define NVME_MDTS 9 111 /* Note the + 1 allows for the initial descriptor to not be page aligned */ 112 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1) 113 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES) 114 115 /* This is a synthetic status code to indicate there is no status */ 116 #define NVME_NO_STATUS 0xffff 117 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) 118 119 /* Reported temperature in Kelvin (i.e. room temperature) */ 120 #define NVME_TEMPERATURE 296 121 122 /* helpers */ 123 124 /* Convert a zero-based value into a one-based value */ 125 #define ONE_BASED(zero) ((zero) + 1) 126 /* Convert a one-based value into a zero-based value */ 127 #define ZERO_BASED(one) ((one) - 1) 128 129 /* Encode number of SQ's and CQ's for Set/Get Features */ 130 #define NVME_FEATURE_NUM_QUEUES(sc) \ 131 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ 132 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16; 133 134 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) 135 136 enum nvme_controller_register_offsets { 137 NVME_CR_CAP_LOW = 0x00, 138 NVME_CR_CAP_HI = 0x04, 139 NVME_CR_VS = 0x08, 140 NVME_CR_INTMS = 0x0c, 141 NVME_CR_INTMC = 0x10, 142 NVME_CR_CC = 0x14, 143 NVME_CR_CSTS = 0x1c, 144 NVME_CR_NSSR = 0x20, 145 NVME_CR_AQA = 0x24, 146 NVME_CR_ASQ_LOW = 0x28, 147 NVME_CR_ASQ_HI = 0x2c, 148 NVME_CR_ACQ_LOW = 0x30, 149 NVME_CR_ACQ_HI = 0x34, 150 }; 151 152 enum nvme_cmd_cdw11 { 153 NVME_CMD_CDW11_PC = 0x0001, 154 NVME_CMD_CDW11_IEN = 0x0002, 155 NVME_CMD_CDW11_IV = 0xFFFF0000, 156 }; 157 158 enum nvme_copy_dir { 159 NVME_COPY_TO_PRP, 160 NVME_COPY_FROM_PRP, 161 }; 162 163 #define NVME_CQ_INTEN 0x01 164 #define NVME_CQ_INTCOAL 0x02 165 166 struct nvme_completion_queue { 167 struct nvme_completion *qbase; 168 pthread_mutex_t mtx; 169 uint32_t size; 170 uint16_t tail; /* nvme progress */ 171 uint16_t head; /* guest progress */ 172 uint16_t intr_vec; 173 uint32_t intr_en; 174 }; 175 176 struct nvme_submission_queue { 177 struct nvme_command *qbase; 178 pthread_mutex_t mtx; 179 uint32_t size; 180 uint16_t head; /* nvme progress */ 181 uint16_t tail; /* guest progress */ 182 uint16_t cqid; /* completion queue id */ 183 int qpriority; 184 }; 185 186 enum nvme_storage_type { 187 NVME_STOR_BLOCKIF = 0, 188 NVME_STOR_RAM = 1, 189 }; 190 191 struct pci_nvme_blockstore { 192 enum nvme_storage_type type; 193 void *ctx; 194 uint64_t size; 195 uint32_t sectsz; 196 uint32_t sectsz_bits; 197 uint64_t eui64; 198 uint32_t deallocate:1; 199 }; 200 201 /* 202 * Calculate the number of additional page descriptors for guest IO requests 203 * based on the advertised Max Data Transfer (MDTS) and given the number of 204 * default iovec's in a struct blockif_req. 205 */ 206 #define MDTS_PAD_SIZE \ 207 ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \ 208 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \ 209 0 ) 210 211 struct pci_nvme_ioreq { 212 struct pci_nvme_softc *sc; 213 STAILQ_ENTRY(pci_nvme_ioreq) link; 214 struct nvme_submission_queue *nvme_sq; 215 uint16_t sqid; 216 217 /* command information */ 218 uint16_t opc; 219 uint16_t cid; 220 uint32_t nsid; 221 222 uint64_t prev_gpaddr; 223 size_t prev_size; 224 size_t bytes; 225 226 struct blockif_req io_req; 227 228 struct iovec iovpadding[MDTS_PAD_SIZE]; 229 }; 230 231 enum nvme_dsm_type { 232 /* Dataset Management bit in ONCS reflects backing storage capability */ 233 NVME_DATASET_MANAGEMENT_AUTO, 234 /* Unconditionally set Dataset Management bit in ONCS */ 235 NVME_DATASET_MANAGEMENT_ENABLE, 236 /* Unconditionally clear Dataset Management bit in ONCS */ 237 NVME_DATASET_MANAGEMENT_DISABLE, 238 }; 239 240 struct pci_nvme_softc; 241 struct nvme_feature_obj; 242 243 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *, 244 struct nvme_feature_obj *, 245 struct nvme_command *, 246 struct nvme_completion *); 247 248 struct nvme_feature_obj { 249 uint32_t cdw11; 250 nvme_feature_cb set; 251 nvme_feature_cb get; 252 bool namespace_specific; 253 }; 254 255 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1) 256 257 typedef enum { 258 PCI_NVME_AE_TYPE_ERROR = 0, 259 PCI_NVME_AE_TYPE_SMART, 260 PCI_NVME_AE_TYPE_NOTICE, 261 PCI_NVME_AE_TYPE_IO_CMD = 6, 262 PCI_NVME_AE_TYPE_VENDOR = 7, 263 PCI_NVME_AE_TYPE_MAX /* Must be last */ 264 } pci_nvme_async_type; 265 266 /* Asynchronous Event Requests */ 267 struct pci_nvme_aer { 268 STAILQ_ENTRY(pci_nvme_aer) link; 269 uint16_t cid; /* Command ID of the submitted AER */ 270 }; 271 272 /** Asynchronous Event Information - Notice */ 273 typedef enum { 274 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED = 0, 275 PCI_NVME_AEI_NOTICE_FW_ACTIVATION, 276 PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE, 277 PCI_NVME_AEI_NOTICE_ANA_CHANGE, 278 PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE, 279 PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT, 280 PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE, 281 PCI_NVME_AEI_NOTICE_MAX, 282 } pci_nvme_async_event_info_notice; 283 284 #define PCI_NVME_AEI_NOTICE_SHIFT 8 285 #define PCI_NVME_AEI_NOTICE_MASK(event) (1 << (event + PCI_NVME_AEI_NOTICE_SHIFT)) 286 287 /* Asynchronous Event Notifications */ 288 struct pci_nvme_aen { 289 pci_nvme_async_type atype; 290 uint32_t event_data; 291 bool posted; 292 }; 293 294 /* 295 * By default, enable all Asynchrnous Event Notifications: 296 * SMART / Health Critical Warnings 297 * Namespace Attribute Notices 298 */ 299 #define PCI_NVME_AEN_DEFAULT_MASK 0x11f 300 301 typedef enum { 302 NVME_CNTRLTYPE_IO = 1, 303 NVME_CNTRLTYPE_DISCOVERY = 2, 304 NVME_CNTRLTYPE_ADMIN = 3, 305 } pci_nvme_cntrl_type; 306 307 struct pci_nvme_softc { 308 struct pci_devinst *nsc_pi; 309 310 pthread_mutex_t mtx; 311 312 struct nvme_registers regs; 313 314 struct nvme_namespace_data nsdata; 315 struct nvme_controller_data ctrldata; 316 struct nvme_error_information_entry err_log; 317 struct nvme_health_information_page health_log; 318 struct nvme_firmware_page fw_log; 319 struct nvme_ns_list ns_log; 320 321 struct pci_nvme_blockstore nvstore; 322 323 uint16_t max_qentries; /* max entries per queue */ 324 uint32_t max_queues; /* max number of IO SQ's or CQ's */ 325 uint32_t num_cqueues; 326 uint32_t num_squeues; 327 bool num_q_is_set; /* Has host set Number of Queues */ 328 329 struct pci_nvme_ioreq *ioreqs; 330 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */ 331 uint32_t pending_ios; 332 uint32_t ioslots; 333 sem_t iosemlock; 334 335 /* 336 * Memory mapped Submission and Completion queues 337 * Each array includes both Admin and IO queues 338 */ 339 struct nvme_completion_queue *compl_queues; 340 struct nvme_submission_queue *submit_queues; 341 342 struct nvme_feature_obj feat[NVME_FID_MAX]; 343 344 enum nvme_dsm_type dataset_management; 345 346 /* Accounting for SMART data */ 347 __uint128_t read_data_units; 348 __uint128_t write_data_units; 349 __uint128_t read_commands; 350 __uint128_t write_commands; 351 uint32_t read_dunits_remainder; 352 uint32_t write_dunits_remainder; 353 354 STAILQ_HEAD(, pci_nvme_aer) aer_list; 355 pthread_mutex_t aer_mtx; 356 uint32_t aer_count; 357 struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX]; 358 pthread_t aen_tid; 359 pthread_mutex_t aen_mtx; 360 pthread_cond_t aen_cond; 361 }; 362 363 364 static void pci_nvme_cq_update(struct pci_nvme_softc *sc, 365 struct nvme_completion_queue *cq, 366 uint32_t cdw0, 367 uint16_t cid, 368 uint16_t sqid, 369 uint16_t status); 370 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); 371 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); 372 static void pci_nvme_io_done(struct blockif_req *, int); 373 374 /* Controller Configuration utils */ 375 #define NVME_CC_GET_EN(cc) \ 376 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) 377 #define NVME_CC_GET_CSS(cc) \ 378 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) 379 #define NVME_CC_GET_SHN(cc) \ 380 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) 381 #define NVME_CC_GET_IOSQES(cc) \ 382 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) 383 #define NVME_CC_GET_IOCQES(cc) \ 384 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) 385 386 #define NVME_CC_WRITE_MASK \ 387 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ 388 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ 389 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) 390 391 #define NVME_CC_NEN_WRITE_MASK \ 392 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ 393 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ 394 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) 395 396 /* Controller Status utils */ 397 #define NVME_CSTS_GET_RDY(sts) \ 398 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) 399 400 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) 401 #define NVME_CSTS_CFS (1 << NVME_CSTS_REG_CFS_SHIFT) 402 403 /* Completion Queue status word utils */ 404 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) 405 #define NVME_STATUS_MASK \ 406 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ 407 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) 408 409 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \ 410 NVME_CTRLR_DATA_ONCS_DSM_SHIFT) 411 412 static void nvme_feature_invalid_cb(struct pci_nvme_softc *, 413 struct nvme_feature_obj *, 414 struct nvme_command *, 415 struct nvme_completion *); 416 static void nvme_feature_temperature(struct pci_nvme_softc *, 417 struct nvme_feature_obj *, 418 struct nvme_command *, 419 struct nvme_completion *); 420 static void nvme_feature_num_queues(struct pci_nvme_softc *, 421 struct nvme_feature_obj *, 422 struct nvme_command *, 423 struct nvme_completion *); 424 static void nvme_feature_iv_config(struct pci_nvme_softc *, 425 struct nvme_feature_obj *, 426 struct nvme_command *, 427 struct nvme_completion *); 428 static void nvme_feature_async_event(struct pci_nvme_softc *, 429 struct nvme_feature_obj *, 430 struct nvme_command *, 431 struct nvme_completion *); 432 433 static void *aen_thr(void *arg); 434 435 static __inline void 436 cpywithpad(char *dst, size_t dst_size, const char *src, char pad) 437 { 438 size_t len; 439 440 len = strnlen(src, dst_size); 441 memset(dst, pad, dst_size); 442 memcpy(dst, src, len); 443 } 444 445 static __inline void 446 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) 447 { 448 449 *status &= ~NVME_STATUS_MASK; 450 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | 451 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; 452 } 453 454 static __inline void 455 pci_nvme_status_genc(uint16_t *status, uint16_t code) 456 { 457 458 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); 459 } 460 461 /* 462 * Initialize the requested number or IO Submission and Completion Queues. 463 * Admin queues are allocated implicitly. 464 */ 465 static void 466 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq) 467 { 468 uint32_t i; 469 470 /* 471 * Allocate and initialize the Submission Queues 472 */ 473 if (nsq > NVME_QUEUES) { 474 WPRINTF("%s: clamping number of SQ from %u to %u", 475 __func__, nsq, NVME_QUEUES); 476 nsq = NVME_QUEUES; 477 } 478 479 sc->num_squeues = nsq; 480 481 sc->submit_queues = calloc(sc->num_squeues + 1, 482 sizeof(struct nvme_submission_queue)); 483 if (sc->submit_queues == NULL) { 484 WPRINTF("%s: SQ allocation failed", __func__); 485 sc->num_squeues = 0; 486 } else { 487 struct nvme_submission_queue *sq = sc->submit_queues; 488 489 for (i = 0; i < sc->num_squeues + 1; i++) 490 pthread_mutex_init(&sq[i].mtx, NULL); 491 } 492 493 /* 494 * Allocate and initialize the Completion Queues 495 */ 496 if (ncq > NVME_QUEUES) { 497 WPRINTF("%s: clamping number of CQ from %u to %u", 498 __func__, ncq, NVME_QUEUES); 499 ncq = NVME_QUEUES; 500 } 501 502 sc->num_cqueues = ncq; 503 504 sc->compl_queues = calloc(sc->num_cqueues + 1, 505 sizeof(struct nvme_completion_queue)); 506 if (sc->compl_queues == NULL) { 507 WPRINTF("%s: CQ allocation failed", __func__); 508 sc->num_cqueues = 0; 509 } else { 510 struct nvme_completion_queue *cq = sc->compl_queues; 511 512 for (i = 0; i < sc->num_cqueues + 1; i++) 513 pthread_mutex_init(&cq[i].mtx, NULL); 514 } 515 } 516 517 static void 518 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) 519 { 520 struct nvme_controller_data *cd = &sc->ctrldata; 521 522 cd->vid = 0xFB5D; 523 cd->ssvid = 0x0000; 524 525 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); 526 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); 527 528 /* Num of submission commands that we can handle at a time (2^rab) */ 529 cd->rab = 4; 530 531 /* FreeBSD OUI */ 532 cd->ieee[0] = 0x58; 533 cd->ieee[1] = 0x9c; 534 cd->ieee[2] = 0xfc; 535 536 cd->mic = 0; 537 538 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */ 539 540 cd->ver = NVME_REV(1,4); 541 542 cd->cntrltype = NVME_CNTRLTYPE_IO; 543 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; 544 cd->oaes = NVMEB(NVME_CTRLR_DATA_OAES_NS_ATTR); 545 cd->acl = 2; 546 cd->aerl = 4; 547 548 /* Advertise 1, Read-only firmware slot */ 549 cd->frmw = NVMEB(NVME_CTRLR_DATA_FRMW_SLOT1_RO) | 550 (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT); 551 cd->lpa = 0; /* TODO: support some simple things like SMART */ 552 cd->elpe = 0; /* max error log page entries */ 553 /* 554 * Report a single power state (zero-based value) 555 * power_state[] values are left as zero to indicate "Not reported" 556 */ 557 cd->npss = 0; 558 559 /* Warning Composite Temperature Threshold */ 560 cd->wctemp = 0x0157; 561 cd->cctemp = 0x0157; 562 563 /* SANICAP must not be 0 for Revision 1.4 and later NVMe Controllers */ 564 cd->sanicap = (NVME_CTRLR_DATA_SANICAP_NODMMAS_NO << 565 NVME_CTRLR_DATA_SANICAP_NODMMAS_SHIFT); 566 567 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | 568 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); 569 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | 570 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); 571 cd->nn = 1; /* number of namespaces */ 572 573 cd->oncs = 0; 574 switch (sc->dataset_management) { 575 case NVME_DATASET_MANAGEMENT_AUTO: 576 if (sc->nvstore.deallocate) 577 cd->oncs |= NVME_ONCS_DSM; 578 break; 579 case NVME_DATASET_MANAGEMENT_ENABLE: 580 cd->oncs |= NVME_ONCS_DSM; 581 break; 582 default: 583 break; 584 } 585 586 cd->fna = NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK << 587 NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT; 588 589 cd->vwc = NVME_CTRLR_DATA_VWC_ALL_NO << NVME_CTRLR_DATA_VWC_ALL_SHIFT; 590 } 591 592 /* 593 * Calculate the CRC-16 of the given buffer 594 * See copyright attribution at top of file 595 */ 596 static uint16_t 597 crc16(uint16_t crc, const void *buffer, unsigned int len) 598 { 599 const unsigned char *cp = buffer; 600 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */ 601 static uint16_t const crc16_table[256] = { 602 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, 603 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, 604 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, 605 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841, 606 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, 607 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41, 608 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641, 609 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040, 610 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240, 611 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441, 612 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41, 613 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840, 614 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41, 615 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40, 616 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640, 617 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041, 618 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240, 619 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441, 620 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41, 621 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840, 622 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41, 623 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40, 624 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640, 625 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041, 626 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241, 627 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440, 628 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40, 629 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841, 630 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40, 631 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41, 632 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, 633 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 634 }; 635 636 while (len--) 637 crc = (((crc >> 8) & 0xffU) ^ 638 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU; 639 return crc; 640 } 641 642 static void 643 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore, 644 struct nvme_namespace_data *nd) 645 { 646 647 /* Get capacity and block size information from backing store */ 648 nd->nsze = nvstore->size / nvstore->sectsz; 649 nd->ncap = nd->nsze; 650 nd->nuse = nd->nsze; 651 } 652 653 static void 654 pci_nvme_init_nsdata(struct pci_nvme_softc *sc, 655 struct nvme_namespace_data *nd, uint32_t nsid, 656 struct pci_nvme_blockstore *nvstore) 657 { 658 659 pci_nvme_init_nsdata_size(nvstore, nd); 660 661 if (nvstore->type == NVME_STOR_BLOCKIF) 662 nvstore->deallocate = blockif_candelete(nvstore->ctx); 663 664 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ 665 nd->flbas = 0; 666 667 /* Create an EUI-64 if user did not provide one */ 668 if (nvstore->eui64 == 0) { 669 char *data = NULL; 670 uint64_t eui64 = nvstore->eui64; 671 672 asprintf(&data, "%s%u%u%u", get_config_value("name"), 673 sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot, 674 sc->nsc_pi->pi_func); 675 676 if (data != NULL) { 677 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); 678 free(data); 679 } 680 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff); 681 } 682 be64enc(nd->eui64, nvstore->eui64); 683 684 /* LBA data-sz = 2^lbads */ 685 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; 686 } 687 688 static void 689 pci_nvme_init_logpages(struct pci_nvme_softc *sc) 690 { 691 692 memset(&sc->err_log, 0, sizeof(sc->err_log)); 693 memset(&sc->health_log, 0, sizeof(sc->health_log)); 694 memset(&sc->fw_log, 0, sizeof(sc->fw_log)); 695 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 696 697 /* Set read/write remainder to round up according to spec */ 698 sc->read_dunits_remainder = 999; 699 sc->write_dunits_remainder = 999; 700 701 /* Set nominal Health values checked by implementations */ 702 sc->health_log.temperature = NVME_TEMPERATURE; 703 sc->health_log.available_spare = 100; 704 sc->health_log.available_spare_threshold = 10; 705 706 /* Set Active Firmware Info to slot 1 */ 707 sc->fw_log.afi = (1 << NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT); 708 memcpy(&sc->fw_log.revision[0], sc->ctrldata.fr, 709 sizeof(sc->fw_log.revision[0])); 710 } 711 712 static void 713 pci_nvme_init_features(struct pci_nvme_softc *sc) 714 { 715 enum nvme_feature fid; 716 717 for (fid = 0; fid < NVME_FID_MAX; fid++) { 718 switch (fid) { 719 case NVME_FEAT_ARBITRATION: 720 case NVME_FEAT_POWER_MANAGEMENT: 721 case NVME_FEAT_INTERRUPT_COALESCING: //XXX 722 case NVME_FEAT_WRITE_ATOMICITY: 723 /* Mandatory but no special handling required */ 724 //XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 725 //XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 726 // this returns a data buffer 727 break; 728 case NVME_FEAT_TEMPERATURE_THRESHOLD: 729 sc->feat[fid].set = nvme_feature_temperature; 730 break; 731 case NVME_FEAT_ERROR_RECOVERY: 732 sc->feat[fid].namespace_specific = true; 733 break; 734 case NVME_FEAT_NUMBER_OF_QUEUES: 735 sc->feat[fid].set = nvme_feature_num_queues; 736 break; 737 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 738 sc->feat[fid].set = nvme_feature_iv_config; 739 break; 740 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 741 sc->feat[fid].set = nvme_feature_async_event; 742 /* Enable all AENs by default */ 743 sc->feat[fid].cdw11 = PCI_NVME_AEN_DEFAULT_MASK; 744 break; 745 default: 746 sc->feat[fid].set = nvme_feature_invalid_cb; 747 sc->feat[fid].get = nvme_feature_invalid_cb; 748 } 749 } 750 } 751 752 static void 753 pci_nvme_aer_reset(struct pci_nvme_softc *sc) 754 { 755 756 STAILQ_INIT(&sc->aer_list); 757 sc->aer_count = 0; 758 } 759 760 static void 761 pci_nvme_aer_init(struct pci_nvme_softc *sc) 762 { 763 764 pthread_mutex_init(&sc->aer_mtx, NULL); 765 pci_nvme_aer_reset(sc); 766 } 767 768 static void 769 pci_nvme_aer_destroy(struct pci_nvme_softc *sc) 770 { 771 struct pci_nvme_aer *aer = NULL; 772 773 pthread_mutex_lock(&sc->aer_mtx); 774 while (!STAILQ_EMPTY(&sc->aer_list)) { 775 aer = STAILQ_FIRST(&sc->aer_list); 776 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 777 free(aer); 778 } 779 pthread_mutex_unlock(&sc->aer_mtx); 780 781 pci_nvme_aer_reset(sc); 782 } 783 784 static bool 785 pci_nvme_aer_available(struct pci_nvme_softc *sc) 786 { 787 788 return (sc->aer_count != 0); 789 } 790 791 static bool 792 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc) 793 { 794 struct nvme_controller_data *cd = &sc->ctrldata; 795 796 /* AERL is a zero based value while aer_count is one's based */ 797 return (sc->aer_count == (cd->aerl + 1)); 798 } 799 800 /* 801 * Add an Async Event Request 802 * 803 * Stores an AER to be returned later if the Controller needs to notify the 804 * host of an event. 805 * Note that while the NVMe spec doesn't require Controllers to return AER's 806 * in order, this implementation does preserve the order. 807 */ 808 static int 809 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid) 810 { 811 struct pci_nvme_aer *aer = NULL; 812 813 aer = calloc(1, sizeof(struct pci_nvme_aer)); 814 if (aer == NULL) 815 return (-1); 816 817 /* Save the Command ID for use in the completion message */ 818 aer->cid = cid; 819 820 pthread_mutex_lock(&sc->aer_mtx); 821 sc->aer_count++; 822 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link); 823 pthread_mutex_unlock(&sc->aer_mtx); 824 825 return (0); 826 } 827 828 /* 829 * Get an Async Event Request structure 830 * 831 * Returns a pointer to an AER previously submitted by the host or NULL if 832 * no AER's exist. Caller is responsible for freeing the returned struct. 833 */ 834 static struct pci_nvme_aer * 835 pci_nvme_aer_get(struct pci_nvme_softc *sc) 836 { 837 struct pci_nvme_aer *aer = NULL; 838 839 pthread_mutex_lock(&sc->aer_mtx); 840 aer = STAILQ_FIRST(&sc->aer_list); 841 if (aer != NULL) { 842 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 843 sc->aer_count--; 844 } 845 pthread_mutex_unlock(&sc->aer_mtx); 846 847 return (aer); 848 } 849 850 static void 851 pci_nvme_aen_reset(struct pci_nvme_softc *sc) 852 { 853 uint32_t atype; 854 855 memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen)); 856 857 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 858 sc->aen[atype].atype = atype; 859 } 860 } 861 862 static void 863 pci_nvme_aen_init(struct pci_nvme_softc *sc) 864 { 865 char nstr[80]; 866 867 pci_nvme_aen_reset(sc); 868 869 pthread_mutex_init(&sc->aen_mtx, NULL); 870 pthread_create(&sc->aen_tid, NULL, aen_thr, sc); 871 snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot, 872 sc->nsc_pi->pi_func); 873 pthread_set_name_np(sc->aen_tid, nstr); 874 } 875 876 static void 877 pci_nvme_aen_destroy(struct pci_nvme_softc *sc) 878 { 879 880 pci_nvme_aen_reset(sc); 881 } 882 883 /* Notify the AEN thread of pending work */ 884 static void 885 pci_nvme_aen_notify(struct pci_nvme_softc *sc) 886 { 887 888 pthread_cond_signal(&sc->aen_cond); 889 } 890 891 /* 892 * Post an Asynchronous Event Notification 893 */ 894 static int32_t 895 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype, 896 uint32_t event_data) 897 { 898 struct pci_nvme_aen *aen; 899 900 if (atype >= PCI_NVME_AE_TYPE_MAX) { 901 return(EINVAL); 902 } 903 904 pthread_mutex_lock(&sc->aen_mtx); 905 aen = &sc->aen[atype]; 906 907 /* Has the controller already posted an event of this type? */ 908 if (aen->posted) { 909 pthread_mutex_unlock(&sc->aen_mtx); 910 return(EALREADY); 911 } 912 913 aen->event_data = event_data; 914 aen->posted = true; 915 pthread_mutex_unlock(&sc->aen_mtx); 916 917 pci_nvme_aen_notify(sc); 918 919 return(0); 920 } 921 922 static void 923 pci_nvme_aen_process(struct pci_nvme_softc *sc) 924 { 925 struct pci_nvme_aer *aer; 926 struct pci_nvme_aen *aen; 927 pci_nvme_async_type atype; 928 uint32_t mask; 929 uint16_t status; 930 uint8_t lid; 931 932 assert(pthread_mutex_isowned_np(&sc->aen_mtx)); 933 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 934 aen = &sc->aen[atype]; 935 /* Previous iterations may have depleted the available AER's */ 936 if (!pci_nvme_aer_available(sc)) { 937 DPRINTF("%s: no AER", __func__); 938 break; 939 } 940 941 if (!aen->posted) { 942 DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype); 943 continue; 944 } 945 946 status = NVME_SC_SUCCESS; 947 948 /* Is the event masked? */ 949 mask = 950 sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11; 951 952 DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data); 953 switch (atype) { 954 case PCI_NVME_AE_TYPE_ERROR: 955 lid = NVME_LOG_ERROR; 956 break; 957 case PCI_NVME_AE_TYPE_SMART: 958 mask &= 0xff; 959 if ((mask & aen->event_data) == 0) 960 continue; 961 lid = NVME_LOG_HEALTH_INFORMATION; 962 break; 963 case PCI_NVME_AE_TYPE_NOTICE: 964 if (aen->event_data >= PCI_NVME_AEI_NOTICE_MAX) { 965 EPRINTLN("%s unknown AEN notice type %u", 966 __func__, aen->event_data); 967 status = NVME_SC_INTERNAL_DEVICE_ERROR; 968 break; 969 } 970 if ((PCI_NVME_AEI_NOTICE_MASK(aen->event_data) & mask) == 0) 971 continue; 972 switch (aen->event_data) { 973 case PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED: 974 lid = NVME_LOG_CHANGED_NAMESPACE; 975 break; 976 case PCI_NVME_AEI_NOTICE_FW_ACTIVATION: 977 lid = NVME_LOG_FIRMWARE_SLOT; 978 break; 979 case PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE: 980 lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED; 981 break; 982 case PCI_NVME_AEI_NOTICE_ANA_CHANGE: 983 lid = NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS; 984 break; 985 case PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE: 986 lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE; 987 break; 988 case PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT: 989 lid = NVME_LOG_LBA_STATUS_INFORMATION; 990 break; 991 case PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE: 992 lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE; 993 break; 994 default: 995 lid = 0; 996 } 997 break; 998 default: 999 /* bad type?!? */ 1000 EPRINTLN("%s unknown AEN type %u", __func__, atype); 1001 status = NVME_SC_INTERNAL_DEVICE_ERROR; 1002 break; 1003 } 1004 1005 aer = pci_nvme_aer_get(sc); 1006 assert(aer != NULL); 1007 1008 DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype); 1009 pci_nvme_cq_update(sc, &sc->compl_queues[0], 1010 (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */ 1011 aer->cid, 1012 0, /* SQID */ 1013 status); 1014 1015 aen->event_data = 0; 1016 aen->posted = false; 1017 1018 pci_generate_msix(sc->nsc_pi, 0); 1019 } 1020 } 1021 1022 static void * 1023 aen_thr(void *arg) 1024 { 1025 struct pci_nvme_softc *sc; 1026 1027 sc = arg; 1028 1029 pthread_mutex_lock(&sc->aen_mtx); 1030 for (;;) { 1031 pci_nvme_aen_process(sc); 1032 pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx); 1033 } 1034 pthread_mutex_unlock(&sc->aen_mtx); 1035 1036 pthread_exit(NULL); 1037 return (NULL); 1038 } 1039 1040 static void 1041 pci_nvme_reset_locked(struct pci_nvme_softc *sc) 1042 { 1043 uint32_t i; 1044 1045 DPRINTF("%s", __func__); 1046 1047 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | 1048 (1 << NVME_CAP_LO_REG_CQR_SHIFT) | 1049 (60 << NVME_CAP_LO_REG_TO_SHIFT); 1050 1051 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; 1052 1053 sc->regs.vs = NVME_REV(1,4); /* NVMe v1.4 */ 1054 1055 sc->regs.cc = 0; 1056 1057 assert(sc->submit_queues != NULL); 1058 1059 for (i = 0; i < sc->num_squeues + 1; i++) { 1060 sc->submit_queues[i].qbase = NULL; 1061 sc->submit_queues[i].size = 0; 1062 sc->submit_queues[i].cqid = 0; 1063 sc->submit_queues[i].tail = 0; 1064 sc->submit_queues[i].head = 0; 1065 } 1066 1067 assert(sc->compl_queues != NULL); 1068 1069 for (i = 0; i < sc->num_cqueues + 1; i++) { 1070 sc->compl_queues[i].qbase = NULL; 1071 sc->compl_queues[i].size = 0; 1072 sc->compl_queues[i].tail = 0; 1073 sc->compl_queues[i].head = 0; 1074 } 1075 1076 sc->num_q_is_set = false; 1077 1078 pci_nvme_aer_destroy(sc); 1079 pci_nvme_aen_destroy(sc); 1080 1081 /* 1082 * Clear CSTS.RDY last to prevent the host from enabling Controller 1083 * before cleanup completes 1084 */ 1085 sc->regs.csts = 0; 1086 } 1087 1088 static void 1089 pci_nvme_reset(struct pci_nvme_softc *sc) 1090 { 1091 pthread_mutex_lock(&sc->mtx); 1092 pci_nvme_reset_locked(sc); 1093 pthread_mutex_unlock(&sc->mtx); 1094 } 1095 1096 static int 1097 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc) 1098 { 1099 uint16_t acqs, asqs; 1100 1101 DPRINTF("%s", __func__); 1102 1103 /* 1104 * NVMe 2.0 states that "enabling a controller while this field is 1105 * cleared to 0h produces undefined results" for both ACQS and 1106 * ASQS. If zero, set CFS and do not become ready. 1107 */ 1108 asqs = ONE_BASED(sc->regs.aqa & NVME_AQA_REG_ASQS_MASK); 1109 if (asqs < 2) { 1110 EPRINTLN("%s: illegal ASQS value %#x (aqa=%#x)", __func__, 1111 asqs - 1, sc->regs.aqa); 1112 sc->regs.csts |= NVME_CSTS_CFS; 1113 return (-1); 1114 } 1115 sc->submit_queues[0].size = asqs; 1116 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq, 1117 sizeof(struct nvme_command) * asqs); 1118 if (sc->submit_queues[0].qbase == NULL) { 1119 EPRINTLN("%s: ASQ vm_map_gpa(%lx) failed", __func__, 1120 sc->regs.asq); 1121 sc->regs.csts |= NVME_CSTS_CFS; 1122 return (-1); 1123 } 1124 1125 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p", 1126 __func__, sc->regs.asq, sc->submit_queues[0].qbase); 1127 1128 acqs = ONE_BASED((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 1129 NVME_AQA_REG_ACQS_MASK); 1130 if (acqs < 2) { 1131 EPRINTLN("%s: illegal ACQS value %#x (aqa=%#x)", __func__, 1132 acqs - 1, sc->regs.aqa); 1133 sc->regs.csts |= NVME_CSTS_CFS; 1134 return (-1); 1135 } 1136 sc->compl_queues[0].size = acqs; 1137 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq, 1138 sizeof(struct nvme_completion) * acqs); 1139 if (sc->compl_queues[0].qbase == NULL) { 1140 EPRINTLN("%s: ACQ vm_map_gpa(%lx) failed", __func__, 1141 sc->regs.acq); 1142 sc->regs.csts |= NVME_CSTS_CFS; 1143 return (-1); 1144 } 1145 sc->compl_queues[0].intr_en = NVME_CQ_INTEN; 1146 1147 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p", 1148 __func__, sc->regs.acq, sc->compl_queues[0].qbase); 1149 1150 return (0); 1151 } 1152 1153 static int 1154 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b, 1155 size_t len, enum nvme_copy_dir dir) 1156 { 1157 uint8_t *p; 1158 size_t bytes; 1159 1160 if (len > (8 * 1024)) { 1161 return (-1); 1162 } 1163 1164 /* Copy from the start of prp1 to the end of the physical page */ 1165 bytes = PAGE_SIZE - (prp1 & PAGE_MASK); 1166 bytes = MIN(bytes, len); 1167 1168 p = vm_map_gpa(ctx, prp1, bytes); 1169 if (p == NULL) { 1170 return (-1); 1171 } 1172 1173 if (dir == NVME_COPY_TO_PRP) 1174 memcpy(p, b, bytes); 1175 else 1176 memcpy(b, p, bytes); 1177 1178 b += bytes; 1179 1180 len -= bytes; 1181 if (len == 0) { 1182 return (0); 1183 } 1184 1185 len = MIN(len, PAGE_SIZE); 1186 1187 p = vm_map_gpa(ctx, prp2, len); 1188 if (p == NULL) { 1189 return (-1); 1190 } 1191 1192 if (dir == NVME_COPY_TO_PRP) 1193 memcpy(p, b, len); 1194 else 1195 memcpy(b, p, len); 1196 1197 return (0); 1198 } 1199 1200 /* 1201 * Write a Completion Queue Entry update 1202 * 1203 * Write the completion and update the doorbell value 1204 */ 1205 static void 1206 pci_nvme_cq_update(struct pci_nvme_softc *sc, 1207 struct nvme_completion_queue *cq, 1208 uint32_t cdw0, 1209 uint16_t cid, 1210 uint16_t sqid, 1211 uint16_t status) 1212 { 1213 struct nvme_submission_queue *sq = &sc->submit_queues[sqid]; 1214 struct nvme_completion *cqe; 1215 1216 assert(cq->qbase != NULL); 1217 1218 pthread_mutex_lock(&cq->mtx); 1219 1220 cqe = &cq->qbase[cq->tail]; 1221 1222 /* Flip the phase bit */ 1223 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK; 1224 1225 cqe->cdw0 = cdw0; 1226 cqe->sqhd = sq->head; 1227 cqe->sqid = sqid; 1228 cqe->cid = cid; 1229 cqe->status = status; 1230 1231 cq->tail++; 1232 if (cq->tail >= cq->size) { 1233 cq->tail = 0; 1234 } 1235 1236 pthread_mutex_unlock(&cq->mtx); 1237 } 1238 1239 static int 1240 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1241 struct nvme_completion* compl) 1242 { 1243 uint16_t qid = command->cdw10 & 0xffff; 1244 1245 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid); 1246 if (qid == 0 || qid > sc->num_squeues || 1247 (sc->submit_queues[qid].qbase == NULL)) { 1248 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u", 1249 __func__, qid, sc->num_squeues); 1250 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1251 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1252 return (1); 1253 } 1254 1255 sc->submit_queues[qid].qbase = NULL; 1256 sc->submit_queues[qid].cqid = 0; 1257 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1258 return (1); 1259 } 1260 1261 static int 1262 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1263 struct nvme_completion* compl) 1264 { 1265 if (command->cdw11 & NVME_CMD_CDW11_PC) { 1266 uint16_t qid = command->cdw10 & 0xffff; 1267 struct nvme_submission_queue *nsq; 1268 1269 if ((qid == 0) || (qid > sc->num_squeues) || 1270 (sc->submit_queues[qid].qbase != NULL)) { 1271 WPRINTF("%s queue index %u > num_squeues %u", 1272 __func__, qid, sc->num_squeues); 1273 pci_nvme_status_tc(&compl->status, 1274 NVME_SCT_COMMAND_SPECIFIC, 1275 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1276 return (1); 1277 } 1278 1279 nsq = &sc->submit_queues[qid]; 1280 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1281 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries); 1282 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) { 1283 /* 1284 * Queues must specify at least two entries 1285 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1286 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1287 */ 1288 pci_nvme_status_tc(&compl->status, 1289 NVME_SCT_COMMAND_SPECIFIC, 1290 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1291 return (1); 1292 } 1293 nsq->head = nsq->tail = 0; 1294 1295 nsq->cqid = (command->cdw11 >> 16) & 0xffff; 1296 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) { 1297 pci_nvme_status_tc(&compl->status, 1298 NVME_SCT_COMMAND_SPECIFIC, 1299 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1300 return (1); 1301 } 1302 1303 if (sc->compl_queues[nsq->cqid].qbase == NULL) { 1304 pci_nvme_status_tc(&compl->status, 1305 NVME_SCT_COMMAND_SPECIFIC, 1306 NVME_SC_COMPLETION_QUEUE_INVALID); 1307 return (1); 1308 } 1309 1310 nsq->qpriority = (command->cdw11 >> 1) & 0x03; 1311 1312 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1313 sizeof(struct nvme_command) * (size_t)nsq->size); 1314 1315 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__, 1316 qid, nsq->size, nsq->qbase, nsq->cqid); 1317 1318 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1319 1320 DPRINTF("%s completed creating IOSQ qid %u", 1321 __func__, qid); 1322 } else { 1323 /* 1324 * Guest sent non-cont submission queue request. 1325 * This setting is unsupported by this emulation. 1326 */ 1327 WPRINTF("%s unsupported non-contig (list-based) " 1328 "create i/o submission queue", __func__); 1329 1330 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1331 } 1332 return (1); 1333 } 1334 1335 static int 1336 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1337 struct nvme_completion* compl) 1338 { 1339 uint16_t qid = command->cdw10 & 0xffff; 1340 uint16_t sqid; 1341 1342 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid); 1343 if (qid == 0 || qid > sc->num_cqueues || 1344 (sc->compl_queues[qid].qbase == NULL)) { 1345 WPRINTF("%s queue index %u / num_cqueues %u", 1346 __func__, qid, sc->num_cqueues); 1347 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1348 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1349 return (1); 1350 } 1351 1352 /* Deleting an Active CQ is an error */ 1353 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++) 1354 if (sc->submit_queues[sqid].cqid == qid) { 1355 pci_nvme_status_tc(&compl->status, 1356 NVME_SCT_COMMAND_SPECIFIC, 1357 NVME_SC_INVALID_QUEUE_DELETION); 1358 return (1); 1359 } 1360 1361 sc->compl_queues[qid].qbase = NULL; 1362 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1363 return (1); 1364 } 1365 1366 static int 1367 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1368 struct nvme_completion* compl) 1369 { 1370 struct nvme_completion_queue *ncq; 1371 uint16_t qid = command->cdw10 & 0xffff; 1372 1373 /* Only support Physically Contiguous queues */ 1374 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) { 1375 WPRINTF("%s unsupported non-contig (list-based) " 1376 "create i/o completion queue", 1377 __func__); 1378 1379 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1380 return (1); 1381 } 1382 1383 if ((qid == 0) || (qid > sc->num_cqueues) || 1384 (sc->compl_queues[qid].qbase != NULL)) { 1385 WPRINTF("%s queue index %u > num_cqueues %u", 1386 __func__, qid, sc->num_cqueues); 1387 pci_nvme_status_tc(&compl->status, 1388 NVME_SCT_COMMAND_SPECIFIC, 1389 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1390 return (1); 1391 } 1392 1393 ncq = &sc->compl_queues[qid]; 1394 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; 1395 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; 1396 if (ncq->intr_vec > (sc->max_queues + 1)) { 1397 pci_nvme_status_tc(&compl->status, 1398 NVME_SCT_COMMAND_SPECIFIC, 1399 NVME_SC_INVALID_INTERRUPT_VECTOR); 1400 return (1); 1401 } 1402 1403 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1404 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) { 1405 /* 1406 * Queues must specify at least two entries 1407 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1408 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1409 */ 1410 pci_nvme_status_tc(&compl->status, 1411 NVME_SCT_COMMAND_SPECIFIC, 1412 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1413 return (1); 1414 } 1415 ncq->head = ncq->tail = 0; 1416 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1417 command->prp1, 1418 sizeof(struct nvme_command) * (size_t)ncq->size); 1419 1420 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1421 1422 1423 return (1); 1424 } 1425 1426 static int 1427 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, 1428 struct nvme_completion* compl) 1429 { 1430 uint64_t logoff; 1431 uint32_t logsize; 1432 uint8_t logpage; 1433 1434 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1435 1436 /* 1437 * Command specifies the number of dwords to return in fields NUMDU 1438 * and NUMDL. This is a zero-based value. 1439 */ 1440 logpage = command->cdw10 & 0xFF; 1441 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1; 1442 logsize *= sizeof(uint32_t); 1443 logoff = ((uint64_t)(command->cdw13) << 32) | command->cdw12; 1444 1445 DPRINTF("%s log page %u len %u", __func__, logpage, logsize); 1446 1447 switch (logpage) { 1448 case NVME_LOG_ERROR: 1449 if (logoff >= sizeof(sc->err_log)) { 1450 pci_nvme_status_genc(&compl->status, 1451 NVME_SC_INVALID_FIELD); 1452 break; 1453 } 1454 1455 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1456 command->prp2, (uint8_t *)&sc->err_log + logoff, 1457 MIN(logsize - logoff, sizeof(sc->err_log)), 1458 NVME_COPY_TO_PRP); 1459 break; 1460 case NVME_LOG_HEALTH_INFORMATION: 1461 if (logoff >= sizeof(sc->health_log)) { 1462 pci_nvme_status_genc(&compl->status, 1463 NVME_SC_INVALID_FIELD); 1464 break; 1465 } 1466 1467 pthread_mutex_lock(&sc->mtx); 1468 memcpy(&sc->health_log.data_units_read, &sc->read_data_units, 1469 sizeof(sc->health_log.data_units_read)); 1470 memcpy(&sc->health_log.data_units_written, &sc->write_data_units, 1471 sizeof(sc->health_log.data_units_written)); 1472 memcpy(&sc->health_log.host_read_commands, &sc->read_commands, 1473 sizeof(sc->health_log.host_read_commands)); 1474 memcpy(&sc->health_log.host_write_commands, &sc->write_commands, 1475 sizeof(sc->health_log.host_write_commands)); 1476 pthread_mutex_unlock(&sc->mtx); 1477 1478 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1479 command->prp2, (uint8_t *)&sc->health_log + logoff, 1480 MIN(logsize - logoff, sizeof(sc->health_log)), 1481 NVME_COPY_TO_PRP); 1482 break; 1483 case NVME_LOG_FIRMWARE_SLOT: 1484 if (logoff >= sizeof(sc->fw_log)) { 1485 pci_nvme_status_genc(&compl->status, 1486 NVME_SC_INVALID_FIELD); 1487 break; 1488 } 1489 1490 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1491 command->prp2, (uint8_t *)&sc->fw_log + logoff, 1492 MIN(logsize - logoff, sizeof(sc->fw_log)), 1493 NVME_COPY_TO_PRP); 1494 break; 1495 case NVME_LOG_CHANGED_NAMESPACE: 1496 if (logoff >= sizeof(sc->ns_log)) { 1497 pci_nvme_status_genc(&compl->status, 1498 NVME_SC_INVALID_FIELD); 1499 break; 1500 } 1501 1502 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1503 command->prp2, (uint8_t *)&sc->ns_log + logoff, 1504 MIN(logsize - logoff, sizeof(sc->ns_log)), 1505 NVME_COPY_TO_PRP); 1506 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 1507 break; 1508 default: 1509 DPRINTF("%s get log page %x command not supported", 1510 __func__, logpage); 1511 1512 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1513 NVME_SC_INVALID_LOG_PAGE); 1514 } 1515 1516 return (1); 1517 } 1518 1519 static int 1520 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, 1521 struct nvme_completion* compl) 1522 { 1523 void *dest; 1524 uint16_t status; 1525 1526 DPRINTF("%s identify 0x%x nsid 0x%x", __func__, 1527 command->cdw10 & 0xFF, command->nsid); 1528 1529 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1530 1531 switch (command->cdw10 & 0xFF) { 1532 case 0x00: /* return Identify Namespace data structure */ 1533 /* Global NS only valid with NS Management */ 1534 if (command->nsid == NVME_GLOBAL_NAMESPACE_TAG) { 1535 pci_nvme_status_genc(&status, 1536 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1537 break; 1538 } 1539 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1540 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata), 1541 NVME_COPY_TO_PRP); 1542 break; 1543 case 0x01: /* return Identify Controller data structure */ 1544 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1545 command->prp2, (uint8_t *)&sc->ctrldata, 1546 sizeof(sc->ctrldata), 1547 NVME_COPY_TO_PRP); 1548 break; 1549 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ 1550 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1551 sizeof(uint32_t) * 1024); 1552 /* All unused entries shall be zero */ 1553 bzero(dest, sizeof(uint32_t) * 1024); 1554 ((uint32_t *)dest)[0] = 1; 1555 break; 1556 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ 1557 if (command->nsid != 1) { 1558 pci_nvme_status_genc(&status, 1559 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1560 break; 1561 } 1562 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1563 sizeof(uint32_t) * 1024); 1564 /* All bytes after the descriptor shall be zero */ 1565 bzero(dest, sizeof(uint32_t) * 1024); 1566 1567 /* Return NIDT=1 (i.e. EUI64) descriptor */ 1568 ((uint8_t *)dest)[0] = 1; 1569 ((uint8_t *)dest)[1] = sizeof(uint64_t); 1570 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t)); 1571 break; 1572 case 0x13: 1573 /* 1574 * Controller list is optional but used by UNH tests. Return 1575 * a valid but empty list. 1576 */ 1577 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1578 sizeof(uint16_t) * 2048); 1579 memset(dest, 0, sizeof(uint16_t) * 2048); 1580 break; 1581 default: 1582 DPRINTF("%s unsupported identify command requested 0x%x", 1583 __func__, command->cdw10 & 0xFF); 1584 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD); 1585 break; 1586 } 1587 1588 compl->status = status; 1589 return (1); 1590 } 1591 1592 static const char * 1593 nvme_fid_to_name(uint8_t fid) 1594 { 1595 const char *name; 1596 1597 switch (fid) { 1598 case NVME_FEAT_ARBITRATION: 1599 name = "Arbitration"; 1600 break; 1601 case NVME_FEAT_POWER_MANAGEMENT: 1602 name = "Power Management"; 1603 break; 1604 case NVME_FEAT_LBA_RANGE_TYPE: 1605 name = "LBA Range Type"; 1606 break; 1607 case NVME_FEAT_TEMPERATURE_THRESHOLD: 1608 name = "Temperature Threshold"; 1609 break; 1610 case NVME_FEAT_ERROR_RECOVERY: 1611 name = "Error Recovery"; 1612 break; 1613 case NVME_FEAT_VOLATILE_WRITE_CACHE: 1614 name = "Volatile Write Cache"; 1615 break; 1616 case NVME_FEAT_NUMBER_OF_QUEUES: 1617 name = "Number of Queues"; 1618 break; 1619 case NVME_FEAT_INTERRUPT_COALESCING: 1620 name = "Interrupt Coalescing"; 1621 break; 1622 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 1623 name = "Interrupt Vector Configuration"; 1624 break; 1625 case NVME_FEAT_WRITE_ATOMICITY: 1626 name = "Write Atomicity Normal"; 1627 break; 1628 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 1629 name = "Asynchronous Event Configuration"; 1630 break; 1631 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 1632 name = "Autonomous Power State Transition"; 1633 break; 1634 case NVME_FEAT_HOST_MEMORY_BUFFER: 1635 name = "Host Memory Buffer"; 1636 break; 1637 case NVME_FEAT_TIMESTAMP: 1638 name = "Timestamp"; 1639 break; 1640 case NVME_FEAT_KEEP_ALIVE_TIMER: 1641 name = "Keep Alive Timer"; 1642 break; 1643 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT: 1644 name = "Host Controlled Thermal Management"; 1645 break; 1646 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG: 1647 name = "Non-Operation Power State Config"; 1648 break; 1649 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG: 1650 name = "Read Recovery Level Config"; 1651 break; 1652 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 1653 name = "Predictable Latency Mode Config"; 1654 break; 1655 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW: 1656 name = "Predictable Latency Mode Window"; 1657 break; 1658 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES: 1659 name = "LBA Status Information Report Interval"; 1660 break; 1661 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 1662 name = "Host Behavior Support"; 1663 break; 1664 case NVME_FEAT_SANITIZE_CONFIG: 1665 name = "Sanitize Config"; 1666 break; 1667 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION: 1668 name = "Endurance Group Event Configuration"; 1669 break; 1670 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 1671 name = "Software Progress Marker"; 1672 break; 1673 case NVME_FEAT_HOST_IDENTIFIER: 1674 name = "Host Identifier"; 1675 break; 1676 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK: 1677 name = "Reservation Notification Mask"; 1678 break; 1679 case NVME_FEAT_RESERVATION_PERSISTENCE: 1680 name = "Reservation Persistence"; 1681 break; 1682 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG: 1683 name = "Namespace Write Protection Config"; 1684 break; 1685 default: 1686 name = "Unknown"; 1687 break; 1688 } 1689 1690 return (name); 1691 } 1692 1693 static void 1694 nvme_feature_invalid_cb(struct pci_nvme_softc *sc __unused, 1695 struct nvme_feature_obj *feat __unused, 1696 struct nvme_command *command __unused, 1697 struct nvme_completion *compl) 1698 { 1699 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1700 } 1701 1702 static void 1703 nvme_feature_iv_config(struct pci_nvme_softc *sc, 1704 struct nvme_feature_obj *feat __unused, 1705 struct nvme_command *command, 1706 struct nvme_completion *compl) 1707 { 1708 uint32_t i; 1709 uint32_t cdw11 = command->cdw11; 1710 uint16_t iv; 1711 bool cd; 1712 1713 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1714 1715 iv = cdw11 & 0xffff; 1716 cd = cdw11 & (1 << 16); 1717 1718 if (iv > (sc->max_queues + 1)) { 1719 return; 1720 } 1721 1722 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */ 1723 if ((iv == 0) && !cd) 1724 return; 1725 1726 /* Requested Interrupt Vector must be used by a CQ */ 1727 for (i = 0; i < sc->num_cqueues + 1; i++) { 1728 if (sc->compl_queues[i].intr_vec == iv) { 1729 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1730 } 1731 } 1732 } 1733 1734 #define NVME_ASYNC_EVENT_ENDURANCE_GROUP (0x4000) 1735 static void 1736 nvme_feature_async_event(struct pci_nvme_softc *sc __unused, 1737 struct nvme_feature_obj *feat __unused, 1738 struct nvme_command *command, 1739 struct nvme_completion *compl) 1740 { 1741 if (command->cdw11 & NVME_ASYNC_EVENT_ENDURANCE_GROUP) 1742 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1743 } 1744 1745 #define NVME_TEMP_THRESH_OVER 0 1746 #define NVME_TEMP_THRESH_UNDER 1 1747 static void 1748 nvme_feature_temperature(struct pci_nvme_softc *sc, 1749 struct nvme_feature_obj *feat __unused, 1750 struct nvme_command *command, 1751 struct nvme_completion *compl) 1752 { 1753 uint16_t tmpth; /* Temperature Threshold */ 1754 uint8_t tmpsel; /* Threshold Temperature Select */ 1755 uint8_t thsel; /* Threshold Type Select */ 1756 bool set_crit = false; 1757 1758 tmpth = command->cdw11 & 0xffff; 1759 tmpsel = (command->cdw11 >> 16) & 0xf; 1760 thsel = (command->cdw11 >> 20) & 0x3; 1761 1762 DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel); 1763 1764 /* Check for unsupported values */ 1765 if (((tmpsel != 0) && (tmpsel != 0xf)) || 1766 (thsel > NVME_TEMP_THRESH_UNDER)) { 1767 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1768 return; 1769 } 1770 1771 if (((thsel == NVME_TEMP_THRESH_OVER) && (NVME_TEMPERATURE >= tmpth)) || 1772 ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth))) 1773 set_crit = true; 1774 1775 pthread_mutex_lock(&sc->mtx); 1776 if (set_crit) 1777 sc->health_log.critical_warning |= 1778 NVME_CRIT_WARN_ST_TEMPERATURE; 1779 else 1780 sc->health_log.critical_warning &= 1781 ~NVME_CRIT_WARN_ST_TEMPERATURE; 1782 pthread_mutex_unlock(&sc->mtx); 1783 1784 if (set_crit) 1785 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART, 1786 sc->health_log.critical_warning); 1787 1788 1789 DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status); 1790 } 1791 1792 static void 1793 nvme_feature_num_queues(struct pci_nvme_softc *sc, 1794 struct nvme_feature_obj *feat __unused, 1795 struct nvme_command *command, 1796 struct nvme_completion *compl) 1797 { 1798 uint16_t nqr; /* Number of Queues Requested */ 1799 1800 if (sc->num_q_is_set) { 1801 WPRINTF("%s: Number of Queues already set", __func__); 1802 pci_nvme_status_genc(&compl->status, 1803 NVME_SC_COMMAND_SEQUENCE_ERROR); 1804 return; 1805 } 1806 1807 nqr = command->cdw11 & 0xFFFF; 1808 if (nqr == 0xffff) { 1809 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr); 1810 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1811 return; 1812 } 1813 1814 sc->num_squeues = ONE_BASED(nqr); 1815 if (sc->num_squeues > sc->max_queues) { 1816 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues, 1817 sc->max_queues); 1818 sc->num_squeues = sc->max_queues; 1819 } 1820 1821 nqr = (command->cdw11 >> 16) & 0xFFFF; 1822 if (nqr == 0xffff) { 1823 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr); 1824 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1825 return; 1826 } 1827 1828 sc->num_cqueues = ONE_BASED(nqr); 1829 if (sc->num_cqueues > sc->max_queues) { 1830 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues, 1831 sc->max_queues); 1832 sc->num_cqueues = sc->max_queues; 1833 } 1834 1835 /* Patch the command value which will be saved on callback's return */ 1836 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc); 1837 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 1838 1839 sc->num_q_is_set = true; 1840 } 1841 1842 static int 1843 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command, 1844 struct nvme_completion *compl) 1845 { 1846 struct nvme_feature_obj *feat; 1847 uint32_t nsid = command->nsid; 1848 uint8_t fid = command->cdw10 & 0xFF; 1849 1850 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1851 1852 if (fid >= NVME_FID_MAX) { 1853 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1854 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1855 return (1); 1856 } 1857 feat = &sc->feat[fid]; 1858 1859 if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) { 1860 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1861 return (1); 1862 } 1863 1864 if (!feat->namespace_specific && 1865 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) { 1866 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1867 NVME_SC_FEATURE_NOT_NS_SPECIFIC); 1868 return (1); 1869 } 1870 1871 compl->cdw0 = 0; 1872 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1873 1874 if (feat->set) 1875 feat->set(sc, feat, command, compl); 1876 1877 DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11); 1878 if (compl->status == NVME_SC_SUCCESS) { 1879 feat->cdw11 = command->cdw11; 1880 if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) && 1881 (command->cdw11 != 0)) 1882 pci_nvme_aen_notify(sc); 1883 } 1884 1885 return (0); 1886 } 1887 1888 #define NVME_FEATURES_SEL_SUPPORTED 0x3 1889 #define NVME_FEATURES_NS_SPECIFIC (1 << 1) 1890 1891 static int 1892 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, 1893 struct nvme_completion* compl) 1894 { 1895 struct nvme_feature_obj *feat; 1896 uint8_t fid = command->cdw10 & 0xFF; 1897 uint8_t sel = (command->cdw10 >> 8) & 0x7; 1898 1899 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1900 1901 if (fid >= NVME_FID_MAX) { 1902 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1903 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1904 return (1); 1905 } 1906 1907 compl->cdw0 = 0; 1908 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1909 1910 feat = &sc->feat[fid]; 1911 if (feat->get) { 1912 feat->get(sc, feat, command, compl); 1913 } 1914 1915 if (compl->status == NVME_SC_SUCCESS) { 1916 if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific) 1917 compl->cdw0 = NVME_FEATURES_NS_SPECIFIC; 1918 else 1919 compl->cdw0 = feat->cdw11; 1920 } 1921 1922 return (0); 1923 } 1924 1925 static int 1926 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command, 1927 struct nvme_completion* compl) 1928 { 1929 uint8_t ses, lbaf, pi; 1930 1931 /* Only supports Secure Erase Setting - User Data Erase */ 1932 ses = (command->cdw10 >> 9) & 0x7; 1933 if (ses > 0x1) { 1934 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1935 return (1); 1936 } 1937 1938 /* Only supports a single LBA Format */ 1939 lbaf = command->cdw10 & 0xf; 1940 if (lbaf != 0) { 1941 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1942 NVME_SC_INVALID_FORMAT); 1943 return (1); 1944 } 1945 1946 /* Doesn't support Protection Infomation */ 1947 pi = (command->cdw10 >> 5) & 0x7; 1948 if (pi != 0) { 1949 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1950 return (1); 1951 } 1952 1953 if (sc->nvstore.type == NVME_STOR_RAM) { 1954 if (sc->nvstore.ctx) 1955 free(sc->nvstore.ctx); 1956 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 1957 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1958 } else { 1959 struct pci_nvme_ioreq *req; 1960 int err; 1961 1962 req = pci_nvme_get_ioreq(sc); 1963 if (req == NULL) { 1964 pci_nvme_status_genc(&compl->status, 1965 NVME_SC_INTERNAL_DEVICE_ERROR); 1966 WPRINTF("%s: unable to allocate IO req", __func__); 1967 return (1); 1968 } 1969 req->nvme_sq = &sc->submit_queues[0]; 1970 req->sqid = 0; 1971 req->opc = command->opc; 1972 req->cid = command->cid; 1973 req->nsid = command->nsid; 1974 1975 req->io_req.br_offset = 0; 1976 req->io_req.br_resid = sc->nvstore.size; 1977 req->io_req.br_callback = pci_nvme_io_done; 1978 1979 err = blockif_delete(sc->nvstore.ctx, &req->io_req); 1980 if (err) { 1981 pci_nvme_status_genc(&compl->status, 1982 NVME_SC_INTERNAL_DEVICE_ERROR); 1983 pci_nvme_release_ioreq(sc, req); 1984 } else 1985 compl->status = NVME_NO_STATUS; 1986 } 1987 1988 return (1); 1989 } 1990 1991 static int 1992 nvme_opc_abort(struct pci_nvme_softc *sc __unused, struct nvme_command *command, 1993 struct nvme_completion *compl) 1994 { 1995 DPRINTF("%s submission queue %u, command ID 0x%x", __func__, 1996 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF); 1997 1998 /* TODO: search for the command ID and abort it */ 1999 2000 compl->cdw0 = 1; 2001 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 2002 return (1); 2003 } 2004 2005 static int 2006 nvme_opc_async_event_req(struct pci_nvme_softc* sc, 2007 struct nvme_command* command, struct nvme_completion* compl) 2008 { 2009 DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__, 2010 sc->aer_count, sc->ctrldata.aerl, command->cid); 2011 2012 /* Don't exceed the Async Event Request Limit (AERL). */ 2013 if (pci_nvme_aer_limit_reached(sc)) { 2014 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 2015 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); 2016 return (1); 2017 } 2018 2019 if (pci_nvme_aer_add(sc, command->cid)) { 2020 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC, 2021 NVME_SC_INTERNAL_DEVICE_ERROR); 2022 return (1); 2023 } 2024 2025 /* 2026 * Raise events when they happen based on the Set Features cmd. 2027 * These events happen async, so only set completion successful if 2028 * there is an event reflective of the request to get event. 2029 */ 2030 compl->status = NVME_NO_STATUS; 2031 pci_nvme_aen_notify(sc); 2032 2033 return (0); 2034 } 2035 2036 static void 2037 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) 2038 { 2039 struct nvme_completion compl; 2040 struct nvme_command *cmd; 2041 struct nvme_submission_queue *sq; 2042 struct nvme_completion_queue *cq; 2043 uint16_t sqhead; 2044 2045 DPRINTF("%s index %u", __func__, (uint32_t)value); 2046 2047 sq = &sc->submit_queues[0]; 2048 cq = &sc->compl_queues[0]; 2049 2050 pthread_mutex_lock(&sq->mtx); 2051 2052 sqhead = sq->head; 2053 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail); 2054 2055 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2056 cmd = &(sq->qbase)[sqhead]; 2057 compl.cdw0 = 0; 2058 compl.status = 0; 2059 2060 switch (cmd->opc) { 2061 case NVME_OPC_DELETE_IO_SQ: 2062 DPRINTF("%s command DELETE_IO_SQ", __func__); 2063 nvme_opc_delete_io_sq(sc, cmd, &compl); 2064 break; 2065 case NVME_OPC_CREATE_IO_SQ: 2066 DPRINTF("%s command CREATE_IO_SQ", __func__); 2067 nvme_opc_create_io_sq(sc, cmd, &compl); 2068 break; 2069 case NVME_OPC_DELETE_IO_CQ: 2070 DPRINTF("%s command DELETE_IO_CQ", __func__); 2071 nvme_opc_delete_io_cq(sc, cmd, &compl); 2072 break; 2073 case NVME_OPC_CREATE_IO_CQ: 2074 DPRINTF("%s command CREATE_IO_CQ", __func__); 2075 nvme_opc_create_io_cq(sc, cmd, &compl); 2076 break; 2077 case NVME_OPC_GET_LOG_PAGE: 2078 DPRINTF("%s command GET_LOG_PAGE", __func__); 2079 nvme_opc_get_log_page(sc, cmd, &compl); 2080 break; 2081 case NVME_OPC_IDENTIFY: 2082 DPRINTF("%s command IDENTIFY", __func__); 2083 nvme_opc_identify(sc, cmd, &compl); 2084 break; 2085 case NVME_OPC_ABORT: 2086 DPRINTF("%s command ABORT", __func__); 2087 nvme_opc_abort(sc, cmd, &compl); 2088 break; 2089 case NVME_OPC_SET_FEATURES: 2090 DPRINTF("%s command SET_FEATURES", __func__); 2091 nvme_opc_set_features(sc, cmd, &compl); 2092 break; 2093 case NVME_OPC_GET_FEATURES: 2094 DPRINTF("%s command GET_FEATURES", __func__); 2095 nvme_opc_get_features(sc, cmd, &compl); 2096 break; 2097 case NVME_OPC_FIRMWARE_ACTIVATE: 2098 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__); 2099 pci_nvme_status_tc(&compl.status, 2100 NVME_SCT_COMMAND_SPECIFIC, 2101 NVME_SC_INVALID_FIRMWARE_SLOT); 2102 break; 2103 case NVME_OPC_ASYNC_EVENT_REQUEST: 2104 DPRINTF("%s command ASYNC_EVENT_REQ", __func__); 2105 nvme_opc_async_event_req(sc, cmd, &compl); 2106 break; 2107 case NVME_OPC_FORMAT_NVM: 2108 DPRINTF("%s command FORMAT_NVM", __func__); 2109 if ((sc->ctrldata.oacs & 2110 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) { 2111 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 2112 break; 2113 } 2114 nvme_opc_format_nvm(sc, cmd, &compl); 2115 break; 2116 case NVME_OPC_SECURITY_SEND: 2117 case NVME_OPC_SECURITY_RECEIVE: 2118 case NVME_OPC_SANITIZE: 2119 case NVME_OPC_GET_LBA_STATUS: 2120 DPRINTF("%s command OPC=%#x (unsupported)", __func__, 2121 cmd->opc); 2122 /* Valid but unsupported opcodes */ 2123 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD); 2124 break; 2125 default: 2126 DPRINTF("%s command OPC=%#X (not implemented)", 2127 __func__, 2128 cmd->opc); 2129 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 2130 } 2131 sqhead = (sqhead + 1) % sq->size; 2132 2133 if (NVME_COMPLETION_VALID(compl)) { 2134 pci_nvme_cq_update(sc, &sc->compl_queues[0], 2135 compl.cdw0, 2136 cmd->cid, 2137 0, /* SQID */ 2138 compl.status); 2139 } 2140 } 2141 2142 DPRINTF("setting sqhead %u", sqhead); 2143 sq->head = sqhead; 2144 2145 if (cq->head != cq->tail) 2146 pci_generate_msix(sc->nsc_pi, 0); 2147 2148 pthread_mutex_unlock(&sq->mtx); 2149 } 2150 2151 /* 2152 * Update the Write and Read statistics reported in SMART data 2153 * 2154 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up. 2155 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000 2156 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999. 2157 */ 2158 static void 2159 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc, 2160 size_t bytes, uint16_t status) 2161 { 2162 2163 pthread_mutex_lock(&sc->mtx); 2164 switch (opc) { 2165 case NVME_OPC_WRITE: 2166 sc->write_commands++; 2167 if (status != NVME_SC_SUCCESS) 2168 break; 2169 sc->write_dunits_remainder += (bytes / 512); 2170 while (sc->write_dunits_remainder >= 1000) { 2171 sc->write_data_units++; 2172 sc->write_dunits_remainder -= 1000; 2173 } 2174 break; 2175 case NVME_OPC_READ: 2176 sc->read_commands++; 2177 if (status != NVME_SC_SUCCESS) 2178 break; 2179 sc->read_dunits_remainder += (bytes / 512); 2180 while (sc->read_dunits_remainder >= 1000) { 2181 sc->read_data_units++; 2182 sc->read_dunits_remainder -= 1000; 2183 } 2184 break; 2185 default: 2186 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc); 2187 break; 2188 } 2189 pthread_mutex_unlock(&sc->mtx); 2190 } 2191 2192 /* 2193 * Check if the combination of Starting LBA (slba) and number of blocks 2194 * exceeds the range of the underlying storage. 2195 * 2196 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores 2197 * the capacity in bytes as a uint64_t, care must be taken to avoid integer 2198 * overflow. 2199 */ 2200 static bool 2201 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba, 2202 uint32_t nblocks) 2203 { 2204 size_t offset, bytes; 2205 2206 /* Overflow check of multiplying Starting LBA by the sector size */ 2207 if (slba >> (64 - nvstore->sectsz_bits)) 2208 return (true); 2209 2210 offset = slba << nvstore->sectsz_bits; 2211 bytes = nblocks << nvstore->sectsz_bits; 2212 2213 /* Overflow check of Number of Logical Blocks */ 2214 if ((nvstore->size <= offset) || ((nvstore->size - offset) < bytes)) 2215 return (true); 2216 2217 return (false); 2218 } 2219 2220 static int 2221 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req, 2222 uint64_t gpaddr, size_t size, int do_write, uint64_t offset) 2223 { 2224 int iovidx; 2225 bool range_is_contiguous; 2226 2227 if (req == NULL) 2228 return (-1); 2229 2230 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) { 2231 return (-1); 2232 } 2233 2234 /* 2235 * Minimize the number of IOVs by concatenating contiguous address 2236 * ranges. If the IOV count is zero, there is no previous range to 2237 * concatenate. 2238 */ 2239 if (req->io_req.br_iovcnt == 0) 2240 range_is_contiguous = false; 2241 else 2242 range_is_contiguous = (req->prev_gpaddr + req->prev_size) == gpaddr; 2243 2244 if (range_is_contiguous) { 2245 iovidx = req->io_req.br_iovcnt - 1; 2246 2247 req->io_req.br_iov[iovidx].iov_base = 2248 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2249 req->prev_gpaddr, size); 2250 if (req->io_req.br_iov[iovidx].iov_base == NULL) 2251 return (-1); 2252 2253 req->prev_size += size; 2254 req->io_req.br_resid += size; 2255 2256 req->io_req.br_iov[iovidx].iov_len = req->prev_size; 2257 } else { 2258 iovidx = req->io_req.br_iovcnt; 2259 if (iovidx == 0) { 2260 req->io_req.br_offset = offset; 2261 req->io_req.br_resid = 0; 2262 req->io_req.br_param = req; 2263 } 2264 2265 req->io_req.br_iov[iovidx].iov_base = 2266 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2267 gpaddr, size); 2268 if (req->io_req.br_iov[iovidx].iov_base == NULL) 2269 return (-1); 2270 2271 req->io_req.br_iov[iovidx].iov_len = size; 2272 2273 req->prev_gpaddr = gpaddr; 2274 req->prev_size = size; 2275 req->io_req.br_resid += size; 2276 2277 req->io_req.br_iovcnt++; 2278 } 2279 2280 return (0); 2281 } 2282 2283 static void 2284 pci_nvme_set_completion(struct pci_nvme_softc *sc, 2285 struct nvme_submission_queue *sq, int sqid, uint16_t cid, uint16_t status) 2286 { 2287 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; 2288 2289 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x", 2290 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), 2291 NVME_STATUS_GET_SC(status)); 2292 2293 pci_nvme_cq_update(sc, cq, 0, cid, sqid, status); 2294 2295 if (cq->head != cq->tail) { 2296 if (cq->intr_en & NVME_CQ_INTEN) { 2297 pci_generate_msix(sc->nsc_pi, cq->intr_vec); 2298 } else { 2299 DPRINTF("%s: CQ%u interrupt disabled", 2300 __func__, sq->cqid); 2301 } 2302 } 2303 } 2304 2305 static void 2306 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) 2307 { 2308 req->sc = NULL; 2309 req->nvme_sq = NULL; 2310 req->sqid = 0; 2311 2312 pthread_mutex_lock(&sc->mtx); 2313 2314 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link); 2315 sc->pending_ios--; 2316 2317 /* when no more IO pending, can set to ready if device reset/enabled */ 2318 if (sc->pending_ios == 0 && 2319 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) 2320 sc->regs.csts |= NVME_CSTS_RDY; 2321 2322 pthread_mutex_unlock(&sc->mtx); 2323 2324 sem_post(&sc->iosemlock); 2325 } 2326 2327 static struct pci_nvme_ioreq * 2328 pci_nvme_get_ioreq(struct pci_nvme_softc *sc) 2329 { 2330 struct pci_nvme_ioreq *req = NULL; 2331 2332 sem_wait(&sc->iosemlock); 2333 pthread_mutex_lock(&sc->mtx); 2334 2335 req = STAILQ_FIRST(&sc->ioreqs_free); 2336 assert(req != NULL); 2337 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link); 2338 2339 req->sc = sc; 2340 2341 sc->pending_ios++; 2342 2343 pthread_mutex_unlock(&sc->mtx); 2344 2345 req->io_req.br_iovcnt = 0; 2346 req->io_req.br_offset = 0; 2347 req->io_req.br_resid = 0; 2348 req->io_req.br_param = req; 2349 req->prev_gpaddr = 0; 2350 req->prev_size = 0; 2351 2352 return req; 2353 } 2354 2355 static void 2356 pci_nvme_io_done(struct blockif_req *br, int err) 2357 { 2358 struct pci_nvme_ioreq *req = br->br_param; 2359 struct nvme_submission_queue *sq = req->nvme_sq; 2360 uint16_t code, status; 2361 2362 DPRINTF("%s error %d %s", __func__, err, strerror(err)); 2363 2364 /* TODO return correct error */ 2365 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; 2366 pci_nvme_status_genc(&status, code); 2367 2368 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, status); 2369 pci_nvme_stats_write_read_update(req->sc, req->opc, 2370 req->bytes, status); 2371 pci_nvme_release_ioreq(req->sc, req); 2372 } 2373 2374 /* 2375 * Implements the Flush command. The specification states: 2376 * If a volatile write cache is not present, Flush commands complete 2377 * successfully and have no effect 2378 * in the description of the Volatile Write Cache (VWC) field of the Identify 2379 * Controller data. Therefore, set status to Success if the command is 2380 * not supported (i.e. RAM or as indicated by the blockif). 2381 */ 2382 static bool 2383 nvme_opc_flush(struct pci_nvme_softc *sc __unused, 2384 struct nvme_command *cmd __unused, 2385 struct pci_nvme_blockstore *nvstore, 2386 struct pci_nvme_ioreq *req, 2387 uint16_t *status) 2388 { 2389 bool pending = false; 2390 2391 if (nvstore->type == NVME_STOR_RAM) { 2392 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2393 } else { 2394 int err; 2395 2396 req->io_req.br_callback = pci_nvme_io_done; 2397 2398 err = blockif_flush(nvstore->ctx, &req->io_req); 2399 switch (err) { 2400 case 0: 2401 pending = true; 2402 break; 2403 case EOPNOTSUPP: 2404 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2405 break; 2406 default: 2407 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2408 } 2409 } 2410 2411 return (pending); 2412 } 2413 2414 static uint16_t 2415 nvme_write_read_ram(struct pci_nvme_softc *sc, 2416 struct pci_nvme_blockstore *nvstore, 2417 uint64_t prp1, uint64_t prp2, 2418 size_t offset, uint64_t bytes, 2419 bool is_write) 2420 { 2421 uint8_t *buf = nvstore->ctx; 2422 enum nvme_copy_dir dir; 2423 uint16_t status; 2424 2425 if (is_write) 2426 dir = NVME_COPY_TO_PRP; 2427 else 2428 dir = NVME_COPY_FROM_PRP; 2429 2430 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2, 2431 buf + offset, bytes, dir)) 2432 pci_nvme_status_genc(&status, 2433 NVME_SC_DATA_TRANSFER_ERROR); 2434 else 2435 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2436 2437 return (status); 2438 } 2439 2440 static uint16_t 2441 nvme_write_read_blockif(struct pci_nvme_softc *sc, 2442 struct pci_nvme_blockstore *nvstore, 2443 struct pci_nvme_ioreq *req, 2444 uint64_t prp1, uint64_t prp2, 2445 size_t offset, uint64_t bytes, 2446 bool is_write) 2447 { 2448 uint64_t size; 2449 int err; 2450 uint16_t status = NVME_NO_STATUS; 2451 2452 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes); 2453 if (pci_nvme_append_iov_req(sc, req, prp1, 2454 size, is_write, offset)) { 2455 err = -1; 2456 goto out; 2457 } 2458 2459 offset += size; 2460 bytes -= size; 2461 2462 if (bytes == 0) { 2463 ; 2464 } else if (bytes <= PAGE_SIZE) { 2465 size = bytes; 2466 if (pci_nvme_append_iov_req(sc, req, prp2, 2467 size, is_write, offset)) { 2468 err = -1; 2469 goto out; 2470 } 2471 } else { 2472 void *vmctx = sc->nsc_pi->pi_vmctx; 2473 uint64_t *prp_list = &prp2; 2474 uint64_t *last = prp_list; 2475 2476 /* PRP2 is pointer to a physical region page list */ 2477 while (bytes) { 2478 /* Last entry in list points to the next list */ 2479 if ((prp_list == last) && (bytes > PAGE_SIZE)) { 2480 uint64_t prp = *prp_list; 2481 2482 prp_list = paddr_guest2host(vmctx, prp, 2483 PAGE_SIZE - (prp % PAGE_SIZE)); 2484 if (prp_list == NULL) { 2485 err = -1; 2486 goto out; 2487 } 2488 last = prp_list + (NVME_PRP2_ITEMS - 1); 2489 } 2490 2491 size = MIN(bytes, PAGE_SIZE); 2492 2493 if (pci_nvme_append_iov_req(sc, req, *prp_list, 2494 size, is_write, offset)) { 2495 err = -1; 2496 goto out; 2497 } 2498 2499 offset += size; 2500 bytes -= size; 2501 2502 prp_list++; 2503 } 2504 } 2505 req->io_req.br_callback = pci_nvme_io_done; 2506 if (is_write) 2507 err = blockif_write(nvstore->ctx, &req->io_req); 2508 else 2509 err = blockif_read(nvstore->ctx, &req->io_req); 2510 out: 2511 if (err) 2512 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); 2513 2514 return (status); 2515 } 2516 2517 static bool 2518 nvme_opc_write_read(struct pci_nvme_softc *sc, 2519 struct nvme_command *cmd, 2520 struct pci_nvme_blockstore *nvstore, 2521 struct pci_nvme_ioreq *req, 2522 uint16_t *status) 2523 { 2524 uint64_t lba, nblocks, bytes; 2525 size_t offset; 2526 bool is_write = cmd->opc == NVME_OPC_WRITE; 2527 bool pending = false; 2528 2529 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; 2530 nblocks = (cmd->cdw12 & 0xFFFF) + 1; 2531 bytes = nblocks << nvstore->sectsz_bits; 2532 if (bytes > NVME_MAX_DATA_SIZE) { 2533 WPRINTF("%s command would exceed MDTS", __func__); 2534 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD); 2535 goto out; 2536 } 2537 2538 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) { 2539 WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)", 2540 __func__, lba, nblocks); 2541 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2542 goto out; 2543 } 2544 2545 offset = lba << nvstore->sectsz_bits; 2546 2547 req->bytes = bytes; 2548 req->io_req.br_offset = lba; 2549 2550 /* PRP bits 1:0 must be zero */ 2551 cmd->prp1 &= ~0x3UL; 2552 cmd->prp2 &= ~0x3UL; 2553 2554 if (nvstore->type == NVME_STOR_RAM) { 2555 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1, 2556 cmd->prp2, offset, bytes, is_write); 2557 } else { 2558 *status = nvme_write_read_blockif(sc, nvstore, req, 2559 cmd->prp1, cmd->prp2, offset, bytes, is_write); 2560 2561 if (*status == NVME_NO_STATUS) 2562 pending = true; 2563 } 2564 out: 2565 if (!pending) 2566 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status); 2567 2568 return (pending); 2569 } 2570 2571 static void 2572 pci_nvme_dealloc_sm(struct blockif_req *br, int err) 2573 { 2574 struct pci_nvme_ioreq *req = br->br_param; 2575 struct pci_nvme_softc *sc = req->sc; 2576 bool done = true; 2577 uint16_t status; 2578 2579 if (err) { 2580 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); 2581 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) { 2582 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2583 } else { 2584 struct iovec *iov = req->io_req.br_iov; 2585 2586 req->prev_gpaddr++; 2587 iov += req->prev_gpaddr; 2588 2589 /* The iov_* values already include the sector size */ 2590 req->io_req.br_offset = (off_t)iov->iov_base; 2591 req->io_req.br_resid = iov->iov_len; 2592 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) { 2593 pci_nvme_status_genc(&status, 2594 NVME_SC_INTERNAL_DEVICE_ERROR); 2595 } else 2596 done = false; 2597 } 2598 2599 if (done) { 2600 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, req->cid, 2601 status); 2602 pci_nvme_release_ioreq(sc, req); 2603 } 2604 } 2605 2606 static bool 2607 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, 2608 struct nvme_command *cmd, 2609 struct pci_nvme_blockstore *nvstore, 2610 struct pci_nvme_ioreq *req, 2611 uint16_t *status) 2612 { 2613 struct nvme_dsm_range *range = NULL; 2614 uint32_t nr, r, non_zero, dr; 2615 int err; 2616 bool pending = false; 2617 2618 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) { 2619 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE); 2620 goto out; 2621 } 2622 2623 nr = cmd->cdw10 & 0xff; 2624 2625 /* copy locally because a range entry could straddle PRPs */ 2626 range = calloc(1, NVME_MAX_DSM_TRIM); 2627 if (range == NULL) { 2628 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2629 goto out; 2630 } 2631 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, 2632 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP); 2633 2634 /* Check for invalid ranges and the number of non-zero lengths */ 2635 non_zero = 0; 2636 for (r = 0; r <= nr; r++) { 2637 if (pci_nvme_out_of_range(nvstore, 2638 range[r].starting_lba, range[r].length)) { 2639 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2640 goto out; 2641 } 2642 if (range[r].length != 0) 2643 non_zero++; 2644 } 2645 2646 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) { 2647 size_t offset, bytes; 2648 int sectsz_bits = sc->nvstore.sectsz_bits; 2649 2650 /* 2651 * DSM calls are advisory only, and compliant controllers 2652 * may choose to take no actions (i.e. return Success). 2653 */ 2654 if (!nvstore->deallocate) { 2655 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2656 goto out; 2657 } 2658 2659 /* If all ranges have a zero length, return Success */ 2660 if (non_zero == 0) { 2661 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2662 goto out; 2663 } 2664 2665 if (req == NULL) { 2666 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2667 goto out; 2668 } 2669 2670 offset = range[0].starting_lba << sectsz_bits; 2671 bytes = range[0].length << sectsz_bits; 2672 2673 /* 2674 * If the request is for more than a single range, store 2675 * the ranges in the br_iov. Optimize for the common case 2676 * of a single range. 2677 * 2678 * Note that NVMe Number of Ranges is a zero based value 2679 */ 2680 req->io_req.br_iovcnt = 0; 2681 req->io_req.br_offset = offset; 2682 req->io_req.br_resid = bytes; 2683 2684 if (nr == 0) { 2685 req->io_req.br_callback = pci_nvme_io_done; 2686 } else { 2687 struct iovec *iov = req->io_req.br_iov; 2688 2689 for (r = 0, dr = 0; r <= nr; r++) { 2690 offset = range[r].starting_lba << sectsz_bits; 2691 bytes = range[r].length << sectsz_bits; 2692 if (bytes == 0) 2693 continue; 2694 2695 if ((nvstore->size - offset) < bytes) { 2696 pci_nvme_status_genc(status, 2697 NVME_SC_LBA_OUT_OF_RANGE); 2698 goto out; 2699 } 2700 iov[dr].iov_base = (void *)offset; 2701 iov[dr].iov_len = bytes; 2702 dr++; 2703 } 2704 req->io_req.br_callback = pci_nvme_dealloc_sm; 2705 2706 /* 2707 * Use prev_gpaddr to track the current entry and 2708 * prev_size to track the number of entries 2709 */ 2710 req->prev_gpaddr = 0; 2711 req->prev_size = dr; 2712 } 2713 2714 err = blockif_delete(nvstore->ctx, &req->io_req); 2715 if (err) 2716 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2717 else 2718 pending = true; 2719 } 2720 out: 2721 free(range); 2722 return (pending); 2723 } 2724 2725 static void 2726 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) 2727 { 2728 struct nvme_submission_queue *sq; 2729 uint16_t status; 2730 uint16_t sqhead; 2731 2732 /* handle all submissions up to sq->tail index */ 2733 sq = &sc->submit_queues[idx]; 2734 2735 pthread_mutex_lock(&sq->mtx); 2736 2737 sqhead = sq->head; 2738 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p", 2739 idx, sqhead, sq->tail, sq->qbase); 2740 2741 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2742 struct nvme_command *cmd; 2743 struct pci_nvme_ioreq *req; 2744 uint32_t nsid; 2745 bool pending; 2746 2747 pending = false; 2748 req = NULL; 2749 status = 0; 2750 2751 cmd = &sq->qbase[sqhead]; 2752 sqhead = (sqhead + 1) % sq->size; 2753 2754 nsid = le32toh(cmd->nsid); 2755 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) { 2756 pci_nvme_status_genc(&status, 2757 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 2758 status |= 2759 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT; 2760 goto complete; 2761 } 2762 2763 req = pci_nvme_get_ioreq(sc); 2764 if (req == NULL) { 2765 pci_nvme_status_genc(&status, 2766 NVME_SC_INTERNAL_DEVICE_ERROR); 2767 WPRINTF("%s: unable to allocate IO req", __func__); 2768 goto complete; 2769 } 2770 req->nvme_sq = sq; 2771 req->sqid = idx; 2772 req->opc = cmd->opc; 2773 req->cid = cmd->cid; 2774 req->nsid = cmd->nsid; 2775 2776 switch (cmd->opc) { 2777 case NVME_OPC_FLUSH: 2778 pending = nvme_opc_flush(sc, cmd, &sc->nvstore, 2779 req, &status); 2780 break; 2781 case NVME_OPC_WRITE: 2782 case NVME_OPC_READ: 2783 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore, 2784 req, &status); 2785 break; 2786 case NVME_OPC_WRITE_ZEROES: 2787 /* TODO: write zeroes 2788 WPRINTF("%s write zeroes lba 0x%lx blocks %u", 2789 __func__, lba, cmd->cdw12 & 0xFFFF); */ 2790 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2791 break; 2792 case NVME_OPC_DATASET_MANAGEMENT: 2793 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, 2794 req, &status); 2795 break; 2796 default: 2797 WPRINTF("%s unhandled io command 0x%x", 2798 __func__, cmd->opc); 2799 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE); 2800 } 2801 complete: 2802 if (!pending) { 2803 pci_nvme_set_completion(sc, sq, idx, cmd->cid, status); 2804 if (req != NULL) 2805 pci_nvme_release_ioreq(sc, req); 2806 } 2807 } 2808 2809 sq->head = sqhead; 2810 2811 pthread_mutex_unlock(&sq->mtx); 2812 } 2813 2814 static void 2815 pci_nvme_handle_doorbell(struct vmctx *ctx __unused, struct pci_nvme_softc* sc, 2816 uint64_t idx, int is_sq, uint64_t value) 2817 { 2818 DPRINTF("nvme doorbell %lu, %s, val 0x%lx", 2819 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF); 2820 2821 if (is_sq) { 2822 if (idx > sc->num_squeues) { 2823 WPRINTF("%s queue index %lu overflow from " 2824 "guest (max %u)", 2825 __func__, idx, sc->num_squeues); 2826 return; 2827 } 2828 2829 atomic_store_short(&sc->submit_queues[idx].tail, 2830 (uint16_t)value); 2831 2832 if (idx == 0) { 2833 pci_nvme_handle_admin_cmd(sc, value); 2834 } else { 2835 /* submission queue; handle new entries in SQ */ 2836 if (idx > sc->num_squeues) { 2837 WPRINTF("%s SQ index %lu overflow from " 2838 "guest (max %u)", 2839 __func__, idx, sc->num_squeues); 2840 return; 2841 } 2842 pci_nvme_handle_io_cmd(sc, (uint16_t)idx); 2843 } 2844 } else { 2845 if (idx > sc->num_cqueues) { 2846 WPRINTF("%s queue index %lu overflow from " 2847 "guest (max %u)", 2848 __func__, idx, sc->num_cqueues); 2849 return; 2850 } 2851 2852 atomic_store_short(&sc->compl_queues[idx].head, 2853 (uint16_t)value); 2854 } 2855 } 2856 2857 static void 2858 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) 2859 { 2860 const char *s = iswrite ? "WRITE" : "READ"; 2861 2862 switch (offset) { 2863 case NVME_CR_CAP_LOW: 2864 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s); 2865 break; 2866 case NVME_CR_CAP_HI: 2867 DPRINTF("%s %s NVME_CR_CAP_HI", func, s); 2868 break; 2869 case NVME_CR_VS: 2870 DPRINTF("%s %s NVME_CR_VS", func, s); 2871 break; 2872 case NVME_CR_INTMS: 2873 DPRINTF("%s %s NVME_CR_INTMS", func, s); 2874 break; 2875 case NVME_CR_INTMC: 2876 DPRINTF("%s %s NVME_CR_INTMC", func, s); 2877 break; 2878 case NVME_CR_CC: 2879 DPRINTF("%s %s NVME_CR_CC", func, s); 2880 break; 2881 case NVME_CR_CSTS: 2882 DPRINTF("%s %s NVME_CR_CSTS", func, s); 2883 break; 2884 case NVME_CR_NSSR: 2885 DPRINTF("%s %s NVME_CR_NSSR", func, s); 2886 break; 2887 case NVME_CR_AQA: 2888 DPRINTF("%s %s NVME_CR_AQA", func, s); 2889 break; 2890 case NVME_CR_ASQ_LOW: 2891 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s); 2892 break; 2893 case NVME_CR_ASQ_HI: 2894 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s); 2895 break; 2896 case NVME_CR_ACQ_LOW: 2897 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s); 2898 break; 2899 case NVME_CR_ACQ_HI: 2900 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s); 2901 break; 2902 default: 2903 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset); 2904 } 2905 2906 } 2907 2908 static void 2909 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc, 2910 uint64_t offset, int size, uint64_t value) 2911 { 2912 uint32_t ccreg; 2913 2914 if (offset >= NVME_DOORBELL_OFFSET) { 2915 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; 2916 uint64_t idx = belloffset / 8; /* door bell size = 2*int */ 2917 int is_sq = (belloffset % 8) < 4; 2918 2919 if ((sc->regs.csts & NVME_CSTS_RDY) == 0) { 2920 WPRINTF("doorbell write prior to RDY (offset=%#lx)\n", 2921 offset); 2922 return; 2923 } 2924 2925 if (belloffset > ((sc->max_queues+1) * 8 - 4)) { 2926 WPRINTF("guest attempted an overflow write offset " 2927 "0x%lx, val 0x%lx in %s", 2928 offset, value, __func__); 2929 return; 2930 } 2931 2932 if (is_sq) { 2933 if (sc->submit_queues[idx].qbase == NULL) 2934 return; 2935 } else if (sc->compl_queues[idx].qbase == NULL) 2936 return; 2937 2938 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value); 2939 return; 2940 } 2941 2942 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx", 2943 offset, size, value); 2944 2945 if (size != 4) { 2946 WPRINTF("guest wrote invalid size %d (offset 0x%lx, " 2947 "val 0x%lx) to bar0 in %s", 2948 size, offset, value, __func__); 2949 /* TODO: shutdown device */ 2950 return; 2951 } 2952 2953 pci_nvme_bar0_reg_dumps(__func__, offset, 1); 2954 2955 pthread_mutex_lock(&sc->mtx); 2956 2957 switch (offset) { 2958 case NVME_CR_CAP_LOW: 2959 case NVME_CR_CAP_HI: 2960 /* readonly */ 2961 break; 2962 case NVME_CR_VS: 2963 /* readonly */ 2964 break; 2965 case NVME_CR_INTMS: 2966 /* MSI-X, so ignore */ 2967 break; 2968 case NVME_CR_INTMC: 2969 /* MSI-X, so ignore */ 2970 break; 2971 case NVME_CR_CC: 2972 ccreg = (uint32_t)value; 2973 2974 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u " 2975 "iocqes %u", 2976 __func__, 2977 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), 2978 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), 2979 NVME_CC_GET_IOCQES(ccreg)); 2980 2981 if (NVME_CC_GET_SHN(ccreg)) { 2982 /* perform shutdown - flush out data to backend */ 2983 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << 2984 NVME_CSTS_REG_SHST_SHIFT); 2985 sc->regs.csts |= NVME_SHST_COMPLETE << 2986 NVME_CSTS_REG_SHST_SHIFT; 2987 } 2988 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { 2989 if (NVME_CC_GET_EN(ccreg) == 0) 2990 /* transition 1-> causes controller reset */ 2991 pci_nvme_reset_locked(sc); 2992 else 2993 pci_nvme_init_controller(ctx, sc); 2994 } 2995 2996 /* Insert the iocqes, iosqes and en bits from the write */ 2997 sc->regs.cc &= ~NVME_CC_WRITE_MASK; 2998 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; 2999 if (NVME_CC_GET_EN(ccreg) == 0) { 3000 /* Insert the ams, mps and css bit fields */ 3001 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; 3002 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; 3003 sc->regs.csts &= ~NVME_CSTS_RDY; 3004 } else if ((sc->pending_ios == 0) && 3005 !(sc->regs.csts & NVME_CSTS_CFS)) { 3006 sc->regs.csts |= NVME_CSTS_RDY; 3007 } 3008 break; 3009 case NVME_CR_CSTS: 3010 break; 3011 case NVME_CR_NSSR: 3012 /* ignore writes; don't support subsystem reset */ 3013 break; 3014 case NVME_CR_AQA: 3015 sc->regs.aqa = (uint32_t)value; 3016 break; 3017 case NVME_CR_ASQ_LOW: 3018 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | 3019 (0xFFFFF000 & value); 3020 break; 3021 case NVME_CR_ASQ_HI: 3022 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | 3023 (value << 32); 3024 break; 3025 case NVME_CR_ACQ_LOW: 3026 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | 3027 (0xFFFFF000 & value); 3028 break; 3029 case NVME_CR_ACQ_HI: 3030 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | 3031 (value << 32); 3032 break; 3033 default: 3034 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d", 3035 __func__, offset, value, size); 3036 } 3037 pthread_mutex_unlock(&sc->mtx); 3038 } 3039 3040 static void 3041 pci_nvme_write(struct vmctx *ctx, int vcpu __unused, struct pci_devinst *pi, 3042 int baridx, uint64_t offset, int size, uint64_t value) 3043 { 3044 struct pci_nvme_softc* sc = pi->pi_arg; 3045 3046 if (baridx == pci_msix_table_bar(pi) || 3047 baridx == pci_msix_pba_bar(pi)) { 3048 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, " 3049 " value 0x%lx", baridx, offset, size, value); 3050 3051 pci_emul_msix_twrite(pi, offset, size, value); 3052 return; 3053 } 3054 3055 switch (baridx) { 3056 case 0: 3057 pci_nvme_write_bar_0(ctx, sc, offset, size, value); 3058 break; 3059 3060 default: 3061 DPRINTF("%s unknown baridx %d, val 0x%lx", 3062 __func__, baridx, value); 3063 } 3064 } 3065 3066 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, 3067 uint64_t offset, int size) 3068 { 3069 uint64_t value; 3070 3071 pci_nvme_bar0_reg_dumps(__func__, offset, 0); 3072 3073 if (offset < NVME_DOORBELL_OFFSET) { 3074 void *p = &(sc->regs); 3075 pthread_mutex_lock(&sc->mtx); 3076 memcpy(&value, (void *)((uintptr_t)p + offset), size); 3077 pthread_mutex_unlock(&sc->mtx); 3078 } else { 3079 value = 0; 3080 WPRINTF("pci_nvme: read invalid offset %ld", offset); 3081 } 3082 3083 switch (size) { 3084 case 1: 3085 value &= 0xFF; 3086 break; 3087 case 2: 3088 value &= 0xFFFF; 3089 break; 3090 case 4: 3091 value &= 0xFFFFFFFF; 3092 break; 3093 } 3094 3095 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x", 3096 offset, size, (uint32_t)value); 3097 3098 return (value); 3099 } 3100 3101 3102 3103 static uint64_t 3104 pci_nvme_read(struct vmctx *ctx __unused, int vcpu __unused, 3105 struct pci_devinst *pi, int baridx, uint64_t offset, int size) 3106 { 3107 struct pci_nvme_softc* sc = pi->pi_arg; 3108 3109 if (baridx == pci_msix_table_bar(pi) || 3110 baridx == pci_msix_pba_bar(pi)) { 3111 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d", 3112 baridx, offset, size); 3113 3114 return pci_emul_msix_tread(pi, offset, size); 3115 } 3116 3117 switch (baridx) { 3118 case 0: 3119 return pci_nvme_read_bar_0(sc, offset, size); 3120 3121 default: 3122 DPRINTF("unknown bar %d, 0x%lx", baridx, offset); 3123 } 3124 3125 return (0); 3126 } 3127 3128 static int 3129 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl) 3130 { 3131 char bident[sizeof("XX:X:X")]; 3132 const char *value; 3133 uint32_t sectsz; 3134 3135 sc->max_queues = NVME_QUEUES; 3136 sc->max_qentries = NVME_MAX_QENTRIES; 3137 sc->ioslots = NVME_IOSLOTS; 3138 sc->num_squeues = sc->max_queues; 3139 sc->num_cqueues = sc->max_queues; 3140 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 3141 sectsz = 0; 3142 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), 3143 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3144 3145 value = get_config_value_node(nvl, "maxq"); 3146 if (value != NULL) 3147 sc->max_queues = atoi(value); 3148 value = get_config_value_node(nvl, "qsz"); 3149 if (value != NULL) { 3150 sc->max_qentries = atoi(value); 3151 if (sc->max_qentries <= 0) { 3152 EPRINTLN("nvme: Invalid qsz option %d", 3153 sc->max_qentries); 3154 return (-1); 3155 } 3156 } 3157 value = get_config_value_node(nvl, "ioslots"); 3158 if (value != NULL) { 3159 sc->ioslots = atoi(value); 3160 if (sc->ioslots <= 0) { 3161 EPRINTLN("Invalid ioslots option %d", sc->ioslots); 3162 return (-1); 3163 } 3164 } 3165 value = get_config_value_node(nvl, "sectsz"); 3166 if (value != NULL) 3167 sectsz = atoi(value); 3168 value = get_config_value_node(nvl, "ser"); 3169 if (value != NULL) { 3170 /* 3171 * This field indicates the Product Serial Number in 3172 * 7-bit ASCII, unused bytes should be space characters. 3173 * Ref: NVMe v1.3c. 3174 */ 3175 cpywithpad((char *)sc->ctrldata.sn, 3176 sizeof(sc->ctrldata.sn), value, ' '); 3177 } 3178 value = get_config_value_node(nvl, "eui64"); 3179 if (value != NULL) 3180 sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0)); 3181 value = get_config_value_node(nvl, "dsm"); 3182 if (value != NULL) { 3183 if (strcmp(value, "auto") == 0) 3184 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 3185 else if (strcmp(value, "enable") == 0) 3186 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE; 3187 else if (strcmp(value, "disable") == 0) 3188 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE; 3189 } 3190 3191 value = get_config_value_node(nvl, "ram"); 3192 if (value != NULL) { 3193 uint64_t sz = strtoull(value, NULL, 10); 3194 3195 sc->nvstore.type = NVME_STOR_RAM; 3196 sc->nvstore.size = sz * 1024 * 1024; 3197 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 3198 sc->nvstore.sectsz = 4096; 3199 sc->nvstore.sectsz_bits = 12; 3200 if (sc->nvstore.ctx == NULL) { 3201 EPRINTLN("nvme: Unable to allocate RAM"); 3202 return (-1); 3203 } 3204 } else { 3205 snprintf(bident, sizeof(bident), "%d:%d", 3206 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3207 sc->nvstore.ctx = blockif_open(nvl, bident); 3208 if (sc->nvstore.ctx == NULL) { 3209 EPRINTLN("nvme: Could not open backing file: %s", 3210 strerror(errno)); 3211 return (-1); 3212 } 3213 sc->nvstore.type = NVME_STOR_BLOCKIF; 3214 sc->nvstore.size = blockif_size(sc->nvstore.ctx); 3215 } 3216 3217 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) 3218 sc->nvstore.sectsz = sectsz; 3219 else if (sc->nvstore.type != NVME_STOR_RAM) 3220 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); 3221 for (sc->nvstore.sectsz_bits = 9; 3222 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; 3223 sc->nvstore.sectsz_bits++); 3224 3225 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) 3226 sc->max_queues = NVME_QUEUES; 3227 3228 return (0); 3229 } 3230 3231 static void 3232 pci_nvme_resized(struct blockif_ctxt *bctxt __unused, void *arg, 3233 size_t new_size) 3234 { 3235 struct pci_nvme_softc *sc; 3236 struct pci_nvme_blockstore *nvstore; 3237 struct nvme_namespace_data *nd; 3238 3239 sc = arg; 3240 nvstore = &sc->nvstore; 3241 nd = &sc->nsdata; 3242 3243 nvstore->size = new_size; 3244 pci_nvme_init_nsdata_size(nvstore, nd); 3245 3246 /* Add changed NSID to list */ 3247 sc->ns_log.ns[0] = 1; 3248 sc->ns_log.ns[1] = 0; 3249 3250 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE, 3251 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED); 3252 } 3253 3254 static int 3255 pci_nvme_init(struct vmctx *ctx __unused, struct pci_devinst *pi, nvlist_t *nvl) 3256 { 3257 struct pci_nvme_softc *sc; 3258 uint32_t pci_membar_sz; 3259 int error; 3260 3261 error = 0; 3262 3263 sc = calloc(1, sizeof(struct pci_nvme_softc)); 3264 pi->pi_arg = sc; 3265 sc->nsc_pi = pi; 3266 3267 error = pci_nvme_parse_config(sc, nvl); 3268 if (error < 0) 3269 goto done; 3270 else 3271 error = 0; 3272 3273 STAILQ_INIT(&sc->ioreqs_free); 3274 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); 3275 for (int i = 0; i < sc->ioslots; i++) { 3276 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); 3277 } 3278 3279 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); 3280 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); 3281 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); 3282 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); 3283 pci_set_cfgdata8(pi, PCIR_PROGIF, 3284 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); 3285 3286 /* 3287 * Allocate size of NVMe registers + doorbell space for all queues. 3288 * 3289 * The specification requires a minimum memory I/O window size of 16K. 3290 * The Windows driver will refuse to start a device with a smaller 3291 * window. 3292 */ 3293 pci_membar_sz = sizeof(struct nvme_registers) + 3294 2 * sizeof(uint32_t) * (sc->max_queues + 1); 3295 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); 3296 3297 DPRINTF("nvme membar size: %u", pci_membar_sz); 3298 3299 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); 3300 if (error) { 3301 WPRINTF("%s pci alloc mem bar failed", __func__); 3302 goto done; 3303 } 3304 3305 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); 3306 if (error) { 3307 WPRINTF("%s pci add msixcap failed", __func__); 3308 goto done; 3309 } 3310 3311 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); 3312 if (error) { 3313 WPRINTF("%s pci add Express capability failed", __func__); 3314 goto done; 3315 } 3316 3317 pthread_mutex_init(&sc->mtx, NULL); 3318 sem_init(&sc->iosemlock, 0, sc->ioslots); 3319 blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc); 3320 3321 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues); 3322 /* 3323 * Controller data depends on Namespace data so initialize Namespace 3324 * data first. 3325 */ 3326 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore); 3327 pci_nvme_init_ctrldata(sc); 3328 pci_nvme_init_logpages(sc); 3329 pci_nvme_init_features(sc); 3330 3331 pci_nvme_aer_init(sc); 3332 pci_nvme_aen_init(sc); 3333 3334 pci_nvme_reset(sc); 3335 3336 pci_lintr_request(pi); 3337 3338 done: 3339 return (error); 3340 } 3341 3342 static int 3343 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts) 3344 { 3345 char *cp, *ram; 3346 3347 if (opts == NULL) 3348 return (0); 3349 3350 if (strncmp(opts, "ram=", 4) == 0) { 3351 cp = strchr(opts, ','); 3352 if (cp == NULL) { 3353 set_config_value_node(nvl, "ram", opts + 4); 3354 return (0); 3355 } 3356 ram = strndup(opts + 4, cp - opts - 4); 3357 set_config_value_node(nvl, "ram", ram); 3358 free(ram); 3359 return (pci_parse_legacy_config(nvl, cp + 1)); 3360 } else 3361 return (blockif_legacy_config(nvl, opts)); 3362 } 3363 3364 struct pci_devemu pci_de_nvme = { 3365 .pe_emu = "nvme", 3366 .pe_init = pci_nvme_init, 3367 .pe_legacy_config = pci_nvme_legacy_config, 3368 .pe_barwrite = pci_nvme_write, 3369 .pe_barread = pci_nvme_read 3370 }; 3371 PCI_EMUL_SET(pci_de_nvme); 3372