1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2017 Shunsuke Mie 5 * Copyright (c) 2018 Leon Dang 6 * Copyright (c) 2020 Chuck Tuffli 7 * 8 * Function crc16 Copyright (c) 2017, Fedor Uporov 9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 33 /* 34 * bhyve PCIe-NVMe device emulation. 35 * 36 * options: 37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt> 38 * 39 * accepted devpath: 40 * /dev/blockdev 41 * /path/to/image 42 * ram=size_in_MiB 43 * 44 * maxq = max number of queues 45 * qsz = max elements in each queue 46 * ioslots = max number of concurrent io requests 47 * sectsz = sector size (defaults to blockif sector size) 48 * ser = serial number (20-chars max) 49 * eui64 = IEEE Extended Unique Identifier (8 byte value) 50 * dsm = DataSet Management support. Option is one of auto, enable,disable 51 * 52 */ 53 54 /* TODO: 55 - create async event for smart and log 56 - intr coalesce 57 */ 58 59 #include <sys/cdefs.h> 60 __FBSDID("$FreeBSD$"); 61 62 #include <sys/errno.h> 63 #include <sys/types.h> 64 #include <net/ieee_oui.h> 65 66 #include <assert.h> 67 #include <pthread.h> 68 #include <pthread_np.h> 69 #include <semaphore.h> 70 #include <stdbool.h> 71 #include <stddef.h> 72 #include <stdint.h> 73 #include <stdio.h> 74 #include <stdlib.h> 75 #include <string.h> 76 77 #include <machine/atomic.h> 78 #include <machine/vmm.h> 79 #include <vmmapi.h> 80 81 #include <dev/nvme/nvme.h> 82 83 #include "bhyverun.h" 84 #include "block_if.h" 85 #include "config.h" 86 #include "debug.h" 87 #include "pci_emul.h" 88 89 90 static int nvme_debug = 0; 91 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args) 92 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args) 93 94 /* defaults; can be overridden */ 95 #define NVME_MSIX_BAR 4 96 97 #define NVME_IOSLOTS 8 98 99 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ 100 #define NVME_MMIO_SPACE_MIN (1 << 14) 101 102 #define NVME_QUEUES 16 103 #define NVME_MAX_QENTRIES 2048 104 /* Memory Page size Minimum reported in CAP register */ 105 #define NVME_MPSMIN 0 106 /* MPSMIN converted to bytes */ 107 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN)) 108 109 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) 110 #define NVME_MDTS 9 111 /* Note the + 1 allows for the initial descriptor to not be page aligned */ 112 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1) 113 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES) 114 115 /* This is a synthetic status code to indicate there is no status */ 116 #define NVME_NO_STATUS 0xffff 117 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) 118 119 /* Reported temperature in Kelvin (i.e. room temperature) */ 120 #define NVME_TEMPERATURE 296 121 122 /* helpers */ 123 124 /* Convert a zero-based value into a one-based value */ 125 #define ONE_BASED(zero) ((zero) + 1) 126 /* Convert a one-based value into a zero-based value */ 127 #define ZERO_BASED(one) ((one) - 1) 128 129 /* Encode number of SQ's and CQ's for Set/Get Features */ 130 #define NVME_FEATURE_NUM_QUEUES(sc) \ 131 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ 132 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16; 133 134 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) 135 136 enum nvme_controller_register_offsets { 137 NVME_CR_CAP_LOW = 0x00, 138 NVME_CR_CAP_HI = 0x04, 139 NVME_CR_VS = 0x08, 140 NVME_CR_INTMS = 0x0c, 141 NVME_CR_INTMC = 0x10, 142 NVME_CR_CC = 0x14, 143 NVME_CR_CSTS = 0x1c, 144 NVME_CR_NSSR = 0x20, 145 NVME_CR_AQA = 0x24, 146 NVME_CR_ASQ_LOW = 0x28, 147 NVME_CR_ASQ_HI = 0x2c, 148 NVME_CR_ACQ_LOW = 0x30, 149 NVME_CR_ACQ_HI = 0x34, 150 }; 151 152 enum nvme_cmd_cdw11 { 153 NVME_CMD_CDW11_PC = 0x0001, 154 NVME_CMD_CDW11_IEN = 0x0002, 155 NVME_CMD_CDW11_IV = 0xFFFF0000, 156 }; 157 158 enum nvme_copy_dir { 159 NVME_COPY_TO_PRP, 160 NVME_COPY_FROM_PRP, 161 }; 162 163 #define NVME_CQ_INTEN 0x01 164 #define NVME_CQ_INTCOAL 0x02 165 166 struct nvme_completion_queue { 167 struct nvme_completion *qbase; 168 pthread_mutex_t mtx; 169 uint32_t size; 170 uint16_t tail; /* nvme progress */ 171 uint16_t head; /* guest progress */ 172 uint16_t intr_vec; 173 uint32_t intr_en; 174 }; 175 176 struct nvme_submission_queue { 177 struct nvme_command *qbase; 178 pthread_mutex_t mtx; 179 uint32_t size; 180 uint16_t head; /* nvme progress */ 181 uint16_t tail; /* guest progress */ 182 uint16_t cqid; /* completion queue id */ 183 int qpriority; 184 }; 185 186 enum nvme_storage_type { 187 NVME_STOR_BLOCKIF = 0, 188 NVME_STOR_RAM = 1, 189 }; 190 191 struct pci_nvme_blockstore { 192 enum nvme_storage_type type; 193 void *ctx; 194 uint64_t size; 195 uint32_t sectsz; 196 uint32_t sectsz_bits; 197 uint64_t eui64; 198 uint32_t deallocate:1; 199 }; 200 201 /* 202 * Calculate the number of additional page descriptors for guest IO requests 203 * based on the advertised Max Data Transfer (MDTS) and given the number of 204 * default iovec's in a struct blockif_req. 205 */ 206 #define MDTS_PAD_SIZE \ 207 ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \ 208 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \ 209 0 ) 210 211 struct pci_nvme_ioreq { 212 struct pci_nvme_softc *sc; 213 STAILQ_ENTRY(pci_nvme_ioreq) link; 214 struct nvme_submission_queue *nvme_sq; 215 uint16_t sqid; 216 217 /* command information */ 218 uint16_t opc; 219 uint16_t cid; 220 uint32_t nsid; 221 222 uint64_t prev_gpaddr; 223 size_t prev_size; 224 size_t bytes; 225 226 struct blockif_req io_req; 227 228 struct iovec iovpadding[MDTS_PAD_SIZE]; 229 }; 230 231 enum nvme_dsm_type { 232 /* Dataset Management bit in ONCS reflects backing storage capability */ 233 NVME_DATASET_MANAGEMENT_AUTO, 234 /* Unconditionally set Dataset Management bit in ONCS */ 235 NVME_DATASET_MANAGEMENT_ENABLE, 236 /* Unconditionally clear Dataset Management bit in ONCS */ 237 NVME_DATASET_MANAGEMENT_DISABLE, 238 }; 239 240 struct pci_nvme_softc; 241 struct nvme_feature_obj; 242 243 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *, 244 struct nvme_feature_obj *, 245 struct nvme_command *, 246 struct nvme_completion *); 247 248 struct nvme_feature_obj { 249 uint32_t cdw11; 250 nvme_feature_cb set; 251 nvme_feature_cb get; 252 bool namespace_specific; 253 }; 254 255 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1) 256 257 typedef enum { 258 PCI_NVME_AE_TYPE_ERROR = 0, 259 PCI_NVME_AE_TYPE_SMART, 260 PCI_NVME_AE_TYPE_NOTICE, 261 PCI_NVME_AE_TYPE_IO_CMD = 6, 262 PCI_NVME_AE_TYPE_VENDOR = 7, 263 PCI_NVME_AE_TYPE_MAX /* Must be last */ 264 } pci_nvme_async_type; 265 266 /* Asynchronous Event Requests */ 267 struct pci_nvme_aer { 268 STAILQ_ENTRY(pci_nvme_aer) link; 269 uint16_t cid; /* Command ID of the submitted AER */ 270 }; 271 272 /** Asynchronous Event Information - Notice */ 273 typedef enum { 274 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED = 0, 275 PCI_NVME_AEI_NOTICE_FW_ACTIVATION, 276 PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE, 277 PCI_NVME_AEI_NOTICE_ANA_CHANGE, 278 PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE, 279 PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT, 280 PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE, 281 PCI_NVME_AEI_NOTICE_MAX, 282 } pci_nvme_async_event_info_notice; 283 284 #define PCI_NVME_AEI_NOTICE_SHIFT 8 285 #define PCI_NVME_AEI_NOTICE_MASK(event) (1 << (event + PCI_NVME_AEI_NOTICE_SHIFT)) 286 287 /* Asynchronous Event Notifications */ 288 struct pci_nvme_aen { 289 pci_nvme_async_type atype; 290 uint32_t event_data; 291 bool posted; 292 }; 293 294 /* 295 * By default, enable all Asynchrnous Event Notifications: 296 * SMART / Health Critical Warnings 297 * Namespace Attribute Notices 298 */ 299 #define PCI_NVME_AEN_DEFAULT_MASK 0x11f 300 301 typedef enum { 302 NVME_CNTRLTYPE_IO = 1, 303 NVME_CNTRLTYPE_DISCOVERY = 2, 304 NVME_CNTRLTYPE_ADMIN = 3, 305 } pci_nvme_cntrl_type; 306 307 struct pci_nvme_softc { 308 struct pci_devinst *nsc_pi; 309 310 pthread_mutex_t mtx; 311 312 struct nvme_registers regs; 313 314 struct nvme_namespace_data nsdata; 315 struct nvme_controller_data ctrldata; 316 struct nvme_error_information_entry err_log; 317 struct nvme_health_information_page health_log; 318 struct nvme_firmware_page fw_log; 319 struct nvme_ns_list ns_log; 320 321 struct pci_nvme_blockstore nvstore; 322 323 uint16_t max_qentries; /* max entries per queue */ 324 uint32_t max_queues; /* max number of IO SQ's or CQ's */ 325 uint32_t num_cqueues; 326 uint32_t num_squeues; 327 bool num_q_is_set; /* Has host set Number of Queues */ 328 329 struct pci_nvme_ioreq *ioreqs; 330 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */ 331 uint32_t pending_ios; 332 uint32_t ioslots; 333 sem_t iosemlock; 334 335 /* 336 * Memory mapped Submission and Completion queues 337 * Each array includes both Admin and IO queues 338 */ 339 struct nvme_completion_queue *compl_queues; 340 struct nvme_submission_queue *submit_queues; 341 342 struct nvme_feature_obj feat[NVME_FID_MAX]; 343 344 enum nvme_dsm_type dataset_management; 345 346 /* Accounting for SMART data */ 347 __uint128_t read_data_units; 348 __uint128_t write_data_units; 349 __uint128_t read_commands; 350 __uint128_t write_commands; 351 uint32_t read_dunits_remainder; 352 uint32_t write_dunits_remainder; 353 354 STAILQ_HEAD(, pci_nvme_aer) aer_list; 355 pthread_mutex_t aer_mtx; 356 uint32_t aer_count; 357 struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX]; 358 pthread_t aen_tid; 359 pthread_mutex_t aen_mtx; 360 pthread_cond_t aen_cond; 361 }; 362 363 364 static void pci_nvme_cq_update(struct pci_nvme_softc *sc, 365 struct nvme_completion_queue *cq, 366 uint32_t cdw0, 367 uint16_t cid, 368 uint16_t sqid, 369 uint16_t status); 370 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); 371 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); 372 static void pci_nvme_io_done(struct blockif_req *, int); 373 374 /* Controller Configuration utils */ 375 #define NVME_CC_GET_EN(cc) \ 376 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) 377 #define NVME_CC_GET_CSS(cc) \ 378 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) 379 #define NVME_CC_GET_SHN(cc) \ 380 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) 381 #define NVME_CC_GET_IOSQES(cc) \ 382 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) 383 #define NVME_CC_GET_IOCQES(cc) \ 384 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) 385 386 #define NVME_CC_WRITE_MASK \ 387 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ 388 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ 389 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) 390 391 #define NVME_CC_NEN_WRITE_MASK \ 392 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ 393 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ 394 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) 395 396 /* Controller Status utils */ 397 #define NVME_CSTS_GET_RDY(sts) \ 398 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) 399 400 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) 401 #define NVME_CSTS_CFS (1 << NVME_CSTS_REG_CFS_SHIFT) 402 403 /* Completion Queue status word utils */ 404 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) 405 #define NVME_STATUS_MASK \ 406 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ 407 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) 408 409 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \ 410 NVME_CTRLR_DATA_ONCS_DSM_SHIFT) 411 412 static void nvme_feature_invalid_cb(struct pci_nvme_softc *, 413 struct nvme_feature_obj *, 414 struct nvme_command *, 415 struct nvme_completion *); 416 static void nvme_feature_temperature(struct pci_nvme_softc *, 417 struct nvme_feature_obj *, 418 struct nvme_command *, 419 struct nvme_completion *); 420 static void nvme_feature_num_queues(struct pci_nvme_softc *, 421 struct nvme_feature_obj *, 422 struct nvme_command *, 423 struct nvme_completion *); 424 static void nvme_feature_iv_config(struct pci_nvme_softc *, 425 struct nvme_feature_obj *, 426 struct nvme_command *, 427 struct nvme_completion *); 428 static void nvme_feature_async_event(struct pci_nvme_softc *, 429 struct nvme_feature_obj *, 430 struct nvme_command *, 431 struct nvme_completion *); 432 433 static void *aen_thr(void *arg); 434 435 static __inline void 436 cpywithpad(char *dst, size_t dst_size, const char *src, char pad) 437 { 438 size_t len; 439 440 len = strnlen(src, dst_size); 441 memset(dst, pad, dst_size); 442 memcpy(dst, src, len); 443 } 444 445 static __inline void 446 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) 447 { 448 449 *status &= ~NVME_STATUS_MASK; 450 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | 451 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; 452 } 453 454 static __inline void 455 pci_nvme_status_genc(uint16_t *status, uint16_t code) 456 { 457 458 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); 459 } 460 461 /* 462 * Initialize the requested number or IO Submission and Completion Queues. 463 * Admin queues are allocated implicitly. 464 */ 465 static void 466 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq) 467 { 468 uint32_t i; 469 470 /* 471 * Allocate and initialize the Submission Queues 472 */ 473 if (nsq > NVME_QUEUES) { 474 WPRINTF("%s: clamping number of SQ from %u to %u", 475 __func__, nsq, NVME_QUEUES); 476 nsq = NVME_QUEUES; 477 } 478 479 sc->num_squeues = nsq; 480 481 sc->submit_queues = calloc(sc->num_squeues + 1, 482 sizeof(struct nvme_submission_queue)); 483 if (sc->submit_queues == NULL) { 484 WPRINTF("%s: SQ allocation failed", __func__); 485 sc->num_squeues = 0; 486 } else { 487 struct nvme_submission_queue *sq = sc->submit_queues; 488 489 for (i = 0; i < sc->num_squeues + 1; i++) 490 pthread_mutex_init(&sq[i].mtx, NULL); 491 } 492 493 /* 494 * Allocate and initialize the Completion Queues 495 */ 496 if (ncq > NVME_QUEUES) { 497 WPRINTF("%s: clamping number of CQ from %u to %u", 498 __func__, ncq, NVME_QUEUES); 499 ncq = NVME_QUEUES; 500 } 501 502 sc->num_cqueues = ncq; 503 504 sc->compl_queues = calloc(sc->num_cqueues + 1, 505 sizeof(struct nvme_completion_queue)); 506 if (sc->compl_queues == NULL) { 507 WPRINTF("%s: CQ allocation failed", __func__); 508 sc->num_cqueues = 0; 509 } else { 510 struct nvme_completion_queue *cq = sc->compl_queues; 511 512 for (i = 0; i < sc->num_cqueues + 1; i++) 513 pthread_mutex_init(&cq[i].mtx, NULL); 514 } 515 } 516 517 static void 518 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) 519 { 520 struct nvme_controller_data *cd = &sc->ctrldata; 521 522 cd->vid = 0xFB5D; 523 cd->ssvid = 0x0000; 524 525 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); 526 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); 527 528 /* Num of submission commands that we can handle at a time (2^rab) */ 529 cd->rab = 4; 530 531 /* FreeBSD OUI */ 532 cd->ieee[0] = 0x58; 533 cd->ieee[1] = 0x9c; 534 cd->ieee[2] = 0xfc; 535 536 cd->mic = 0; 537 538 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */ 539 540 cd->ver = NVME_REV(1,4); 541 542 cd->cntrltype = NVME_CNTRLTYPE_IO; 543 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; 544 cd->oaes = NVMEB(NVME_CTRLR_DATA_OAES_NS_ATTR); 545 cd->acl = 2; 546 cd->aerl = 4; 547 548 /* Advertise 1, Read-only firmware slot */ 549 cd->frmw = NVMEB(NVME_CTRLR_DATA_FRMW_SLOT1_RO) | 550 (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT); 551 cd->lpa = 0; /* TODO: support some simple things like SMART */ 552 cd->elpe = 0; /* max error log page entries */ 553 /* 554 * Report a single power state (zero-based value) 555 * power_state[] values are left as zero to indicate "Not reported" 556 */ 557 cd->npss = 0; 558 559 /* Warning Composite Temperature Threshold */ 560 cd->wctemp = 0x0157; 561 cd->cctemp = 0x0157; 562 563 /* SANICAP must not be 0 for Revision 1.4 and later NVMe Controllers */ 564 cd->sanicap = (NVME_CTRLR_DATA_SANICAP_NODMMAS_NO << 565 NVME_CTRLR_DATA_SANICAP_NODMMAS_SHIFT); 566 567 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | 568 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); 569 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | 570 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); 571 cd->nn = 1; /* number of namespaces */ 572 573 cd->oncs = 0; 574 switch (sc->dataset_management) { 575 case NVME_DATASET_MANAGEMENT_AUTO: 576 if (sc->nvstore.deallocate) 577 cd->oncs |= NVME_ONCS_DSM; 578 break; 579 case NVME_DATASET_MANAGEMENT_ENABLE: 580 cd->oncs |= NVME_ONCS_DSM; 581 break; 582 default: 583 break; 584 } 585 586 cd->fna = NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK << 587 NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT; 588 589 cd->vwc = NVME_CTRLR_DATA_VWC_ALL_NO << NVME_CTRLR_DATA_VWC_ALL_SHIFT; 590 } 591 592 /* 593 * Calculate the CRC-16 of the given buffer 594 * See copyright attribution at top of file 595 */ 596 static uint16_t 597 crc16(uint16_t crc, const void *buffer, unsigned int len) 598 { 599 const unsigned char *cp = buffer; 600 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */ 601 static uint16_t const crc16_table[256] = { 602 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, 603 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, 604 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, 605 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841, 606 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, 607 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41, 608 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641, 609 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040, 610 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240, 611 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441, 612 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41, 613 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840, 614 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41, 615 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40, 616 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640, 617 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041, 618 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240, 619 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441, 620 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41, 621 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840, 622 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41, 623 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40, 624 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640, 625 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041, 626 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241, 627 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440, 628 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40, 629 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841, 630 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40, 631 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41, 632 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, 633 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 634 }; 635 636 while (len--) 637 crc = (((crc >> 8) & 0xffU) ^ 638 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU; 639 return crc; 640 } 641 642 static void 643 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore, 644 struct nvme_namespace_data *nd) 645 { 646 647 /* Get capacity and block size information from backing store */ 648 nd->nsze = nvstore->size / nvstore->sectsz; 649 nd->ncap = nd->nsze; 650 nd->nuse = nd->nsze; 651 } 652 653 static void 654 pci_nvme_init_nsdata(struct pci_nvme_softc *sc, 655 struct nvme_namespace_data *nd, uint32_t nsid, 656 struct pci_nvme_blockstore *nvstore) 657 { 658 659 pci_nvme_init_nsdata_size(nvstore, nd); 660 661 if (nvstore->type == NVME_STOR_BLOCKIF) 662 nvstore->deallocate = blockif_candelete(nvstore->ctx); 663 664 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ 665 nd->flbas = 0; 666 667 /* Create an EUI-64 if user did not provide one */ 668 if (nvstore->eui64 == 0) { 669 char *data = NULL; 670 uint64_t eui64 = nvstore->eui64; 671 672 asprintf(&data, "%s%u%u%u", get_config_value("name"), 673 sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot, 674 sc->nsc_pi->pi_func); 675 676 if (data != NULL) { 677 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); 678 free(data); 679 } 680 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff); 681 } 682 be64enc(nd->eui64, nvstore->eui64); 683 684 /* LBA data-sz = 2^lbads */ 685 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; 686 } 687 688 static void 689 pci_nvme_init_logpages(struct pci_nvme_softc *sc) 690 { 691 __uint128_t power_cycles = 1; 692 693 memset(&sc->err_log, 0, sizeof(sc->err_log)); 694 memset(&sc->health_log, 0, sizeof(sc->health_log)); 695 memset(&sc->fw_log, 0, sizeof(sc->fw_log)); 696 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 697 698 /* Set read/write remainder to round up according to spec */ 699 sc->read_dunits_remainder = 999; 700 sc->write_dunits_remainder = 999; 701 702 /* Set nominal Health values checked by implementations */ 703 sc->health_log.temperature = NVME_TEMPERATURE; 704 sc->health_log.available_spare = 100; 705 sc->health_log.available_spare_threshold = 10; 706 707 /* Set Active Firmware Info to slot 1 */ 708 sc->fw_log.afi = (1 << NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT); 709 memcpy(&sc->fw_log.revision[0], sc->ctrldata.fr, 710 sizeof(sc->fw_log.revision[0])); 711 712 memcpy(&sc->health_log.power_cycles, &power_cycles, 713 sizeof(sc->health_log.power_cycles)); 714 } 715 716 static void 717 pci_nvme_init_features(struct pci_nvme_softc *sc) 718 { 719 enum nvme_feature fid; 720 721 for (fid = 0; fid < NVME_FID_MAX; fid++) { 722 switch (fid) { 723 case NVME_FEAT_ARBITRATION: 724 case NVME_FEAT_POWER_MANAGEMENT: 725 case NVME_FEAT_INTERRUPT_COALESCING: //XXX 726 case NVME_FEAT_WRITE_ATOMICITY: 727 /* Mandatory but no special handling required */ 728 //XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 729 //XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 730 // this returns a data buffer 731 break; 732 case NVME_FEAT_TEMPERATURE_THRESHOLD: 733 sc->feat[fid].set = nvme_feature_temperature; 734 break; 735 case NVME_FEAT_ERROR_RECOVERY: 736 sc->feat[fid].namespace_specific = true; 737 break; 738 case NVME_FEAT_NUMBER_OF_QUEUES: 739 sc->feat[fid].set = nvme_feature_num_queues; 740 break; 741 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 742 sc->feat[fid].set = nvme_feature_iv_config; 743 break; 744 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 745 sc->feat[fid].set = nvme_feature_async_event; 746 /* Enable all AENs by default */ 747 sc->feat[fid].cdw11 = PCI_NVME_AEN_DEFAULT_MASK; 748 break; 749 default: 750 sc->feat[fid].set = nvme_feature_invalid_cb; 751 sc->feat[fid].get = nvme_feature_invalid_cb; 752 } 753 } 754 } 755 756 static void 757 pci_nvme_aer_reset(struct pci_nvme_softc *sc) 758 { 759 760 STAILQ_INIT(&sc->aer_list); 761 sc->aer_count = 0; 762 } 763 764 static void 765 pci_nvme_aer_init(struct pci_nvme_softc *sc) 766 { 767 768 pthread_mutex_init(&sc->aer_mtx, NULL); 769 pci_nvme_aer_reset(sc); 770 } 771 772 static void 773 pci_nvme_aer_destroy(struct pci_nvme_softc *sc) 774 { 775 struct pci_nvme_aer *aer = NULL; 776 777 pthread_mutex_lock(&sc->aer_mtx); 778 while (!STAILQ_EMPTY(&sc->aer_list)) { 779 aer = STAILQ_FIRST(&sc->aer_list); 780 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 781 free(aer); 782 } 783 pthread_mutex_unlock(&sc->aer_mtx); 784 785 pci_nvme_aer_reset(sc); 786 } 787 788 static bool 789 pci_nvme_aer_available(struct pci_nvme_softc *sc) 790 { 791 792 return (sc->aer_count != 0); 793 } 794 795 static bool 796 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc) 797 { 798 struct nvme_controller_data *cd = &sc->ctrldata; 799 800 /* AERL is a zero based value while aer_count is one's based */ 801 return (sc->aer_count == (cd->aerl + 1)); 802 } 803 804 /* 805 * Add an Async Event Request 806 * 807 * Stores an AER to be returned later if the Controller needs to notify the 808 * host of an event. 809 * Note that while the NVMe spec doesn't require Controllers to return AER's 810 * in order, this implementation does preserve the order. 811 */ 812 static int 813 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid) 814 { 815 struct pci_nvme_aer *aer = NULL; 816 817 aer = calloc(1, sizeof(struct pci_nvme_aer)); 818 if (aer == NULL) 819 return (-1); 820 821 /* Save the Command ID for use in the completion message */ 822 aer->cid = cid; 823 824 pthread_mutex_lock(&sc->aer_mtx); 825 sc->aer_count++; 826 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link); 827 pthread_mutex_unlock(&sc->aer_mtx); 828 829 return (0); 830 } 831 832 /* 833 * Get an Async Event Request structure 834 * 835 * Returns a pointer to an AER previously submitted by the host or NULL if 836 * no AER's exist. Caller is responsible for freeing the returned struct. 837 */ 838 static struct pci_nvme_aer * 839 pci_nvme_aer_get(struct pci_nvme_softc *sc) 840 { 841 struct pci_nvme_aer *aer = NULL; 842 843 pthread_mutex_lock(&sc->aer_mtx); 844 aer = STAILQ_FIRST(&sc->aer_list); 845 if (aer != NULL) { 846 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 847 sc->aer_count--; 848 } 849 pthread_mutex_unlock(&sc->aer_mtx); 850 851 return (aer); 852 } 853 854 static void 855 pci_nvme_aen_reset(struct pci_nvme_softc *sc) 856 { 857 uint32_t atype; 858 859 memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen)); 860 861 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 862 sc->aen[atype].atype = atype; 863 } 864 } 865 866 static void 867 pci_nvme_aen_init(struct pci_nvme_softc *sc) 868 { 869 char nstr[80]; 870 871 pci_nvme_aen_reset(sc); 872 873 pthread_mutex_init(&sc->aen_mtx, NULL); 874 pthread_create(&sc->aen_tid, NULL, aen_thr, sc); 875 snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot, 876 sc->nsc_pi->pi_func); 877 pthread_set_name_np(sc->aen_tid, nstr); 878 } 879 880 static void 881 pci_nvme_aen_destroy(struct pci_nvme_softc *sc) 882 { 883 884 pci_nvme_aen_reset(sc); 885 } 886 887 /* Notify the AEN thread of pending work */ 888 static void 889 pci_nvme_aen_notify(struct pci_nvme_softc *sc) 890 { 891 892 pthread_cond_signal(&sc->aen_cond); 893 } 894 895 /* 896 * Post an Asynchronous Event Notification 897 */ 898 static int32_t 899 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype, 900 uint32_t event_data) 901 { 902 struct pci_nvme_aen *aen; 903 904 if (atype >= PCI_NVME_AE_TYPE_MAX) { 905 return(EINVAL); 906 } 907 908 pthread_mutex_lock(&sc->aen_mtx); 909 aen = &sc->aen[atype]; 910 911 /* Has the controller already posted an event of this type? */ 912 if (aen->posted) { 913 pthread_mutex_unlock(&sc->aen_mtx); 914 return(EALREADY); 915 } 916 917 aen->event_data = event_data; 918 aen->posted = true; 919 pthread_mutex_unlock(&sc->aen_mtx); 920 921 pci_nvme_aen_notify(sc); 922 923 return(0); 924 } 925 926 static void 927 pci_nvme_aen_process(struct pci_nvme_softc *sc) 928 { 929 struct pci_nvme_aer *aer; 930 struct pci_nvme_aen *aen; 931 pci_nvme_async_type atype; 932 uint32_t mask; 933 uint16_t status; 934 uint8_t lid; 935 936 assert(pthread_mutex_isowned_np(&sc->aen_mtx)); 937 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 938 aen = &sc->aen[atype]; 939 /* Previous iterations may have depleted the available AER's */ 940 if (!pci_nvme_aer_available(sc)) { 941 DPRINTF("%s: no AER", __func__); 942 break; 943 } 944 945 if (!aen->posted) { 946 DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype); 947 continue; 948 } 949 950 status = NVME_SC_SUCCESS; 951 952 /* Is the event masked? */ 953 mask = 954 sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11; 955 956 DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data); 957 switch (atype) { 958 case PCI_NVME_AE_TYPE_ERROR: 959 lid = NVME_LOG_ERROR; 960 break; 961 case PCI_NVME_AE_TYPE_SMART: 962 mask &= 0xff; 963 if ((mask & aen->event_data) == 0) 964 continue; 965 lid = NVME_LOG_HEALTH_INFORMATION; 966 break; 967 case PCI_NVME_AE_TYPE_NOTICE: 968 if (aen->event_data >= PCI_NVME_AEI_NOTICE_MAX) { 969 EPRINTLN("%s unknown AEN notice type %u", 970 __func__, aen->event_data); 971 status = NVME_SC_INTERNAL_DEVICE_ERROR; 972 break; 973 } 974 if ((PCI_NVME_AEI_NOTICE_MASK(aen->event_data) & mask) == 0) 975 continue; 976 switch (aen->event_data) { 977 case PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED: 978 lid = NVME_LOG_CHANGED_NAMESPACE; 979 break; 980 case PCI_NVME_AEI_NOTICE_FW_ACTIVATION: 981 lid = NVME_LOG_FIRMWARE_SLOT; 982 break; 983 case PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE: 984 lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED; 985 break; 986 case PCI_NVME_AEI_NOTICE_ANA_CHANGE: 987 lid = NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS; 988 break; 989 case PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE: 990 lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE; 991 break; 992 case PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT: 993 lid = NVME_LOG_LBA_STATUS_INFORMATION; 994 break; 995 case PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE: 996 lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE; 997 break; 998 default: 999 lid = 0; 1000 } 1001 break; 1002 default: 1003 /* bad type?!? */ 1004 EPRINTLN("%s unknown AEN type %u", __func__, atype); 1005 status = NVME_SC_INTERNAL_DEVICE_ERROR; 1006 break; 1007 } 1008 1009 aer = pci_nvme_aer_get(sc); 1010 assert(aer != NULL); 1011 1012 DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype); 1013 pci_nvme_cq_update(sc, &sc->compl_queues[0], 1014 (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */ 1015 aer->cid, 1016 0, /* SQID */ 1017 status); 1018 1019 aen->event_data = 0; 1020 aen->posted = false; 1021 1022 pci_generate_msix(sc->nsc_pi, 0); 1023 } 1024 } 1025 1026 static void * 1027 aen_thr(void *arg) 1028 { 1029 struct pci_nvme_softc *sc; 1030 1031 sc = arg; 1032 1033 pthread_mutex_lock(&sc->aen_mtx); 1034 for (;;) { 1035 pci_nvme_aen_process(sc); 1036 pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx); 1037 } 1038 pthread_mutex_unlock(&sc->aen_mtx); 1039 1040 pthread_exit(NULL); 1041 return (NULL); 1042 } 1043 1044 static void 1045 pci_nvme_reset_locked(struct pci_nvme_softc *sc) 1046 { 1047 uint32_t i; 1048 1049 DPRINTF("%s", __func__); 1050 1051 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | 1052 (1 << NVME_CAP_LO_REG_CQR_SHIFT) | 1053 (60 << NVME_CAP_LO_REG_TO_SHIFT); 1054 1055 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; 1056 1057 sc->regs.vs = NVME_REV(1,4); /* NVMe v1.4 */ 1058 1059 sc->regs.cc = 0; 1060 1061 assert(sc->submit_queues != NULL); 1062 1063 for (i = 0; i < sc->num_squeues + 1; i++) { 1064 sc->submit_queues[i].qbase = NULL; 1065 sc->submit_queues[i].size = 0; 1066 sc->submit_queues[i].cqid = 0; 1067 sc->submit_queues[i].tail = 0; 1068 sc->submit_queues[i].head = 0; 1069 } 1070 1071 assert(sc->compl_queues != NULL); 1072 1073 for (i = 0; i < sc->num_cqueues + 1; i++) { 1074 sc->compl_queues[i].qbase = NULL; 1075 sc->compl_queues[i].size = 0; 1076 sc->compl_queues[i].tail = 0; 1077 sc->compl_queues[i].head = 0; 1078 } 1079 1080 sc->num_q_is_set = false; 1081 1082 pci_nvme_aer_destroy(sc); 1083 pci_nvme_aen_destroy(sc); 1084 1085 /* 1086 * Clear CSTS.RDY last to prevent the host from enabling Controller 1087 * before cleanup completes 1088 */ 1089 sc->regs.csts = 0; 1090 } 1091 1092 static void 1093 pci_nvme_reset(struct pci_nvme_softc *sc) 1094 { 1095 pthread_mutex_lock(&sc->mtx); 1096 pci_nvme_reset_locked(sc); 1097 pthread_mutex_unlock(&sc->mtx); 1098 } 1099 1100 static int 1101 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc) 1102 { 1103 uint16_t acqs, asqs; 1104 1105 DPRINTF("%s", __func__); 1106 1107 /* 1108 * NVMe 2.0 states that "enabling a controller while this field is 1109 * cleared to 0h produces undefined results" for both ACQS and 1110 * ASQS. If zero, set CFS and do not become ready. 1111 */ 1112 asqs = ONE_BASED(sc->regs.aqa & NVME_AQA_REG_ASQS_MASK); 1113 if (asqs < 2) { 1114 EPRINTLN("%s: illegal ASQS value %#x (aqa=%#x)", __func__, 1115 asqs - 1, sc->regs.aqa); 1116 sc->regs.csts |= NVME_CSTS_CFS; 1117 return (-1); 1118 } 1119 sc->submit_queues[0].size = asqs; 1120 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq, 1121 sizeof(struct nvme_command) * asqs); 1122 if (sc->submit_queues[0].qbase == NULL) { 1123 EPRINTLN("%s: ASQ vm_map_gpa(%lx) failed", __func__, 1124 sc->regs.asq); 1125 sc->regs.csts |= NVME_CSTS_CFS; 1126 return (-1); 1127 } 1128 1129 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p", 1130 __func__, sc->regs.asq, sc->submit_queues[0].qbase); 1131 1132 acqs = ONE_BASED((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 1133 NVME_AQA_REG_ACQS_MASK); 1134 if (acqs < 2) { 1135 EPRINTLN("%s: illegal ACQS value %#x (aqa=%#x)", __func__, 1136 acqs - 1, sc->regs.aqa); 1137 sc->regs.csts |= NVME_CSTS_CFS; 1138 return (-1); 1139 } 1140 sc->compl_queues[0].size = acqs; 1141 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq, 1142 sizeof(struct nvme_completion) * acqs); 1143 if (sc->compl_queues[0].qbase == NULL) { 1144 EPRINTLN("%s: ACQ vm_map_gpa(%lx) failed", __func__, 1145 sc->regs.acq); 1146 sc->regs.csts |= NVME_CSTS_CFS; 1147 return (-1); 1148 } 1149 sc->compl_queues[0].intr_en = NVME_CQ_INTEN; 1150 1151 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p", 1152 __func__, sc->regs.acq, sc->compl_queues[0].qbase); 1153 1154 return (0); 1155 } 1156 1157 static int 1158 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b, 1159 size_t len, enum nvme_copy_dir dir) 1160 { 1161 uint8_t *p; 1162 size_t bytes; 1163 1164 if (len > (8 * 1024)) { 1165 return (-1); 1166 } 1167 1168 /* Copy from the start of prp1 to the end of the physical page */ 1169 bytes = PAGE_SIZE - (prp1 & PAGE_MASK); 1170 bytes = MIN(bytes, len); 1171 1172 p = vm_map_gpa(ctx, prp1, bytes); 1173 if (p == NULL) { 1174 return (-1); 1175 } 1176 1177 if (dir == NVME_COPY_TO_PRP) 1178 memcpy(p, b, bytes); 1179 else 1180 memcpy(b, p, bytes); 1181 1182 b += bytes; 1183 1184 len -= bytes; 1185 if (len == 0) { 1186 return (0); 1187 } 1188 1189 len = MIN(len, PAGE_SIZE); 1190 1191 p = vm_map_gpa(ctx, prp2, len); 1192 if (p == NULL) { 1193 return (-1); 1194 } 1195 1196 if (dir == NVME_COPY_TO_PRP) 1197 memcpy(p, b, len); 1198 else 1199 memcpy(b, p, len); 1200 1201 return (0); 1202 } 1203 1204 /* 1205 * Write a Completion Queue Entry update 1206 * 1207 * Write the completion and update the doorbell value 1208 */ 1209 static void 1210 pci_nvme_cq_update(struct pci_nvme_softc *sc, 1211 struct nvme_completion_queue *cq, 1212 uint32_t cdw0, 1213 uint16_t cid, 1214 uint16_t sqid, 1215 uint16_t status) 1216 { 1217 struct nvme_submission_queue *sq = &sc->submit_queues[sqid]; 1218 struct nvme_completion *cqe; 1219 1220 assert(cq->qbase != NULL); 1221 1222 pthread_mutex_lock(&cq->mtx); 1223 1224 cqe = &cq->qbase[cq->tail]; 1225 1226 /* Flip the phase bit */ 1227 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK; 1228 1229 cqe->cdw0 = cdw0; 1230 cqe->sqhd = sq->head; 1231 cqe->sqid = sqid; 1232 cqe->cid = cid; 1233 cqe->status = status; 1234 1235 cq->tail++; 1236 if (cq->tail >= cq->size) { 1237 cq->tail = 0; 1238 } 1239 1240 pthread_mutex_unlock(&cq->mtx); 1241 } 1242 1243 static int 1244 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1245 struct nvme_completion* compl) 1246 { 1247 uint16_t qid = command->cdw10 & 0xffff; 1248 1249 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid); 1250 if (qid == 0 || qid > sc->num_squeues || 1251 (sc->submit_queues[qid].qbase == NULL)) { 1252 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u", 1253 __func__, qid, sc->num_squeues); 1254 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1255 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1256 return (1); 1257 } 1258 1259 sc->submit_queues[qid].qbase = NULL; 1260 sc->submit_queues[qid].cqid = 0; 1261 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1262 return (1); 1263 } 1264 1265 static int 1266 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1267 struct nvme_completion* compl) 1268 { 1269 if (command->cdw11 & NVME_CMD_CDW11_PC) { 1270 uint16_t qid = command->cdw10 & 0xffff; 1271 struct nvme_submission_queue *nsq; 1272 1273 if ((qid == 0) || (qid > sc->num_squeues) || 1274 (sc->submit_queues[qid].qbase != NULL)) { 1275 WPRINTF("%s queue index %u > num_squeues %u", 1276 __func__, qid, sc->num_squeues); 1277 pci_nvme_status_tc(&compl->status, 1278 NVME_SCT_COMMAND_SPECIFIC, 1279 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1280 return (1); 1281 } 1282 1283 nsq = &sc->submit_queues[qid]; 1284 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1285 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries); 1286 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) { 1287 /* 1288 * Queues must specify at least two entries 1289 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1290 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1291 */ 1292 pci_nvme_status_tc(&compl->status, 1293 NVME_SCT_COMMAND_SPECIFIC, 1294 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1295 return (1); 1296 } 1297 nsq->head = nsq->tail = 0; 1298 1299 nsq->cqid = (command->cdw11 >> 16) & 0xffff; 1300 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) { 1301 pci_nvme_status_tc(&compl->status, 1302 NVME_SCT_COMMAND_SPECIFIC, 1303 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1304 return (1); 1305 } 1306 1307 if (sc->compl_queues[nsq->cqid].qbase == NULL) { 1308 pci_nvme_status_tc(&compl->status, 1309 NVME_SCT_COMMAND_SPECIFIC, 1310 NVME_SC_COMPLETION_QUEUE_INVALID); 1311 return (1); 1312 } 1313 1314 nsq->qpriority = (command->cdw11 >> 1) & 0x03; 1315 1316 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1317 sizeof(struct nvme_command) * (size_t)nsq->size); 1318 1319 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__, 1320 qid, nsq->size, nsq->qbase, nsq->cqid); 1321 1322 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1323 1324 DPRINTF("%s completed creating IOSQ qid %u", 1325 __func__, qid); 1326 } else { 1327 /* 1328 * Guest sent non-cont submission queue request. 1329 * This setting is unsupported by this emulation. 1330 */ 1331 WPRINTF("%s unsupported non-contig (list-based) " 1332 "create i/o submission queue", __func__); 1333 1334 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1335 } 1336 return (1); 1337 } 1338 1339 static int 1340 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1341 struct nvme_completion* compl) 1342 { 1343 uint16_t qid = command->cdw10 & 0xffff; 1344 uint16_t sqid; 1345 1346 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid); 1347 if (qid == 0 || qid > sc->num_cqueues || 1348 (sc->compl_queues[qid].qbase == NULL)) { 1349 WPRINTF("%s queue index %u / num_cqueues %u", 1350 __func__, qid, sc->num_cqueues); 1351 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1352 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1353 return (1); 1354 } 1355 1356 /* Deleting an Active CQ is an error */ 1357 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++) 1358 if (sc->submit_queues[sqid].cqid == qid) { 1359 pci_nvme_status_tc(&compl->status, 1360 NVME_SCT_COMMAND_SPECIFIC, 1361 NVME_SC_INVALID_QUEUE_DELETION); 1362 return (1); 1363 } 1364 1365 sc->compl_queues[qid].qbase = NULL; 1366 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1367 return (1); 1368 } 1369 1370 static int 1371 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1372 struct nvme_completion* compl) 1373 { 1374 struct nvme_completion_queue *ncq; 1375 uint16_t qid = command->cdw10 & 0xffff; 1376 1377 /* Only support Physically Contiguous queues */ 1378 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) { 1379 WPRINTF("%s unsupported non-contig (list-based) " 1380 "create i/o completion queue", 1381 __func__); 1382 1383 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1384 return (1); 1385 } 1386 1387 if ((qid == 0) || (qid > sc->num_cqueues) || 1388 (sc->compl_queues[qid].qbase != NULL)) { 1389 WPRINTF("%s queue index %u > num_cqueues %u", 1390 __func__, qid, sc->num_cqueues); 1391 pci_nvme_status_tc(&compl->status, 1392 NVME_SCT_COMMAND_SPECIFIC, 1393 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1394 return (1); 1395 } 1396 1397 ncq = &sc->compl_queues[qid]; 1398 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; 1399 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; 1400 if (ncq->intr_vec > (sc->max_queues + 1)) { 1401 pci_nvme_status_tc(&compl->status, 1402 NVME_SCT_COMMAND_SPECIFIC, 1403 NVME_SC_INVALID_INTERRUPT_VECTOR); 1404 return (1); 1405 } 1406 1407 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1408 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) { 1409 /* 1410 * Queues must specify at least two entries 1411 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1412 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1413 */ 1414 pci_nvme_status_tc(&compl->status, 1415 NVME_SCT_COMMAND_SPECIFIC, 1416 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1417 return (1); 1418 } 1419 ncq->head = ncq->tail = 0; 1420 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1421 command->prp1, 1422 sizeof(struct nvme_command) * (size_t)ncq->size); 1423 1424 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1425 1426 1427 return (1); 1428 } 1429 1430 static int 1431 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, 1432 struct nvme_completion* compl) 1433 { 1434 uint64_t logoff; 1435 uint32_t logsize; 1436 uint8_t logpage; 1437 1438 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1439 1440 /* 1441 * Command specifies the number of dwords to return in fields NUMDU 1442 * and NUMDL. This is a zero-based value. 1443 */ 1444 logpage = command->cdw10 & 0xFF; 1445 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1; 1446 logsize *= sizeof(uint32_t); 1447 logoff = ((uint64_t)(command->cdw13) << 32) | command->cdw12; 1448 1449 DPRINTF("%s log page %u len %u", __func__, logpage, logsize); 1450 1451 switch (logpage) { 1452 case NVME_LOG_ERROR: 1453 if (logoff >= sizeof(sc->err_log)) { 1454 pci_nvme_status_genc(&compl->status, 1455 NVME_SC_INVALID_FIELD); 1456 break; 1457 } 1458 1459 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1460 command->prp2, (uint8_t *)&sc->err_log + logoff, 1461 MIN(logsize - logoff, sizeof(sc->err_log)), 1462 NVME_COPY_TO_PRP); 1463 break; 1464 case NVME_LOG_HEALTH_INFORMATION: 1465 if (logoff >= sizeof(sc->health_log)) { 1466 pci_nvme_status_genc(&compl->status, 1467 NVME_SC_INVALID_FIELD); 1468 break; 1469 } 1470 1471 pthread_mutex_lock(&sc->mtx); 1472 memcpy(&sc->health_log.data_units_read, &sc->read_data_units, 1473 sizeof(sc->health_log.data_units_read)); 1474 memcpy(&sc->health_log.data_units_written, &sc->write_data_units, 1475 sizeof(sc->health_log.data_units_written)); 1476 memcpy(&sc->health_log.host_read_commands, &sc->read_commands, 1477 sizeof(sc->health_log.host_read_commands)); 1478 memcpy(&sc->health_log.host_write_commands, &sc->write_commands, 1479 sizeof(sc->health_log.host_write_commands)); 1480 pthread_mutex_unlock(&sc->mtx); 1481 1482 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1483 command->prp2, (uint8_t *)&sc->health_log + logoff, 1484 MIN(logsize - logoff, sizeof(sc->health_log)), 1485 NVME_COPY_TO_PRP); 1486 break; 1487 case NVME_LOG_FIRMWARE_SLOT: 1488 if (logoff >= sizeof(sc->fw_log)) { 1489 pci_nvme_status_genc(&compl->status, 1490 NVME_SC_INVALID_FIELD); 1491 break; 1492 } 1493 1494 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1495 command->prp2, (uint8_t *)&sc->fw_log + logoff, 1496 MIN(logsize - logoff, sizeof(sc->fw_log)), 1497 NVME_COPY_TO_PRP); 1498 break; 1499 case NVME_LOG_CHANGED_NAMESPACE: 1500 if (logoff >= sizeof(sc->ns_log)) { 1501 pci_nvme_status_genc(&compl->status, 1502 NVME_SC_INVALID_FIELD); 1503 break; 1504 } 1505 1506 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1507 command->prp2, (uint8_t *)&sc->ns_log + logoff, 1508 MIN(logsize - logoff, sizeof(sc->ns_log)), 1509 NVME_COPY_TO_PRP); 1510 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 1511 break; 1512 default: 1513 DPRINTF("%s get log page %x command not supported", 1514 __func__, logpage); 1515 1516 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1517 NVME_SC_INVALID_LOG_PAGE); 1518 } 1519 1520 return (1); 1521 } 1522 1523 static int 1524 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, 1525 struct nvme_completion* compl) 1526 { 1527 void *dest; 1528 uint16_t status; 1529 1530 DPRINTF("%s identify 0x%x nsid 0x%x", __func__, 1531 command->cdw10 & 0xFF, command->nsid); 1532 1533 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1534 1535 switch (command->cdw10 & 0xFF) { 1536 case 0x00: /* return Identify Namespace data structure */ 1537 /* Global NS only valid with NS Management */ 1538 if (command->nsid == NVME_GLOBAL_NAMESPACE_TAG) { 1539 pci_nvme_status_genc(&status, 1540 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1541 break; 1542 } 1543 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1544 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata), 1545 NVME_COPY_TO_PRP); 1546 break; 1547 case 0x01: /* return Identify Controller data structure */ 1548 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1549 command->prp2, (uint8_t *)&sc->ctrldata, 1550 sizeof(sc->ctrldata), 1551 NVME_COPY_TO_PRP); 1552 break; 1553 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ 1554 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1555 sizeof(uint32_t) * 1024); 1556 /* All unused entries shall be zero */ 1557 memset(dest, 0, sizeof(uint32_t) * 1024); 1558 ((uint32_t *)dest)[0] = 1; 1559 break; 1560 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ 1561 if (command->nsid != 1) { 1562 pci_nvme_status_genc(&status, 1563 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1564 break; 1565 } 1566 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1567 sizeof(uint32_t) * 1024); 1568 /* All bytes after the descriptor shall be zero */ 1569 memset(dest, 0, sizeof(uint32_t) * 1024); 1570 1571 /* Return NIDT=1 (i.e. EUI64) descriptor */ 1572 ((uint8_t *)dest)[0] = 1; 1573 ((uint8_t *)dest)[1] = sizeof(uint64_t); 1574 memcpy(((uint8_t *)dest) + 4, sc->nsdata.eui64, sizeof(uint64_t)); 1575 break; 1576 case 0x13: 1577 /* 1578 * Controller list is optional but used by UNH tests. Return 1579 * a valid but empty list. 1580 */ 1581 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1582 sizeof(uint16_t) * 2048); 1583 memset(dest, 0, sizeof(uint16_t) * 2048); 1584 break; 1585 default: 1586 DPRINTF("%s unsupported identify command requested 0x%x", 1587 __func__, command->cdw10 & 0xFF); 1588 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD); 1589 break; 1590 } 1591 1592 compl->status = status; 1593 return (1); 1594 } 1595 1596 static const char * 1597 nvme_fid_to_name(uint8_t fid) 1598 { 1599 const char *name; 1600 1601 switch (fid) { 1602 case NVME_FEAT_ARBITRATION: 1603 name = "Arbitration"; 1604 break; 1605 case NVME_FEAT_POWER_MANAGEMENT: 1606 name = "Power Management"; 1607 break; 1608 case NVME_FEAT_LBA_RANGE_TYPE: 1609 name = "LBA Range Type"; 1610 break; 1611 case NVME_FEAT_TEMPERATURE_THRESHOLD: 1612 name = "Temperature Threshold"; 1613 break; 1614 case NVME_FEAT_ERROR_RECOVERY: 1615 name = "Error Recovery"; 1616 break; 1617 case NVME_FEAT_VOLATILE_WRITE_CACHE: 1618 name = "Volatile Write Cache"; 1619 break; 1620 case NVME_FEAT_NUMBER_OF_QUEUES: 1621 name = "Number of Queues"; 1622 break; 1623 case NVME_FEAT_INTERRUPT_COALESCING: 1624 name = "Interrupt Coalescing"; 1625 break; 1626 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 1627 name = "Interrupt Vector Configuration"; 1628 break; 1629 case NVME_FEAT_WRITE_ATOMICITY: 1630 name = "Write Atomicity Normal"; 1631 break; 1632 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 1633 name = "Asynchronous Event Configuration"; 1634 break; 1635 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 1636 name = "Autonomous Power State Transition"; 1637 break; 1638 case NVME_FEAT_HOST_MEMORY_BUFFER: 1639 name = "Host Memory Buffer"; 1640 break; 1641 case NVME_FEAT_TIMESTAMP: 1642 name = "Timestamp"; 1643 break; 1644 case NVME_FEAT_KEEP_ALIVE_TIMER: 1645 name = "Keep Alive Timer"; 1646 break; 1647 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT: 1648 name = "Host Controlled Thermal Management"; 1649 break; 1650 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG: 1651 name = "Non-Operation Power State Config"; 1652 break; 1653 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG: 1654 name = "Read Recovery Level Config"; 1655 break; 1656 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 1657 name = "Predictable Latency Mode Config"; 1658 break; 1659 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW: 1660 name = "Predictable Latency Mode Window"; 1661 break; 1662 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES: 1663 name = "LBA Status Information Report Interval"; 1664 break; 1665 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 1666 name = "Host Behavior Support"; 1667 break; 1668 case NVME_FEAT_SANITIZE_CONFIG: 1669 name = "Sanitize Config"; 1670 break; 1671 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION: 1672 name = "Endurance Group Event Configuration"; 1673 break; 1674 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 1675 name = "Software Progress Marker"; 1676 break; 1677 case NVME_FEAT_HOST_IDENTIFIER: 1678 name = "Host Identifier"; 1679 break; 1680 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK: 1681 name = "Reservation Notification Mask"; 1682 break; 1683 case NVME_FEAT_RESERVATION_PERSISTENCE: 1684 name = "Reservation Persistence"; 1685 break; 1686 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG: 1687 name = "Namespace Write Protection Config"; 1688 break; 1689 default: 1690 name = "Unknown"; 1691 break; 1692 } 1693 1694 return (name); 1695 } 1696 1697 static void 1698 nvme_feature_invalid_cb(struct pci_nvme_softc *sc __unused, 1699 struct nvme_feature_obj *feat __unused, 1700 struct nvme_command *command __unused, 1701 struct nvme_completion *compl) 1702 { 1703 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1704 } 1705 1706 static void 1707 nvme_feature_iv_config(struct pci_nvme_softc *sc, 1708 struct nvme_feature_obj *feat __unused, 1709 struct nvme_command *command, 1710 struct nvme_completion *compl) 1711 { 1712 uint32_t i; 1713 uint32_t cdw11 = command->cdw11; 1714 uint16_t iv; 1715 bool cd; 1716 1717 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1718 1719 iv = cdw11 & 0xffff; 1720 cd = cdw11 & (1 << 16); 1721 1722 if (iv > (sc->max_queues + 1)) { 1723 return; 1724 } 1725 1726 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */ 1727 if ((iv == 0) && !cd) 1728 return; 1729 1730 /* Requested Interrupt Vector must be used by a CQ */ 1731 for (i = 0; i < sc->num_cqueues + 1; i++) { 1732 if (sc->compl_queues[i].intr_vec == iv) { 1733 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1734 } 1735 } 1736 } 1737 1738 #define NVME_ASYNC_EVENT_ENDURANCE_GROUP (0x4000) 1739 static void 1740 nvme_feature_async_event(struct pci_nvme_softc *sc __unused, 1741 struct nvme_feature_obj *feat __unused, 1742 struct nvme_command *command, 1743 struct nvme_completion *compl) 1744 { 1745 if (command->cdw11 & NVME_ASYNC_EVENT_ENDURANCE_GROUP) 1746 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1747 } 1748 1749 #define NVME_TEMP_THRESH_OVER 0 1750 #define NVME_TEMP_THRESH_UNDER 1 1751 static void 1752 nvme_feature_temperature(struct pci_nvme_softc *sc, 1753 struct nvme_feature_obj *feat __unused, 1754 struct nvme_command *command, 1755 struct nvme_completion *compl) 1756 { 1757 uint16_t tmpth; /* Temperature Threshold */ 1758 uint8_t tmpsel; /* Threshold Temperature Select */ 1759 uint8_t thsel; /* Threshold Type Select */ 1760 bool set_crit = false; 1761 bool report_crit; 1762 1763 tmpth = command->cdw11 & 0xffff; 1764 tmpsel = (command->cdw11 >> 16) & 0xf; 1765 thsel = (command->cdw11 >> 20) & 0x3; 1766 1767 DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel); 1768 1769 /* Check for unsupported values */ 1770 if (((tmpsel != 0) && (tmpsel != 0xf)) || 1771 (thsel > NVME_TEMP_THRESH_UNDER)) { 1772 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1773 return; 1774 } 1775 1776 if (((thsel == NVME_TEMP_THRESH_OVER) && (NVME_TEMPERATURE >= tmpth)) || 1777 ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth))) 1778 set_crit = true; 1779 1780 pthread_mutex_lock(&sc->mtx); 1781 if (set_crit) 1782 sc->health_log.critical_warning |= 1783 NVME_CRIT_WARN_ST_TEMPERATURE; 1784 else 1785 sc->health_log.critical_warning &= 1786 ~NVME_CRIT_WARN_ST_TEMPERATURE; 1787 pthread_mutex_unlock(&sc->mtx); 1788 1789 report_crit = sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11 & 1790 NVME_CRIT_WARN_ST_TEMPERATURE; 1791 1792 if (set_crit && report_crit) 1793 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART, 1794 sc->health_log.critical_warning); 1795 1796 DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status); 1797 } 1798 1799 static void 1800 nvme_feature_num_queues(struct pci_nvme_softc *sc, 1801 struct nvme_feature_obj *feat __unused, 1802 struct nvme_command *command, 1803 struct nvme_completion *compl) 1804 { 1805 uint16_t nqr; /* Number of Queues Requested */ 1806 1807 if (sc->num_q_is_set) { 1808 WPRINTF("%s: Number of Queues already set", __func__); 1809 pci_nvme_status_genc(&compl->status, 1810 NVME_SC_COMMAND_SEQUENCE_ERROR); 1811 return; 1812 } 1813 1814 nqr = command->cdw11 & 0xFFFF; 1815 if (nqr == 0xffff) { 1816 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr); 1817 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1818 return; 1819 } 1820 1821 sc->num_squeues = ONE_BASED(nqr); 1822 if (sc->num_squeues > sc->max_queues) { 1823 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues, 1824 sc->max_queues); 1825 sc->num_squeues = sc->max_queues; 1826 } 1827 1828 nqr = (command->cdw11 >> 16) & 0xFFFF; 1829 if (nqr == 0xffff) { 1830 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr); 1831 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1832 return; 1833 } 1834 1835 sc->num_cqueues = ONE_BASED(nqr); 1836 if (sc->num_cqueues > sc->max_queues) { 1837 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues, 1838 sc->max_queues); 1839 sc->num_cqueues = sc->max_queues; 1840 } 1841 1842 /* Patch the command value which will be saved on callback's return */ 1843 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc); 1844 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 1845 1846 sc->num_q_is_set = true; 1847 } 1848 1849 static int 1850 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command, 1851 struct nvme_completion *compl) 1852 { 1853 struct nvme_feature_obj *feat; 1854 uint32_t nsid = command->nsid; 1855 uint8_t fid = NVMEV(NVME_FEAT_SET_FID, command->cdw10); 1856 bool sv = NVMEV(NVME_FEAT_SET_SV, command->cdw10); 1857 1858 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1859 1860 if (fid >= NVME_FID_MAX) { 1861 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1862 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1863 return (1); 1864 } 1865 1866 if (sv) { 1867 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1868 NVME_SC_FEATURE_NOT_SAVEABLE); 1869 return (1); 1870 } 1871 1872 feat = &sc->feat[fid]; 1873 1874 if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) { 1875 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1876 return (1); 1877 } 1878 1879 if (!feat->namespace_specific && 1880 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) { 1881 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1882 NVME_SC_FEATURE_NOT_NS_SPECIFIC); 1883 return (1); 1884 } 1885 1886 compl->cdw0 = 0; 1887 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1888 1889 if (feat->set) 1890 feat->set(sc, feat, command, compl); 1891 else { 1892 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1893 NVME_SC_FEATURE_NOT_CHANGEABLE); 1894 return (1); 1895 } 1896 1897 DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11); 1898 if (compl->status == NVME_SC_SUCCESS) { 1899 feat->cdw11 = command->cdw11; 1900 if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) && 1901 (command->cdw11 != 0)) 1902 pci_nvme_aen_notify(sc); 1903 } 1904 1905 return (0); 1906 } 1907 1908 #define NVME_FEATURES_SEL_SUPPORTED 0x3 1909 #define NVME_FEATURES_NS_SPECIFIC (1 << 1) 1910 1911 static int 1912 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, 1913 struct nvme_completion* compl) 1914 { 1915 struct nvme_feature_obj *feat; 1916 uint8_t fid = command->cdw10 & 0xFF; 1917 uint8_t sel = (command->cdw10 >> 8) & 0x7; 1918 1919 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1920 1921 if (fid >= NVME_FID_MAX) { 1922 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1923 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1924 return (1); 1925 } 1926 1927 compl->cdw0 = 0; 1928 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1929 1930 feat = &sc->feat[fid]; 1931 if (feat->get) { 1932 feat->get(sc, feat, command, compl); 1933 } 1934 1935 if (compl->status == NVME_SC_SUCCESS) { 1936 if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific) 1937 compl->cdw0 = NVME_FEATURES_NS_SPECIFIC; 1938 else 1939 compl->cdw0 = feat->cdw11; 1940 } 1941 1942 return (0); 1943 } 1944 1945 static int 1946 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command, 1947 struct nvme_completion* compl) 1948 { 1949 uint8_t ses, lbaf, pi; 1950 1951 /* Only supports Secure Erase Setting - User Data Erase */ 1952 ses = (command->cdw10 >> 9) & 0x7; 1953 if (ses > 0x1) { 1954 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1955 return (1); 1956 } 1957 1958 /* Only supports a single LBA Format */ 1959 lbaf = command->cdw10 & 0xf; 1960 if (lbaf != 0) { 1961 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1962 NVME_SC_INVALID_FORMAT); 1963 return (1); 1964 } 1965 1966 /* Doesn't support Protection Infomation */ 1967 pi = (command->cdw10 >> 5) & 0x7; 1968 if (pi != 0) { 1969 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1970 return (1); 1971 } 1972 1973 if (sc->nvstore.type == NVME_STOR_RAM) { 1974 if (sc->nvstore.ctx) 1975 free(sc->nvstore.ctx); 1976 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 1977 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1978 } else { 1979 struct pci_nvme_ioreq *req; 1980 int err; 1981 1982 req = pci_nvme_get_ioreq(sc); 1983 if (req == NULL) { 1984 pci_nvme_status_genc(&compl->status, 1985 NVME_SC_INTERNAL_DEVICE_ERROR); 1986 WPRINTF("%s: unable to allocate IO req", __func__); 1987 return (1); 1988 } 1989 req->nvme_sq = &sc->submit_queues[0]; 1990 req->sqid = 0; 1991 req->opc = command->opc; 1992 req->cid = command->cid; 1993 req->nsid = command->nsid; 1994 1995 req->io_req.br_offset = 0; 1996 req->io_req.br_resid = sc->nvstore.size; 1997 req->io_req.br_callback = pci_nvme_io_done; 1998 1999 err = blockif_delete(sc->nvstore.ctx, &req->io_req); 2000 if (err) { 2001 pci_nvme_status_genc(&compl->status, 2002 NVME_SC_INTERNAL_DEVICE_ERROR); 2003 pci_nvme_release_ioreq(sc, req); 2004 } else 2005 compl->status = NVME_NO_STATUS; 2006 } 2007 2008 return (1); 2009 } 2010 2011 static int 2012 nvme_opc_abort(struct pci_nvme_softc *sc __unused, struct nvme_command *command, 2013 struct nvme_completion *compl) 2014 { 2015 DPRINTF("%s submission queue %u, command ID 0x%x", __func__, 2016 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF); 2017 2018 /* TODO: search for the command ID and abort it */ 2019 2020 compl->cdw0 = 1; 2021 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 2022 return (1); 2023 } 2024 2025 static int 2026 nvme_opc_async_event_req(struct pci_nvme_softc* sc, 2027 struct nvme_command* command, struct nvme_completion* compl) 2028 { 2029 DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__, 2030 sc->aer_count, sc->ctrldata.aerl, command->cid); 2031 2032 /* Don't exceed the Async Event Request Limit (AERL). */ 2033 if (pci_nvme_aer_limit_reached(sc)) { 2034 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 2035 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); 2036 return (1); 2037 } 2038 2039 if (pci_nvme_aer_add(sc, command->cid)) { 2040 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC, 2041 NVME_SC_INTERNAL_DEVICE_ERROR); 2042 return (1); 2043 } 2044 2045 /* 2046 * Raise events when they happen based on the Set Features cmd. 2047 * These events happen async, so only set completion successful if 2048 * there is an event reflective of the request to get event. 2049 */ 2050 compl->status = NVME_NO_STATUS; 2051 pci_nvme_aen_notify(sc); 2052 2053 return (0); 2054 } 2055 2056 static void 2057 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) 2058 { 2059 struct nvme_completion compl; 2060 struct nvme_command *cmd; 2061 struct nvme_submission_queue *sq; 2062 struct nvme_completion_queue *cq; 2063 uint16_t sqhead; 2064 2065 DPRINTF("%s index %u", __func__, (uint32_t)value); 2066 2067 sq = &sc->submit_queues[0]; 2068 cq = &sc->compl_queues[0]; 2069 2070 pthread_mutex_lock(&sq->mtx); 2071 2072 sqhead = sq->head; 2073 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail); 2074 2075 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2076 cmd = &(sq->qbase)[sqhead]; 2077 compl.cdw0 = 0; 2078 compl.status = 0; 2079 2080 switch (cmd->opc) { 2081 case NVME_OPC_DELETE_IO_SQ: 2082 DPRINTF("%s command DELETE_IO_SQ", __func__); 2083 nvme_opc_delete_io_sq(sc, cmd, &compl); 2084 break; 2085 case NVME_OPC_CREATE_IO_SQ: 2086 DPRINTF("%s command CREATE_IO_SQ", __func__); 2087 nvme_opc_create_io_sq(sc, cmd, &compl); 2088 break; 2089 case NVME_OPC_DELETE_IO_CQ: 2090 DPRINTF("%s command DELETE_IO_CQ", __func__); 2091 nvme_opc_delete_io_cq(sc, cmd, &compl); 2092 break; 2093 case NVME_OPC_CREATE_IO_CQ: 2094 DPRINTF("%s command CREATE_IO_CQ", __func__); 2095 nvme_opc_create_io_cq(sc, cmd, &compl); 2096 break; 2097 case NVME_OPC_GET_LOG_PAGE: 2098 DPRINTF("%s command GET_LOG_PAGE", __func__); 2099 nvme_opc_get_log_page(sc, cmd, &compl); 2100 break; 2101 case NVME_OPC_IDENTIFY: 2102 DPRINTF("%s command IDENTIFY", __func__); 2103 nvme_opc_identify(sc, cmd, &compl); 2104 break; 2105 case NVME_OPC_ABORT: 2106 DPRINTF("%s command ABORT", __func__); 2107 nvme_opc_abort(sc, cmd, &compl); 2108 break; 2109 case NVME_OPC_SET_FEATURES: 2110 DPRINTF("%s command SET_FEATURES", __func__); 2111 nvme_opc_set_features(sc, cmd, &compl); 2112 break; 2113 case NVME_OPC_GET_FEATURES: 2114 DPRINTF("%s command GET_FEATURES", __func__); 2115 nvme_opc_get_features(sc, cmd, &compl); 2116 break; 2117 case NVME_OPC_FIRMWARE_ACTIVATE: 2118 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__); 2119 pci_nvme_status_tc(&compl.status, 2120 NVME_SCT_COMMAND_SPECIFIC, 2121 NVME_SC_INVALID_FIRMWARE_SLOT); 2122 break; 2123 case NVME_OPC_ASYNC_EVENT_REQUEST: 2124 DPRINTF("%s command ASYNC_EVENT_REQ", __func__); 2125 nvme_opc_async_event_req(sc, cmd, &compl); 2126 break; 2127 case NVME_OPC_FORMAT_NVM: 2128 DPRINTF("%s command FORMAT_NVM", __func__); 2129 if ((sc->ctrldata.oacs & 2130 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) { 2131 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 2132 break; 2133 } 2134 nvme_opc_format_nvm(sc, cmd, &compl); 2135 break; 2136 case NVME_OPC_SECURITY_SEND: 2137 case NVME_OPC_SECURITY_RECEIVE: 2138 case NVME_OPC_SANITIZE: 2139 case NVME_OPC_GET_LBA_STATUS: 2140 DPRINTF("%s command OPC=%#x (unsupported)", __func__, 2141 cmd->opc); 2142 /* Valid but unsupported opcodes */ 2143 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD); 2144 break; 2145 default: 2146 DPRINTF("%s command OPC=%#X (not implemented)", 2147 __func__, 2148 cmd->opc); 2149 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 2150 } 2151 sqhead = (sqhead + 1) % sq->size; 2152 2153 if (NVME_COMPLETION_VALID(compl)) { 2154 pci_nvme_cq_update(sc, &sc->compl_queues[0], 2155 compl.cdw0, 2156 cmd->cid, 2157 0, /* SQID */ 2158 compl.status); 2159 } 2160 } 2161 2162 DPRINTF("setting sqhead %u", sqhead); 2163 sq->head = sqhead; 2164 2165 if (cq->head != cq->tail) 2166 pci_generate_msix(sc->nsc_pi, 0); 2167 2168 pthread_mutex_unlock(&sq->mtx); 2169 } 2170 2171 /* 2172 * Update the Write and Read statistics reported in SMART data 2173 * 2174 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up. 2175 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000 2176 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999. 2177 */ 2178 static void 2179 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc, 2180 size_t bytes, uint16_t status) 2181 { 2182 2183 pthread_mutex_lock(&sc->mtx); 2184 switch (opc) { 2185 case NVME_OPC_WRITE: 2186 sc->write_commands++; 2187 if (status != NVME_SC_SUCCESS) 2188 break; 2189 sc->write_dunits_remainder += (bytes / 512); 2190 while (sc->write_dunits_remainder >= 1000) { 2191 sc->write_data_units++; 2192 sc->write_dunits_remainder -= 1000; 2193 } 2194 break; 2195 case NVME_OPC_READ: 2196 sc->read_commands++; 2197 if (status != NVME_SC_SUCCESS) 2198 break; 2199 sc->read_dunits_remainder += (bytes / 512); 2200 while (sc->read_dunits_remainder >= 1000) { 2201 sc->read_data_units++; 2202 sc->read_dunits_remainder -= 1000; 2203 } 2204 break; 2205 default: 2206 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc); 2207 break; 2208 } 2209 pthread_mutex_unlock(&sc->mtx); 2210 } 2211 2212 /* 2213 * Check if the combination of Starting LBA (slba) and number of blocks 2214 * exceeds the range of the underlying storage. 2215 * 2216 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores 2217 * the capacity in bytes as a uint64_t, care must be taken to avoid integer 2218 * overflow. 2219 */ 2220 static bool 2221 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba, 2222 uint32_t nblocks) 2223 { 2224 size_t offset, bytes; 2225 2226 /* Overflow check of multiplying Starting LBA by the sector size */ 2227 if (slba >> (64 - nvstore->sectsz_bits)) 2228 return (true); 2229 2230 offset = slba << nvstore->sectsz_bits; 2231 bytes = nblocks << nvstore->sectsz_bits; 2232 2233 /* Overflow check of Number of Logical Blocks */ 2234 if ((nvstore->size <= offset) || ((nvstore->size - offset) < bytes)) 2235 return (true); 2236 2237 return (false); 2238 } 2239 2240 static int 2241 pci_nvme_append_iov_req(struct pci_nvme_softc *sc __unused, 2242 struct pci_nvme_ioreq *req, uint64_t gpaddr, size_t size, uint64_t offset) 2243 { 2244 int iovidx; 2245 bool range_is_contiguous; 2246 2247 if (req == NULL) 2248 return (-1); 2249 2250 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) { 2251 return (-1); 2252 } 2253 2254 /* 2255 * Minimize the number of IOVs by concatenating contiguous address 2256 * ranges. If the IOV count is zero, there is no previous range to 2257 * concatenate. 2258 */ 2259 if (req->io_req.br_iovcnt == 0) 2260 range_is_contiguous = false; 2261 else 2262 range_is_contiguous = (req->prev_gpaddr + req->prev_size) == gpaddr; 2263 2264 if (range_is_contiguous) { 2265 iovidx = req->io_req.br_iovcnt - 1; 2266 2267 req->io_req.br_iov[iovidx].iov_base = 2268 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2269 req->prev_gpaddr, size); 2270 if (req->io_req.br_iov[iovidx].iov_base == NULL) 2271 return (-1); 2272 2273 req->prev_size += size; 2274 req->io_req.br_resid += size; 2275 2276 req->io_req.br_iov[iovidx].iov_len = req->prev_size; 2277 } else { 2278 iovidx = req->io_req.br_iovcnt; 2279 if (iovidx == 0) { 2280 req->io_req.br_offset = offset; 2281 req->io_req.br_resid = 0; 2282 req->io_req.br_param = req; 2283 } 2284 2285 req->io_req.br_iov[iovidx].iov_base = 2286 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2287 gpaddr, size); 2288 if (req->io_req.br_iov[iovidx].iov_base == NULL) 2289 return (-1); 2290 2291 req->io_req.br_iov[iovidx].iov_len = size; 2292 2293 req->prev_gpaddr = gpaddr; 2294 req->prev_size = size; 2295 req->io_req.br_resid += size; 2296 2297 req->io_req.br_iovcnt++; 2298 } 2299 2300 return (0); 2301 } 2302 2303 static void 2304 pci_nvme_set_completion(struct pci_nvme_softc *sc, 2305 struct nvme_submission_queue *sq, int sqid, uint16_t cid, uint16_t status) 2306 { 2307 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; 2308 2309 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x", 2310 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), 2311 NVME_STATUS_GET_SC(status)); 2312 2313 pci_nvme_cq_update(sc, cq, 0, cid, sqid, status); 2314 2315 if (cq->head != cq->tail) { 2316 if (cq->intr_en & NVME_CQ_INTEN) { 2317 pci_generate_msix(sc->nsc_pi, cq->intr_vec); 2318 } else { 2319 DPRINTF("%s: CQ%u interrupt disabled", 2320 __func__, sq->cqid); 2321 } 2322 } 2323 } 2324 2325 static void 2326 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) 2327 { 2328 req->sc = NULL; 2329 req->nvme_sq = NULL; 2330 req->sqid = 0; 2331 2332 pthread_mutex_lock(&sc->mtx); 2333 2334 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link); 2335 sc->pending_ios--; 2336 2337 /* when no more IO pending, can set to ready if device reset/enabled */ 2338 if (sc->pending_ios == 0 && 2339 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) 2340 sc->regs.csts |= NVME_CSTS_RDY; 2341 2342 pthread_mutex_unlock(&sc->mtx); 2343 2344 sem_post(&sc->iosemlock); 2345 } 2346 2347 static struct pci_nvme_ioreq * 2348 pci_nvme_get_ioreq(struct pci_nvme_softc *sc) 2349 { 2350 struct pci_nvme_ioreq *req = NULL; 2351 2352 sem_wait(&sc->iosemlock); 2353 pthread_mutex_lock(&sc->mtx); 2354 2355 req = STAILQ_FIRST(&sc->ioreqs_free); 2356 assert(req != NULL); 2357 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link); 2358 2359 req->sc = sc; 2360 2361 sc->pending_ios++; 2362 2363 pthread_mutex_unlock(&sc->mtx); 2364 2365 req->io_req.br_iovcnt = 0; 2366 req->io_req.br_offset = 0; 2367 req->io_req.br_resid = 0; 2368 req->io_req.br_param = req; 2369 req->prev_gpaddr = 0; 2370 req->prev_size = 0; 2371 2372 return req; 2373 } 2374 2375 static void 2376 pci_nvme_io_done(struct blockif_req *br, int err) 2377 { 2378 struct pci_nvme_ioreq *req = br->br_param; 2379 struct nvme_submission_queue *sq = req->nvme_sq; 2380 uint16_t code, status; 2381 2382 DPRINTF("%s error %d %s", __func__, err, strerror(err)); 2383 2384 /* TODO return correct error */ 2385 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; 2386 pci_nvme_status_genc(&status, code); 2387 2388 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, status); 2389 pci_nvme_stats_write_read_update(req->sc, req->opc, 2390 req->bytes, status); 2391 pci_nvme_release_ioreq(req->sc, req); 2392 } 2393 2394 /* 2395 * Implements the Flush command. The specification states: 2396 * If a volatile write cache is not present, Flush commands complete 2397 * successfully and have no effect 2398 * in the description of the Volatile Write Cache (VWC) field of the Identify 2399 * Controller data. Therefore, set status to Success if the command is 2400 * not supported (i.e. RAM or as indicated by the blockif). 2401 */ 2402 static bool 2403 nvme_opc_flush(struct pci_nvme_softc *sc __unused, 2404 struct nvme_command *cmd __unused, 2405 struct pci_nvme_blockstore *nvstore, 2406 struct pci_nvme_ioreq *req, 2407 uint16_t *status) 2408 { 2409 bool pending = false; 2410 2411 if (nvstore->type == NVME_STOR_RAM) { 2412 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2413 } else { 2414 int err; 2415 2416 req->io_req.br_callback = pci_nvme_io_done; 2417 2418 err = blockif_flush(nvstore->ctx, &req->io_req); 2419 switch (err) { 2420 case 0: 2421 pending = true; 2422 break; 2423 case EOPNOTSUPP: 2424 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2425 break; 2426 default: 2427 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2428 } 2429 } 2430 2431 return (pending); 2432 } 2433 2434 static uint16_t 2435 nvme_write_read_ram(struct pci_nvme_softc *sc, 2436 struct pci_nvme_blockstore *nvstore, 2437 uint64_t prp1, uint64_t prp2, 2438 size_t offset, uint64_t bytes, 2439 bool is_write) 2440 { 2441 uint8_t *buf = nvstore->ctx; 2442 enum nvme_copy_dir dir; 2443 uint16_t status; 2444 2445 if (is_write) 2446 dir = NVME_COPY_TO_PRP; 2447 else 2448 dir = NVME_COPY_FROM_PRP; 2449 2450 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2, 2451 buf + offset, bytes, dir)) 2452 pci_nvme_status_genc(&status, 2453 NVME_SC_DATA_TRANSFER_ERROR); 2454 else 2455 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2456 2457 return (status); 2458 } 2459 2460 static uint16_t 2461 nvme_write_read_blockif(struct pci_nvme_softc *sc, 2462 struct pci_nvme_blockstore *nvstore, 2463 struct pci_nvme_ioreq *req, 2464 uint64_t prp1, uint64_t prp2, 2465 size_t offset, uint64_t bytes, 2466 bool is_write) 2467 { 2468 uint64_t size; 2469 int err; 2470 uint16_t status = NVME_NO_STATUS; 2471 2472 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes); 2473 if (pci_nvme_append_iov_req(sc, req, prp1, size, offset)) { 2474 err = -1; 2475 goto out; 2476 } 2477 2478 offset += size; 2479 bytes -= size; 2480 2481 if (bytes == 0) { 2482 ; 2483 } else if (bytes <= PAGE_SIZE) { 2484 size = bytes; 2485 if (pci_nvme_append_iov_req(sc, req, prp2, size, offset)) { 2486 err = -1; 2487 goto out; 2488 } 2489 } else { 2490 void *vmctx = sc->nsc_pi->pi_vmctx; 2491 uint64_t *prp_list = &prp2; 2492 uint64_t *last = prp_list; 2493 2494 /* PRP2 is pointer to a physical region page list */ 2495 while (bytes) { 2496 /* Last entry in list points to the next list */ 2497 if ((prp_list == last) && (bytes > PAGE_SIZE)) { 2498 uint64_t prp = *prp_list; 2499 2500 prp_list = paddr_guest2host(vmctx, prp, 2501 PAGE_SIZE - (prp % PAGE_SIZE)); 2502 if (prp_list == NULL) { 2503 err = -1; 2504 goto out; 2505 } 2506 last = prp_list + (NVME_PRP2_ITEMS - 1); 2507 } 2508 2509 size = MIN(bytes, PAGE_SIZE); 2510 2511 if (pci_nvme_append_iov_req(sc, req, *prp_list, size, 2512 offset)) { 2513 err = -1; 2514 goto out; 2515 } 2516 2517 offset += size; 2518 bytes -= size; 2519 2520 prp_list++; 2521 } 2522 } 2523 req->io_req.br_callback = pci_nvme_io_done; 2524 if (is_write) 2525 err = blockif_write(nvstore->ctx, &req->io_req); 2526 else 2527 err = blockif_read(nvstore->ctx, &req->io_req); 2528 out: 2529 if (err) 2530 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); 2531 2532 return (status); 2533 } 2534 2535 static bool 2536 nvme_opc_write_read(struct pci_nvme_softc *sc, 2537 struct nvme_command *cmd, 2538 struct pci_nvme_blockstore *nvstore, 2539 struct pci_nvme_ioreq *req, 2540 uint16_t *status) 2541 { 2542 uint64_t lba, nblocks, bytes; 2543 size_t offset; 2544 bool is_write = cmd->opc == NVME_OPC_WRITE; 2545 bool pending = false; 2546 2547 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; 2548 nblocks = (cmd->cdw12 & 0xFFFF) + 1; 2549 bytes = nblocks << nvstore->sectsz_bits; 2550 if (bytes > NVME_MAX_DATA_SIZE) { 2551 WPRINTF("%s command would exceed MDTS", __func__); 2552 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD); 2553 goto out; 2554 } 2555 2556 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) { 2557 WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)", 2558 __func__, lba, nblocks); 2559 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2560 goto out; 2561 } 2562 2563 offset = lba << nvstore->sectsz_bits; 2564 2565 req->bytes = bytes; 2566 req->io_req.br_offset = lba; 2567 2568 /* PRP bits 1:0 must be zero */ 2569 cmd->prp1 &= ~0x3UL; 2570 cmd->prp2 &= ~0x3UL; 2571 2572 if (nvstore->type == NVME_STOR_RAM) { 2573 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1, 2574 cmd->prp2, offset, bytes, is_write); 2575 } else { 2576 *status = nvme_write_read_blockif(sc, nvstore, req, 2577 cmd->prp1, cmd->prp2, offset, bytes, is_write); 2578 2579 if (*status == NVME_NO_STATUS) 2580 pending = true; 2581 } 2582 out: 2583 if (!pending) 2584 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status); 2585 2586 return (pending); 2587 } 2588 2589 static void 2590 pci_nvme_dealloc_sm(struct blockif_req *br, int err) 2591 { 2592 struct pci_nvme_ioreq *req = br->br_param; 2593 struct pci_nvme_softc *sc = req->sc; 2594 bool done = true; 2595 uint16_t status; 2596 2597 if (err) { 2598 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); 2599 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) { 2600 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2601 } else { 2602 struct iovec *iov = req->io_req.br_iov; 2603 2604 req->prev_gpaddr++; 2605 iov += req->prev_gpaddr; 2606 2607 /* The iov_* values already include the sector size */ 2608 req->io_req.br_offset = (off_t)iov->iov_base; 2609 req->io_req.br_resid = iov->iov_len; 2610 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) { 2611 pci_nvme_status_genc(&status, 2612 NVME_SC_INTERNAL_DEVICE_ERROR); 2613 } else 2614 done = false; 2615 } 2616 2617 if (done) { 2618 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, req->cid, 2619 status); 2620 pci_nvme_release_ioreq(sc, req); 2621 } 2622 } 2623 2624 static bool 2625 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, 2626 struct nvme_command *cmd, 2627 struct pci_nvme_blockstore *nvstore, 2628 struct pci_nvme_ioreq *req, 2629 uint16_t *status) 2630 { 2631 struct nvme_dsm_range *range = NULL; 2632 uint32_t nr, r, non_zero, dr; 2633 int err; 2634 bool pending = false; 2635 2636 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) { 2637 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE); 2638 goto out; 2639 } 2640 2641 nr = cmd->cdw10 & 0xff; 2642 2643 /* copy locally because a range entry could straddle PRPs */ 2644 range = calloc(1, NVME_MAX_DSM_TRIM); 2645 if (range == NULL) { 2646 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2647 goto out; 2648 } 2649 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, 2650 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP); 2651 2652 /* Check for invalid ranges and the number of non-zero lengths */ 2653 non_zero = 0; 2654 for (r = 0; r <= nr; r++) { 2655 if (pci_nvme_out_of_range(nvstore, 2656 range[r].starting_lba, range[r].length)) { 2657 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2658 goto out; 2659 } 2660 if (range[r].length != 0) 2661 non_zero++; 2662 } 2663 2664 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) { 2665 size_t offset, bytes; 2666 int sectsz_bits = sc->nvstore.sectsz_bits; 2667 2668 /* 2669 * DSM calls are advisory only, and compliant controllers 2670 * may choose to take no actions (i.e. return Success). 2671 */ 2672 if (!nvstore->deallocate) { 2673 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2674 goto out; 2675 } 2676 2677 /* If all ranges have a zero length, return Success */ 2678 if (non_zero == 0) { 2679 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2680 goto out; 2681 } 2682 2683 if (req == NULL) { 2684 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2685 goto out; 2686 } 2687 2688 offset = range[0].starting_lba << sectsz_bits; 2689 bytes = range[0].length << sectsz_bits; 2690 2691 /* 2692 * If the request is for more than a single range, store 2693 * the ranges in the br_iov. Optimize for the common case 2694 * of a single range. 2695 * 2696 * Note that NVMe Number of Ranges is a zero based value 2697 */ 2698 req->io_req.br_iovcnt = 0; 2699 req->io_req.br_offset = offset; 2700 req->io_req.br_resid = bytes; 2701 2702 if (nr == 0) { 2703 req->io_req.br_callback = pci_nvme_io_done; 2704 } else { 2705 struct iovec *iov = req->io_req.br_iov; 2706 2707 for (r = 0, dr = 0; r <= nr; r++) { 2708 offset = range[r].starting_lba << sectsz_bits; 2709 bytes = range[r].length << sectsz_bits; 2710 if (bytes == 0) 2711 continue; 2712 2713 if ((nvstore->size - offset) < bytes) { 2714 pci_nvme_status_genc(status, 2715 NVME_SC_LBA_OUT_OF_RANGE); 2716 goto out; 2717 } 2718 iov[dr].iov_base = (void *)offset; 2719 iov[dr].iov_len = bytes; 2720 dr++; 2721 } 2722 req->io_req.br_callback = pci_nvme_dealloc_sm; 2723 2724 /* 2725 * Use prev_gpaddr to track the current entry and 2726 * prev_size to track the number of entries 2727 */ 2728 req->prev_gpaddr = 0; 2729 req->prev_size = dr; 2730 } 2731 2732 err = blockif_delete(nvstore->ctx, &req->io_req); 2733 if (err) 2734 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2735 else 2736 pending = true; 2737 } 2738 out: 2739 free(range); 2740 return (pending); 2741 } 2742 2743 static void 2744 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) 2745 { 2746 struct nvme_submission_queue *sq; 2747 uint16_t status; 2748 uint16_t sqhead; 2749 2750 /* handle all submissions up to sq->tail index */ 2751 sq = &sc->submit_queues[idx]; 2752 2753 pthread_mutex_lock(&sq->mtx); 2754 2755 sqhead = sq->head; 2756 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p", 2757 idx, sqhead, sq->tail, sq->qbase); 2758 2759 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2760 struct nvme_command *cmd; 2761 struct pci_nvme_ioreq *req; 2762 uint32_t nsid; 2763 bool pending; 2764 2765 pending = false; 2766 req = NULL; 2767 status = 0; 2768 2769 cmd = &sq->qbase[sqhead]; 2770 sqhead = (sqhead + 1) % sq->size; 2771 2772 nsid = le32toh(cmd->nsid); 2773 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) { 2774 pci_nvme_status_genc(&status, 2775 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 2776 status |= 2777 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT; 2778 goto complete; 2779 } 2780 2781 req = pci_nvme_get_ioreq(sc); 2782 if (req == NULL) { 2783 pci_nvme_status_genc(&status, 2784 NVME_SC_INTERNAL_DEVICE_ERROR); 2785 WPRINTF("%s: unable to allocate IO req", __func__); 2786 goto complete; 2787 } 2788 req->nvme_sq = sq; 2789 req->sqid = idx; 2790 req->opc = cmd->opc; 2791 req->cid = cmd->cid; 2792 req->nsid = cmd->nsid; 2793 2794 switch (cmd->opc) { 2795 case NVME_OPC_FLUSH: 2796 pending = nvme_opc_flush(sc, cmd, &sc->nvstore, 2797 req, &status); 2798 break; 2799 case NVME_OPC_WRITE: 2800 case NVME_OPC_READ: 2801 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore, 2802 req, &status); 2803 break; 2804 case NVME_OPC_WRITE_ZEROES: 2805 /* TODO: write zeroes 2806 WPRINTF("%s write zeroes lba 0x%lx blocks %u", 2807 __func__, lba, cmd->cdw12 & 0xFFFF); */ 2808 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2809 break; 2810 case NVME_OPC_DATASET_MANAGEMENT: 2811 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, 2812 req, &status); 2813 break; 2814 default: 2815 WPRINTF("%s unhandled io command 0x%x", 2816 __func__, cmd->opc); 2817 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE); 2818 } 2819 complete: 2820 if (!pending) { 2821 pci_nvme_set_completion(sc, sq, idx, cmd->cid, status); 2822 if (req != NULL) 2823 pci_nvme_release_ioreq(sc, req); 2824 } 2825 } 2826 2827 sq->head = sqhead; 2828 2829 pthread_mutex_unlock(&sq->mtx); 2830 } 2831 2832 static void 2833 pci_nvme_handle_doorbell(struct vmctx *ctx __unused, struct pci_nvme_softc* sc, 2834 uint64_t idx, int is_sq, uint64_t value) 2835 { 2836 DPRINTF("nvme doorbell %lu, %s, val 0x%lx", 2837 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF); 2838 2839 if (is_sq) { 2840 if (idx > sc->num_squeues) { 2841 WPRINTF("%s queue index %lu overflow from " 2842 "guest (max %u)", 2843 __func__, idx, sc->num_squeues); 2844 return; 2845 } 2846 2847 atomic_store_short(&sc->submit_queues[idx].tail, 2848 (uint16_t)value); 2849 2850 if (idx == 0) { 2851 pci_nvme_handle_admin_cmd(sc, value); 2852 } else { 2853 /* submission queue; handle new entries in SQ */ 2854 if (idx > sc->num_squeues) { 2855 WPRINTF("%s SQ index %lu overflow from " 2856 "guest (max %u)", 2857 __func__, idx, sc->num_squeues); 2858 return; 2859 } 2860 pci_nvme_handle_io_cmd(sc, (uint16_t)idx); 2861 } 2862 } else { 2863 if (idx > sc->num_cqueues) { 2864 WPRINTF("%s queue index %lu overflow from " 2865 "guest (max %u)", 2866 __func__, idx, sc->num_cqueues); 2867 return; 2868 } 2869 2870 atomic_store_short(&sc->compl_queues[idx].head, 2871 (uint16_t)value); 2872 } 2873 } 2874 2875 static void 2876 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) 2877 { 2878 const char *s = iswrite ? "WRITE" : "READ"; 2879 2880 switch (offset) { 2881 case NVME_CR_CAP_LOW: 2882 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s); 2883 break; 2884 case NVME_CR_CAP_HI: 2885 DPRINTF("%s %s NVME_CR_CAP_HI", func, s); 2886 break; 2887 case NVME_CR_VS: 2888 DPRINTF("%s %s NVME_CR_VS", func, s); 2889 break; 2890 case NVME_CR_INTMS: 2891 DPRINTF("%s %s NVME_CR_INTMS", func, s); 2892 break; 2893 case NVME_CR_INTMC: 2894 DPRINTF("%s %s NVME_CR_INTMC", func, s); 2895 break; 2896 case NVME_CR_CC: 2897 DPRINTF("%s %s NVME_CR_CC", func, s); 2898 break; 2899 case NVME_CR_CSTS: 2900 DPRINTF("%s %s NVME_CR_CSTS", func, s); 2901 break; 2902 case NVME_CR_NSSR: 2903 DPRINTF("%s %s NVME_CR_NSSR", func, s); 2904 break; 2905 case NVME_CR_AQA: 2906 DPRINTF("%s %s NVME_CR_AQA", func, s); 2907 break; 2908 case NVME_CR_ASQ_LOW: 2909 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s); 2910 break; 2911 case NVME_CR_ASQ_HI: 2912 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s); 2913 break; 2914 case NVME_CR_ACQ_LOW: 2915 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s); 2916 break; 2917 case NVME_CR_ACQ_HI: 2918 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s); 2919 break; 2920 default: 2921 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset); 2922 } 2923 2924 } 2925 2926 static void 2927 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc, 2928 uint64_t offset, int size, uint64_t value) 2929 { 2930 uint32_t ccreg; 2931 2932 if (offset >= NVME_DOORBELL_OFFSET) { 2933 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; 2934 uint64_t idx = belloffset / 8; /* door bell size = 2*int */ 2935 int is_sq = (belloffset % 8) < 4; 2936 2937 if ((sc->regs.csts & NVME_CSTS_RDY) == 0) { 2938 WPRINTF("doorbell write prior to RDY (offset=%#lx)\n", 2939 offset); 2940 return; 2941 } 2942 2943 if (belloffset > ((sc->max_queues+1) * 8 - 4)) { 2944 WPRINTF("guest attempted an overflow write offset " 2945 "0x%lx, val 0x%lx in %s", 2946 offset, value, __func__); 2947 return; 2948 } 2949 2950 if (is_sq) { 2951 if (sc->submit_queues[idx].qbase == NULL) 2952 return; 2953 } else if (sc->compl_queues[idx].qbase == NULL) 2954 return; 2955 2956 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value); 2957 return; 2958 } 2959 2960 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx", 2961 offset, size, value); 2962 2963 if (size != 4) { 2964 WPRINTF("guest wrote invalid size %d (offset 0x%lx, " 2965 "val 0x%lx) to bar0 in %s", 2966 size, offset, value, __func__); 2967 /* TODO: shutdown device */ 2968 return; 2969 } 2970 2971 pci_nvme_bar0_reg_dumps(__func__, offset, 1); 2972 2973 pthread_mutex_lock(&sc->mtx); 2974 2975 switch (offset) { 2976 case NVME_CR_CAP_LOW: 2977 case NVME_CR_CAP_HI: 2978 /* readonly */ 2979 break; 2980 case NVME_CR_VS: 2981 /* readonly */ 2982 break; 2983 case NVME_CR_INTMS: 2984 /* MSI-X, so ignore */ 2985 break; 2986 case NVME_CR_INTMC: 2987 /* MSI-X, so ignore */ 2988 break; 2989 case NVME_CR_CC: 2990 ccreg = (uint32_t)value; 2991 2992 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u " 2993 "iocqes %u", 2994 __func__, 2995 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), 2996 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), 2997 NVME_CC_GET_IOCQES(ccreg)); 2998 2999 if (NVME_CC_GET_SHN(ccreg)) { 3000 /* perform shutdown - flush out data to backend */ 3001 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << 3002 NVME_CSTS_REG_SHST_SHIFT); 3003 sc->regs.csts |= NVME_SHST_COMPLETE << 3004 NVME_CSTS_REG_SHST_SHIFT; 3005 } 3006 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { 3007 if (NVME_CC_GET_EN(ccreg) == 0) 3008 /* transition 1-> causes controller reset */ 3009 pci_nvme_reset_locked(sc); 3010 else 3011 pci_nvme_init_controller(ctx, sc); 3012 } 3013 3014 /* Insert the iocqes, iosqes and en bits from the write */ 3015 sc->regs.cc &= ~NVME_CC_WRITE_MASK; 3016 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; 3017 if (NVME_CC_GET_EN(ccreg) == 0) { 3018 /* Insert the ams, mps and css bit fields */ 3019 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; 3020 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; 3021 sc->regs.csts &= ~NVME_CSTS_RDY; 3022 } else if ((sc->pending_ios == 0) && 3023 !(sc->regs.csts & NVME_CSTS_CFS)) { 3024 sc->regs.csts |= NVME_CSTS_RDY; 3025 } 3026 break; 3027 case NVME_CR_CSTS: 3028 break; 3029 case NVME_CR_NSSR: 3030 /* ignore writes; don't support subsystem reset */ 3031 break; 3032 case NVME_CR_AQA: 3033 sc->regs.aqa = (uint32_t)value; 3034 break; 3035 case NVME_CR_ASQ_LOW: 3036 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | 3037 (0xFFFFF000 & value); 3038 break; 3039 case NVME_CR_ASQ_HI: 3040 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | 3041 (value << 32); 3042 break; 3043 case NVME_CR_ACQ_LOW: 3044 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | 3045 (0xFFFFF000 & value); 3046 break; 3047 case NVME_CR_ACQ_HI: 3048 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | 3049 (value << 32); 3050 break; 3051 default: 3052 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d", 3053 __func__, offset, value, size); 3054 } 3055 pthread_mutex_unlock(&sc->mtx); 3056 } 3057 3058 static void 3059 pci_nvme_write(struct vmctx *ctx, int vcpu __unused, struct pci_devinst *pi, 3060 int baridx, uint64_t offset, int size, uint64_t value) 3061 { 3062 struct pci_nvme_softc* sc = pi->pi_arg; 3063 3064 if (baridx == pci_msix_table_bar(pi) || 3065 baridx == pci_msix_pba_bar(pi)) { 3066 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, " 3067 " value 0x%lx", baridx, offset, size, value); 3068 3069 pci_emul_msix_twrite(pi, offset, size, value); 3070 return; 3071 } 3072 3073 switch (baridx) { 3074 case 0: 3075 pci_nvme_write_bar_0(ctx, sc, offset, size, value); 3076 break; 3077 3078 default: 3079 DPRINTF("%s unknown baridx %d, val 0x%lx", 3080 __func__, baridx, value); 3081 } 3082 } 3083 3084 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, 3085 uint64_t offset, int size) 3086 { 3087 uint64_t value; 3088 3089 pci_nvme_bar0_reg_dumps(__func__, offset, 0); 3090 3091 if (offset < NVME_DOORBELL_OFFSET) { 3092 void *p = &(sc->regs); 3093 pthread_mutex_lock(&sc->mtx); 3094 memcpy(&value, (void *)((uintptr_t)p + offset), size); 3095 pthread_mutex_unlock(&sc->mtx); 3096 } else { 3097 value = 0; 3098 WPRINTF("pci_nvme: read invalid offset %ld", offset); 3099 } 3100 3101 switch (size) { 3102 case 1: 3103 value &= 0xFF; 3104 break; 3105 case 2: 3106 value &= 0xFFFF; 3107 break; 3108 case 4: 3109 value &= 0xFFFFFFFF; 3110 break; 3111 } 3112 3113 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x", 3114 offset, size, (uint32_t)value); 3115 3116 return (value); 3117 } 3118 3119 3120 3121 static uint64_t 3122 pci_nvme_read(struct vmctx *ctx __unused, int vcpu __unused, 3123 struct pci_devinst *pi, int baridx, uint64_t offset, int size) 3124 { 3125 struct pci_nvme_softc* sc = pi->pi_arg; 3126 3127 if (baridx == pci_msix_table_bar(pi) || 3128 baridx == pci_msix_pba_bar(pi)) { 3129 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d", 3130 baridx, offset, size); 3131 3132 return pci_emul_msix_tread(pi, offset, size); 3133 } 3134 3135 switch (baridx) { 3136 case 0: 3137 return pci_nvme_read_bar_0(sc, offset, size); 3138 3139 default: 3140 DPRINTF("unknown bar %d, 0x%lx", baridx, offset); 3141 } 3142 3143 return (0); 3144 } 3145 3146 static int 3147 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl) 3148 { 3149 char bident[sizeof("XX:X:X")]; 3150 const char *value; 3151 uint32_t sectsz; 3152 3153 sc->max_queues = NVME_QUEUES; 3154 sc->max_qentries = NVME_MAX_QENTRIES; 3155 sc->ioslots = NVME_IOSLOTS; 3156 sc->num_squeues = sc->max_queues; 3157 sc->num_cqueues = sc->max_queues; 3158 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 3159 sectsz = 0; 3160 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), 3161 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3162 3163 value = get_config_value_node(nvl, "maxq"); 3164 if (value != NULL) 3165 sc->max_queues = atoi(value); 3166 value = get_config_value_node(nvl, "qsz"); 3167 if (value != NULL) { 3168 sc->max_qentries = atoi(value); 3169 if (sc->max_qentries <= 0) { 3170 EPRINTLN("nvme: Invalid qsz option %d", 3171 sc->max_qentries); 3172 return (-1); 3173 } 3174 } 3175 value = get_config_value_node(nvl, "ioslots"); 3176 if (value != NULL) { 3177 sc->ioslots = atoi(value); 3178 if (sc->ioslots <= 0) { 3179 EPRINTLN("Invalid ioslots option %d", sc->ioslots); 3180 return (-1); 3181 } 3182 } 3183 value = get_config_value_node(nvl, "sectsz"); 3184 if (value != NULL) 3185 sectsz = atoi(value); 3186 value = get_config_value_node(nvl, "ser"); 3187 if (value != NULL) { 3188 /* 3189 * This field indicates the Product Serial Number in 3190 * 7-bit ASCII, unused bytes should be space characters. 3191 * Ref: NVMe v1.3c. 3192 */ 3193 cpywithpad((char *)sc->ctrldata.sn, 3194 sizeof(sc->ctrldata.sn), value, ' '); 3195 } 3196 value = get_config_value_node(nvl, "eui64"); 3197 if (value != NULL) 3198 sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0)); 3199 value = get_config_value_node(nvl, "dsm"); 3200 if (value != NULL) { 3201 if (strcmp(value, "auto") == 0) 3202 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 3203 else if (strcmp(value, "enable") == 0) 3204 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE; 3205 else if (strcmp(value, "disable") == 0) 3206 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE; 3207 } 3208 3209 value = get_config_value_node(nvl, "ram"); 3210 if (value != NULL) { 3211 uint64_t sz = strtoull(value, NULL, 10); 3212 3213 sc->nvstore.type = NVME_STOR_RAM; 3214 sc->nvstore.size = sz * 1024 * 1024; 3215 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 3216 sc->nvstore.sectsz = 4096; 3217 sc->nvstore.sectsz_bits = 12; 3218 if (sc->nvstore.ctx == NULL) { 3219 EPRINTLN("nvme: Unable to allocate RAM"); 3220 return (-1); 3221 } 3222 } else { 3223 snprintf(bident, sizeof(bident), "%d:%d", 3224 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3225 sc->nvstore.ctx = blockif_open(nvl, bident); 3226 if (sc->nvstore.ctx == NULL) { 3227 EPRINTLN("nvme: Could not open backing file: %s", 3228 strerror(errno)); 3229 return (-1); 3230 } 3231 sc->nvstore.type = NVME_STOR_BLOCKIF; 3232 sc->nvstore.size = blockif_size(sc->nvstore.ctx); 3233 } 3234 3235 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) 3236 sc->nvstore.sectsz = sectsz; 3237 else if (sc->nvstore.type != NVME_STOR_RAM) 3238 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); 3239 for (sc->nvstore.sectsz_bits = 9; 3240 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; 3241 sc->nvstore.sectsz_bits++); 3242 3243 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) 3244 sc->max_queues = NVME_QUEUES; 3245 3246 return (0); 3247 } 3248 3249 static void 3250 pci_nvme_resized(struct blockif_ctxt *bctxt __unused, void *arg, 3251 size_t new_size) 3252 { 3253 struct pci_nvme_softc *sc; 3254 struct pci_nvme_blockstore *nvstore; 3255 struct nvme_namespace_data *nd; 3256 3257 sc = arg; 3258 nvstore = &sc->nvstore; 3259 nd = &sc->nsdata; 3260 3261 nvstore->size = new_size; 3262 pci_nvme_init_nsdata_size(nvstore, nd); 3263 3264 /* Add changed NSID to list */ 3265 sc->ns_log.ns[0] = 1; 3266 sc->ns_log.ns[1] = 0; 3267 3268 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE, 3269 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED); 3270 } 3271 3272 static int 3273 pci_nvme_init(struct vmctx *ctx __unused, struct pci_devinst *pi, nvlist_t *nvl) 3274 { 3275 struct pci_nvme_softc *sc; 3276 uint32_t pci_membar_sz; 3277 int error; 3278 3279 error = 0; 3280 3281 sc = calloc(1, sizeof(struct pci_nvme_softc)); 3282 pi->pi_arg = sc; 3283 sc->nsc_pi = pi; 3284 3285 error = pci_nvme_parse_config(sc, nvl); 3286 if (error < 0) 3287 goto done; 3288 else 3289 error = 0; 3290 3291 STAILQ_INIT(&sc->ioreqs_free); 3292 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); 3293 for (uint32_t i = 0; i < sc->ioslots; i++) { 3294 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); 3295 } 3296 3297 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); 3298 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); 3299 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); 3300 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); 3301 pci_set_cfgdata8(pi, PCIR_PROGIF, 3302 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); 3303 3304 /* 3305 * Allocate size of NVMe registers + doorbell space for all queues. 3306 * 3307 * The specification requires a minimum memory I/O window size of 16K. 3308 * The Windows driver will refuse to start a device with a smaller 3309 * window. 3310 */ 3311 pci_membar_sz = sizeof(struct nvme_registers) + 3312 2 * sizeof(uint32_t) * (sc->max_queues + 1); 3313 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); 3314 3315 DPRINTF("nvme membar size: %u", pci_membar_sz); 3316 3317 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); 3318 if (error) { 3319 WPRINTF("%s pci alloc mem bar failed", __func__); 3320 goto done; 3321 } 3322 3323 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); 3324 if (error) { 3325 WPRINTF("%s pci add msixcap failed", __func__); 3326 goto done; 3327 } 3328 3329 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); 3330 if (error) { 3331 WPRINTF("%s pci add Express capability failed", __func__); 3332 goto done; 3333 } 3334 3335 pthread_mutex_init(&sc->mtx, NULL); 3336 sem_init(&sc->iosemlock, 0, sc->ioslots); 3337 blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc); 3338 3339 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues); 3340 /* 3341 * Controller data depends on Namespace data so initialize Namespace 3342 * data first. 3343 */ 3344 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore); 3345 pci_nvme_init_ctrldata(sc); 3346 pci_nvme_init_logpages(sc); 3347 pci_nvme_init_features(sc); 3348 3349 pci_nvme_aer_init(sc); 3350 pci_nvme_aen_init(sc); 3351 3352 pci_nvme_reset(sc); 3353 3354 pci_lintr_request(pi); 3355 3356 done: 3357 return (error); 3358 } 3359 3360 static int 3361 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts) 3362 { 3363 char *cp, *ram; 3364 3365 if (opts == NULL) 3366 return (0); 3367 3368 if (strncmp(opts, "ram=", 4) == 0) { 3369 cp = strchr(opts, ','); 3370 if (cp == NULL) { 3371 set_config_value_node(nvl, "ram", opts + 4); 3372 return (0); 3373 } 3374 ram = strndup(opts + 4, cp - opts - 4); 3375 set_config_value_node(nvl, "ram", ram); 3376 free(ram); 3377 return (pci_parse_legacy_config(nvl, cp + 1)); 3378 } else 3379 return (blockif_legacy_config(nvl, opts)); 3380 } 3381 3382 static const struct pci_devemu pci_de_nvme = { 3383 .pe_emu = "nvme", 3384 .pe_init = pci_nvme_init, 3385 .pe_legacy_config = pci_nvme_legacy_config, 3386 .pe_barwrite = pci_nvme_write, 3387 .pe_barread = pci_nvme_read 3388 }; 3389 PCI_EMUL_SET(pci_de_nvme); 3390