1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2017 Shunsuke Mie 5 * Copyright (c) 2018 Leon Dang 6 * Copyright (c) 2020 Chuck Tuffli 7 * 8 * Function crc16 Copyright (c) 2017, Fedor Uporov 9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 33 /* 34 * bhyve PCIe-NVMe device emulation. 35 * 36 * options: 37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt> 38 * 39 * accepted devpath: 40 * /dev/blockdev 41 * /path/to/image 42 * ram=size_in_MiB 43 * 44 * maxq = max number of queues 45 * qsz = max elements in each queue 46 * ioslots = max number of concurrent io requests 47 * sectsz = sector size (defaults to blockif sector size) 48 * ser = serial number (20-chars max) 49 * eui64 = IEEE Extended Unique Identifier (8 byte value) 50 * dsm = DataSet Management support. Option is one of auto, enable,disable 51 * 52 */ 53 54 /* TODO: 55 - create async event for smart and log 56 - intr coalesce 57 */ 58 59 #include <sys/cdefs.h> 60 __FBSDID("$FreeBSD$"); 61 62 #include <sys/errno.h> 63 #include <sys/types.h> 64 #include <net/ieee_oui.h> 65 66 #include <assert.h> 67 #include <pthread.h> 68 #include <pthread_np.h> 69 #include <semaphore.h> 70 #include <stdbool.h> 71 #include <stddef.h> 72 #include <stdint.h> 73 #include <stdio.h> 74 #include <stdlib.h> 75 #include <string.h> 76 77 #include <machine/atomic.h> 78 #include <machine/vmm.h> 79 #include <vmmapi.h> 80 81 #include <dev/nvme/nvme.h> 82 83 #include "bhyverun.h" 84 #include "block_if.h" 85 #include "config.h" 86 #include "debug.h" 87 #include "pci_emul.h" 88 89 90 static int nvme_debug = 0; 91 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args) 92 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args) 93 94 /* defaults; can be overridden */ 95 #define NVME_MSIX_BAR 4 96 97 #define NVME_IOSLOTS 8 98 99 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ 100 #define NVME_MMIO_SPACE_MIN (1 << 14) 101 102 #define NVME_QUEUES 16 103 #define NVME_MAX_QENTRIES 2048 104 /* Memory Page size Minimum reported in CAP register */ 105 #define NVME_MPSMIN 0 106 /* MPSMIN converted to bytes */ 107 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN)) 108 109 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) 110 #define NVME_MDTS 9 111 /* Note the + 1 allows for the initial descriptor to not be page aligned */ 112 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1) 113 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES) 114 115 /* This is a synthetic status code to indicate there is no status */ 116 #define NVME_NO_STATUS 0xffff 117 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) 118 119 /* Reported temperature in Kelvin (i.e. room temperature) */ 120 #define NVME_TEMPERATURE 296 121 122 /* helpers */ 123 124 /* Convert a zero-based value into a one-based value */ 125 #define ONE_BASED(zero) ((zero) + 1) 126 /* Convert a one-based value into a zero-based value */ 127 #define ZERO_BASED(one) ((one) - 1) 128 129 /* Encode number of SQ's and CQ's for Set/Get Features */ 130 #define NVME_FEATURE_NUM_QUEUES(sc) \ 131 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ 132 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16; 133 134 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) 135 136 enum nvme_controller_register_offsets { 137 NVME_CR_CAP_LOW = 0x00, 138 NVME_CR_CAP_HI = 0x04, 139 NVME_CR_VS = 0x08, 140 NVME_CR_INTMS = 0x0c, 141 NVME_CR_INTMC = 0x10, 142 NVME_CR_CC = 0x14, 143 NVME_CR_CSTS = 0x1c, 144 NVME_CR_NSSR = 0x20, 145 NVME_CR_AQA = 0x24, 146 NVME_CR_ASQ_LOW = 0x28, 147 NVME_CR_ASQ_HI = 0x2c, 148 NVME_CR_ACQ_LOW = 0x30, 149 NVME_CR_ACQ_HI = 0x34, 150 }; 151 152 enum nvme_cmd_cdw11 { 153 NVME_CMD_CDW11_PC = 0x0001, 154 NVME_CMD_CDW11_IEN = 0x0002, 155 NVME_CMD_CDW11_IV = 0xFFFF0000, 156 }; 157 158 enum nvme_copy_dir { 159 NVME_COPY_TO_PRP, 160 NVME_COPY_FROM_PRP, 161 }; 162 163 #define NVME_CQ_INTEN 0x01 164 #define NVME_CQ_INTCOAL 0x02 165 166 struct nvme_completion_queue { 167 struct nvme_completion *qbase; 168 pthread_mutex_t mtx; 169 uint32_t size; 170 uint16_t tail; /* nvme progress */ 171 uint16_t head; /* guest progress */ 172 uint16_t intr_vec; 173 uint32_t intr_en; 174 }; 175 176 struct nvme_submission_queue { 177 struct nvme_command *qbase; 178 pthread_mutex_t mtx; 179 uint32_t size; 180 uint16_t head; /* nvme progress */ 181 uint16_t tail; /* guest progress */ 182 uint16_t cqid; /* completion queue id */ 183 int qpriority; 184 }; 185 186 enum nvme_storage_type { 187 NVME_STOR_BLOCKIF = 0, 188 NVME_STOR_RAM = 1, 189 }; 190 191 struct pci_nvme_blockstore { 192 enum nvme_storage_type type; 193 void *ctx; 194 uint64_t size; 195 uint32_t sectsz; 196 uint32_t sectsz_bits; 197 uint64_t eui64; 198 uint32_t deallocate:1; 199 }; 200 201 /* 202 * Calculate the number of additional page descriptors for guest IO requests 203 * based on the advertised Max Data Transfer (MDTS) and given the number of 204 * default iovec's in a struct blockif_req. 205 */ 206 #define MDTS_PAD_SIZE \ 207 ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \ 208 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \ 209 0 ) 210 211 struct pci_nvme_ioreq { 212 struct pci_nvme_softc *sc; 213 STAILQ_ENTRY(pci_nvme_ioreq) link; 214 struct nvme_submission_queue *nvme_sq; 215 uint16_t sqid; 216 217 /* command information */ 218 uint16_t opc; 219 uint16_t cid; 220 uint32_t nsid; 221 222 uint64_t prev_gpaddr; 223 size_t prev_size; 224 size_t bytes; 225 226 struct blockif_req io_req; 227 228 struct iovec iovpadding[MDTS_PAD_SIZE]; 229 }; 230 231 enum nvme_dsm_type { 232 /* Dataset Management bit in ONCS reflects backing storage capability */ 233 NVME_DATASET_MANAGEMENT_AUTO, 234 /* Unconditionally set Dataset Management bit in ONCS */ 235 NVME_DATASET_MANAGEMENT_ENABLE, 236 /* Unconditionally clear Dataset Management bit in ONCS */ 237 NVME_DATASET_MANAGEMENT_DISABLE, 238 }; 239 240 struct pci_nvme_softc; 241 struct nvme_feature_obj; 242 243 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *, 244 struct nvme_feature_obj *, 245 struct nvme_command *, 246 struct nvme_completion *); 247 248 struct nvme_feature_obj { 249 uint32_t cdw11; 250 nvme_feature_cb set; 251 nvme_feature_cb get; 252 bool namespace_specific; 253 }; 254 255 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1) 256 257 typedef enum { 258 PCI_NVME_AE_TYPE_ERROR = 0, 259 PCI_NVME_AE_TYPE_SMART, 260 PCI_NVME_AE_TYPE_NOTICE, 261 PCI_NVME_AE_TYPE_IO_CMD = 6, 262 PCI_NVME_AE_TYPE_VENDOR = 7, 263 PCI_NVME_AE_TYPE_MAX /* Must be last */ 264 } pci_nvme_async_type; 265 266 /* Asynchronous Event Requests */ 267 struct pci_nvme_aer { 268 STAILQ_ENTRY(pci_nvme_aer) link; 269 uint16_t cid; /* Command ID of the submitted AER */ 270 }; 271 272 /** Asynchronous Event Information - Notice */ 273 typedef enum { 274 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED = 0, 275 PCI_NVME_AEI_NOTICE_FW_ACTIVATION, 276 PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE, 277 PCI_NVME_AEI_NOTICE_ANA_CHANGE, 278 PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE, 279 PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT, 280 PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE, 281 PCI_NVME_AEI_NOTICE_MAX, 282 } pci_nvme_async_event_info_notice; 283 284 #define PCI_NVME_AEI_NOTICE_SHIFT 8 285 #define PCI_NVME_AEI_NOTICE_MASK(event) (1 << (event + PCI_NVME_AEI_NOTICE_SHIFT)) 286 287 /* Asynchronous Event Notifications */ 288 struct pci_nvme_aen { 289 pci_nvme_async_type atype; 290 uint32_t event_data; 291 bool posted; 292 }; 293 294 /* 295 * By default, enable all Asynchrnous Event Notifications: 296 * SMART / Health Critical Warnings 297 * Namespace Attribute Notices 298 */ 299 #define PCI_NVME_AEN_DEFAULT_MASK 0x11f 300 301 typedef enum { 302 NVME_CNTRLTYPE_IO = 1, 303 NVME_CNTRLTYPE_DISCOVERY = 2, 304 NVME_CNTRLTYPE_ADMIN = 3, 305 } pci_nvme_cntrl_type; 306 307 struct pci_nvme_softc { 308 struct pci_devinst *nsc_pi; 309 310 pthread_mutex_t mtx; 311 312 struct nvme_registers regs; 313 314 struct nvme_namespace_data nsdata; 315 struct nvme_controller_data ctrldata; 316 struct nvme_error_information_entry err_log; 317 struct nvme_health_information_page health_log; 318 struct nvme_firmware_page fw_log; 319 struct nvme_ns_list ns_log; 320 321 struct pci_nvme_blockstore nvstore; 322 323 uint16_t max_qentries; /* max entries per queue */ 324 uint32_t max_queues; /* max number of IO SQ's or CQ's */ 325 uint32_t num_cqueues; 326 uint32_t num_squeues; 327 bool num_q_is_set; /* Has host set Number of Queues */ 328 329 struct pci_nvme_ioreq *ioreqs; 330 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */ 331 uint32_t pending_ios; 332 uint32_t ioslots; 333 sem_t iosemlock; 334 335 /* 336 * Memory mapped Submission and Completion queues 337 * Each array includes both Admin and IO queues 338 */ 339 struct nvme_completion_queue *compl_queues; 340 struct nvme_submission_queue *submit_queues; 341 342 struct nvme_feature_obj feat[NVME_FID_MAX]; 343 344 enum nvme_dsm_type dataset_management; 345 346 /* Accounting for SMART data */ 347 __uint128_t read_data_units; 348 __uint128_t write_data_units; 349 __uint128_t read_commands; 350 __uint128_t write_commands; 351 uint32_t read_dunits_remainder; 352 uint32_t write_dunits_remainder; 353 354 STAILQ_HEAD(, pci_nvme_aer) aer_list; 355 pthread_mutex_t aer_mtx; 356 uint32_t aer_count; 357 struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX]; 358 pthread_t aen_tid; 359 pthread_mutex_t aen_mtx; 360 pthread_cond_t aen_cond; 361 }; 362 363 364 static void pci_nvme_cq_update(struct pci_nvme_softc *sc, 365 struct nvme_completion_queue *cq, 366 uint32_t cdw0, 367 uint16_t cid, 368 uint16_t sqid, 369 uint16_t status); 370 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); 371 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); 372 static void pci_nvme_io_done(struct blockif_req *, int); 373 374 /* Controller Configuration utils */ 375 #define NVME_CC_GET_EN(cc) \ 376 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) 377 #define NVME_CC_GET_CSS(cc) \ 378 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) 379 #define NVME_CC_GET_SHN(cc) \ 380 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) 381 #define NVME_CC_GET_IOSQES(cc) \ 382 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) 383 #define NVME_CC_GET_IOCQES(cc) \ 384 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) 385 386 #define NVME_CC_WRITE_MASK \ 387 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ 388 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ 389 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) 390 391 #define NVME_CC_NEN_WRITE_MASK \ 392 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ 393 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ 394 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) 395 396 /* Controller Status utils */ 397 #define NVME_CSTS_GET_RDY(sts) \ 398 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) 399 400 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) 401 #define NVME_CSTS_CFS (1 << NVME_CSTS_REG_CFS_SHIFT) 402 403 /* Completion Queue status word utils */ 404 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) 405 #define NVME_STATUS_MASK \ 406 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ 407 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) 408 409 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \ 410 NVME_CTRLR_DATA_ONCS_DSM_SHIFT) 411 412 static void nvme_feature_invalid_cb(struct pci_nvme_softc *, 413 struct nvme_feature_obj *, 414 struct nvme_command *, 415 struct nvme_completion *); 416 static void nvme_feature_temperature(struct pci_nvme_softc *, 417 struct nvme_feature_obj *, 418 struct nvme_command *, 419 struct nvme_completion *); 420 static void nvme_feature_num_queues(struct pci_nvme_softc *, 421 struct nvme_feature_obj *, 422 struct nvme_command *, 423 struct nvme_completion *); 424 static void nvme_feature_iv_config(struct pci_nvme_softc *, 425 struct nvme_feature_obj *, 426 struct nvme_command *, 427 struct nvme_completion *); 428 static void nvme_feature_async_event(struct pci_nvme_softc *, 429 struct nvme_feature_obj *, 430 struct nvme_command *, 431 struct nvme_completion *); 432 433 static void *aen_thr(void *arg); 434 435 static __inline void 436 cpywithpad(char *dst, size_t dst_size, const char *src, char pad) 437 { 438 size_t len; 439 440 len = strnlen(src, dst_size); 441 memset(dst, pad, dst_size); 442 memcpy(dst, src, len); 443 } 444 445 static __inline void 446 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) 447 { 448 449 *status &= ~NVME_STATUS_MASK; 450 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | 451 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; 452 } 453 454 static __inline void 455 pci_nvme_status_genc(uint16_t *status, uint16_t code) 456 { 457 458 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); 459 } 460 461 /* 462 * Initialize the requested number or IO Submission and Completion Queues. 463 * Admin queues are allocated implicitly. 464 */ 465 static void 466 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq) 467 { 468 uint32_t i; 469 470 /* 471 * Allocate and initialize the Submission Queues 472 */ 473 if (nsq > NVME_QUEUES) { 474 WPRINTF("%s: clamping number of SQ from %u to %u", 475 __func__, nsq, NVME_QUEUES); 476 nsq = NVME_QUEUES; 477 } 478 479 sc->num_squeues = nsq; 480 481 sc->submit_queues = calloc(sc->num_squeues + 1, 482 sizeof(struct nvme_submission_queue)); 483 if (sc->submit_queues == NULL) { 484 WPRINTF("%s: SQ allocation failed", __func__); 485 sc->num_squeues = 0; 486 } else { 487 struct nvme_submission_queue *sq = sc->submit_queues; 488 489 for (i = 0; i < sc->num_squeues + 1; i++) 490 pthread_mutex_init(&sq[i].mtx, NULL); 491 } 492 493 /* 494 * Allocate and initialize the Completion Queues 495 */ 496 if (ncq > NVME_QUEUES) { 497 WPRINTF("%s: clamping number of CQ from %u to %u", 498 __func__, ncq, NVME_QUEUES); 499 ncq = NVME_QUEUES; 500 } 501 502 sc->num_cqueues = ncq; 503 504 sc->compl_queues = calloc(sc->num_cqueues + 1, 505 sizeof(struct nvme_completion_queue)); 506 if (sc->compl_queues == NULL) { 507 WPRINTF("%s: CQ allocation failed", __func__); 508 sc->num_cqueues = 0; 509 } else { 510 struct nvme_completion_queue *cq = sc->compl_queues; 511 512 for (i = 0; i < sc->num_cqueues + 1; i++) 513 pthread_mutex_init(&cq[i].mtx, NULL); 514 } 515 } 516 517 static void 518 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) 519 { 520 struct nvme_controller_data *cd = &sc->ctrldata; 521 522 cd->vid = 0xFB5D; 523 cd->ssvid = 0x0000; 524 525 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); 526 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); 527 528 /* Num of submission commands that we can handle at a time (2^rab) */ 529 cd->rab = 4; 530 531 /* FreeBSD OUI */ 532 cd->ieee[0] = 0x58; 533 cd->ieee[1] = 0x9c; 534 cd->ieee[2] = 0xfc; 535 536 cd->mic = 0; 537 538 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */ 539 540 cd->ver = NVME_REV(1,4); 541 542 cd->cntrltype = NVME_CNTRLTYPE_IO; 543 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; 544 cd->oaes = NVMEB(NVME_CTRLR_DATA_OAES_NS_ATTR); 545 cd->acl = 2; 546 cd->aerl = 4; 547 548 /* Advertise 1, Read-only firmware slot */ 549 cd->frmw = NVMEB(NVME_CTRLR_DATA_FRMW_SLOT1_RO) | 550 (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT); 551 cd->lpa = 0; /* TODO: support some simple things like SMART */ 552 cd->elpe = 0; /* max error log page entries */ 553 /* 554 * Report a single power state (zero-based value) 555 * power_state[] values are left as zero to indicate "Not reported" 556 */ 557 cd->npss = 0; 558 559 /* Warning Composite Temperature Threshold */ 560 cd->wctemp = 0x0157; 561 cd->cctemp = 0x0157; 562 563 /* SANICAP must not be 0 for Revision 1.4 and later NVMe Controllers */ 564 cd->sanicap = (NVME_CTRLR_DATA_SANICAP_NODMMAS_NO << 565 NVME_CTRLR_DATA_SANICAP_NODMMAS_SHIFT); 566 567 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | 568 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); 569 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | 570 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); 571 cd->nn = 1; /* number of namespaces */ 572 573 cd->oncs = 0; 574 switch (sc->dataset_management) { 575 case NVME_DATASET_MANAGEMENT_AUTO: 576 if (sc->nvstore.deallocate) 577 cd->oncs |= NVME_ONCS_DSM; 578 break; 579 case NVME_DATASET_MANAGEMENT_ENABLE: 580 cd->oncs |= NVME_ONCS_DSM; 581 break; 582 default: 583 break; 584 } 585 586 cd->fna = NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK << 587 NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT; 588 589 cd->vwc = NVME_CTRLR_DATA_VWC_ALL_NO << NVME_CTRLR_DATA_VWC_ALL_SHIFT; 590 } 591 592 /* 593 * Calculate the CRC-16 of the given buffer 594 * See copyright attribution at top of file 595 */ 596 static uint16_t 597 crc16(uint16_t crc, const void *buffer, unsigned int len) 598 { 599 const unsigned char *cp = buffer; 600 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */ 601 static uint16_t const crc16_table[256] = { 602 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, 603 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, 604 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, 605 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841, 606 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, 607 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41, 608 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641, 609 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040, 610 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240, 611 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441, 612 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41, 613 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840, 614 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41, 615 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40, 616 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640, 617 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041, 618 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240, 619 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441, 620 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41, 621 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840, 622 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41, 623 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40, 624 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640, 625 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041, 626 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241, 627 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440, 628 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40, 629 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841, 630 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40, 631 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41, 632 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, 633 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 634 }; 635 636 while (len--) 637 crc = (((crc >> 8) & 0xffU) ^ 638 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU; 639 return crc; 640 } 641 642 static void 643 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore, 644 struct nvme_namespace_data *nd) 645 { 646 647 /* Get capacity and block size information from backing store */ 648 nd->nsze = nvstore->size / nvstore->sectsz; 649 nd->ncap = nd->nsze; 650 nd->nuse = nd->nsze; 651 } 652 653 static void 654 pci_nvme_init_nsdata(struct pci_nvme_softc *sc, 655 struct nvme_namespace_data *nd, uint32_t nsid, 656 struct pci_nvme_blockstore *nvstore) 657 { 658 659 pci_nvme_init_nsdata_size(nvstore, nd); 660 661 if (nvstore->type == NVME_STOR_BLOCKIF) 662 nvstore->deallocate = blockif_candelete(nvstore->ctx); 663 664 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ 665 nd->flbas = 0; 666 667 /* Create an EUI-64 if user did not provide one */ 668 if (nvstore->eui64 == 0) { 669 char *data = NULL; 670 uint64_t eui64 = nvstore->eui64; 671 672 asprintf(&data, "%s%u%u%u", get_config_value("name"), 673 sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot, 674 sc->nsc_pi->pi_func); 675 676 if (data != NULL) { 677 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); 678 free(data); 679 } 680 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff); 681 } 682 be64enc(nd->eui64, nvstore->eui64); 683 684 /* LBA data-sz = 2^lbads */ 685 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; 686 } 687 688 static void 689 pci_nvme_init_logpages(struct pci_nvme_softc *sc) 690 { 691 __uint128_t power_cycles = 1; 692 693 memset(&sc->err_log, 0, sizeof(sc->err_log)); 694 memset(&sc->health_log, 0, sizeof(sc->health_log)); 695 memset(&sc->fw_log, 0, sizeof(sc->fw_log)); 696 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 697 698 /* Set read/write remainder to round up according to spec */ 699 sc->read_dunits_remainder = 999; 700 sc->write_dunits_remainder = 999; 701 702 /* Set nominal Health values checked by implementations */ 703 sc->health_log.temperature = NVME_TEMPERATURE; 704 sc->health_log.available_spare = 100; 705 sc->health_log.available_spare_threshold = 10; 706 707 /* Set Active Firmware Info to slot 1 */ 708 sc->fw_log.afi = (1 << NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT); 709 memcpy(&sc->fw_log.revision[0], sc->ctrldata.fr, 710 sizeof(sc->fw_log.revision[0])); 711 712 memcpy(&sc->health_log.power_cycles, &power_cycles, 713 sizeof(sc->health_log.power_cycles)); 714 } 715 716 static void 717 pci_nvme_init_features(struct pci_nvme_softc *sc) 718 { 719 enum nvme_feature fid; 720 721 for (fid = 0; fid < NVME_FID_MAX; fid++) { 722 switch (fid) { 723 case NVME_FEAT_ARBITRATION: 724 case NVME_FEAT_POWER_MANAGEMENT: 725 case NVME_FEAT_INTERRUPT_COALESCING: //XXX 726 case NVME_FEAT_WRITE_ATOMICITY: 727 /* Mandatory but no special handling required */ 728 //XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 729 //XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 730 // this returns a data buffer 731 break; 732 case NVME_FEAT_TEMPERATURE_THRESHOLD: 733 sc->feat[fid].set = nvme_feature_temperature; 734 break; 735 case NVME_FEAT_ERROR_RECOVERY: 736 sc->feat[fid].namespace_specific = true; 737 break; 738 case NVME_FEAT_NUMBER_OF_QUEUES: 739 sc->feat[fid].set = nvme_feature_num_queues; 740 break; 741 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 742 sc->feat[fid].set = nvme_feature_iv_config; 743 break; 744 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 745 sc->feat[fid].set = nvme_feature_async_event; 746 /* Enable all AENs by default */ 747 sc->feat[fid].cdw11 = PCI_NVME_AEN_DEFAULT_MASK; 748 break; 749 default: 750 sc->feat[fid].set = nvme_feature_invalid_cb; 751 sc->feat[fid].get = nvme_feature_invalid_cb; 752 } 753 } 754 } 755 756 static void 757 pci_nvme_aer_reset(struct pci_nvme_softc *sc) 758 { 759 760 STAILQ_INIT(&sc->aer_list); 761 sc->aer_count = 0; 762 } 763 764 static void 765 pci_nvme_aer_init(struct pci_nvme_softc *sc) 766 { 767 768 pthread_mutex_init(&sc->aer_mtx, NULL); 769 pci_nvme_aer_reset(sc); 770 } 771 772 static void 773 pci_nvme_aer_destroy(struct pci_nvme_softc *sc) 774 { 775 struct pci_nvme_aer *aer = NULL; 776 777 pthread_mutex_lock(&sc->aer_mtx); 778 while (!STAILQ_EMPTY(&sc->aer_list)) { 779 aer = STAILQ_FIRST(&sc->aer_list); 780 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 781 free(aer); 782 } 783 pthread_mutex_unlock(&sc->aer_mtx); 784 785 pci_nvme_aer_reset(sc); 786 } 787 788 static bool 789 pci_nvme_aer_available(struct pci_nvme_softc *sc) 790 { 791 792 return (sc->aer_count != 0); 793 } 794 795 static bool 796 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc) 797 { 798 struct nvme_controller_data *cd = &sc->ctrldata; 799 800 /* AERL is a zero based value while aer_count is one's based */ 801 return (sc->aer_count == (cd->aerl + 1)); 802 } 803 804 /* 805 * Add an Async Event Request 806 * 807 * Stores an AER to be returned later if the Controller needs to notify the 808 * host of an event. 809 * Note that while the NVMe spec doesn't require Controllers to return AER's 810 * in order, this implementation does preserve the order. 811 */ 812 static int 813 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid) 814 { 815 struct pci_nvme_aer *aer = NULL; 816 817 aer = calloc(1, sizeof(struct pci_nvme_aer)); 818 if (aer == NULL) 819 return (-1); 820 821 /* Save the Command ID for use in the completion message */ 822 aer->cid = cid; 823 824 pthread_mutex_lock(&sc->aer_mtx); 825 sc->aer_count++; 826 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link); 827 pthread_mutex_unlock(&sc->aer_mtx); 828 829 return (0); 830 } 831 832 /* 833 * Get an Async Event Request structure 834 * 835 * Returns a pointer to an AER previously submitted by the host or NULL if 836 * no AER's exist. Caller is responsible for freeing the returned struct. 837 */ 838 static struct pci_nvme_aer * 839 pci_nvme_aer_get(struct pci_nvme_softc *sc) 840 { 841 struct pci_nvme_aer *aer = NULL; 842 843 pthread_mutex_lock(&sc->aer_mtx); 844 aer = STAILQ_FIRST(&sc->aer_list); 845 if (aer != NULL) { 846 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 847 sc->aer_count--; 848 } 849 pthread_mutex_unlock(&sc->aer_mtx); 850 851 return (aer); 852 } 853 854 static void 855 pci_nvme_aen_reset(struct pci_nvme_softc *sc) 856 { 857 uint32_t atype; 858 859 memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen)); 860 861 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 862 sc->aen[atype].atype = atype; 863 } 864 } 865 866 static void 867 pci_nvme_aen_init(struct pci_nvme_softc *sc) 868 { 869 char nstr[80]; 870 871 pci_nvme_aen_reset(sc); 872 873 pthread_mutex_init(&sc->aen_mtx, NULL); 874 pthread_create(&sc->aen_tid, NULL, aen_thr, sc); 875 snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot, 876 sc->nsc_pi->pi_func); 877 pthread_set_name_np(sc->aen_tid, nstr); 878 } 879 880 static void 881 pci_nvme_aen_destroy(struct pci_nvme_softc *sc) 882 { 883 884 pci_nvme_aen_reset(sc); 885 } 886 887 /* Notify the AEN thread of pending work */ 888 static void 889 pci_nvme_aen_notify(struct pci_nvme_softc *sc) 890 { 891 892 pthread_cond_signal(&sc->aen_cond); 893 } 894 895 /* 896 * Post an Asynchronous Event Notification 897 */ 898 static int32_t 899 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype, 900 uint32_t event_data) 901 { 902 struct pci_nvme_aen *aen; 903 904 if (atype >= PCI_NVME_AE_TYPE_MAX) { 905 return(EINVAL); 906 } 907 908 pthread_mutex_lock(&sc->aen_mtx); 909 aen = &sc->aen[atype]; 910 911 /* Has the controller already posted an event of this type? */ 912 if (aen->posted) { 913 pthread_mutex_unlock(&sc->aen_mtx); 914 return(EALREADY); 915 } 916 917 aen->event_data = event_data; 918 aen->posted = true; 919 pthread_mutex_unlock(&sc->aen_mtx); 920 921 pci_nvme_aen_notify(sc); 922 923 return(0); 924 } 925 926 static void 927 pci_nvme_aen_process(struct pci_nvme_softc *sc) 928 { 929 struct pci_nvme_aer *aer; 930 struct pci_nvme_aen *aen; 931 pci_nvme_async_type atype; 932 uint32_t mask; 933 uint16_t status; 934 uint8_t lid; 935 936 assert(pthread_mutex_isowned_np(&sc->aen_mtx)); 937 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 938 aen = &sc->aen[atype]; 939 /* Previous iterations may have depleted the available AER's */ 940 if (!pci_nvme_aer_available(sc)) { 941 DPRINTF("%s: no AER", __func__); 942 break; 943 } 944 945 if (!aen->posted) { 946 DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype); 947 continue; 948 } 949 950 status = NVME_SC_SUCCESS; 951 952 /* Is the event masked? */ 953 mask = 954 sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11; 955 956 DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data); 957 switch (atype) { 958 case PCI_NVME_AE_TYPE_ERROR: 959 lid = NVME_LOG_ERROR; 960 break; 961 case PCI_NVME_AE_TYPE_SMART: 962 mask &= 0xff; 963 if ((mask & aen->event_data) == 0) 964 continue; 965 lid = NVME_LOG_HEALTH_INFORMATION; 966 break; 967 case PCI_NVME_AE_TYPE_NOTICE: 968 if (aen->event_data >= PCI_NVME_AEI_NOTICE_MAX) { 969 EPRINTLN("%s unknown AEN notice type %u", 970 __func__, aen->event_data); 971 status = NVME_SC_INTERNAL_DEVICE_ERROR; 972 break; 973 } 974 if ((PCI_NVME_AEI_NOTICE_MASK(aen->event_data) & mask) == 0) 975 continue; 976 switch (aen->event_data) { 977 case PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED: 978 lid = NVME_LOG_CHANGED_NAMESPACE; 979 break; 980 case PCI_NVME_AEI_NOTICE_FW_ACTIVATION: 981 lid = NVME_LOG_FIRMWARE_SLOT; 982 break; 983 case PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE: 984 lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED; 985 break; 986 case PCI_NVME_AEI_NOTICE_ANA_CHANGE: 987 lid = NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS; 988 break; 989 case PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE: 990 lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE; 991 break; 992 case PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT: 993 lid = NVME_LOG_LBA_STATUS_INFORMATION; 994 break; 995 case PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE: 996 lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE; 997 break; 998 default: 999 lid = 0; 1000 } 1001 break; 1002 default: 1003 /* bad type?!? */ 1004 EPRINTLN("%s unknown AEN type %u", __func__, atype); 1005 status = NVME_SC_INTERNAL_DEVICE_ERROR; 1006 break; 1007 } 1008 1009 aer = pci_nvme_aer_get(sc); 1010 assert(aer != NULL); 1011 1012 DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype); 1013 pci_nvme_cq_update(sc, &sc->compl_queues[0], 1014 (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */ 1015 aer->cid, 1016 0, /* SQID */ 1017 status); 1018 1019 aen->event_data = 0; 1020 aen->posted = false; 1021 1022 pci_generate_msix(sc->nsc_pi, 0); 1023 } 1024 } 1025 1026 static void * 1027 aen_thr(void *arg) 1028 { 1029 struct pci_nvme_softc *sc; 1030 1031 sc = arg; 1032 1033 pthread_mutex_lock(&sc->aen_mtx); 1034 for (;;) { 1035 pci_nvme_aen_process(sc); 1036 pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx); 1037 } 1038 pthread_mutex_unlock(&sc->aen_mtx); 1039 1040 pthread_exit(NULL); 1041 return (NULL); 1042 } 1043 1044 static void 1045 pci_nvme_reset_locked(struct pci_nvme_softc *sc) 1046 { 1047 uint32_t i; 1048 1049 DPRINTF("%s", __func__); 1050 1051 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | 1052 (1 << NVME_CAP_LO_REG_CQR_SHIFT) | 1053 (60 << NVME_CAP_LO_REG_TO_SHIFT); 1054 1055 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; 1056 1057 sc->regs.vs = NVME_REV(1,4); /* NVMe v1.4 */ 1058 1059 sc->regs.cc = 0; 1060 1061 assert(sc->submit_queues != NULL); 1062 1063 for (i = 0; i < sc->num_squeues + 1; i++) { 1064 sc->submit_queues[i].qbase = NULL; 1065 sc->submit_queues[i].size = 0; 1066 sc->submit_queues[i].cqid = 0; 1067 sc->submit_queues[i].tail = 0; 1068 sc->submit_queues[i].head = 0; 1069 } 1070 1071 assert(sc->compl_queues != NULL); 1072 1073 for (i = 0; i < sc->num_cqueues + 1; i++) { 1074 sc->compl_queues[i].qbase = NULL; 1075 sc->compl_queues[i].size = 0; 1076 sc->compl_queues[i].tail = 0; 1077 sc->compl_queues[i].head = 0; 1078 } 1079 1080 sc->num_q_is_set = false; 1081 1082 pci_nvme_aer_destroy(sc); 1083 pci_nvme_aen_destroy(sc); 1084 1085 /* 1086 * Clear CSTS.RDY last to prevent the host from enabling Controller 1087 * before cleanup completes 1088 */ 1089 sc->regs.csts = 0; 1090 } 1091 1092 static void 1093 pci_nvme_reset(struct pci_nvme_softc *sc) 1094 { 1095 pthread_mutex_lock(&sc->mtx); 1096 pci_nvme_reset_locked(sc); 1097 pthread_mutex_unlock(&sc->mtx); 1098 } 1099 1100 static int 1101 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc) 1102 { 1103 uint16_t acqs, asqs; 1104 1105 DPRINTF("%s", __func__); 1106 1107 /* 1108 * NVMe 2.0 states that "enabling a controller while this field is 1109 * cleared to 0h produces undefined results" for both ACQS and 1110 * ASQS. If zero, set CFS and do not become ready. 1111 */ 1112 asqs = ONE_BASED(sc->regs.aqa & NVME_AQA_REG_ASQS_MASK); 1113 if (asqs < 2) { 1114 EPRINTLN("%s: illegal ASQS value %#x (aqa=%#x)", __func__, 1115 asqs - 1, sc->regs.aqa); 1116 sc->regs.csts |= NVME_CSTS_CFS; 1117 return (-1); 1118 } 1119 sc->submit_queues[0].size = asqs; 1120 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq, 1121 sizeof(struct nvme_command) * asqs); 1122 if (sc->submit_queues[0].qbase == NULL) { 1123 EPRINTLN("%s: ASQ vm_map_gpa(%lx) failed", __func__, 1124 sc->regs.asq); 1125 sc->regs.csts |= NVME_CSTS_CFS; 1126 return (-1); 1127 } 1128 1129 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p", 1130 __func__, sc->regs.asq, sc->submit_queues[0].qbase); 1131 1132 acqs = ONE_BASED((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 1133 NVME_AQA_REG_ACQS_MASK); 1134 if (acqs < 2) { 1135 EPRINTLN("%s: illegal ACQS value %#x (aqa=%#x)", __func__, 1136 acqs - 1, sc->regs.aqa); 1137 sc->regs.csts |= NVME_CSTS_CFS; 1138 return (-1); 1139 } 1140 sc->compl_queues[0].size = acqs; 1141 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq, 1142 sizeof(struct nvme_completion) * acqs); 1143 if (sc->compl_queues[0].qbase == NULL) { 1144 EPRINTLN("%s: ACQ vm_map_gpa(%lx) failed", __func__, 1145 sc->regs.acq); 1146 sc->regs.csts |= NVME_CSTS_CFS; 1147 return (-1); 1148 } 1149 sc->compl_queues[0].intr_en = NVME_CQ_INTEN; 1150 1151 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p", 1152 __func__, sc->regs.acq, sc->compl_queues[0].qbase); 1153 1154 return (0); 1155 } 1156 1157 static int 1158 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b, 1159 size_t len, enum nvme_copy_dir dir) 1160 { 1161 uint8_t *p; 1162 size_t bytes; 1163 1164 if (len > (8 * 1024)) { 1165 return (-1); 1166 } 1167 1168 /* Copy from the start of prp1 to the end of the physical page */ 1169 bytes = PAGE_SIZE - (prp1 & PAGE_MASK); 1170 bytes = MIN(bytes, len); 1171 1172 p = vm_map_gpa(ctx, prp1, bytes); 1173 if (p == NULL) { 1174 return (-1); 1175 } 1176 1177 if (dir == NVME_COPY_TO_PRP) 1178 memcpy(p, b, bytes); 1179 else 1180 memcpy(b, p, bytes); 1181 1182 b += bytes; 1183 1184 len -= bytes; 1185 if (len == 0) { 1186 return (0); 1187 } 1188 1189 len = MIN(len, PAGE_SIZE); 1190 1191 p = vm_map_gpa(ctx, prp2, len); 1192 if (p == NULL) { 1193 return (-1); 1194 } 1195 1196 if (dir == NVME_COPY_TO_PRP) 1197 memcpy(p, b, len); 1198 else 1199 memcpy(b, p, len); 1200 1201 return (0); 1202 } 1203 1204 /* 1205 * Write a Completion Queue Entry update 1206 * 1207 * Write the completion and update the doorbell value 1208 */ 1209 static void 1210 pci_nvme_cq_update(struct pci_nvme_softc *sc, 1211 struct nvme_completion_queue *cq, 1212 uint32_t cdw0, 1213 uint16_t cid, 1214 uint16_t sqid, 1215 uint16_t status) 1216 { 1217 struct nvme_submission_queue *sq = &sc->submit_queues[sqid]; 1218 struct nvme_completion *cqe; 1219 1220 assert(cq->qbase != NULL); 1221 1222 pthread_mutex_lock(&cq->mtx); 1223 1224 cqe = &cq->qbase[cq->tail]; 1225 1226 /* Flip the phase bit */ 1227 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK; 1228 1229 cqe->cdw0 = cdw0; 1230 cqe->sqhd = sq->head; 1231 cqe->sqid = sqid; 1232 cqe->cid = cid; 1233 cqe->status = status; 1234 1235 cq->tail++; 1236 if (cq->tail >= cq->size) { 1237 cq->tail = 0; 1238 } 1239 1240 pthread_mutex_unlock(&cq->mtx); 1241 } 1242 1243 static int 1244 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1245 struct nvme_completion* compl) 1246 { 1247 uint16_t qid = command->cdw10 & 0xffff; 1248 1249 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid); 1250 if (qid == 0 || qid > sc->num_squeues || 1251 (sc->submit_queues[qid].qbase == NULL)) { 1252 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u", 1253 __func__, qid, sc->num_squeues); 1254 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1255 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1256 return (1); 1257 } 1258 1259 sc->submit_queues[qid].qbase = NULL; 1260 sc->submit_queues[qid].cqid = 0; 1261 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1262 return (1); 1263 } 1264 1265 static int 1266 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1267 struct nvme_completion* compl) 1268 { 1269 if (command->cdw11 & NVME_CMD_CDW11_PC) { 1270 uint16_t qid = command->cdw10 & 0xffff; 1271 struct nvme_submission_queue *nsq; 1272 1273 if ((qid == 0) || (qid > sc->num_squeues) || 1274 (sc->submit_queues[qid].qbase != NULL)) { 1275 WPRINTF("%s queue index %u > num_squeues %u", 1276 __func__, qid, sc->num_squeues); 1277 pci_nvme_status_tc(&compl->status, 1278 NVME_SCT_COMMAND_SPECIFIC, 1279 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1280 return (1); 1281 } 1282 1283 nsq = &sc->submit_queues[qid]; 1284 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1285 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries); 1286 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) { 1287 /* 1288 * Queues must specify at least two entries 1289 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1290 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1291 */ 1292 pci_nvme_status_tc(&compl->status, 1293 NVME_SCT_COMMAND_SPECIFIC, 1294 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1295 return (1); 1296 } 1297 nsq->head = nsq->tail = 0; 1298 1299 nsq->cqid = (command->cdw11 >> 16) & 0xffff; 1300 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) { 1301 pci_nvme_status_tc(&compl->status, 1302 NVME_SCT_COMMAND_SPECIFIC, 1303 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1304 return (1); 1305 } 1306 1307 if (sc->compl_queues[nsq->cqid].qbase == NULL) { 1308 pci_nvme_status_tc(&compl->status, 1309 NVME_SCT_COMMAND_SPECIFIC, 1310 NVME_SC_COMPLETION_QUEUE_INVALID); 1311 return (1); 1312 } 1313 1314 nsq->qpriority = (command->cdw11 >> 1) & 0x03; 1315 1316 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1317 sizeof(struct nvme_command) * (size_t)nsq->size); 1318 1319 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__, 1320 qid, nsq->size, nsq->qbase, nsq->cqid); 1321 1322 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1323 1324 DPRINTF("%s completed creating IOSQ qid %u", 1325 __func__, qid); 1326 } else { 1327 /* 1328 * Guest sent non-cont submission queue request. 1329 * This setting is unsupported by this emulation. 1330 */ 1331 WPRINTF("%s unsupported non-contig (list-based) " 1332 "create i/o submission queue", __func__); 1333 1334 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1335 } 1336 return (1); 1337 } 1338 1339 static int 1340 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1341 struct nvme_completion* compl) 1342 { 1343 uint16_t qid = command->cdw10 & 0xffff; 1344 uint16_t sqid; 1345 1346 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid); 1347 if (qid == 0 || qid > sc->num_cqueues || 1348 (sc->compl_queues[qid].qbase == NULL)) { 1349 WPRINTF("%s queue index %u / num_cqueues %u", 1350 __func__, qid, sc->num_cqueues); 1351 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1352 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1353 return (1); 1354 } 1355 1356 /* Deleting an Active CQ is an error */ 1357 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++) 1358 if (sc->submit_queues[sqid].cqid == qid) { 1359 pci_nvme_status_tc(&compl->status, 1360 NVME_SCT_COMMAND_SPECIFIC, 1361 NVME_SC_INVALID_QUEUE_DELETION); 1362 return (1); 1363 } 1364 1365 sc->compl_queues[qid].qbase = NULL; 1366 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1367 return (1); 1368 } 1369 1370 static int 1371 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1372 struct nvme_completion* compl) 1373 { 1374 struct nvme_completion_queue *ncq; 1375 uint16_t qid = command->cdw10 & 0xffff; 1376 1377 /* Only support Physically Contiguous queues */ 1378 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) { 1379 WPRINTF("%s unsupported non-contig (list-based) " 1380 "create i/o completion queue", 1381 __func__); 1382 1383 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1384 return (1); 1385 } 1386 1387 if ((qid == 0) || (qid > sc->num_cqueues) || 1388 (sc->compl_queues[qid].qbase != NULL)) { 1389 WPRINTF("%s queue index %u > num_cqueues %u", 1390 __func__, qid, sc->num_cqueues); 1391 pci_nvme_status_tc(&compl->status, 1392 NVME_SCT_COMMAND_SPECIFIC, 1393 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1394 return (1); 1395 } 1396 1397 ncq = &sc->compl_queues[qid]; 1398 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; 1399 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; 1400 if (ncq->intr_vec > (sc->max_queues + 1)) { 1401 pci_nvme_status_tc(&compl->status, 1402 NVME_SCT_COMMAND_SPECIFIC, 1403 NVME_SC_INVALID_INTERRUPT_VECTOR); 1404 return (1); 1405 } 1406 1407 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1408 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) { 1409 /* 1410 * Queues must specify at least two entries 1411 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1412 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1413 */ 1414 pci_nvme_status_tc(&compl->status, 1415 NVME_SCT_COMMAND_SPECIFIC, 1416 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1417 return (1); 1418 } 1419 ncq->head = ncq->tail = 0; 1420 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1421 command->prp1, 1422 sizeof(struct nvme_command) * (size_t)ncq->size); 1423 1424 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1425 1426 1427 return (1); 1428 } 1429 1430 static int 1431 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, 1432 struct nvme_completion* compl) 1433 { 1434 uint64_t logoff; 1435 uint32_t logsize; 1436 uint8_t logpage; 1437 1438 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1439 1440 /* 1441 * Command specifies the number of dwords to return in fields NUMDU 1442 * and NUMDL. This is a zero-based value. 1443 */ 1444 logpage = command->cdw10 & 0xFF; 1445 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1; 1446 logsize *= sizeof(uint32_t); 1447 logoff = ((uint64_t)(command->cdw13) << 32) | command->cdw12; 1448 1449 DPRINTF("%s log page %u len %u", __func__, logpage, logsize); 1450 1451 switch (logpage) { 1452 case NVME_LOG_ERROR: 1453 if (logoff >= sizeof(sc->err_log)) { 1454 pci_nvme_status_genc(&compl->status, 1455 NVME_SC_INVALID_FIELD); 1456 break; 1457 } 1458 1459 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1460 command->prp2, (uint8_t *)&sc->err_log + logoff, 1461 MIN(logsize - logoff, sizeof(sc->err_log)), 1462 NVME_COPY_TO_PRP); 1463 break; 1464 case NVME_LOG_HEALTH_INFORMATION: 1465 if (logoff >= sizeof(sc->health_log)) { 1466 pci_nvme_status_genc(&compl->status, 1467 NVME_SC_INVALID_FIELD); 1468 break; 1469 } 1470 1471 pthread_mutex_lock(&sc->mtx); 1472 memcpy(&sc->health_log.data_units_read, &sc->read_data_units, 1473 sizeof(sc->health_log.data_units_read)); 1474 memcpy(&sc->health_log.data_units_written, &sc->write_data_units, 1475 sizeof(sc->health_log.data_units_written)); 1476 memcpy(&sc->health_log.host_read_commands, &sc->read_commands, 1477 sizeof(sc->health_log.host_read_commands)); 1478 memcpy(&sc->health_log.host_write_commands, &sc->write_commands, 1479 sizeof(sc->health_log.host_write_commands)); 1480 pthread_mutex_unlock(&sc->mtx); 1481 1482 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1483 command->prp2, (uint8_t *)&sc->health_log + logoff, 1484 MIN(logsize - logoff, sizeof(sc->health_log)), 1485 NVME_COPY_TO_PRP); 1486 break; 1487 case NVME_LOG_FIRMWARE_SLOT: 1488 if (logoff >= sizeof(sc->fw_log)) { 1489 pci_nvme_status_genc(&compl->status, 1490 NVME_SC_INVALID_FIELD); 1491 break; 1492 } 1493 1494 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1495 command->prp2, (uint8_t *)&sc->fw_log + logoff, 1496 MIN(logsize - logoff, sizeof(sc->fw_log)), 1497 NVME_COPY_TO_PRP); 1498 break; 1499 case NVME_LOG_CHANGED_NAMESPACE: 1500 if (logoff >= sizeof(sc->ns_log)) { 1501 pci_nvme_status_genc(&compl->status, 1502 NVME_SC_INVALID_FIELD); 1503 break; 1504 } 1505 1506 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1507 command->prp2, (uint8_t *)&sc->ns_log + logoff, 1508 MIN(logsize - logoff, sizeof(sc->ns_log)), 1509 NVME_COPY_TO_PRP); 1510 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 1511 break; 1512 default: 1513 DPRINTF("%s get log page %x command not supported", 1514 __func__, logpage); 1515 1516 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1517 NVME_SC_INVALID_LOG_PAGE); 1518 } 1519 1520 return (1); 1521 } 1522 1523 static int 1524 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, 1525 struct nvme_completion* compl) 1526 { 1527 void *dest; 1528 uint16_t status; 1529 1530 DPRINTF("%s identify 0x%x nsid 0x%x", __func__, 1531 command->cdw10 & 0xFF, command->nsid); 1532 1533 status = 0; 1534 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1535 1536 switch (command->cdw10 & 0xFF) { 1537 case 0x00: /* return Identify Namespace data structure */ 1538 /* Global NS only valid with NS Management */ 1539 if (command->nsid == NVME_GLOBAL_NAMESPACE_TAG) { 1540 pci_nvme_status_genc(&status, 1541 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1542 break; 1543 } 1544 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1545 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata), 1546 NVME_COPY_TO_PRP); 1547 break; 1548 case 0x01: /* return Identify Controller data structure */ 1549 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1550 command->prp2, (uint8_t *)&sc->ctrldata, 1551 sizeof(sc->ctrldata), 1552 NVME_COPY_TO_PRP); 1553 break; 1554 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ 1555 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1556 sizeof(uint32_t) * 1024); 1557 /* All unused entries shall be zero */ 1558 memset(dest, 0, sizeof(uint32_t) * 1024); 1559 ((uint32_t *)dest)[0] = 1; 1560 break; 1561 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ 1562 if (command->nsid != 1) { 1563 pci_nvme_status_genc(&status, 1564 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1565 break; 1566 } 1567 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1568 sizeof(uint32_t) * 1024); 1569 /* All bytes after the descriptor shall be zero */ 1570 memset(dest, 0, sizeof(uint32_t) * 1024); 1571 1572 /* Return NIDT=1 (i.e. EUI64) descriptor */ 1573 ((uint8_t *)dest)[0] = 1; 1574 ((uint8_t *)dest)[1] = sizeof(uint64_t); 1575 memcpy(((uint8_t *)dest) + 4, sc->nsdata.eui64, sizeof(uint64_t)); 1576 break; 1577 case 0x13: 1578 /* 1579 * Controller list is optional but used by UNH tests. Return 1580 * a valid but empty list. 1581 */ 1582 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1583 sizeof(uint16_t) * 2048); 1584 memset(dest, 0, sizeof(uint16_t) * 2048); 1585 break; 1586 default: 1587 DPRINTF("%s unsupported identify command requested 0x%x", 1588 __func__, command->cdw10 & 0xFF); 1589 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD); 1590 break; 1591 } 1592 1593 compl->status = status; 1594 return (1); 1595 } 1596 1597 static const char * 1598 nvme_fid_to_name(uint8_t fid) 1599 { 1600 const char *name; 1601 1602 switch (fid) { 1603 case NVME_FEAT_ARBITRATION: 1604 name = "Arbitration"; 1605 break; 1606 case NVME_FEAT_POWER_MANAGEMENT: 1607 name = "Power Management"; 1608 break; 1609 case NVME_FEAT_LBA_RANGE_TYPE: 1610 name = "LBA Range Type"; 1611 break; 1612 case NVME_FEAT_TEMPERATURE_THRESHOLD: 1613 name = "Temperature Threshold"; 1614 break; 1615 case NVME_FEAT_ERROR_RECOVERY: 1616 name = "Error Recovery"; 1617 break; 1618 case NVME_FEAT_VOLATILE_WRITE_CACHE: 1619 name = "Volatile Write Cache"; 1620 break; 1621 case NVME_FEAT_NUMBER_OF_QUEUES: 1622 name = "Number of Queues"; 1623 break; 1624 case NVME_FEAT_INTERRUPT_COALESCING: 1625 name = "Interrupt Coalescing"; 1626 break; 1627 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 1628 name = "Interrupt Vector Configuration"; 1629 break; 1630 case NVME_FEAT_WRITE_ATOMICITY: 1631 name = "Write Atomicity Normal"; 1632 break; 1633 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 1634 name = "Asynchronous Event Configuration"; 1635 break; 1636 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 1637 name = "Autonomous Power State Transition"; 1638 break; 1639 case NVME_FEAT_HOST_MEMORY_BUFFER: 1640 name = "Host Memory Buffer"; 1641 break; 1642 case NVME_FEAT_TIMESTAMP: 1643 name = "Timestamp"; 1644 break; 1645 case NVME_FEAT_KEEP_ALIVE_TIMER: 1646 name = "Keep Alive Timer"; 1647 break; 1648 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT: 1649 name = "Host Controlled Thermal Management"; 1650 break; 1651 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG: 1652 name = "Non-Operation Power State Config"; 1653 break; 1654 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG: 1655 name = "Read Recovery Level Config"; 1656 break; 1657 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 1658 name = "Predictable Latency Mode Config"; 1659 break; 1660 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW: 1661 name = "Predictable Latency Mode Window"; 1662 break; 1663 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES: 1664 name = "LBA Status Information Report Interval"; 1665 break; 1666 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 1667 name = "Host Behavior Support"; 1668 break; 1669 case NVME_FEAT_SANITIZE_CONFIG: 1670 name = "Sanitize Config"; 1671 break; 1672 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION: 1673 name = "Endurance Group Event Configuration"; 1674 break; 1675 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 1676 name = "Software Progress Marker"; 1677 break; 1678 case NVME_FEAT_HOST_IDENTIFIER: 1679 name = "Host Identifier"; 1680 break; 1681 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK: 1682 name = "Reservation Notification Mask"; 1683 break; 1684 case NVME_FEAT_RESERVATION_PERSISTENCE: 1685 name = "Reservation Persistence"; 1686 break; 1687 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG: 1688 name = "Namespace Write Protection Config"; 1689 break; 1690 default: 1691 name = "Unknown"; 1692 break; 1693 } 1694 1695 return (name); 1696 } 1697 1698 static void 1699 nvme_feature_invalid_cb(struct pci_nvme_softc *sc __unused, 1700 struct nvme_feature_obj *feat __unused, 1701 struct nvme_command *command __unused, 1702 struct nvme_completion *compl) 1703 { 1704 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1705 } 1706 1707 static void 1708 nvme_feature_iv_config(struct pci_nvme_softc *sc, 1709 struct nvme_feature_obj *feat __unused, 1710 struct nvme_command *command, 1711 struct nvme_completion *compl) 1712 { 1713 uint32_t i; 1714 uint32_t cdw11 = command->cdw11; 1715 uint16_t iv; 1716 bool cd; 1717 1718 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1719 1720 iv = cdw11 & 0xffff; 1721 cd = cdw11 & (1 << 16); 1722 1723 if (iv > (sc->max_queues + 1)) { 1724 return; 1725 } 1726 1727 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */ 1728 if ((iv == 0) && !cd) 1729 return; 1730 1731 /* Requested Interrupt Vector must be used by a CQ */ 1732 for (i = 0; i < sc->num_cqueues + 1; i++) { 1733 if (sc->compl_queues[i].intr_vec == iv) { 1734 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1735 } 1736 } 1737 } 1738 1739 #define NVME_ASYNC_EVENT_ENDURANCE_GROUP (0x4000) 1740 static void 1741 nvme_feature_async_event(struct pci_nvme_softc *sc __unused, 1742 struct nvme_feature_obj *feat __unused, 1743 struct nvme_command *command, 1744 struct nvme_completion *compl) 1745 { 1746 if (command->cdw11 & NVME_ASYNC_EVENT_ENDURANCE_GROUP) 1747 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1748 } 1749 1750 #define NVME_TEMP_THRESH_OVER 0 1751 #define NVME_TEMP_THRESH_UNDER 1 1752 static void 1753 nvme_feature_temperature(struct pci_nvme_softc *sc, 1754 struct nvme_feature_obj *feat __unused, 1755 struct nvme_command *command, 1756 struct nvme_completion *compl) 1757 { 1758 uint16_t tmpth; /* Temperature Threshold */ 1759 uint8_t tmpsel; /* Threshold Temperature Select */ 1760 uint8_t thsel; /* Threshold Type Select */ 1761 bool set_crit = false; 1762 bool report_crit; 1763 1764 tmpth = command->cdw11 & 0xffff; 1765 tmpsel = (command->cdw11 >> 16) & 0xf; 1766 thsel = (command->cdw11 >> 20) & 0x3; 1767 1768 DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel); 1769 1770 /* Check for unsupported values */ 1771 if (((tmpsel != 0) && (tmpsel != 0xf)) || 1772 (thsel > NVME_TEMP_THRESH_UNDER)) { 1773 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1774 return; 1775 } 1776 1777 if (((thsel == NVME_TEMP_THRESH_OVER) && (NVME_TEMPERATURE >= tmpth)) || 1778 ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth))) 1779 set_crit = true; 1780 1781 pthread_mutex_lock(&sc->mtx); 1782 if (set_crit) 1783 sc->health_log.critical_warning |= 1784 NVME_CRIT_WARN_ST_TEMPERATURE; 1785 else 1786 sc->health_log.critical_warning &= 1787 ~NVME_CRIT_WARN_ST_TEMPERATURE; 1788 pthread_mutex_unlock(&sc->mtx); 1789 1790 report_crit = sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11 & 1791 NVME_CRIT_WARN_ST_TEMPERATURE; 1792 1793 if (set_crit && report_crit) 1794 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART, 1795 sc->health_log.critical_warning); 1796 1797 DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status); 1798 } 1799 1800 static void 1801 nvme_feature_num_queues(struct pci_nvme_softc *sc, 1802 struct nvme_feature_obj *feat __unused, 1803 struct nvme_command *command, 1804 struct nvme_completion *compl) 1805 { 1806 uint16_t nqr; /* Number of Queues Requested */ 1807 1808 if (sc->num_q_is_set) { 1809 WPRINTF("%s: Number of Queues already set", __func__); 1810 pci_nvme_status_genc(&compl->status, 1811 NVME_SC_COMMAND_SEQUENCE_ERROR); 1812 return; 1813 } 1814 1815 nqr = command->cdw11 & 0xFFFF; 1816 if (nqr == 0xffff) { 1817 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr); 1818 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1819 return; 1820 } 1821 1822 sc->num_squeues = ONE_BASED(nqr); 1823 if (sc->num_squeues > sc->max_queues) { 1824 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues, 1825 sc->max_queues); 1826 sc->num_squeues = sc->max_queues; 1827 } 1828 1829 nqr = (command->cdw11 >> 16) & 0xFFFF; 1830 if (nqr == 0xffff) { 1831 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr); 1832 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1833 return; 1834 } 1835 1836 sc->num_cqueues = ONE_BASED(nqr); 1837 if (sc->num_cqueues > sc->max_queues) { 1838 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues, 1839 sc->max_queues); 1840 sc->num_cqueues = sc->max_queues; 1841 } 1842 1843 /* Patch the command value which will be saved on callback's return */ 1844 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc); 1845 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 1846 1847 sc->num_q_is_set = true; 1848 } 1849 1850 static int 1851 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command, 1852 struct nvme_completion *compl) 1853 { 1854 struct nvme_feature_obj *feat; 1855 uint32_t nsid = command->nsid; 1856 uint8_t fid = NVMEV(NVME_FEAT_SET_FID, command->cdw10); 1857 bool sv = NVMEV(NVME_FEAT_SET_SV, command->cdw10); 1858 1859 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1860 1861 if (fid >= NVME_FID_MAX) { 1862 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1863 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1864 return (1); 1865 } 1866 1867 if (sv) { 1868 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1869 NVME_SC_FEATURE_NOT_SAVEABLE); 1870 return (1); 1871 } 1872 1873 feat = &sc->feat[fid]; 1874 1875 if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) { 1876 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1877 return (1); 1878 } 1879 1880 if (!feat->namespace_specific && 1881 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) { 1882 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1883 NVME_SC_FEATURE_NOT_NS_SPECIFIC); 1884 return (1); 1885 } 1886 1887 compl->cdw0 = 0; 1888 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1889 1890 if (feat->set) 1891 feat->set(sc, feat, command, compl); 1892 else { 1893 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1894 NVME_SC_FEATURE_NOT_CHANGEABLE); 1895 return (1); 1896 } 1897 1898 DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11); 1899 if (compl->status == NVME_SC_SUCCESS) { 1900 feat->cdw11 = command->cdw11; 1901 if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) && 1902 (command->cdw11 != 0)) 1903 pci_nvme_aen_notify(sc); 1904 } 1905 1906 return (0); 1907 } 1908 1909 #define NVME_FEATURES_SEL_SUPPORTED 0x3 1910 #define NVME_FEATURES_NS_SPECIFIC (1 << 1) 1911 1912 static int 1913 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, 1914 struct nvme_completion* compl) 1915 { 1916 struct nvme_feature_obj *feat; 1917 uint8_t fid = command->cdw10 & 0xFF; 1918 uint8_t sel = (command->cdw10 >> 8) & 0x7; 1919 1920 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1921 1922 if (fid >= NVME_FID_MAX) { 1923 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1924 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1925 return (1); 1926 } 1927 1928 compl->cdw0 = 0; 1929 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1930 1931 feat = &sc->feat[fid]; 1932 if (feat->get) { 1933 feat->get(sc, feat, command, compl); 1934 } 1935 1936 if (compl->status == NVME_SC_SUCCESS) { 1937 if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific) 1938 compl->cdw0 = NVME_FEATURES_NS_SPECIFIC; 1939 else 1940 compl->cdw0 = feat->cdw11; 1941 } 1942 1943 return (0); 1944 } 1945 1946 static int 1947 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command, 1948 struct nvme_completion* compl) 1949 { 1950 uint8_t ses, lbaf, pi; 1951 1952 /* Only supports Secure Erase Setting - User Data Erase */ 1953 ses = (command->cdw10 >> 9) & 0x7; 1954 if (ses > 0x1) { 1955 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1956 return (1); 1957 } 1958 1959 /* Only supports a single LBA Format */ 1960 lbaf = command->cdw10 & 0xf; 1961 if (lbaf != 0) { 1962 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1963 NVME_SC_INVALID_FORMAT); 1964 return (1); 1965 } 1966 1967 /* Doesn't support Protection Infomation */ 1968 pi = (command->cdw10 >> 5) & 0x7; 1969 if (pi != 0) { 1970 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1971 return (1); 1972 } 1973 1974 if (sc->nvstore.type == NVME_STOR_RAM) { 1975 if (sc->nvstore.ctx) 1976 free(sc->nvstore.ctx); 1977 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 1978 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1979 } else { 1980 struct pci_nvme_ioreq *req; 1981 int err; 1982 1983 req = pci_nvme_get_ioreq(sc); 1984 if (req == NULL) { 1985 pci_nvme_status_genc(&compl->status, 1986 NVME_SC_INTERNAL_DEVICE_ERROR); 1987 WPRINTF("%s: unable to allocate IO req", __func__); 1988 return (1); 1989 } 1990 req->nvme_sq = &sc->submit_queues[0]; 1991 req->sqid = 0; 1992 req->opc = command->opc; 1993 req->cid = command->cid; 1994 req->nsid = command->nsid; 1995 1996 req->io_req.br_offset = 0; 1997 req->io_req.br_resid = sc->nvstore.size; 1998 req->io_req.br_callback = pci_nvme_io_done; 1999 2000 err = blockif_delete(sc->nvstore.ctx, &req->io_req); 2001 if (err) { 2002 pci_nvme_status_genc(&compl->status, 2003 NVME_SC_INTERNAL_DEVICE_ERROR); 2004 pci_nvme_release_ioreq(sc, req); 2005 } else 2006 compl->status = NVME_NO_STATUS; 2007 } 2008 2009 return (1); 2010 } 2011 2012 static int 2013 nvme_opc_abort(struct pci_nvme_softc *sc __unused, struct nvme_command *command, 2014 struct nvme_completion *compl) 2015 { 2016 DPRINTF("%s submission queue %u, command ID 0x%x", __func__, 2017 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF); 2018 2019 /* TODO: search for the command ID and abort it */ 2020 2021 compl->cdw0 = 1; 2022 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 2023 return (1); 2024 } 2025 2026 static int 2027 nvme_opc_async_event_req(struct pci_nvme_softc* sc, 2028 struct nvme_command* command, struct nvme_completion* compl) 2029 { 2030 DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__, 2031 sc->aer_count, sc->ctrldata.aerl, command->cid); 2032 2033 /* Don't exceed the Async Event Request Limit (AERL). */ 2034 if (pci_nvme_aer_limit_reached(sc)) { 2035 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 2036 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); 2037 return (1); 2038 } 2039 2040 if (pci_nvme_aer_add(sc, command->cid)) { 2041 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC, 2042 NVME_SC_INTERNAL_DEVICE_ERROR); 2043 return (1); 2044 } 2045 2046 /* 2047 * Raise events when they happen based on the Set Features cmd. 2048 * These events happen async, so only set completion successful if 2049 * there is an event reflective of the request to get event. 2050 */ 2051 compl->status = NVME_NO_STATUS; 2052 pci_nvme_aen_notify(sc); 2053 2054 return (0); 2055 } 2056 2057 static void 2058 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) 2059 { 2060 struct nvme_completion compl; 2061 struct nvme_command *cmd; 2062 struct nvme_submission_queue *sq; 2063 struct nvme_completion_queue *cq; 2064 uint16_t sqhead; 2065 2066 DPRINTF("%s index %u", __func__, (uint32_t)value); 2067 2068 sq = &sc->submit_queues[0]; 2069 cq = &sc->compl_queues[0]; 2070 2071 pthread_mutex_lock(&sq->mtx); 2072 2073 sqhead = sq->head; 2074 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail); 2075 2076 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2077 cmd = &(sq->qbase)[sqhead]; 2078 compl.cdw0 = 0; 2079 compl.status = 0; 2080 2081 switch (cmd->opc) { 2082 case NVME_OPC_DELETE_IO_SQ: 2083 DPRINTF("%s command DELETE_IO_SQ", __func__); 2084 nvme_opc_delete_io_sq(sc, cmd, &compl); 2085 break; 2086 case NVME_OPC_CREATE_IO_SQ: 2087 DPRINTF("%s command CREATE_IO_SQ", __func__); 2088 nvme_opc_create_io_sq(sc, cmd, &compl); 2089 break; 2090 case NVME_OPC_DELETE_IO_CQ: 2091 DPRINTF("%s command DELETE_IO_CQ", __func__); 2092 nvme_opc_delete_io_cq(sc, cmd, &compl); 2093 break; 2094 case NVME_OPC_CREATE_IO_CQ: 2095 DPRINTF("%s command CREATE_IO_CQ", __func__); 2096 nvme_opc_create_io_cq(sc, cmd, &compl); 2097 break; 2098 case NVME_OPC_GET_LOG_PAGE: 2099 DPRINTF("%s command GET_LOG_PAGE", __func__); 2100 nvme_opc_get_log_page(sc, cmd, &compl); 2101 break; 2102 case NVME_OPC_IDENTIFY: 2103 DPRINTF("%s command IDENTIFY", __func__); 2104 nvme_opc_identify(sc, cmd, &compl); 2105 break; 2106 case NVME_OPC_ABORT: 2107 DPRINTF("%s command ABORT", __func__); 2108 nvme_opc_abort(sc, cmd, &compl); 2109 break; 2110 case NVME_OPC_SET_FEATURES: 2111 DPRINTF("%s command SET_FEATURES", __func__); 2112 nvme_opc_set_features(sc, cmd, &compl); 2113 break; 2114 case NVME_OPC_GET_FEATURES: 2115 DPRINTF("%s command GET_FEATURES", __func__); 2116 nvme_opc_get_features(sc, cmd, &compl); 2117 break; 2118 case NVME_OPC_FIRMWARE_ACTIVATE: 2119 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__); 2120 pci_nvme_status_tc(&compl.status, 2121 NVME_SCT_COMMAND_SPECIFIC, 2122 NVME_SC_INVALID_FIRMWARE_SLOT); 2123 break; 2124 case NVME_OPC_ASYNC_EVENT_REQUEST: 2125 DPRINTF("%s command ASYNC_EVENT_REQ", __func__); 2126 nvme_opc_async_event_req(sc, cmd, &compl); 2127 break; 2128 case NVME_OPC_FORMAT_NVM: 2129 DPRINTF("%s command FORMAT_NVM", __func__); 2130 if ((sc->ctrldata.oacs & 2131 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) { 2132 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 2133 break; 2134 } 2135 nvme_opc_format_nvm(sc, cmd, &compl); 2136 break; 2137 case NVME_OPC_SECURITY_SEND: 2138 case NVME_OPC_SECURITY_RECEIVE: 2139 case NVME_OPC_SANITIZE: 2140 case NVME_OPC_GET_LBA_STATUS: 2141 DPRINTF("%s command OPC=%#x (unsupported)", __func__, 2142 cmd->opc); 2143 /* Valid but unsupported opcodes */ 2144 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD); 2145 break; 2146 default: 2147 DPRINTF("%s command OPC=%#X (not implemented)", 2148 __func__, 2149 cmd->opc); 2150 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 2151 } 2152 sqhead = (sqhead + 1) % sq->size; 2153 2154 if (NVME_COMPLETION_VALID(compl)) { 2155 pci_nvme_cq_update(sc, &sc->compl_queues[0], 2156 compl.cdw0, 2157 cmd->cid, 2158 0, /* SQID */ 2159 compl.status); 2160 } 2161 } 2162 2163 DPRINTF("setting sqhead %u", sqhead); 2164 sq->head = sqhead; 2165 2166 if (cq->head != cq->tail) 2167 pci_generate_msix(sc->nsc_pi, 0); 2168 2169 pthread_mutex_unlock(&sq->mtx); 2170 } 2171 2172 /* 2173 * Update the Write and Read statistics reported in SMART data 2174 * 2175 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up. 2176 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000 2177 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999. 2178 */ 2179 static void 2180 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc, 2181 size_t bytes, uint16_t status) 2182 { 2183 2184 pthread_mutex_lock(&sc->mtx); 2185 switch (opc) { 2186 case NVME_OPC_WRITE: 2187 sc->write_commands++; 2188 if (status != NVME_SC_SUCCESS) 2189 break; 2190 sc->write_dunits_remainder += (bytes / 512); 2191 while (sc->write_dunits_remainder >= 1000) { 2192 sc->write_data_units++; 2193 sc->write_dunits_remainder -= 1000; 2194 } 2195 break; 2196 case NVME_OPC_READ: 2197 sc->read_commands++; 2198 if (status != NVME_SC_SUCCESS) 2199 break; 2200 sc->read_dunits_remainder += (bytes / 512); 2201 while (sc->read_dunits_remainder >= 1000) { 2202 sc->read_data_units++; 2203 sc->read_dunits_remainder -= 1000; 2204 } 2205 break; 2206 default: 2207 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc); 2208 break; 2209 } 2210 pthread_mutex_unlock(&sc->mtx); 2211 } 2212 2213 /* 2214 * Check if the combination of Starting LBA (slba) and number of blocks 2215 * exceeds the range of the underlying storage. 2216 * 2217 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores 2218 * the capacity in bytes as a uint64_t, care must be taken to avoid integer 2219 * overflow. 2220 */ 2221 static bool 2222 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba, 2223 uint32_t nblocks) 2224 { 2225 size_t offset, bytes; 2226 2227 /* Overflow check of multiplying Starting LBA by the sector size */ 2228 if (slba >> (64 - nvstore->sectsz_bits)) 2229 return (true); 2230 2231 offset = slba << nvstore->sectsz_bits; 2232 bytes = nblocks << nvstore->sectsz_bits; 2233 2234 /* Overflow check of Number of Logical Blocks */ 2235 if ((nvstore->size <= offset) || ((nvstore->size - offset) < bytes)) 2236 return (true); 2237 2238 return (false); 2239 } 2240 2241 static int 2242 pci_nvme_append_iov_req(struct pci_nvme_softc *sc __unused, 2243 struct pci_nvme_ioreq *req, uint64_t gpaddr, size_t size, uint64_t offset) 2244 { 2245 int iovidx; 2246 bool range_is_contiguous; 2247 2248 if (req == NULL) 2249 return (-1); 2250 2251 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) { 2252 return (-1); 2253 } 2254 2255 /* 2256 * Minimize the number of IOVs by concatenating contiguous address 2257 * ranges. If the IOV count is zero, there is no previous range to 2258 * concatenate. 2259 */ 2260 if (req->io_req.br_iovcnt == 0) 2261 range_is_contiguous = false; 2262 else 2263 range_is_contiguous = (req->prev_gpaddr + req->prev_size) == gpaddr; 2264 2265 if (range_is_contiguous) { 2266 iovidx = req->io_req.br_iovcnt - 1; 2267 2268 req->io_req.br_iov[iovidx].iov_base = 2269 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2270 req->prev_gpaddr, size); 2271 if (req->io_req.br_iov[iovidx].iov_base == NULL) 2272 return (-1); 2273 2274 req->prev_size += size; 2275 req->io_req.br_resid += size; 2276 2277 req->io_req.br_iov[iovidx].iov_len = req->prev_size; 2278 } else { 2279 iovidx = req->io_req.br_iovcnt; 2280 if (iovidx == 0) { 2281 req->io_req.br_offset = offset; 2282 req->io_req.br_resid = 0; 2283 req->io_req.br_param = req; 2284 } 2285 2286 req->io_req.br_iov[iovidx].iov_base = 2287 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2288 gpaddr, size); 2289 if (req->io_req.br_iov[iovidx].iov_base == NULL) 2290 return (-1); 2291 2292 req->io_req.br_iov[iovidx].iov_len = size; 2293 2294 req->prev_gpaddr = gpaddr; 2295 req->prev_size = size; 2296 req->io_req.br_resid += size; 2297 2298 req->io_req.br_iovcnt++; 2299 } 2300 2301 return (0); 2302 } 2303 2304 static void 2305 pci_nvme_set_completion(struct pci_nvme_softc *sc, 2306 struct nvme_submission_queue *sq, int sqid, uint16_t cid, uint16_t status) 2307 { 2308 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; 2309 2310 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x", 2311 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), 2312 NVME_STATUS_GET_SC(status)); 2313 2314 pci_nvme_cq_update(sc, cq, 0, cid, sqid, status); 2315 2316 if (cq->head != cq->tail) { 2317 if (cq->intr_en & NVME_CQ_INTEN) { 2318 pci_generate_msix(sc->nsc_pi, cq->intr_vec); 2319 } else { 2320 DPRINTF("%s: CQ%u interrupt disabled", 2321 __func__, sq->cqid); 2322 } 2323 } 2324 } 2325 2326 static void 2327 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) 2328 { 2329 req->sc = NULL; 2330 req->nvme_sq = NULL; 2331 req->sqid = 0; 2332 2333 pthread_mutex_lock(&sc->mtx); 2334 2335 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link); 2336 sc->pending_ios--; 2337 2338 /* when no more IO pending, can set to ready if device reset/enabled */ 2339 if (sc->pending_ios == 0 && 2340 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) 2341 sc->regs.csts |= NVME_CSTS_RDY; 2342 2343 pthread_mutex_unlock(&sc->mtx); 2344 2345 sem_post(&sc->iosemlock); 2346 } 2347 2348 static struct pci_nvme_ioreq * 2349 pci_nvme_get_ioreq(struct pci_nvme_softc *sc) 2350 { 2351 struct pci_nvme_ioreq *req = NULL; 2352 2353 sem_wait(&sc->iosemlock); 2354 pthread_mutex_lock(&sc->mtx); 2355 2356 req = STAILQ_FIRST(&sc->ioreqs_free); 2357 assert(req != NULL); 2358 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link); 2359 2360 req->sc = sc; 2361 2362 sc->pending_ios++; 2363 2364 pthread_mutex_unlock(&sc->mtx); 2365 2366 req->io_req.br_iovcnt = 0; 2367 req->io_req.br_offset = 0; 2368 req->io_req.br_resid = 0; 2369 req->io_req.br_param = req; 2370 req->prev_gpaddr = 0; 2371 req->prev_size = 0; 2372 2373 return req; 2374 } 2375 2376 static void 2377 pci_nvme_io_done(struct blockif_req *br, int err) 2378 { 2379 struct pci_nvme_ioreq *req = br->br_param; 2380 struct nvme_submission_queue *sq = req->nvme_sq; 2381 uint16_t code, status; 2382 2383 DPRINTF("%s error %d %s", __func__, err, strerror(err)); 2384 2385 /* TODO return correct error */ 2386 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; 2387 status = 0; 2388 pci_nvme_status_genc(&status, code); 2389 2390 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, status); 2391 pci_nvme_stats_write_read_update(req->sc, req->opc, 2392 req->bytes, status); 2393 pci_nvme_release_ioreq(req->sc, req); 2394 } 2395 2396 /* 2397 * Implements the Flush command. The specification states: 2398 * If a volatile write cache is not present, Flush commands complete 2399 * successfully and have no effect 2400 * in the description of the Volatile Write Cache (VWC) field of the Identify 2401 * Controller data. Therefore, set status to Success if the command is 2402 * not supported (i.e. RAM or as indicated by the blockif). 2403 */ 2404 static bool 2405 nvme_opc_flush(struct pci_nvme_softc *sc __unused, 2406 struct nvme_command *cmd __unused, 2407 struct pci_nvme_blockstore *nvstore, 2408 struct pci_nvme_ioreq *req, 2409 uint16_t *status) 2410 { 2411 bool pending = false; 2412 2413 if (nvstore->type == NVME_STOR_RAM) { 2414 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2415 } else { 2416 int err; 2417 2418 req->io_req.br_callback = pci_nvme_io_done; 2419 2420 err = blockif_flush(nvstore->ctx, &req->io_req); 2421 switch (err) { 2422 case 0: 2423 pending = true; 2424 break; 2425 case EOPNOTSUPP: 2426 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2427 break; 2428 default: 2429 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2430 } 2431 } 2432 2433 return (pending); 2434 } 2435 2436 static uint16_t 2437 nvme_write_read_ram(struct pci_nvme_softc *sc, 2438 struct pci_nvme_blockstore *nvstore, 2439 uint64_t prp1, uint64_t prp2, 2440 size_t offset, uint64_t bytes, 2441 bool is_write) 2442 { 2443 uint8_t *buf = nvstore->ctx; 2444 enum nvme_copy_dir dir; 2445 uint16_t status; 2446 2447 if (is_write) 2448 dir = NVME_COPY_TO_PRP; 2449 else 2450 dir = NVME_COPY_FROM_PRP; 2451 2452 status = 0; 2453 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2, 2454 buf + offset, bytes, dir)) 2455 pci_nvme_status_genc(&status, 2456 NVME_SC_DATA_TRANSFER_ERROR); 2457 else 2458 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2459 2460 return (status); 2461 } 2462 2463 static uint16_t 2464 nvme_write_read_blockif(struct pci_nvme_softc *sc, 2465 struct pci_nvme_blockstore *nvstore, 2466 struct pci_nvme_ioreq *req, 2467 uint64_t prp1, uint64_t prp2, 2468 size_t offset, uint64_t bytes, 2469 bool is_write) 2470 { 2471 uint64_t size; 2472 int err; 2473 uint16_t status = NVME_NO_STATUS; 2474 2475 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes); 2476 if (pci_nvme_append_iov_req(sc, req, prp1, size, offset)) { 2477 err = -1; 2478 goto out; 2479 } 2480 2481 offset += size; 2482 bytes -= size; 2483 2484 if (bytes == 0) { 2485 ; 2486 } else if (bytes <= PAGE_SIZE) { 2487 size = bytes; 2488 if (pci_nvme_append_iov_req(sc, req, prp2, size, offset)) { 2489 err = -1; 2490 goto out; 2491 } 2492 } else { 2493 void *vmctx = sc->nsc_pi->pi_vmctx; 2494 uint64_t *prp_list = &prp2; 2495 uint64_t *last = prp_list; 2496 2497 /* PRP2 is pointer to a physical region page list */ 2498 while (bytes) { 2499 /* Last entry in list points to the next list */ 2500 if ((prp_list == last) && (bytes > PAGE_SIZE)) { 2501 uint64_t prp = *prp_list; 2502 2503 prp_list = paddr_guest2host(vmctx, prp, 2504 PAGE_SIZE - (prp % PAGE_SIZE)); 2505 if (prp_list == NULL) { 2506 err = -1; 2507 goto out; 2508 } 2509 last = prp_list + (NVME_PRP2_ITEMS - 1); 2510 } 2511 2512 size = MIN(bytes, PAGE_SIZE); 2513 2514 if (pci_nvme_append_iov_req(sc, req, *prp_list, size, 2515 offset)) { 2516 err = -1; 2517 goto out; 2518 } 2519 2520 offset += size; 2521 bytes -= size; 2522 2523 prp_list++; 2524 } 2525 } 2526 req->io_req.br_callback = pci_nvme_io_done; 2527 if (is_write) 2528 err = blockif_write(nvstore->ctx, &req->io_req); 2529 else 2530 err = blockif_read(nvstore->ctx, &req->io_req); 2531 out: 2532 if (err) 2533 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); 2534 2535 return (status); 2536 } 2537 2538 static bool 2539 nvme_opc_write_read(struct pci_nvme_softc *sc, 2540 struct nvme_command *cmd, 2541 struct pci_nvme_blockstore *nvstore, 2542 struct pci_nvme_ioreq *req, 2543 uint16_t *status) 2544 { 2545 uint64_t lba, nblocks, bytes; 2546 size_t offset; 2547 bool is_write = cmd->opc == NVME_OPC_WRITE; 2548 bool pending = false; 2549 2550 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; 2551 nblocks = (cmd->cdw12 & 0xFFFF) + 1; 2552 bytes = nblocks << nvstore->sectsz_bits; 2553 if (bytes > NVME_MAX_DATA_SIZE) { 2554 WPRINTF("%s command would exceed MDTS", __func__); 2555 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD); 2556 goto out; 2557 } 2558 2559 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) { 2560 WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)", 2561 __func__, lba, nblocks); 2562 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2563 goto out; 2564 } 2565 2566 offset = lba << nvstore->sectsz_bits; 2567 2568 req->bytes = bytes; 2569 req->io_req.br_offset = lba; 2570 2571 /* PRP bits 1:0 must be zero */ 2572 cmd->prp1 &= ~0x3UL; 2573 cmd->prp2 &= ~0x3UL; 2574 2575 if (nvstore->type == NVME_STOR_RAM) { 2576 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1, 2577 cmd->prp2, offset, bytes, is_write); 2578 } else { 2579 *status = nvme_write_read_blockif(sc, nvstore, req, 2580 cmd->prp1, cmd->prp2, offset, bytes, is_write); 2581 2582 if (*status == NVME_NO_STATUS) 2583 pending = true; 2584 } 2585 out: 2586 if (!pending) 2587 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status); 2588 2589 return (pending); 2590 } 2591 2592 static void 2593 pci_nvme_dealloc_sm(struct blockif_req *br, int err) 2594 { 2595 struct pci_nvme_ioreq *req = br->br_param; 2596 struct pci_nvme_softc *sc = req->sc; 2597 bool done = true; 2598 uint16_t status; 2599 2600 status = 0; 2601 if (err) { 2602 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); 2603 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) { 2604 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2605 } else { 2606 struct iovec *iov = req->io_req.br_iov; 2607 2608 req->prev_gpaddr++; 2609 iov += req->prev_gpaddr; 2610 2611 /* The iov_* values already include the sector size */ 2612 req->io_req.br_offset = (off_t)iov->iov_base; 2613 req->io_req.br_resid = iov->iov_len; 2614 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) { 2615 pci_nvme_status_genc(&status, 2616 NVME_SC_INTERNAL_DEVICE_ERROR); 2617 } else 2618 done = false; 2619 } 2620 2621 if (done) { 2622 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, req->cid, 2623 status); 2624 pci_nvme_release_ioreq(sc, req); 2625 } 2626 } 2627 2628 static bool 2629 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, 2630 struct nvme_command *cmd, 2631 struct pci_nvme_blockstore *nvstore, 2632 struct pci_nvme_ioreq *req, 2633 uint16_t *status) 2634 { 2635 struct nvme_dsm_range *range = NULL; 2636 uint32_t nr, r, non_zero, dr; 2637 int err; 2638 bool pending = false; 2639 2640 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) { 2641 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE); 2642 goto out; 2643 } 2644 2645 nr = cmd->cdw10 & 0xff; 2646 2647 /* copy locally because a range entry could straddle PRPs */ 2648 range = calloc(1, NVME_MAX_DSM_TRIM); 2649 if (range == NULL) { 2650 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2651 goto out; 2652 } 2653 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, 2654 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP); 2655 2656 /* Check for invalid ranges and the number of non-zero lengths */ 2657 non_zero = 0; 2658 for (r = 0; r <= nr; r++) { 2659 if (pci_nvme_out_of_range(nvstore, 2660 range[r].starting_lba, range[r].length)) { 2661 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2662 goto out; 2663 } 2664 if (range[r].length != 0) 2665 non_zero++; 2666 } 2667 2668 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) { 2669 size_t offset, bytes; 2670 int sectsz_bits = sc->nvstore.sectsz_bits; 2671 2672 /* 2673 * DSM calls are advisory only, and compliant controllers 2674 * may choose to take no actions (i.e. return Success). 2675 */ 2676 if (!nvstore->deallocate) { 2677 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2678 goto out; 2679 } 2680 2681 /* If all ranges have a zero length, return Success */ 2682 if (non_zero == 0) { 2683 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2684 goto out; 2685 } 2686 2687 if (req == NULL) { 2688 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2689 goto out; 2690 } 2691 2692 offset = range[0].starting_lba << sectsz_bits; 2693 bytes = range[0].length << sectsz_bits; 2694 2695 /* 2696 * If the request is for more than a single range, store 2697 * the ranges in the br_iov. Optimize for the common case 2698 * of a single range. 2699 * 2700 * Note that NVMe Number of Ranges is a zero based value 2701 */ 2702 req->io_req.br_iovcnt = 0; 2703 req->io_req.br_offset = offset; 2704 req->io_req.br_resid = bytes; 2705 2706 if (nr == 0) { 2707 req->io_req.br_callback = pci_nvme_io_done; 2708 } else { 2709 struct iovec *iov = req->io_req.br_iov; 2710 2711 for (r = 0, dr = 0; r <= nr; r++) { 2712 offset = range[r].starting_lba << sectsz_bits; 2713 bytes = range[r].length << sectsz_bits; 2714 if (bytes == 0) 2715 continue; 2716 2717 if ((nvstore->size - offset) < bytes) { 2718 pci_nvme_status_genc(status, 2719 NVME_SC_LBA_OUT_OF_RANGE); 2720 goto out; 2721 } 2722 iov[dr].iov_base = (void *)offset; 2723 iov[dr].iov_len = bytes; 2724 dr++; 2725 } 2726 req->io_req.br_callback = pci_nvme_dealloc_sm; 2727 2728 /* 2729 * Use prev_gpaddr to track the current entry and 2730 * prev_size to track the number of entries 2731 */ 2732 req->prev_gpaddr = 0; 2733 req->prev_size = dr; 2734 } 2735 2736 err = blockif_delete(nvstore->ctx, &req->io_req); 2737 if (err) 2738 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2739 else 2740 pending = true; 2741 } 2742 out: 2743 free(range); 2744 return (pending); 2745 } 2746 2747 static void 2748 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) 2749 { 2750 struct nvme_submission_queue *sq; 2751 uint16_t status; 2752 uint16_t sqhead; 2753 2754 /* handle all submissions up to sq->tail index */ 2755 sq = &sc->submit_queues[idx]; 2756 2757 pthread_mutex_lock(&sq->mtx); 2758 2759 sqhead = sq->head; 2760 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p", 2761 idx, sqhead, sq->tail, sq->qbase); 2762 2763 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2764 struct nvme_command *cmd; 2765 struct pci_nvme_ioreq *req; 2766 uint32_t nsid; 2767 bool pending; 2768 2769 pending = false; 2770 req = NULL; 2771 status = 0; 2772 2773 cmd = &sq->qbase[sqhead]; 2774 sqhead = (sqhead + 1) % sq->size; 2775 2776 nsid = le32toh(cmd->nsid); 2777 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) { 2778 pci_nvme_status_genc(&status, 2779 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 2780 status |= 2781 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT; 2782 goto complete; 2783 } 2784 2785 req = pci_nvme_get_ioreq(sc); 2786 if (req == NULL) { 2787 pci_nvme_status_genc(&status, 2788 NVME_SC_INTERNAL_DEVICE_ERROR); 2789 WPRINTF("%s: unable to allocate IO req", __func__); 2790 goto complete; 2791 } 2792 req->nvme_sq = sq; 2793 req->sqid = idx; 2794 req->opc = cmd->opc; 2795 req->cid = cmd->cid; 2796 req->nsid = cmd->nsid; 2797 2798 switch (cmd->opc) { 2799 case NVME_OPC_FLUSH: 2800 pending = nvme_opc_flush(sc, cmd, &sc->nvstore, 2801 req, &status); 2802 break; 2803 case NVME_OPC_WRITE: 2804 case NVME_OPC_READ: 2805 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore, 2806 req, &status); 2807 break; 2808 case NVME_OPC_WRITE_ZEROES: 2809 /* TODO: write zeroes 2810 WPRINTF("%s write zeroes lba 0x%lx blocks %u", 2811 __func__, lba, cmd->cdw12 & 0xFFFF); */ 2812 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2813 break; 2814 case NVME_OPC_DATASET_MANAGEMENT: 2815 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, 2816 req, &status); 2817 break; 2818 default: 2819 WPRINTF("%s unhandled io command 0x%x", 2820 __func__, cmd->opc); 2821 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE); 2822 } 2823 complete: 2824 if (!pending) { 2825 pci_nvme_set_completion(sc, sq, idx, cmd->cid, status); 2826 if (req != NULL) 2827 pci_nvme_release_ioreq(sc, req); 2828 } 2829 } 2830 2831 sq->head = sqhead; 2832 2833 pthread_mutex_unlock(&sq->mtx); 2834 } 2835 2836 static void 2837 pci_nvme_handle_doorbell(struct vmctx *ctx __unused, struct pci_nvme_softc* sc, 2838 uint64_t idx, int is_sq, uint64_t value) 2839 { 2840 DPRINTF("nvme doorbell %lu, %s, val 0x%lx", 2841 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF); 2842 2843 if (is_sq) { 2844 if (idx > sc->num_squeues) { 2845 WPRINTF("%s queue index %lu overflow from " 2846 "guest (max %u)", 2847 __func__, idx, sc->num_squeues); 2848 return; 2849 } 2850 2851 atomic_store_short(&sc->submit_queues[idx].tail, 2852 (uint16_t)value); 2853 2854 if (idx == 0) { 2855 pci_nvme_handle_admin_cmd(sc, value); 2856 } else { 2857 /* submission queue; handle new entries in SQ */ 2858 if (idx > sc->num_squeues) { 2859 WPRINTF("%s SQ index %lu overflow from " 2860 "guest (max %u)", 2861 __func__, idx, sc->num_squeues); 2862 return; 2863 } 2864 pci_nvme_handle_io_cmd(sc, (uint16_t)idx); 2865 } 2866 } else { 2867 if (idx > sc->num_cqueues) { 2868 WPRINTF("%s queue index %lu overflow from " 2869 "guest (max %u)", 2870 __func__, idx, sc->num_cqueues); 2871 return; 2872 } 2873 2874 atomic_store_short(&sc->compl_queues[idx].head, 2875 (uint16_t)value); 2876 } 2877 } 2878 2879 static void 2880 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) 2881 { 2882 const char *s = iswrite ? "WRITE" : "READ"; 2883 2884 switch (offset) { 2885 case NVME_CR_CAP_LOW: 2886 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s); 2887 break; 2888 case NVME_CR_CAP_HI: 2889 DPRINTF("%s %s NVME_CR_CAP_HI", func, s); 2890 break; 2891 case NVME_CR_VS: 2892 DPRINTF("%s %s NVME_CR_VS", func, s); 2893 break; 2894 case NVME_CR_INTMS: 2895 DPRINTF("%s %s NVME_CR_INTMS", func, s); 2896 break; 2897 case NVME_CR_INTMC: 2898 DPRINTF("%s %s NVME_CR_INTMC", func, s); 2899 break; 2900 case NVME_CR_CC: 2901 DPRINTF("%s %s NVME_CR_CC", func, s); 2902 break; 2903 case NVME_CR_CSTS: 2904 DPRINTF("%s %s NVME_CR_CSTS", func, s); 2905 break; 2906 case NVME_CR_NSSR: 2907 DPRINTF("%s %s NVME_CR_NSSR", func, s); 2908 break; 2909 case NVME_CR_AQA: 2910 DPRINTF("%s %s NVME_CR_AQA", func, s); 2911 break; 2912 case NVME_CR_ASQ_LOW: 2913 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s); 2914 break; 2915 case NVME_CR_ASQ_HI: 2916 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s); 2917 break; 2918 case NVME_CR_ACQ_LOW: 2919 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s); 2920 break; 2921 case NVME_CR_ACQ_HI: 2922 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s); 2923 break; 2924 default: 2925 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset); 2926 } 2927 2928 } 2929 2930 static void 2931 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc, 2932 uint64_t offset, int size, uint64_t value) 2933 { 2934 uint32_t ccreg; 2935 2936 if (offset >= NVME_DOORBELL_OFFSET) { 2937 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; 2938 uint64_t idx = belloffset / 8; /* door bell size = 2*int */ 2939 int is_sq = (belloffset % 8) < 4; 2940 2941 if ((sc->regs.csts & NVME_CSTS_RDY) == 0) { 2942 WPRINTF("doorbell write prior to RDY (offset=%#lx)\n", 2943 offset); 2944 return; 2945 } 2946 2947 if (belloffset > ((sc->max_queues+1) * 8 - 4)) { 2948 WPRINTF("guest attempted an overflow write offset " 2949 "0x%lx, val 0x%lx in %s", 2950 offset, value, __func__); 2951 return; 2952 } 2953 2954 if (is_sq) { 2955 if (sc->submit_queues[idx].qbase == NULL) 2956 return; 2957 } else if (sc->compl_queues[idx].qbase == NULL) 2958 return; 2959 2960 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value); 2961 return; 2962 } 2963 2964 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx", 2965 offset, size, value); 2966 2967 if (size != 4) { 2968 WPRINTF("guest wrote invalid size %d (offset 0x%lx, " 2969 "val 0x%lx) to bar0 in %s", 2970 size, offset, value, __func__); 2971 /* TODO: shutdown device */ 2972 return; 2973 } 2974 2975 pci_nvme_bar0_reg_dumps(__func__, offset, 1); 2976 2977 pthread_mutex_lock(&sc->mtx); 2978 2979 switch (offset) { 2980 case NVME_CR_CAP_LOW: 2981 case NVME_CR_CAP_HI: 2982 /* readonly */ 2983 break; 2984 case NVME_CR_VS: 2985 /* readonly */ 2986 break; 2987 case NVME_CR_INTMS: 2988 /* MSI-X, so ignore */ 2989 break; 2990 case NVME_CR_INTMC: 2991 /* MSI-X, so ignore */ 2992 break; 2993 case NVME_CR_CC: 2994 ccreg = (uint32_t)value; 2995 2996 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u " 2997 "iocqes %u", 2998 __func__, 2999 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), 3000 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), 3001 NVME_CC_GET_IOCQES(ccreg)); 3002 3003 if (NVME_CC_GET_SHN(ccreg)) { 3004 /* perform shutdown - flush out data to backend */ 3005 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << 3006 NVME_CSTS_REG_SHST_SHIFT); 3007 sc->regs.csts |= NVME_SHST_COMPLETE << 3008 NVME_CSTS_REG_SHST_SHIFT; 3009 } 3010 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { 3011 if (NVME_CC_GET_EN(ccreg) == 0) 3012 /* transition 1-> causes controller reset */ 3013 pci_nvme_reset_locked(sc); 3014 else 3015 pci_nvme_init_controller(ctx, sc); 3016 } 3017 3018 /* Insert the iocqes, iosqes and en bits from the write */ 3019 sc->regs.cc &= ~NVME_CC_WRITE_MASK; 3020 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; 3021 if (NVME_CC_GET_EN(ccreg) == 0) { 3022 /* Insert the ams, mps and css bit fields */ 3023 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; 3024 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; 3025 sc->regs.csts &= ~NVME_CSTS_RDY; 3026 } else if ((sc->pending_ios == 0) && 3027 !(sc->regs.csts & NVME_CSTS_CFS)) { 3028 sc->regs.csts |= NVME_CSTS_RDY; 3029 } 3030 break; 3031 case NVME_CR_CSTS: 3032 break; 3033 case NVME_CR_NSSR: 3034 /* ignore writes; don't support subsystem reset */ 3035 break; 3036 case NVME_CR_AQA: 3037 sc->regs.aqa = (uint32_t)value; 3038 break; 3039 case NVME_CR_ASQ_LOW: 3040 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | 3041 (0xFFFFF000 & value); 3042 break; 3043 case NVME_CR_ASQ_HI: 3044 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | 3045 (value << 32); 3046 break; 3047 case NVME_CR_ACQ_LOW: 3048 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | 3049 (0xFFFFF000 & value); 3050 break; 3051 case NVME_CR_ACQ_HI: 3052 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | 3053 (value << 32); 3054 break; 3055 default: 3056 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d", 3057 __func__, offset, value, size); 3058 } 3059 pthread_mutex_unlock(&sc->mtx); 3060 } 3061 3062 static void 3063 pci_nvme_write(struct vmctx *ctx, int vcpu __unused, struct pci_devinst *pi, 3064 int baridx, uint64_t offset, int size, uint64_t value) 3065 { 3066 struct pci_nvme_softc* sc = pi->pi_arg; 3067 3068 if (baridx == pci_msix_table_bar(pi) || 3069 baridx == pci_msix_pba_bar(pi)) { 3070 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, " 3071 " value 0x%lx", baridx, offset, size, value); 3072 3073 pci_emul_msix_twrite(pi, offset, size, value); 3074 return; 3075 } 3076 3077 switch (baridx) { 3078 case 0: 3079 pci_nvme_write_bar_0(ctx, sc, offset, size, value); 3080 break; 3081 3082 default: 3083 DPRINTF("%s unknown baridx %d, val 0x%lx", 3084 __func__, baridx, value); 3085 } 3086 } 3087 3088 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, 3089 uint64_t offset, int size) 3090 { 3091 uint64_t value; 3092 3093 pci_nvme_bar0_reg_dumps(__func__, offset, 0); 3094 3095 if (offset < NVME_DOORBELL_OFFSET) { 3096 void *p = &(sc->regs); 3097 pthread_mutex_lock(&sc->mtx); 3098 memcpy(&value, (void *)((uintptr_t)p + offset), size); 3099 pthread_mutex_unlock(&sc->mtx); 3100 } else { 3101 value = 0; 3102 WPRINTF("pci_nvme: read invalid offset %ld", offset); 3103 } 3104 3105 switch (size) { 3106 case 1: 3107 value &= 0xFF; 3108 break; 3109 case 2: 3110 value &= 0xFFFF; 3111 break; 3112 case 4: 3113 value &= 0xFFFFFFFF; 3114 break; 3115 } 3116 3117 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x", 3118 offset, size, (uint32_t)value); 3119 3120 return (value); 3121 } 3122 3123 3124 3125 static uint64_t 3126 pci_nvme_read(struct vmctx *ctx __unused, int vcpu __unused, 3127 struct pci_devinst *pi, int baridx, uint64_t offset, int size) 3128 { 3129 struct pci_nvme_softc* sc = pi->pi_arg; 3130 3131 if (baridx == pci_msix_table_bar(pi) || 3132 baridx == pci_msix_pba_bar(pi)) { 3133 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d", 3134 baridx, offset, size); 3135 3136 return pci_emul_msix_tread(pi, offset, size); 3137 } 3138 3139 switch (baridx) { 3140 case 0: 3141 return pci_nvme_read_bar_0(sc, offset, size); 3142 3143 default: 3144 DPRINTF("unknown bar %d, 0x%lx", baridx, offset); 3145 } 3146 3147 return (0); 3148 } 3149 3150 static int 3151 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl) 3152 { 3153 char bident[sizeof("XX:X:X")]; 3154 const char *value; 3155 uint32_t sectsz; 3156 3157 sc->max_queues = NVME_QUEUES; 3158 sc->max_qentries = NVME_MAX_QENTRIES; 3159 sc->ioslots = NVME_IOSLOTS; 3160 sc->num_squeues = sc->max_queues; 3161 sc->num_cqueues = sc->max_queues; 3162 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 3163 sectsz = 0; 3164 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), 3165 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3166 3167 value = get_config_value_node(nvl, "maxq"); 3168 if (value != NULL) 3169 sc->max_queues = atoi(value); 3170 value = get_config_value_node(nvl, "qsz"); 3171 if (value != NULL) { 3172 sc->max_qentries = atoi(value); 3173 if (sc->max_qentries <= 0) { 3174 EPRINTLN("nvme: Invalid qsz option %d", 3175 sc->max_qentries); 3176 return (-1); 3177 } 3178 } 3179 value = get_config_value_node(nvl, "ioslots"); 3180 if (value != NULL) { 3181 sc->ioslots = atoi(value); 3182 if (sc->ioslots <= 0) { 3183 EPRINTLN("Invalid ioslots option %d", sc->ioslots); 3184 return (-1); 3185 } 3186 } 3187 value = get_config_value_node(nvl, "sectsz"); 3188 if (value != NULL) 3189 sectsz = atoi(value); 3190 value = get_config_value_node(nvl, "ser"); 3191 if (value != NULL) { 3192 /* 3193 * This field indicates the Product Serial Number in 3194 * 7-bit ASCII, unused bytes should be space characters. 3195 * Ref: NVMe v1.3c. 3196 */ 3197 cpywithpad((char *)sc->ctrldata.sn, 3198 sizeof(sc->ctrldata.sn), value, ' '); 3199 } 3200 value = get_config_value_node(nvl, "eui64"); 3201 if (value != NULL) 3202 sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0)); 3203 value = get_config_value_node(nvl, "dsm"); 3204 if (value != NULL) { 3205 if (strcmp(value, "auto") == 0) 3206 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 3207 else if (strcmp(value, "enable") == 0) 3208 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE; 3209 else if (strcmp(value, "disable") == 0) 3210 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE; 3211 } 3212 3213 value = get_config_value_node(nvl, "ram"); 3214 if (value != NULL) { 3215 uint64_t sz = strtoull(value, NULL, 10); 3216 3217 sc->nvstore.type = NVME_STOR_RAM; 3218 sc->nvstore.size = sz * 1024 * 1024; 3219 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 3220 sc->nvstore.sectsz = 4096; 3221 sc->nvstore.sectsz_bits = 12; 3222 if (sc->nvstore.ctx == NULL) { 3223 EPRINTLN("nvme: Unable to allocate RAM"); 3224 return (-1); 3225 } 3226 } else { 3227 snprintf(bident, sizeof(bident), "%d:%d", 3228 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3229 sc->nvstore.ctx = blockif_open(nvl, bident); 3230 if (sc->nvstore.ctx == NULL) { 3231 EPRINTLN("nvme: Could not open backing file: %s", 3232 strerror(errno)); 3233 return (-1); 3234 } 3235 sc->nvstore.type = NVME_STOR_BLOCKIF; 3236 sc->nvstore.size = blockif_size(sc->nvstore.ctx); 3237 } 3238 3239 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) 3240 sc->nvstore.sectsz = sectsz; 3241 else if (sc->nvstore.type != NVME_STOR_RAM) 3242 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); 3243 for (sc->nvstore.sectsz_bits = 9; 3244 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; 3245 sc->nvstore.sectsz_bits++); 3246 3247 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) 3248 sc->max_queues = NVME_QUEUES; 3249 3250 return (0); 3251 } 3252 3253 static void 3254 pci_nvme_resized(struct blockif_ctxt *bctxt __unused, void *arg, 3255 size_t new_size) 3256 { 3257 struct pci_nvme_softc *sc; 3258 struct pci_nvme_blockstore *nvstore; 3259 struct nvme_namespace_data *nd; 3260 3261 sc = arg; 3262 nvstore = &sc->nvstore; 3263 nd = &sc->nsdata; 3264 3265 nvstore->size = new_size; 3266 pci_nvme_init_nsdata_size(nvstore, nd); 3267 3268 /* Add changed NSID to list */ 3269 sc->ns_log.ns[0] = 1; 3270 sc->ns_log.ns[1] = 0; 3271 3272 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE, 3273 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED); 3274 } 3275 3276 static int 3277 pci_nvme_init(struct vmctx *ctx __unused, struct pci_devinst *pi, nvlist_t *nvl) 3278 { 3279 struct pci_nvme_softc *sc; 3280 uint32_t pci_membar_sz; 3281 int error; 3282 3283 error = 0; 3284 3285 sc = calloc(1, sizeof(struct pci_nvme_softc)); 3286 pi->pi_arg = sc; 3287 sc->nsc_pi = pi; 3288 3289 error = pci_nvme_parse_config(sc, nvl); 3290 if (error < 0) 3291 goto done; 3292 else 3293 error = 0; 3294 3295 STAILQ_INIT(&sc->ioreqs_free); 3296 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); 3297 for (uint32_t i = 0; i < sc->ioslots; i++) { 3298 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); 3299 } 3300 3301 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); 3302 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); 3303 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); 3304 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); 3305 pci_set_cfgdata8(pi, PCIR_PROGIF, 3306 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); 3307 3308 /* 3309 * Allocate size of NVMe registers + doorbell space for all queues. 3310 * 3311 * The specification requires a minimum memory I/O window size of 16K. 3312 * The Windows driver will refuse to start a device with a smaller 3313 * window. 3314 */ 3315 pci_membar_sz = sizeof(struct nvme_registers) + 3316 2 * sizeof(uint32_t) * (sc->max_queues + 1); 3317 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); 3318 3319 DPRINTF("nvme membar size: %u", pci_membar_sz); 3320 3321 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); 3322 if (error) { 3323 WPRINTF("%s pci alloc mem bar failed", __func__); 3324 goto done; 3325 } 3326 3327 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); 3328 if (error) { 3329 WPRINTF("%s pci add msixcap failed", __func__); 3330 goto done; 3331 } 3332 3333 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); 3334 if (error) { 3335 WPRINTF("%s pci add Express capability failed", __func__); 3336 goto done; 3337 } 3338 3339 pthread_mutex_init(&sc->mtx, NULL); 3340 sem_init(&sc->iosemlock, 0, sc->ioslots); 3341 blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc); 3342 3343 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues); 3344 /* 3345 * Controller data depends on Namespace data so initialize Namespace 3346 * data first. 3347 */ 3348 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore); 3349 pci_nvme_init_ctrldata(sc); 3350 pci_nvme_init_logpages(sc); 3351 pci_nvme_init_features(sc); 3352 3353 pci_nvme_aer_init(sc); 3354 pci_nvme_aen_init(sc); 3355 3356 pci_nvme_reset(sc); 3357 3358 pci_lintr_request(pi); 3359 3360 done: 3361 return (error); 3362 } 3363 3364 static int 3365 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts) 3366 { 3367 char *cp, *ram; 3368 3369 if (opts == NULL) 3370 return (0); 3371 3372 if (strncmp(opts, "ram=", 4) == 0) { 3373 cp = strchr(opts, ','); 3374 if (cp == NULL) { 3375 set_config_value_node(nvl, "ram", opts + 4); 3376 return (0); 3377 } 3378 ram = strndup(opts + 4, cp - opts - 4); 3379 set_config_value_node(nvl, "ram", ram); 3380 free(ram); 3381 return (pci_parse_legacy_config(nvl, cp + 1)); 3382 } else 3383 return (blockif_legacy_config(nvl, opts)); 3384 } 3385 3386 static const struct pci_devemu pci_de_nvme = { 3387 .pe_emu = "nvme", 3388 .pe_init = pci_nvme_init, 3389 .pe_legacy_config = pci_nvme_legacy_config, 3390 .pe_barwrite = pci_nvme_write, 3391 .pe_barread = pci_nvme_read 3392 }; 3393 PCI_EMUL_SET(pci_de_nvme); 3394