1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2017 Shunsuke Mie 5 * Copyright (c) 2018 Leon Dang 6 * Copyright (c) 2020 Chuck Tuffli 7 * 8 * Function crc16 Copyright (c) 2017, Fedor Uporov 9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 33 /* 34 * bhyve PCIe-NVMe device emulation. 35 * 36 * options: 37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt> 38 * 39 * accepted devpath: 40 * /dev/blockdev 41 * /path/to/image 42 * ram=size_in_MiB 43 * 44 * maxq = max number of queues 45 * qsz = max elements in each queue 46 * ioslots = max number of concurrent io requests 47 * sectsz = sector size (defaults to blockif sector size) 48 * ser = serial number (20-chars max) 49 * eui64 = IEEE Extended Unique Identifier (8 byte value) 50 * dsm = DataSet Management support. Option is one of auto, enable,disable 51 * 52 */ 53 54 /* TODO: 55 - create async event for smart and log 56 - intr coalesce 57 */ 58 59 #include <sys/cdefs.h> 60 __FBSDID("$FreeBSD$"); 61 62 #include <sys/errno.h> 63 #include <sys/types.h> 64 #include <net/ieee_oui.h> 65 66 #include <assert.h> 67 #include <pthread.h> 68 #include <pthread_np.h> 69 #include <semaphore.h> 70 #include <stdbool.h> 71 #include <stddef.h> 72 #include <stdint.h> 73 #include <stdio.h> 74 #include <stdlib.h> 75 #include <string.h> 76 77 #include <machine/atomic.h> 78 #include <machine/vmm.h> 79 #include <vmmapi.h> 80 81 #include <dev/nvme/nvme.h> 82 83 #include "bhyverun.h" 84 #include "block_if.h" 85 #include "config.h" 86 #include "debug.h" 87 #include "pci_emul.h" 88 89 90 static int nvme_debug = 0; 91 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args) 92 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args) 93 94 /* defaults; can be overridden */ 95 #define NVME_MSIX_BAR 4 96 97 #define NVME_IOSLOTS 8 98 99 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ 100 #define NVME_MMIO_SPACE_MIN (1 << 14) 101 102 #define NVME_QUEUES 16 103 #define NVME_MAX_QENTRIES 2048 104 /* Memory Page size Minimum reported in CAP register */ 105 #define NVME_MPSMIN 0 106 /* MPSMIN converted to bytes */ 107 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN)) 108 109 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) 110 #define NVME_MDTS 9 111 /* Note the + 1 allows for the initial descriptor to not be page aligned */ 112 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1) 113 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES) 114 115 /* This is a synthetic status code to indicate there is no status */ 116 #define NVME_NO_STATUS 0xffff 117 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) 118 119 /* Reported temperature in Kelvin (i.e. room temperature) */ 120 #define NVME_TEMPERATURE 296 121 122 /* helpers */ 123 124 /* Convert a zero-based value into a one-based value */ 125 #define ONE_BASED(zero) ((zero) + 1) 126 /* Convert a one-based value into a zero-based value */ 127 #define ZERO_BASED(one) ((one) - 1) 128 129 /* Encode number of SQ's and CQ's for Set/Get Features */ 130 #define NVME_FEATURE_NUM_QUEUES(sc) \ 131 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ 132 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16; 133 134 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) 135 136 enum nvme_controller_register_offsets { 137 NVME_CR_CAP_LOW = 0x00, 138 NVME_CR_CAP_HI = 0x04, 139 NVME_CR_VS = 0x08, 140 NVME_CR_INTMS = 0x0c, 141 NVME_CR_INTMC = 0x10, 142 NVME_CR_CC = 0x14, 143 NVME_CR_CSTS = 0x1c, 144 NVME_CR_NSSR = 0x20, 145 NVME_CR_AQA = 0x24, 146 NVME_CR_ASQ_LOW = 0x28, 147 NVME_CR_ASQ_HI = 0x2c, 148 NVME_CR_ACQ_LOW = 0x30, 149 NVME_CR_ACQ_HI = 0x34, 150 }; 151 152 enum nvme_cmd_cdw11 { 153 NVME_CMD_CDW11_PC = 0x0001, 154 NVME_CMD_CDW11_IEN = 0x0002, 155 NVME_CMD_CDW11_IV = 0xFFFF0000, 156 }; 157 158 enum nvme_copy_dir { 159 NVME_COPY_TO_PRP, 160 NVME_COPY_FROM_PRP, 161 }; 162 163 #define NVME_CQ_INTEN 0x01 164 #define NVME_CQ_INTCOAL 0x02 165 166 struct nvme_completion_queue { 167 struct nvme_completion *qbase; 168 pthread_mutex_t mtx; 169 uint32_t size; 170 uint16_t tail; /* nvme progress */ 171 uint16_t head; /* guest progress */ 172 uint16_t intr_vec; 173 uint32_t intr_en; 174 }; 175 176 struct nvme_submission_queue { 177 struct nvme_command *qbase; 178 pthread_mutex_t mtx; 179 uint32_t size; 180 uint16_t head; /* nvme progress */ 181 uint16_t tail; /* guest progress */ 182 uint16_t cqid; /* completion queue id */ 183 int qpriority; 184 }; 185 186 enum nvme_storage_type { 187 NVME_STOR_BLOCKIF = 0, 188 NVME_STOR_RAM = 1, 189 }; 190 191 struct pci_nvme_blockstore { 192 enum nvme_storage_type type; 193 void *ctx; 194 uint64_t size; 195 uint32_t sectsz; 196 uint32_t sectsz_bits; 197 uint64_t eui64; 198 uint32_t deallocate:1; 199 }; 200 201 /* 202 * Calculate the number of additional page descriptors for guest IO requests 203 * based on the advertised Max Data Transfer (MDTS) and given the number of 204 * default iovec's in a struct blockif_req. 205 */ 206 #define MDTS_PAD_SIZE \ 207 ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \ 208 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \ 209 0 ) 210 211 struct pci_nvme_ioreq { 212 struct pci_nvme_softc *sc; 213 STAILQ_ENTRY(pci_nvme_ioreq) link; 214 struct nvme_submission_queue *nvme_sq; 215 uint16_t sqid; 216 217 /* command information */ 218 uint16_t opc; 219 uint16_t cid; 220 uint32_t nsid; 221 222 uint64_t prev_gpaddr; 223 size_t prev_size; 224 size_t bytes; 225 226 struct blockif_req io_req; 227 228 struct iovec iovpadding[MDTS_PAD_SIZE]; 229 }; 230 231 enum nvme_dsm_type { 232 /* Dataset Management bit in ONCS reflects backing storage capability */ 233 NVME_DATASET_MANAGEMENT_AUTO, 234 /* Unconditionally set Dataset Management bit in ONCS */ 235 NVME_DATASET_MANAGEMENT_ENABLE, 236 /* Unconditionally clear Dataset Management bit in ONCS */ 237 NVME_DATASET_MANAGEMENT_DISABLE, 238 }; 239 240 struct pci_nvme_softc; 241 struct nvme_feature_obj; 242 243 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *, 244 struct nvme_feature_obj *, 245 struct nvme_command *, 246 struct nvme_completion *); 247 248 struct nvme_feature_obj { 249 uint32_t cdw11; 250 nvme_feature_cb set; 251 nvme_feature_cb get; 252 bool namespace_specific; 253 }; 254 255 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1) 256 257 typedef enum { 258 PCI_NVME_AE_TYPE_ERROR = 0, 259 PCI_NVME_AE_TYPE_SMART, 260 PCI_NVME_AE_TYPE_NOTICE, 261 PCI_NVME_AE_TYPE_IO_CMD = 6, 262 PCI_NVME_AE_TYPE_VENDOR = 7, 263 PCI_NVME_AE_TYPE_MAX /* Must be last */ 264 } pci_nvme_async_type; 265 266 /* Asynchronous Event Requests */ 267 struct pci_nvme_aer { 268 STAILQ_ENTRY(pci_nvme_aer) link; 269 uint16_t cid; /* Command ID of the submitted AER */ 270 }; 271 272 /** Asynchronous Event Information - Notice */ 273 typedef enum { 274 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED = 0, 275 PCI_NVME_AEI_NOTICE_FW_ACTIVATION, 276 PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE, 277 PCI_NVME_AEI_NOTICE_ANA_CHANGE, 278 PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE, 279 PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT, 280 PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE, 281 PCI_NVME_AEI_NOTICE_MAX, 282 } pci_nvme_async_event_info_notice; 283 284 #define PCI_NVME_AEI_NOTICE_SHIFT 8 285 #define PCI_NVME_AEI_NOTICE_MASK(event) (1 << (event + PCI_NVME_AEI_NOTICE_SHIFT)) 286 287 /* Asynchronous Event Notifications */ 288 struct pci_nvme_aen { 289 pci_nvme_async_type atype; 290 uint32_t event_data; 291 bool posted; 292 }; 293 294 /* 295 * By default, enable all Asynchrnous Event Notifications: 296 * SMART / Health Critical Warnings 297 * Namespace Attribute Notices 298 */ 299 #define PCI_NVME_AEN_DEFAULT_MASK 0x11f 300 301 typedef enum { 302 NVME_CNTRLTYPE_IO = 1, 303 NVME_CNTRLTYPE_DISCOVERY = 2, 304 NVME_CNTRLTYPE_ADMIN = 3, 305 } pci_nvme_cntrl_type; 306 307 struct pci_nvme_softc { 308 struct pci_devinst *nsc_pi; 309 310 pthread_mutex_t mtx; 311 312 struct nvme_registers regs; 313 314 struct nvme_namespace_data nsdata; 315 struct nvme_controller_data ctrldata; 316 struct nvme_error_information_entry err_log; 317 struct nvme_health_information_page health_log; 318 struct nvme_firmware_page fw_log; 319 struct nvme_ns_list ns_log; 320 321 struct pci_nvme_blockstore nvstore; 322 323 uint16_t max_qentries; /* max entries per queue */ 324 uint32_t max_queues; /* max number of IO SQ's or CQ's */ 325 uint32_t num_cqueues; 326 uint32_t num_squeues; 327 bool num_q_is_set; /* Has host set Number of Queues */ 328 329 struct pci_nvme_ioreq *ioreqs; 330 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */ 331 uint32_t pending_ios; 332 uint32_t ioslots; 333 sem_t iosemlock; 334 335 /* 336 * Memory mapped Submission and Completion queues 337 * Each array includes both Admin and IO queues 338 */ 339 struct nvme_completion_queue *compl_queues; 340 struct nvme_submission_queue *submit_queues; 341 342 struct nvme_feature_obj feat[NVME_FID_MAX]; 343 344 enum nvme_dsm_type dataset_management; 345 346 /* Accounting for SMART data */ 347 __uint128_t read_data_units; 348 __uint128_t write_data_units; 349 __uint128_t read_commands; 350 __uint128_t write_commands; 351 uint32_t read_dunits_remainder; 352 uint32_t write_dunits_remainder; 353 354 STAILQ_HEAD(, pci_nvme_aer) aer_list; 355 pthread_mutex_t aer_mtx; 356 uint32_t aer_count; 357 struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX]; 358 pthread_t aen_tid; 359 pthread_mutex_t aen_mtx; 360 pthread_cond_t aen_cond; 361 }; 362 363 364 static void pci_nvme_cq_update(struct pci_nvme_softc *sc, 365 struct nvme_completion_queue *cq, 366 uint32_t cdw0, 367 uint16_t cid, 368 uint16_t sqid, 369 uint16_t status); 370 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); 371 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); 372 static void pci_nvme_io_done(struct blockif_req *, int); 373 374 /* Controller Configuration utils */ 375 #define NVME_CC_GET_EN(cc) \ 376 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) 377 #define NVME_CC_GET_CSS(cc) \ 378 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) 379 #define NVME_CC_GET_SHN(cc) \ 380 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) 381 #define NVME_CC_GET_IOSQES(cc) \ 382 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) 383 #define NVME_CC_GET_IOCQES(cc) \ 384 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) 385 386 #define NVME_CC_WRITE_MASK \ 387 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ 388 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ 389 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) 390 391 #define NVME_CC_NEN_WRITE_MASK \ 392 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ 393 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ 394 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) 395 396 /* Controller Status utils */ 397 #define NVME_CSTS_GET_RDY(sts) \ 398 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) 399 400 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) 401 #define NVME_CSTS_CFS (1 << NVME_CSTS_REG_CFS_SHIFT) 402 403 /* Completion Queue status word utils */ 404 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) 405 #define NVME_STATUS_MASK \ 406 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ 407 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) 408 409 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \ 410 NVME_CTRLR_DATA_ONCS_DSM_SHIFT) 411 412 static void nvme_feature_invalid_cb(struct pci_nvme_softc *, 413 struct nvme_feature_obj *, 414 struct nvme_command *, 415 struct nvme_completion *); 416 static void nvme_feature_temperature(struct pci_nvme_softc *, 417 struct nvme_feature_obj *, 418 struct nvme_command *, 419 struct nvme_completion *); 420 static void nvme_feature_num_queues(struct pci_nvme_softc *, 421 struct nvme_feature_obj *, 422 struct nvme_command *, 423 struct nvme_completion *); 424 static void nvme_feature_iv_config(struct pci_nvme_softc *, 425 struct nvme_feature_obj *, 426 struct nvme_command *, 427 struct nvme_completion *); 428 static void nvme_feature_async_event(struct pci_nvme_softc *, 429 struct nvme_feature_obj *, 430 struct nvme_command *, 431 struct nvme_completion *); 432 433 static void *aen_thr(void *arg); 434 435 static __inline void 436 cpywithpad(char *dst, size_t dst_size, const char *src, char pad) 437 { 438 size_t len; 439 440 len = strnlen(src, dst_size); 441 memset(dst, pad, dst_size); 442 memcpy(dst, src, len); 443 } 444 445 static __inline void 446 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) 447 { 448 449 *status &= ~NVME_STATUS_MASK; 450 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | 451 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; 452 } 453 454 static __inline void 455 pci_nvme_status_genc(uint16_t *status, uint16_t code) 456 { 457 458 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); 459 } 460 461 /* 462 * Initialize the requested number or IO Submission and Completion Queues. 463 * Admin queues are allocated implicitly. 464 */ 465 static void 466 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq) 467 { 468 uint32_t i; 469 470 /* 471 * Allocate and initialize the Submission Queues 472 */ 473 if (nsq > NVME_QUEUES) { 474 WPRINTF("%s: clamping number of SQ from %u to %u", 475 __func__, nsq, NVME_QUEUES); 476 nsq = NVME_QUEUES; 477 } 478 479 sc->num_squeues = nsq; 480 481 sc->submit_queues = calloc(sc->num_squeues + 1, 482 sizeof(struct nvme_submission_queue)); 483 if (sc->submit_queues == NULL) { 484 WPRINTF("%s: SQ allocation failed", __func__); 485 sc->num_squeues = 0; 486 } else { 487 struct nvme_submission_queue *sq = sc->submit_queues; 488 489 for (i = 0; i < sc->num_squeues + 1; i++) 490 pthread_mutex_init(&sq[i].mtx, NULL); 491 } 492 493 /* 494 * Allocate and initialize the Completion Queues 495 */ 496 if (ncq > NVME_QUEUES) { 497 WPRINTF("%s: clamping number of CQ from %u to %u", 498 __func__, ncq, NVME_QUEUES); 499 ncq = NVME_QUEUES; 500 } 501 502 sc->num_cqueues = ncq; 503 504 sc->compl_queues = calloc(sc->num_cqueues + 1, 505 sizeof(struct nvme_completion_queue)); 506 if (sc->compl_queues == NULL) { 507 WPRINTF("%s: CQ allocation failed", __func__); 508 sc->num_cqueues = 0; 509 } else { 510 struct nvme_completion_queue *cq = sc->compl_queues; 511 512 for (i = 0; i < sc->num_cqueues + 1; i++) 513 pthread_mutex_init(&cq[i].mtx, NULL); 514 } 515 } 516 517 static void 518 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) 519 { 520 struct nvme_controller_data *cd = &sc->ctrldata; 521 522 cd->vid = 0xFB5D; 523 cd->ssvid = 0x0000; 524 525 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); 526 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); 527 528 /* Num of submission commands that we can handle at a time (2^rab) */ 529 cd->rab = 4; 530 531 /* FreeBSD OUI */ 532 cd->ieee[0] = 0x58; 533 cd->ieee[1] = 0x9c; 534 cd->ieee[2] = 0xfc; 535 536 cd->mic = 0; 537 538 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */ 539 540 cd->ver = NVME_REV(1,4); 541 542 cd->cntrltype = NVME_CNTRLTYPE_IO; 543 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; 544 cd->oaes = NVMEB(NVME_CTRLR_DATA_OAES_NS_ATTR); 545 cd->acl = 2; 546 cd->aerl = 4; 547 548 /* Advertise 1, Read-only firmware slot */ 549 cd->frmw = NVMEB(NVME_CTRLR_DATA_FRMW_SLOT1_RO) | 550 (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT); 551 cd->lpa = 0; /* TODO: support some simple things like SMART */ 552 cd->elpe = 0; /* max error log page entries */ 553 /* 554 * Report a single power state (zero-based value) 555 * power_state[] values are left as zero to indicate "Not reported" 556 */ 557 cd->npss = 0; 558 559 /* Warning Composite Temperature Threshold */ 560 cd->wctemp = 0x0157; 561 cd->cctemp = 0x0157; 562 563 /* SANICAP must not be 0 for Revision 1.4 and later NVMe Controllers */ 564 cd->sanicap = (NVME_CTRLR_DATA_SANICAP_NODMMAS_NO << 565 NVME_CTRLR_DATA_SANICAP_NODMMAS_SHIFT); 566 567 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | 568 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); 569 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | 570 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); 571 cd->nn = 1; /* number of namespaces */ 572 573 cd->oncs = 0; 574 switch (sc->dataset_management) { 575 case NVME_DATASET_MANAGEMENT_AUTO: 576 if (sc->nvstore.deallocate) 577 cd->oncs |= NVME_ONCS_DSM; 578 break; 579 case NVME_DATASET_MANAGEMENT_ENABLE: 580 cd->oncs |= NVME_ONCS_DSM; 581 break; 582 default: 583 break; 584 } 585 586 cd->fna = NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK << 587 NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT; 588 589 cd->vwc = NVME_CTRLR_DATA_VWC_ALL_NO << NVME_CTRLR_DATA_VWC_ALL_SHIFT; 590 } 591 592 /* 593 * Calculate the CRC-16 of the given buffer 594 * See copyright attribution at top of file 595 */ 596 static uint16_t 597 crc16(uint16_t crc, const void *buffer, unsigned int len) 598 { 599 const unsigned char *cp = buffer; 600 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */ 601 static uint16_t const crc16_table[256] = { 602 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, 603 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, 604 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, 605 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841, 606 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, 607 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41, 608 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641, 609 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040, 610 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240, 611 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441, 612 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41, 613 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840, 614 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41, 615 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40, 616 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640, 617 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041, 618 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240, 619 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441, 620 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41, 621 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840, 622 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41, 623 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40, 624 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640, 625 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041, 626 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241, 627 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440, 628 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40, 629 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841, 630 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40, 631 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41, 632 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, 633 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 634 }; 635 636 while (len--) 637 crc = (((crc >> 8) & 0xffU) ^ 638 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU; 639 return crc; 640 } 641 642 static void 643 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore, 644 struct nvme_namespace_data *nd) 645 { 646 647 /* Get capacity and block size information from backing store */ 648 nd->nsze = nvstore->size / nvstore->sectsz; 649 nd->ncap = nd->nsze; 650 nd->nuse = nd->nsze; 651 } 652 653 static void 654 pci_nvme_init_nsdata(struct pci_nvme_softc *sc, 655 struct nvme_namespace_data *nd, uint32_t nsid, 656 struct pci_nvme_blockstore *nvstore) 657 { 658 659 pci_nvme_init_nsdata_size(nvstore, nd); 660 661 if (nvstore->type == NVME_STOR_BLOCKIF) 662 nvstore->deallocate = blockif_candelete(nvstore->ctx); 663 664 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ 665 nd->flbas = 0; 666 667 /* Create an EUI-64 if user did not provide one */ 668 if (nvstore->eui64 == 0) { 669 char *data = NULL; 670 uint64_t eui64 = nvstore->eui64; 671 672 asprintf(&data, "%s%u%u%u", get_config_value("name"), 673 sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot, 674 sc->nsc_pi->pi_func); 675 676 if (data != NULL) { 677 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); 678 free(data); 679 } 680 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff); 681 } 682 be64enc(nd->eui64, nvstore->eui64); 683 684 /* LBA data-sz = 2^lbads */ 685 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; 686 } 687 688 static void 689 pci_nvme_init_logpages(struct pci_nvme_softc *sc) 690 { 691 692 memset(&sc->err_log, 0, sizeof(sc->err_log)); 693 memset(&sc->health_log, 0, sizeof(sc->health_log)); 694 memset(&sc->fw_log, 0, sizeof(sc->fw_log)); 695 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 696 697 /* Set read/write remainder to round up according to spec */ 698 sc->read_dunits_remainder = 999; 699 sc->write_dunits_remainder = 999; 700 701 /* Set nominal Health values checked by implementations */ 702 sc->health_log.temperature = NVME_TEMPERATURE; 703 sc->health_log.available_spare = 100; 704 sc->health_log.available_spare_threshold = 10; 705 706 /* Set Active Firmware Info to slot 1 */ 707 sc->fw_log.afi = (1 << NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT); 708 memcpy(&sc->fw_log.revision[0], sc->ctrldata.fr, 709 sizeof(sc->fw_log.revision[0])); 710 } 711 712 static void 713 pci_nvme_init_features(struct pci_nvme_softc *sc) 714 { 715 enum nvme_feature fid; 716 717 for (fid = 0; fid < NVME_FID_MAX; fid++) { 718 switch (fid) { 719 case NVME_FEAT_ARBITRATION: 720 case NVME_FEAT_POWER_MANAGEMENT: 721 case NVME_FEAT_INTERRUPT_COALESCING: //XXX 722 case NVME_FEAT_WRITE_ATOMICITY: 723 /* Mandatory but no special handling required */ 724 //XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 725 //XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 726 // this returns a data buffer 727 break; 728 case NVME_FEAT_TEMPERATURE_THRESHOLD: 729 sc->feat[fid].set = nvme_feature_temperature; 730 break; 731 case NVME_FEAT_ERROR_RECOVERY: 732 sc->feat[fid].namespace_specific = true; 733 break; 734 case NVME_FEAT_NUMBER_OF_QUEUES: 735 sc->feat[fid].set = nvme_feature_num_queues; 736 break; 737 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 738 sc->feat[fid].set = nvme_feature_iv_config; 739 break; 740 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 741 sc->feat[fid].set = nvme_feature_async_event; 742 /* Enable all AENs by default */ 743 sc->feat[fid].cdw11 = PCI_NVME_AEN_DEFAULT_MASK; 744 break; 745 default: 746 sc->feat[fid].set = nvme_feature_invalid_cb; 747 sc->feat[fid].get = nvme_feature_invalid_cb; 748 } 749 } 750 } 751 752 static void 753 pci_nvme_aer_reset(struct pci_nvme_softc *sc) 754 { 755 756 STAILQ_INIT(&sc->aer_list); 757 sc->aer_count = 0; 758 } 759 760 static void 761 pci_nvme_aer_init(struct pci_nvme_softc *sc) 762 { 763 764 pthread_mutex_init(&sc->aer_mtx, NULL); 765 pci_nvme_aer_reset(sc); 766 } 767 768 static void 769 pci_nvme_aer_destroy(struct pci_nvme_softc *sc) 770 { 771 struct pci_nvme_aer *aer = NULL; 772 773 pthread_mutex_lock(&sc->aer_mtx); 774 while (!STAILQ_EMPTY(&sc->aer_list)) { 775 aer = STAILQ_FIRST(&sc->aer_list); 776 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 777 free(aer); 778 } 779 pthread_mutex_unlock(&sc->aer_mtx); 780 781 pci_nvme_aer_reset(sc); 782 } 783 784 static bool 785 pci_nvme_aer_available(struct pci_nvme_softc *sc) 786 { 787 788 return (sc->aer_count != 0); 789 } 790 791 static bool 792 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc) 793 { 794 struct nvme_controller_data *cd = &sc->ctrldata; 795 796 /* AERL is a zero based value while aer_count is one's based */ 797 return (sc->aer_count == (cd->aerl + 1)); 798 } 799 800 /* 801 * Add an Async Event Request 802 * 803 * Stores an AER to be returned later if the Controller needs to notify the 804 * host of an event. 805 * Note that while the NVMe spec doesn't require Controllers to return AER's 806 * in order, this implementation does preserve the order. 807 */ 808 static int 809 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid) 810 { 811 struct pci_nvme_aer *aer = NULL; 812 813 aer = calloc(1, sizeof(struct pci_nvme_aer)); 814 if (aer == NULL) 815 return (-1); 816 817 /* Save the Command ID for use in the completion message */ 818 aer->cid = cid; 819 820 pthread_mutex_lock(&sc->aer_mtx); 821 sc->aer_count++; 822 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link); 823 pthread_mutex_unlock(&sc->aer_mtx); 824 825 return (0); 826 } 827 828 /* 829 * Get an Async Event Request structure 830 * 831 * Returns a pointer to an AER previously submitted by the host or NULL if 832 * no AER's exist. Caller is responsible for freeing the returned struct. 833 */ 834 static struct pci_nvme_aer * 835 pci_nvme_aer_get(struct pci_nvme_softc *sc) 836 { 837 struct pci_nvme_aer *aer = NULL; 838 839 pthread_mutex_lock(&sc->aer_mtx); 840 aer = STAILQ_FIRST(&sc->aer_list); 841 if (aer != NULL) { 842 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 843 sc->aer_count--; 844 } 845 pthread_mutex_unlock(&sc->aer_mtx); 846 847 return (aer); 848 } 849 850 static void 851 pci_nvme_aen_reset(struct pci_nvme_softc *sc) 852 { 853 uint32_t atype; 854 855 memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen)); 856 857 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 858 sc->aen[atype].atype = atype; 859 } 860 } 861 862 static void 863 pci_nvme_aen_init(struct pci_nvme_softc *sc) 864 { 865 char nstr[80]; 866 867 pci_nvme_aen_reset(sc); 868 869 pthread_mutex_init(&sc->aen_mtx, NULL); 870 pthread_create(&sc->aen_tid, NULL, aen_thr, sc); 871 snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot, 872 sc->nsc_pi->pi_func); 873 pthread_set_name_np(sc->aen_tid, nstr); 874 } 875 876 static void 877 pci_nvme_aen_destroy(struct pci_nvme_softc *sc) 878 { 879 880 pci_nvme_aen_reset(sc); 881 } 882 883 /* Notify the AEN thread of pending work */ 884 static void 885 pci_nvme_aen_notify(struct pci_nvme_softc *sc) 886 { 887 888 pthread_cond_signal(&sc->aen_cond); 889 } 890 891 /* 892 * Post an Asynchronous Event Notification 893 */ 894 static int32_t 895 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype, 896 uint32_t event_data) 897 { 898 struct pci_nvme_aen *aen; 899 900 if (atype >= PCI_NVME_AE_TYPE_MAX) { 901 return(EINVAL); 902 } 903 904 pthread_mutex_lock(&sc->aen_mtx); 905 aen = &sc->aen[atype]; 906 907 /* Has the controller already posted an event of this type? */ 908 if (aen->posted) { 909 pthread_mutex_unlock(&sc->aen_mtx); 910 return(EALREADY); 911 } 912 913 aen->event_data = event_data; 914 aen->posted = true; 915 pthread_mutex_unlock(&sc->aen_mtx); 916 917 pci_nvme_aen_notify(sc); 918 919 return(0); 920 } 921 922 static void 923 pci_nvme_aen_process(struct pci_nvme_softc *sc) 924 { 925 struct pci_nvme_aer *aer; 926 struct pci_nvme_aen *aen; 927 pci_nvme_async_type atype; 928 uint32_t mask; 929 uint16_t status; 930 uint8_t lid; 931 932 assert(pthread_mutex_isowned_np(&sc->aen_mtx)); 933 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 934 aen = &sc->aen[atype]; 935 /* Previous iterations may have depleted the available AER's */ 936 if (!pci_nvme_aer_available(sc)) { 937 DPRINTF("%s: no AER", __func__); 938 break; 939 } 940 941 if (!aen->posted) { 942 DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype); 943 continue; 944 } 945 946 status = NVME_SC_SUCCESS; 947 948 /* Is the event masked? */ 949 mask = 950 sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11; 951 952 DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data); 953 switch (atype) { 954 case PCI_NVME_AE_TYPE_ERROR: 955 lid = NVME_LOG_ERROR; 956 break; 957 case PCI_NVME_AE_TYPE_SMART: 958 mask &= 0xff; 959 if ((mask & aen->event_data) == 0) 960 continue; 961 lid = NVME_LOG_HEALTH_INFORMATION; 962 break; 963 case PCI_NVME_AE_TYPE_NOTICE: 964 if (aen->event_data >= PCI_NVME_AEI_NOTICE_MAX) { 965 EPRINTLN("%s unknown AEN notice type %u", 966 __func__, aen->event_data); 967 status = NVME_SC_INTERNAL_DEVICE_ERROR; 968 break; 969 } 970 if ((PCI_NVME_AEI_NOTICE_MASK(aen->event_data) & mask) == 0) 971 continue; 972 switch (aen->event_data) { 973 case PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED: 974 lid = NVME_LOG_CHANGED_NAMESPACE; 975 break; 976 case PCI_NVME_AEI_NOTICE_FW_ACTIVATION: 977 lid = NVME_LOG_FIRMWARE_SLOT; 978 break; 979 case PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE: 980 lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED; 981 break; 982 case PCI_NVME_AEI_NOTICE_ANA_CHANGE: 983 lid = NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS; 984 break; 985 case PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE: 986 lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE; 987 break; 988 case PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT: 989 lid = NVME_LOG_LBA_STATUS_INFORMATION; 990 break; 991 case PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE: 992 lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE; 993 break; 994 default: 995 lid = 0; 996 } 997 break; 998 default: 999 /* bad type?!? */ 1000 EPRINTLN("%s unknown AEN type %u", __func__, atype); 1001 status = NVME_SC_INTERNAL_DEVICE_ERROR; 1002 break; 1003 } 1004 1005 aer = pci_nvme_aer_get(sc); 1006 assert(aer != NULL); 1007 1008 DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype); 1009 pci_nvme_cq_update(sc, &sc->compl_queues[0], 1010 (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */ 1011 aer->cid, 1012 0, /* SQID */ 1013 status); 1014 1015 aen->event_data = 0; 1016 aen->posted = false; 1017 1018 pci_generate_msix(sc->nsc_pi, 0); 1019 } 1020 } 1021 1022 static void * 1023 aen_thr(void *arg) 1024 { 1025 struct pci_nvme_softc *sc; 1026 1027 sc = arg; 1028 1029 pthread_mutex_lock(&sc->aen_mtx); 1030 for (;;) { 1031 pci_nvme_aen_process(sc); 1032 pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx); 1033 } 1034 pthread_mutex_unlock(&sc->aen_mtx); 1035 1036 pthread_exit(NULL); 1037 return (NULL); 1038 } 1039 1040 static void 1041 pci_nvme_reset_locked(struct pci_nvme_softc *sc) 1042 { 1043 uint32_t i; 1044 1045 DPRINTF("%s", __func__); 1046 1047 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | 1048 (1 << NVME_CAP_LO_REG_CQR_SHIFT) | 1049 (60 << NVME_CAP_LO_REG_TO_SHIFT); 1050 1051 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; 1052 1053 sc->regs.vs = NVME_REV(1,4); /* NVMe v1.4 */ 1054 1055 sc->regs.cc = 0; 1056 1057 assert(sc->submit_queues != NULL); 1058 1059 for (i = 0; i < sc->num_squeues + 1; i++) { 1060 sc->submit_queues[i].qbase = NULL; 1061 sc->submit_queues[i].size = 0; 1062 sc->submit_queues[i].cqid = 0; 1063 sc->submit_queues[i].tail = 0; 1064 sc->submit_queues[i].head = 0; 1065 } 1066 1067 assert(sc->compl_queues != NULL); 1068 1069 for (i = 0; i < sc->num_cqueues + 1; i++) { 1070 sc->compl_queues[i].qbase = NULL; 1071 sc->compl_queues[i].size = 0; 1072 sc->compl_queues[i].tail = 0; 1073 sc->compl_queues[i].head = 0; 1074 } 1075 1076 sc->num_q_is_set = false; 1077 1078 pci_nvme_aer_destroy(sc); 1079 pci_nvme_aen_destroy(sc); 1080 1081 /* 1082 * Clear CSTS.RDY last to prevent the host from enabling Controller 1083 * before cleanup completes 1084 */ 1085 sc->regs.csts = 0; 1086 } 1087 1088 static void 1089 pci_nvme_reset(struct pci_nvme_softc *sc) 1090 { 1091 pthread_mutex_lock(&sc->mtx); 1092 pci_nvme_reset_locked(sc); 1093 pthread_mutex_unlock(&sc->mtx); 1094 } 1095 1096 static int 1097 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc) 1098 { 1099 uint16_t acqs, asqs; 1100 1101 DPRINTF("%s", __func__); 1102 1103 /* 1104 * NVMe 2.0 states that "enabling a controller while this field is 1105 * cleared to 0h produces undefined results" for both ACQS and 1106 * ASQS. If zero, set CFS and do not become ready. 1107 */ 1108 asqs = ONE_BASED(sc->regs.aqa & NVME_AQA_REG_ASQS_MASK); 1109 if (asqs < 2) { 1110 EPRINTLN("%s: illegal ASQS value %#x (aqa=%#x)", __func__, 1111 asqs - 1, sc->regs.aqa); 1112 sc->regs.csts |= NVME_CSTS_CFS; 1113 return (-1); 1114 } 1115 sc->submit_queues[0].size = asqs; 1116 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq, 1117 sizeof(struct nvme_command) * asqs); 1118 if (sc->submit_queues[0].qbase == NULL) { 1119 EPRINTLN("%s: ASQ vm_map_gpa(%lx) failed", __func__, 1120 sc->regs.asq); 1121 sc->regs.csts |= NVME_CSTS_CFS; 1122 return (-1); 1123 } 1124 1125 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p", 1126 __func__, sc->regs.asq, sc->submit_queues[0].qbase); 1127 1128 acqs = ONE_BASED((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 1129 NVME_AQA_REG_ACQS_MASK); 1130 if (acqs < 2) { 1131 EPRINTLN("%s: illegal ACQS value %#x (aqa=%#x)", __func__, 1132 acqs - 1, sc->regs.aqa); 1133 sc->regs.csts |= NVME_CSTS_CFS; 1134 return (-1); 1135 } 1136 sc->compl_queues[0].size = acqs; 1137 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq, 1138 sizeof(struct nvme_completion) * acqs); 1139 if (sc->compl_queues[0].qbase == NULL) { 1140 EPRINTLN("%s: ACQ vm_map_gpa(%lx) failed", __func__, 1141 sc->regs.acq); 1142 sc->regs.csts |= NVME_CSTS_CFS; 1143 return (-1); 1144 } 1145 sc->compl_queues[0].intr_en = NVME_CQ_INTEN; 1146 1147 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p", 1148 __func__, sc->regs.acq, sc->compl_queues[0].qbase); 1149 1150 return (0); 1151 } 1152 1153 static int 1154 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b, 1155 size_t len, enum nvme_copy_dir dir) 1156 { 1157 uint8_t *p; 1158 size_t bytes; 1159 1160 if (len > (8 * 1024)) { 1161 return (-1); 1162 } 1163 1164 /* Copy from the start of prp1 to the end of the physical page */ 1165 bytes = PAGE_SIZE - (prp1 & PAGE_MASK); 1166 bytes = MIN(bytes, len); 1167 1168 p = vm_map_gpa(ctx, prp1, bytes); 1169 if (p == NULL) { 1170 return (-1); 1171 } 1172 1173 if (dir == NVME_COPY_TO_PRP) 1174 memcpy(p, b, bytes); 1175 else 1176 memcpy(b, p, bytes); 1177 1178 b += bytes; 1179 1180 len -= bytes; 1181 if (len == 0) { 1182 return (0); 1183 } 1184 1185 len = MIN(len, PAGE_SIZE); 1186 1187 p = vm_map_gpa(ctx, prp2, len); 1188 if (p == NULL) { 1189 return (-1); 1190 } 1191 1192 if (dir == NVME_COPY_TO_PRP) 1193 memcpy(p, b, len); 1194 else 1195 memcpy(b, p, len); 1196 1197 return (0); 1198 } 1199 1200 /* 1201 * Write a Completion Queue Entry update 1202 * 1203 * Write the completion and update the doorbell value 1204 */ 1205 static void 1206 pci_nvme_cq_update(struct pci_nvme_softc *sc, 1207 struct nvme_completion_queue *cq, 1208 uint32_t cdw0, 1209 uint16_t cid, 1210 uint16_t sqid, 1211 uint16_t status) 1212 { 1213 struct nvme_submission_queue *sq = &sc->submit_queues[sqid]; 1214 struct nvme_completion *cqe; 1215 1216 assert(cq->qbase != NULL); 1217 1218 pthread_mutex_lock(&cq->mtx); 1219 1220 cqe = &cq->qbase[cq->tail]; 1221 1222 /* Flip the phase bit */ 1223 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK; 1224 1225 cqe->cdw0 = cdw0; 1226 cqe->sqhd = sq->head; 1227 cqe->sqid = sqid; 1228 cqe->cid = cid; 1229 cqe->status = status; 1230 1231 cq->tail++; 1232 if (cq->tail >= cq->size) { 1233 cq->tail = 0; 1234 } 1235 1236 pthread_mutex_unlock(&cq->mtx); 1237 } 1238 1239 static int 1240 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1241 struct nvme_completion* compl) 1242 { 1243 uint16_t qid = command->cdw10 & 0xffff; 1244 1245 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid); 1246 if (qid == 0 || qid > sc->num_squeues || 1247 (sc->submit_queues[qid].qbase == NULL)) { 1248 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u", 1249 __func__, qid, sc->num_squeues); 1250 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1251 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1252 return (1); 1253 } 1254 1255 sc->submit_queues[qid].qbase = NULL; 1256 sc->submit_queues[qid].cqid = 0; 1257 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1258 return (1); 1259 } 1260 1261 static int 1262 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1263 struct nvme_completion* compl) 1264 { 1265 if (command->cdw11 & NVME_CMD_CDW11_PC) { 1266 uint16_t qid = command->cdw10 & 0xffff; 1267 struct nvme_submission_queue *nsq; 1268 1269 if ((qid == 0) || (qid > sc->num_squeues) || 1270 (sc->submit_queues[qid].qbase != NULL)) { 1271 WPRINTF("%s queue index %u > num_squeues %u", 1272 __func__, qid, sc->num_squeues); 1273 pci_nvme_status_tc(&compl->status, 1274 NVME_SCT_COMMAND_SPECIFIC, 1275 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1276 return (1); 1277 } 1278 1279 nsq = &sc->submit_queues[qid]; 1280 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1281 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries); 1282 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) { 1283 /* 1284 * Queues must specify at least two entries 1285 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1286 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1287 */ 1288 pci_nvme_status_tc(&compl->status, 1289 NVME_SCT_COMMAND_SPECIFIC, 1290 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1291 return (1); 1292 } 1293 nsq->head = nsq->tail = 0; 1294 1295 nsq->cqid = (command->cdw11 >> 16) & 0xffff; 1296 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) { 1297 pci_nvme_status_tc(&compl->status, 1298 NVME_SCT_COMMAND_SPECIFIC, 1299 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1300 return (1); 1301 } 1302 1303 if (sc->compl_queues[nsq->cqid].qbase == NULL) { 1304 pci_nvme_status_tc(&compl->status, 1305 NVME_SCT_COMMAND_SPECIFIC, 1306 NVME_SC_COMPLETION_QUEUE_INVALID); 1307 return (1); 1308 } 1309 1310 nsq->qpriority = (command->cdw11 >> 1) & 0x03; 1311 1312 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1313 sizeof(struct nvme_command) * (size_t)nsq->size); 1314 1315 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__, 1316 qid, nsq->size, nsq->qbase, nsq->cqid); 1317 1318 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1319 1320 DPRINTF("%s completed creating IOSQ qid %u", 1321 __func__, qid); 1322 } else { 1323 /* 1324 * Guest sent non-cont submission queue request. 1325 * This setting is unsupported by this emulation. 1326 */ 1327 WPRINTF("%s unsupported non-contig (list-based) " 1328 "create i/o submission queue", __func__); 1329 1330 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1331 } 1332 return (1); 1333 } 1334 1335 static int 1336 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1337 struct nvme_completion* compl) 1338 { 1339 uint16_t qid = command->cdw10 & 0xffff; 1340 uint16_t sqid; 1341 1342 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid); 1343 if (qid == 0 || qid > sc->num_cqueues || 1344 (sc->compl_queues[qid].qbase == NULL)) { 1345 WPRINTF("%s queue index %u / num_cqueues %u", 1346 __func__, qid, sc->num_cqueues); 1347 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1348 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1349 return (1); 1350 } 1351 1352 /* Deleting an Active CQ is an error */ 1353 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++) 1354 if (sc->submit_queues[sqid].cqid == qid) { 1355 pci_nvme_status_tc(&compl->status, 1356 NVME_SCT_COMMAND_SPECIFIC, 1357 NVME_SC_INVALID_QUEUE_DELETION); 1358 return (1); 1359 } 1360 1361 sc->compl_queues[qid].qbase = NULL; 1362 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1363 return (1); 1364 } 1365 1366 static int 1367 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1368 struct nvme_completion* compl) 1369 { 1370 struct nvme_completion_queue *ncq; 1371 uint16_t qid = command->cdw10 & 0xffff; 1372 1373 /* Only support Physically Contiguous queues */ 1374 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) { 1375 WPRINTF("%s unsupported non-contig (list-based) " 1376 "create i/o completion queue", 1377 __func__); 1378 1379 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1380 return (1); 1381 } 1382 1383 if ((qid == 0) || (qid > sc->num_cqueues) || 1384 (sc->compl_queues[qid].qbase != NULL)) { 1385 WPRINTF("%s queue index %u > num_cqueues %u", 1386 __func__, qid, sc->num_cqueues); 1387 pci_nvme_status_tc(&compl->status, 1388 NVME_SCT_COMMAND_SPECIFIC, 1389 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1390 return (1); 1391 } 1392 1393 ncq = &sc->compl_queues[qid]; 1394 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; 1395 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; 1396 if (ncq->intr_vec > (sc->max_queues + 1)) { 1397 pci_nvme_status_tc(&compl->status, 1398 NVME_SCT_COMMAND_SPECIFIC, 1399 NVME_SC_INVALID_INTERRUPT_VECTOR); 1400 return (1); 1401 } 1402 1403 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1404 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) { 1405 /* 1406 * Queues must specify at least two entries 1407 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1408 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1409 */ 1410 pci_nvme_status_tc(&compl->status, 1411 NVME_SCT_COMMAND_SPECIFIC, 1412 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1413 return (1); 1414 } 1415 ncq->head = ncq->tail = 0; 1416 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1417 command->prp1, 1418 sizeof(struct nvme_command) * (size_t)ncq->size); 1419 1420 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1421 1422 1423 return (1); 1424 } 1425 1426 static int 1427 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, 1428 struct nvme_completion* compl) 1429 { 1430 uint64_t logoff; 1431 uint32_t logsize; 1432 uint8_t logpage; 1433 1434 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1435 1436 /* 1437 * Command specifies the number of dwords to return in fields NUMDU 1438 * and NUMDL. This is a zero-based value. 1439 */ 1440 logpage = command->cdw10 & 0xFF; 1441 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1; 1442 logsize *= sizeof(uint32_t); 1443 logoff = ((uint64_t)(command->cdw13) << 32) | command->cdw12; 1444 1445 DPRINTF("%s log page %u len %u", __func__, logpage, logsize); 1446 1447 switch (logpage) { 1448 case NVME_LOG_ERROR: 1449 if (logoff >= sizeof(sc->err_log)) { 1450 pci_nvme_status_genc(&compl->status, 1451 NVME_SC_INVALID_FIELD); 1452 break; 1453 } 1454 1455 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1456 command->prp2, (uint8_t *)&sc->err_log + logoff, 1457 MIN(logsize - logoff, sizeof(sc->err_log)), 1458 NVME_COPY_TO_PRP); 1459 break; 1460 case NVME_LOG_HEALTH_INFORMATION: 1461 if (logoff >= sizeof(sc->health_log)) { 1462 pci_nvme_status_genc(&compl->status, 1463 NVME_SC_INVALID_FIELD); 1464 break; 1465 } 1466 1467 pthread_mutex_lock(&sc->mtx); 1468 memcpy(&sc->health_log.data_units_read, &sc->read_data_units, 1469 sizeof(sc->health_log.data_units_read)); 1470 memcpy(&sc->health_log.data_units_written, &sc->write_data_units, 1471 sizeof(sc->health_log.data_units_written)); 1472 memcpy(&sc->health_log.host_read_commands, &sc->read_commands, 1473 sizeof(sc->health_log.host_read_commands)); 1474 memcpy(&sc->health_log.host_write_commands, &sc->write_commands, 1475 sizeof(sc->health_log.host_write_commands)); 1476 pthread_mutex_unlock(&sc->mtx); 1477 1478 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1479 command->prp2, (uint8_t *)&sc->health_log + logoff, 1480 MIN(logsize - logoff, sizeof(sc->health_log)), 1481 NVME_COPY_TO_PRP); 1482 break; 1483 case NVME_LOG_FIRMWARE_SLOT: 1484 if (logoff >= sizeof(sc->fw_log)) { 1485 pci_nvme_status_genc(&compl->status, 1486 NVME_SC_INVALID_FIELD); 1487 break; 1488 } 1489 1490 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1491 command->prp2, (uint8_t *)&sc->fw_log + logoff, 1492 MIN(logsize - logoff, sizeof(sc->fw_log)), 1493 NVME_COPY_TO_PRP); 1494 break; 1495 case NVME_LOG_CHANGED_NAMESPACE: 1496 if (logoff >= sizeof(sc->ns_log)) { 1497 pci_nvme_status_genc(&compl->status, 1498 NVME_SC_INVALID_FIELD); 1499 break; 1500 } 1501 1502 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1503 command->prp2, (uint8_t *)&sc->ns_log + logoff, 1504 MIN(logsize - logoff, sizeof(sc->ns_log)), 1505 NVME_COPY_TO_PRP); 1506 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 1507 break; 1508 default: 1509 DPRINTF("%s get log page %x command not supported", 1510 __func__, logpage); 1511 1512 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1513 NVME_SC_INVALID_LOG_PAGE); 1514 } 1515 1516 return (1); 1517 } 1518 1519 static int 1520 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, 1521 struct nvme_completion* compl) 1522 { 1523 void *dest; 1524 uint16_t status; 1525 1526 DPRINTF("%s identify 0x%x nsid 0x%x", __func__, 1527 command->cdw10 & 0xFF, command->nsid); 1528 1529 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1530 1531 switch (command->cdw10 & 0xFF) { 1532 case 0x00: /* return Identify Namespace data structure */ 1533 /* Global NS only valid with NS Management */ 1534 if (command->nsid == NVME_GLOBAL_NAMESPACE_TAG) { 1535 pci_nvme_status_genc(&status, 1536 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1537 break; 1538 } 1539 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1540 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata), 1541 NVME_COPY_TO_PRP); 1542 break; 1543 case 0x01: /* return Identify Controller data structure */ 1544 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1545 command->prp2, (uint8_t *)&sc->ctrldata, 1546 sizeof(sc->ctrldata), 1547 NVME_COPY_TO_PRP); 1548 break; 1549 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ 1550 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1551 sizeof(uint32_t) * 1024); 1552 /* All unused entries shall be zero */ 1553 memset(dest, 0, sizeof(uint32_t) * 1024); 1554 ((uint32_t *)dest)[0] = 1; 1555 break; 1556 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ 1557 if (command->nsid != 1) { 1558 pci_nvme_status_genc(&status, 1559 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1560 break; 1561 } 1562 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1563 sizeof(uint32_t) * 1024); 1564 /* All bytes after the descriptor shall be zero */ 1565 memset(dest, 0, sizeof(uint32_t) * 1024); 1566 1567 /* Return NIDT=1 (i.e. EUI64) descriptor */ 1568 ((uint8_t *)dest)[0] = 1; 1569 ((uint8_t *)dest)[1] = sizeof(uint64_t); 1570 memcpy(((uint8_t *)dest) + 4, sc->nsdata.eui64, sizeof(uint64_t)); 1571 break; 1572 case 0x13: 1573 /* 1574 * Controller list is optional but used by UNH tests. Return 1575 * a valid but empty list. 1576 */ 1577 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1578 sizeof(uint16_t) * 2048); 1579 memset(dest, 0, sizeof(uint16_t) * 2048); 1580 break; 1581 default: 1582 DPRINTF("%s unsupported identify command requested 0x%x", 1583 __func__, command->cdw10 & 0xFF); 1584 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD); 1585 break; 1586 } 1587 1588 compl->status = status; 1589 return (1); 1590 } 1591 1592 static const char * 1593 nvme_fid_to_name(uint8_t fid) 1594 { 1595 const char *name; 1596 1597 switch (fid) { 1598 case NVME_FEAT_ARBITRATION: 1599 name = "Arbitration"; 1600 break; 1601 case NVME_FEAT_POWER_MANAGEMENT: 1602 name = "Power Management"; 1603 break; 1604 case NVME_FEAT_LBA_RANGE_TYPE: 1605 name = "LBA Range Type"; 1606 break; 1607 case NVME_FEAT_TEMPERATURE_THRESHOLD: 1608 name = "Temperature Threshold"; 1609 break; 1610 case NVME_FEAT_ERROR_RECOVERY: 1611 name = "Error Recovery"; 1612 break; 1613 case NVME_FEAT_VOLATILE_WRITE_CACHE: 1614 name = "Volatile Write Cache"; 1615 break; 1616 case NVME_FEAT_NUMBER_OF_QUEUES: 1617 name = "Number of Queues"; 1618 break; 1619 case NVME_FEAT_INTERRUPT_COALESCING: 1620 name = "Interrupt Coalescing"; 1621 break; 1622 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 1623 name = "Interrupt Vector Configuration"; 1624 break; 1625 case NVME_FEAT_WRITE_ATOMICITY: 1626 name = "Write Atomicity Normal"; 1627 break; 1628 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 1629 name = "Asynchronous Event Configuration"; 1630 break; 1631 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 1632 name = "Autonomous Power State Transition"; 1633 break; 1634 case NVME_FEAT_HOST_MEMORY_BUFFER: 1635 name = "Host Memory Buffer"; 1636 break; 1637 case NVME_FEAT_TIMESTAMP: 1638 name = "Timestamp"; 1639 break; 1640 case NVME_FEAT_KEEP_ALIVE_TIMER: 1641 name = "Keep Alive Timer"; 1642 break; 1643 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT: 1644 name = "Host Controlled Thermal Management"; 1645 break; 1646 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG: 1647 name = "Non-Operation Power State Config"; 1648 break; 1649 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG: 1650 name = "Read Recovery Level Config"; 1651 break; 1652 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 1653 name = "Predictable Latency Mode Config"; 1654 break; 1655 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW: 1656 name = "Predictable Latency Mode Window"; 1657 break; 1658 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES: 1659 name = "LBA Status Information Report Interval"; 1660 break; 1661 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 1662 name = "Host Behavior Support"; 1663 break; 1664 case NVME_FEAT_SANITIZE_CONFIG: 1665 name = "Sanitize Config"; 1666 break; 1667 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION: 1668 name = "Endurance Group Event Configuration"; 1669 break; 1670 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 1671 name = "Software Progress Marker"; 1672 break; 1673 case NVME_FEAT_HOST_IDENTIFIER: 1674 name = "Host Identifier"; 1675 break; 1676 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK: 1677 name = "Reservation Notification Mask"; 1678 break; 1679 case NVME_FEAT_RESERVATION_PERSISTENCE: 1680 name = "Reservation Persistence"; 1681 break; 1682 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG: 1683 name = "Namespace Write Protection Config"; 1684 break; 1685 default: 1686 name = "Unknown"; 1687 break; 1688 } 1689 1690 return (name); 1691 } 1692 1693 static void 1694 nvme_feature_invalid_cb(struct pci_nvme_softc *sc __unused, 1695 struct nvme_feature_obj *feat __unused, 1696 struct nvme_command *command __unused, 1697 struct nvme_completion *compl) 1698 { 1699 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1700 } 1701 1702 static void 1703 nvme_feature_iv_config(struct pci_nvme_softc *sc, 1704 struct nvme_feature_obj *feat __unused, 1705 struct nvme_command *command, 1706 struct nvme_completion *compl) 1707 { 1708 uint32_t i; 1709 uint32_t cdw11 = command->cdw11; 1710 uint16_t iv; 1711 bool cd; 1712 1713 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1714 1715 iv = cdw11 & 0xffff; 1716 cd = cdw11 & (1 << 16); 1717 1718 if (iv > (sc->max_queues + 1)) { 1719 return; 1720 } 1721 1722 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */ 1723 if ((iv == 0) && !cd) 1724 return; 1725 1726 /* Requested Interrupt Vector must be used by a CQ */ 1727 for (i = 0; i < sc->num_cqueues + 1; i++) { 1728 if (sc->compl_queues[i].intr_vec == iv) { 1729 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1730 } 1731 } 1732 } 1733 1734 #define NVME_ASYNC_EVENT_ENDURANCE_GROUP (0x4000) 1735 static void 1736 nvme_feature_async_event(struct pci_nvme_softc *sc __unused, 1737 struct nvme_feature_obj *feat __unused, 1738 struct nvme_command *command, 1739 struct nvme_completion *compl) 1740 { 1741 if (command->cdw11 & NVME_ASYNC_EVENT_ENDURANCE_GROUP) 1742 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1743 } 1744 1745 #define NVME_TEMP_THRESH_OVER 0 1746 #define NVME_TEMP_THRESH_UNDER 1 1747 static void 1748 nvme_feature_temperature(struct pci_nvme_softc *sc, 1749 struct nvme_feature_obj *feat __unused, 1750 struct nvme_command *command, 1751 struct nvme_completion *compl) 1752 { 1753 uint16_t tmpth; /* Temperature Threshold */ 1754 uint8_t tmpsel; /* Threshold Temperature Select */ 1755 uint8_t thsel; /* Threshold Type Select */ 1756 bool set_crit = false; 1757 1758 tmpth = command->cdw11 & 0xffff; 1759 tmpsel = (command->cdw11 >> 16) & 0xf; 1760 thsel = (command->cdw11 >> 20) & 0x3; 1761 1762 DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel); 1763 1764 /* Check for unsupported values */ 1765 if (((tmpsel != 0) && (tmpsel != 0xf)) || 1766 (thsel > NVME_TEMP_THRESH_UNDER)) { 1767 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1768 return; 1769 } 1770 1771 if (((thsel == NVME_TEMP_THRESH_OVER) && (NVME_TEMPERATURE >= tmpth)) || 1772 ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth))) 1773 set_crit = true; 1774 1775 pthread_mutex_lock(&sc->mtx); 1776 if (set_crit) 1777 sc->health_log.critical_warning |= 1778 NVME_CRIT_WARN_ST_TEMPERATURE; 1779 else 1780 sc->health_log.critical_warning &= 1781 ~NVME_CRIT_WARN_ST_TEMPERATURE; 1782 pthread_mutex_unlock(&sc->mtx); 1783 1784 if (set_crit) 1785 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART, 1786 sc->health_log.critical_warning); 1787 1788 1789 DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status); 1790 } 1791 1792 static void 1793 nvme_feature_num_queues(struct pci_nvme_softc *sc, 1794 struct nvme_feature_obj *feat __unused, 1795 struct nvme_command *command, 1796 struct nvme_completion *compl) 1797 { 1798 uint16_t nqr; /* Number of Queues Requested */ 1799 1800 if (sc->num_q_is_set) { 1801 WPRINTF("%s: Number of Queues already set", __func__); 1802 pci_nvme_status_genc(&compl->status, 1803 NVME_SC_COMMAND_SEQUENCE_ERROR); 1804 return; 1805 } 1806 1807 nqr = command->cdw11 & 0xFFFF; 1808 if (nqr == 0xffff) { 1809 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr); 1810 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1811 return; 1812 } 1813 1814 sc->num_squeues = ONE_BASED(nqr); 1815 if (sc->num_squeues > sc->max_queues) { 1816 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues, 1817 sc->max_queues); 1818 sc->num_squeues = sc->max_queues; 1819 } 1820 1821 nqr = (command->cdw11 >> 16) & 0xFFFF; 1822 if (nqr == 0xffff) { 1823 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr); 1824 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1825 return; 1826 } 1827 1828 sc->num_cqueues = ONE_BASED(nqr); 1829 if (sc->num_cqueues > sc->max_queues) { 1830 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues, 1831 sc->max_queues); 1832 sc->num_cqueues = sc->max_queues; 1833 } 1834 1835 /* Patch the command value which will be saved on callback's return */ 1836 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc); 1837 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 1838 1839 sc->num_q_is_set = true; 1840 } 1841 1842 static int 1843 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command, 1844 struct nvme_completion *compl) 1845 { 1846 struct nvme_feature_obj *feat; 1847 uint32_t nsid = command->nsid; 1848 uint8_t fid = command->cdw10 & 0xFF; 1849 1850 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1851 1852 if (fid >= NVME_FID_MAX) { 1853 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1854 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1855 return (1); 1856 } 1857 feat = &sc->feat[fid]; 1858 1859 if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) { 1860 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1861 return (1); 1862 } 1863 1864 if (!feat->namespace_specific && 1865 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) { 1866 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1867 NVME_SC_FEATURE_NOT_NS_SPECIFIC); 1868 return (1); 1869 } 1870 1871 compl->cdw0 = 0; 1872 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1873 1874 if (feat->set) 1875 feat->set(sc, feat, command, compl); 1876 1877 DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11); 1878 if (compl->status == NVME_SC_SUCCESS) { 1879 feat->cdw11 = command->cdw11; 1880 if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) && 1881 (command->cdw11 != 0)) 1882 pci_nvme_aen_notify(sc); 1883 } 1884 1885 return (0); 1886 } 1887 1888 #define NVME_FEATURES_SEL_SUPPORTED 0x3 1889 #define NVME_FEATURES_NS_SPECIFIC (1 << 1) 1890 1891 static int 1892 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, 1893 struct nvme_completion* compl) 1894 { 1895 struct nvme_feature_obj *feat; 1896 uint8_t fid = command->cdw10 & 0xFF; 1897 uint8_t sel = (command->cdw10 >> 8) & 0x7; 1898 1899 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1900 1901 if (fid >= NVME_FID_MAX) { 1902 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1903 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1904 return (1); 1905 } 1906 1907 compl->cdw0 = 0; 1908 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1909 1910 feat = &sc->feat[fid]; 1911 if (feat->get) { 1912 feat->get(sc, feat, command, compl); 1913 } 1914 1915 if (compl->status == NVME_SC_SUCCESS) { 1916 if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific) 1917 compl->cdw0 = NVME_FEATURES_NS_SPECIFIC; 1918 else 1919 compl->cdw0 = feat->cdw11; 1920 } 1921 1922 return (0); 1923 } 1924 1925 static int 1926 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command, 1927 struct nvme_completion* compl) 1928 { 1929 uint8_t ses, lbaf, pi; 1930 1931 /* Only supports Secure Erase Setting - User Data Erase */ 1932 ses = (command->cdw10 >> 9) & 0x7; 1933 if (ses > 0x1) { 1934 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1935 return (1); 1936 } 1937 1938 /* Only supports a single LBA Format */ 1939 lbaf = command->cdw10 & 0xf; 1940 if (lbaf != 0) { 1941 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1942 NVME_SC_INVALID_FORMAT); 1943 return (1); 1944 } 1945 1946 /* Doesn't support Protection Infomation */ 1947 pi = (command->cdw10 >> 5) & 0x7; 1948 if (pi != 0) { 1949 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1950 return (1); 1951 } 1952 1953 if (sc->nvstore.type == NVME_STOR_RAM) { 1954 if (sc->nvstore.ctx) 1955 free(sc->nvstore.ctx); 1956 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 1957 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1958 } else { 1959 struct pci_nvme_ioreq *req; 1960 int err; 1961 1962 req = pci_nvme_get_ioreq(sc); 1963 if (req == NULL) { 1964 pci_nvme_status_genc(&compl->status, 1965 NVME_SC_INTERNAL_DEVICE_ERROR); 1966 WPRINTF("%s: unable to allocate IO req", __func__); 1967 return (1); 1968 } 1969 req->nvme_sq = &sc->submit_queues[0]; 1970 req->sqid = 0; 1971 req->opc = command->opc; 1972 req->cid = command->cid; 1973 req->nsid = command->nsid; 1974 1975 req->io_req.br_offset = 0; 1976 req->io_req.br_resid = sc->nvstore.size; 1977 req->io_req.br_callback = pci_nvme_io_done; 1978 1979 err = blockif_delete(sc->nvstore.ctx, &req->io_req); 1980 if (err) { 1981 pci_nvme_status_genc(&compl->status, 1982 NVME_SC_INTERNAL_DEVICE_ERROR); 1983 pci_nvme_release_ioreq(sc, req); 1984 } else 1985 compl->status = NVME_NO_STATUS; 1986 } 1987 1988 return (1); 1989 } 1990 1991 static int 1992 nvme_opc_abort(struct pci_nvme_softc *sc __unused, struct nvme_command *command, 1993 struct nvme_completion *compl) 1994 { 1995 DPRINTF("%s submission queue %u, command ID 0x%x", __func__, 1996 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF); 1997 1998 /* TODO: search for the command ID and abort it */ 1999 2000 compl->cdw0 = 1; 2001 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 2002 return (1); 2003 } 2004 2005 static int 2006 nvme_opc_async_event_req(struct pci_nvme_softc* sc, 2007 struct nvme_command* command, struct nvme_completion* compl) 2008 { 2009 DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__, 2010 sc->aer_count, sc->ctrldata.aerl, command->cid); 2011 2012 /* Don't exceed the Async Event Request Limit (AERL). */ 2013 if (pci_nvme_aer_limit_reached(sc)) { 2014 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 2015 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); 2016 return (1); 2017 } 2018 2019 if (pci_nvme_aer_add(sc, command->cid)) { 2020 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC, 2021 NVME_SC_INTERNAL_DEVICE_ERROR); 2022 return (1); 2023 } 2024 2025 /* 2026 * Raise events when they happen based on the Set Features cmd. 2027 * These events happen async, so only set completion successful if 2028 * there is an event reflective of the request to get event. 2029 */ 2030 compl->status = NVME_NO_STATUS; 2031 pci_nvme_aen_notify(sc); 2032 2033 return (0); 2034 } 2035 2036 static void 2037 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) 2038 { 2039 struct nvme_completion compl; 2040 struct nvme_command *cmd; 2041 struct nvme_submission_queue *sq; 2042 struct nvme_completion_queue *cq; 2043 uint16_t sqhead; 2044 2045 DPRINTF("%s index %u", __func__, (uint32_t)value); 2046 2047 sq = &sc->submit_queues[0]; 2048 cq = &sc->compl_queues[0]; 2049 2050 pthread_mutex_lock(&sq->mtx); 2051 2052 sqhead = sq->head; 2053 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail); 2054 2055 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2056 cmd = &(sq->qbase)[sqhead]; 2057 compl.cdw0 = 0; 2058 compl.status = 0; 2059 2060 switch (cmd->opc) { 2061 case NVME_OPC_DELETE_IO_SQ: 2062 DPRINTF("%s command DELETE_IO_SQ", __func__); 2063 nvme_opc_delete_io_sq(sc, cmd, &compl); 2064 break; 2065 case NVME_OPC_CREATE_IO_SQ: 2066 DPRINTF("%s command CREATE_IO_SQ", __func__); 2067 nvme_opc_create_io_sq(sc, cmd, &compl); 2068 break; 2069 case NVME_OPC_DELETE_IO_CQ: 2070 DPRINTF("%s command DELETE_IO_CQ", __func__); 2071 nvme_opc_delete_io_cq(sc, cmd, &compl); 2072 break; 2073 case NVME_OPC_CREATE_IO_CQ: 2074 DPRINTF("%s command CREATE_IO_CQ", __func__); 2075 nvme_opc_create_io_cq(sc, cmd, &compl); 2076 break; 2077 case NVME_OPC_GET_LOG_PAGE: 2078 DPRINTF("%s command GET_LOG_PAGE", __func__); 2079 nvme_opc_get_log_page(sc, cmd, &compl); 2080 break; 2081 case NVME_OPC_IDENTIFY: 2082 DPRINTF("%s command IDENTIFY", __func__); 2083 nvme_opc_identify(sc, cmd, &compl); 2084 break; 2085 case NVME_OPC_ABORT: 2086 DPRINTF("%s command ABORT", __func__); 2087 nvme_opc_abort(sc, cmd, &compl); 2088 break; 2089 case NVME_OPC_SET_FEATURES: 2090 DPRINTF("%s command SET_FEATURES", __func__); 2091 nvme_opc_set_features(sc, cmd, &compl); 2092 break; 2093 case NVME_OPC_GET_FEATURES: 2094 DPRINTF("%s command GET_FEATURES", __func__); 2095 nvme_opc_get_features(sc, cmd, &compl); 2096 break; 2097 case NVME_OPC_FIRMWARE_ACTIVATE: 2098 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__); 2099 pci_nvme_status_tc(&compl.status, 2100 NVME_SCT_COMMAND_SPECIFIC, 2101 NVME_SC_INVALID_FIRMWARE_SLOT); 2102 break; 2103 case NVME_OPC_ASYNC_EVENT_REQUEST: 2104 DPRINTF("%s command ASYNC_EVENT_REQ", __func__); 2105 nvme_opc_async_event_req(sc, cmd, &compl); 2106 break; 2107 case NVME_OPC_FORMAT_NVM: 2108 DPRINTF("%s command FORMAT_NVM", __func__); 2109 if ((sc->ctrldata.oacs & 2110 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) { 2111 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 2112 break; 2113 } 2114 nvme_opc_format_nvm(sc, cmd, &compl); 2115 break; 2116 case NVME_OPC_SECURITY_SEND: 2117 case NVME_OPC_SECURITY_RECEIVE: 2118 case NVME_OPC_SANITIZE: 2119 case NVME_OPC_GET_LBA_STATUS: 2120 DPRINTF("%s command OPC=%#x (unsupported)", __func__, 2121 cmd->opc); 2122 /* Valid but unsupported opcodes */ 2123 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD); 2124 break; 2125 default: 2126 DPRINTF("%s command OPC=%#X (not implemented)", 2127 __func__, 2128 cmd->opc); 2129 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 2130 } 2131 sqhead = (sqhead + 1) % sq->size; 2132 2133 if (NVME_COMPLETION_VALID(compl)) { 2134 pci_nvme_cq_update(sc, &sc->compl_queues[0], 2135 compl.cdw0, 2136 cmd->cid, 2137 0, /* SQID */ 2138 compl.status); 2139 } 2140 } 2141 2142 DPRINTF("setting sqhead %u", sqhead); 2143 sq->head = sqhead; 2144 2145 if (cq->head != cq->tail) 2146 pci_generate_msix(sc->nsc_pi, 0); 2147 2148 pthread_mutex_unlock(&sq->mtx); 2149 } 2150 2151 /* 2152 * Update the Write and Read statistics reported in SMART data 2153 * 2154 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up. 2155 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000 2156 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999. 2157 */ 2158 static void 2159 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc, 2160 size_t bytes, uint16_t status) 2161 { 2162 2163 pthread_mutex_lock(&sc->mtx); 2164 switch (opc) { 2165 case NVME_OPC_WRITE: 2166 sc->write_commands++; 2167 if (status != NVME_SC_SUCCESS) 2168 break; 2169 sc->write_dunits_remainder += (bytes / 512); 2170 while (sc->write_dunits_remainder >= 1000) { 2171 sc->write_data_units++; 2172 sc->write_dunits_remainder -= 1000; 2173 } 2174 break; 2175 case NVME_OPC_READ: 2176 sc->read_commands++; 2177 if (status != NVME_SC_SUCCESS) 2178 break; 2179 sc->read_dunits_remainder += (bytes / 512); 2180 while (sc->read_dunits_remainder >= 1000) { 2181 sc->read_data_units++; 2182 sc->read_dunits_remainder -= 1000; 2183 } 2184 break; 2185 default: 2186 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc); 2187 break; 2188 } 2189 pthread_mutex_unlock(&sc->mtx); 2190 } 2191 2192 /* 2193 * Check if the combination of Starting LBA (slba) and number of blocks 2194 * exceeds the range of the underlying storage. 2195 * 2196 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores 2197 * the capacity in bytes as a uint64_t, care must be taken to avoid integer 2198 * overflow. 2199 */ 2200 static bool 2201 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba, 2202 uint32_t nblocks) 2203 { 2204 size_t offset, bytes; 2205 2206 /* Overflow check of multiplying Starting LBA by the sector size */ 2207 if (slba >> (64 - nvstore->sectsz_bits)) 2208 return (true); 2209 2210 offset = slba << nvstore->sectsz_bits; 2211 bytes = nblocks << nvstore->sectsz_bits; 2212 2213 /* Overflow check of Number of Logical Blocks */ 2214 if ((nvstore->size <= offset) || ((nvstore->size - offset) < bytes)) 2215 return (true); 2216 2217 return (false); 2218 } 2219 2220 static int 2221 pci_nvme_append_iov_req(struct pci_nvme_softc *sc __unused, 2222 struct pci_nvme_ioreq *req, uint64_t gpaddr, size_t size, uint64_t offset) 2223 { 2224 int iovidx; 2225 bool range_is_contiguous; 2226 2227 if (req == NULL) 2228 return (-1); 2229 2230 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) { 2231 return (-1); 2232 } 2233 2234 /* 2235 * Minimize the number of IOVs by concatenating contiguous address 2236 * ranges. If the IOV count is zero, there is no previous range to 2237 * concatenate. 2238 */ 2239 if (req->io_req.br_iovcnt == 0) 2240 range_is_contiguous = false; 2241 else 2242 range_is_contiguous = (req->prev_gpaddr + req->prev_size) == gpaddr; 2243 2244 if (range_is_contiguous) { 2245 iovidx = req->io_req.br_iovcnt - 1; 2246 2247 req->io_req.br_iov[iovidx].iov_base = 2248 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2249 req->prev_gpaddr, size); 2250 if (req->io_req.br_iov[iovidx].iov_base == NULL) 2251 return (-1); 2252 2253 req->prev_size += size; 2254 req->io_req.br_resid += size; 2255 2256 req->io_req.br_iov[iovidx].iov_len = req->prev_size; 2257 } else { 2258 iovidx = req->io_req.br_iovcnt; 2259 if (iovidx == 0) { 2260 req->io_req.br_offset = offset; 2261 req->io_req.br_resid = 0; 2262 req->io_req.br_param = req; 2263 } 2264 2265 req->io_req.br_iov[iovidx].iov_base = 2266 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2267 gpaddr, size); 2268 if (req->io_req.br_iov[iovidx].iov_base == NULL) 2269 return (-1); 2270 2271 req->io_req.br_iov[iovidx].iov_len = size; 2272 2273 req->prev_gpaddr = gpaddr; 2274 req->prev_size = size; 2275 req->io_req.br_resid += size; 2276 2277 req->io_req.br_iovcnt++; 2278 } 2279 2280 return (0); 2281 } 2282 2283 static void 2284 pci_nvme_set_completion(struct pci_nvme_softc *sc, 2285 struct nvme_submission_queue *sq, int sqid, uint16_t cid, uint16_t status) 2286 { 2287 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; 2288 2289 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x", 2290 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), 2291 NVME_STATUS_GET_SC(status)); 2292 2293 pci_nvme_cq_update(sc, cq, 0, cid, sqid, status); 2294 2295 if (cq->head != cq->tail) { 2296 if (cq->intr_en & NVME_CQ_INTEN) { 2297 pci_generate_msix(sc->nsc_pi, cq->intr_vec); 2298 } else { 2299 DPRINTF("%s: CQ%u interrupt disabled", 2300 __func__, sq->cqid); 2301 } 2302 } 2303 } 2304 2305 static void 2306 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) 2307 { 2308 req->sc = NULL; 2309 req->nvme_sq = NULL; 2310 req->sqid = 0; 2311 2312 pthread_mutex_lock(&sc->mtx); 2313 2314 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link); 2315 sc->pending_ios--; 2316 2317 /* when no more IO pending, can set to ready if device reset/enabled */ 2318 if (sc->pending_ios == 0 && 2319 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) 2320 sc->regs.csts |= NVME_CSTS_RDY; 2321 2322 pthread_mutex_unlock(&sc->mtx); 2323 2324 sem_post(&sc->iosemlock); 2325 } 2326 2327 static struct pci_nvme_ioreq * 2328 pci_nvme_get_ioreq(struct pci_nvme_softc *sc) 2329 { 2330 struct pci_nvme_ioreq *req = NULL; 2331 2332 sem_wait(&sc->iosemlock); 2333 pthread_mutex_lock(&sc->mtx); 2334 2335 req = STAILQ_FIRST(&sc->ioreqs_free); 2336 assert(req != NULL); 2337 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link); 2338 2339 req->sc = sc; 2340 2341 sc->pending_ios++; 2342 2343 pthread_mutex_unlock(&sc->mtx); 2344 2345 req->io_req.br_iovcnt = 0; 2346 req->io_req.br_offset = 0; 2347 req->io_req.br_resid = 0; 2348 req->io_req.br_param = req; 2349 req->prev_gpaddr = 0; 2350 req->prev_size = 0; 2351 2352 return req; 2353 } 2354 2355 static void 2356 pci_nvme_io_done(struct blockif_req *br, int err) 2357 { 2358 struct pci_nvme_ioreq *req = br->br_param; 2359 struct nvme_submission_queue *sq = req->nvme_sq; 2360 uint16_t code, status; 2361 2362 DPRINTF("%s error %d %s", __func__, err, strerror(err)); 2363 2364 /* TODO return correct error */ 2365 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; 2366 pci_nvme_status_genc(&status, code); 2367 2368 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, status); 2369 pci_nvme_stats_write_read_update(req->sc, req->opc, 2370 req->bytes, status); 2371 pci_nvme_release_ioreq(req->sc, req); 2372 } 2373 2374 /* 2375 * Implements the Flush command. The specification states: 2376 * If a volatile write cache is not present, Flush commands complete 2377 * successfully and have no effect 2378 * in the description of the Volatile Write Cache (VWC) field of the Identify 2379 * Controller data. Therefore, set status to Success if the command is 2380 * not supported (i.e. RAM or as indicated by the blockif). 2381 */ 2382 static bool 2383 nvme_opc_flush(struct pci_nvme_softc *sc __unused, 2384 struct nvme_command *cmd __unused, 2385 struct pci_nvme_blockstore *nvstore, 2386 struct pci_nvme_ioreq *req, 2387 uint16_t *status) 2388 { 2389 bool pending = false; 2390 2391 if (nvstore->type == NVME_STOR_RAM) { 2392 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2393 } else { 2394 int err; 2395 2396 req->io_req.br_callback = pci_nvme_io_done; 2397 2398 err = blockif_flush(nvstore->ctx, &req->io_req); 2399 switch (err) { 2400 case 0: 2401 pending = true; 2402 break; 2403 case EOPNOTSUPP: 2404 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2405 break; 2406 default: 2407 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2408 } 2409 } 2410 2411 return (pending); 2412 } 2413 2414 static uint16_t 2415 nvme_write_read_ram(struct pci_nvme_softc *sc, 2416 struct pci_nvme_blockstore *nvstore, 2417 uint64_t prp1, uint64_t prp2, 2418 size_t offset, uint64_t bytes, 2419 bool is_write) 2420 { 2421 uint8_t *buf = nvstore->ctx; 2422 enum nvme_copy_dir dir; 2423 uint16_t status; 2424 2425 if (is_write) 2426 dir = NVME_COPY_TO_PRP; 2427 else 2428 dir = NVME_COPY_FROM_PRP; 2429 2430 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2, 2431 buf + offset, bytes, dir)) 2432 pci_nvme_status_genc(&status, 2433 NVME_SC_DATA_TRANSFER_ERROR); 2434 else 2435 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2436 2437 return (status); 2438 } 2439 2440 static uint16_t 2441 nvme_write_read_blockif(struct pci_nvme_softc *sc, 2442 struct pci_nvme_blockstore *nvstore, 2443 struct pci_nvme_ioreq *req, 2444 uint64_t prp1, uint64_t prp2, 2445 size_t offset, uint64_t bytes, 2446 bool is_write) 2447 { 2448 uint64_t size; 2449 int err; 2450 uint16_t status = NVME_NO_STATUS; 2451 2452 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes); 2453 if (pci_nvme_append_iov_req(sc, req, prp1, size, offset)) { 2454 err = -1; 2455 goto out; 2456 } 2457 2458 offset += size; 2459 bytes -= size; 2460 2461 if (bytes == 0) { 2462 ; 2463 } else if (bytes <= PAGE_SIZE) { 2464 size = bytes; 2465 if (pci_nvme_append_iov_req(sc, req, prp2, size, offset)) { 2466 err = -1; 2467 goto out; 2468 } 2469 } else { 2470 void *vmctx = sc->nsc_pi->pi_vmctx; 2471 uint64_t *prp_list = &prp2; 2472 uint64_t *last = prp_list; 2473 2474 /* PRP2 is pointer to a physical region page list */ 2475 while (bytes) { 2476 /* Last entry in list points to the next list */ 2477 if ((prp_list == last) && (bytes > PAGE_SIZE)) { 2478 uint64_t prp = *prp_list; 2479 2480 prp_list = paddr_guest2host(vmctx, prp, 2481 PAGE_SIZE - (prp % PAGE_SIZE)); 2482 if (prp_list == NULL) { 2483 err = -1; 2484 goto out; 2485 } 2486 last = prp_list + (NVME_PRP2_ITEMS - 1); 2487 } 2488 2489 size = MIN(bytes, PAGE_SIZE); 2490 2491 if (pci_nvme_append_iov_req(sc, req, *prp_list, size, 2492 offset)) { 2493 err = -1; 2494 goto out; 2495 } 2496 2497 offset += size; 2498 bytes -= size; 2499 2500 prp_list++; 2501 } 2502 } 2503 req->io_req.br_callback = pci_nvme_io_done; 2504 if (is_write) 2505 err = blockif_write(nvstore->ctx, &req->io_req); 2506 else 2507 err = blockif_read(nvstore->ctx, &req->io_req); 2508 out: 2509 if (err) 2510 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); 2511 2512 return (status); 2513 } 2514 2515 static bool 2516 nvme_opc_write_read(struct pci_nvme_softc *sc, 2517 struct nvme_command *cmd, 2518 struct pci_nvme_blockstore *nvstore, 2519 struct pci_nvme_ioreq *req, 2520 uint16_t *status) 2521 { 2522 uint64_t lba, nblocks, bytes; 2523 size_t offset; 2524 bool is_write = cmd->opc == NVME_OPC_WRITE; 2525 bool pending = false; 2526 2527 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; 2528 nblocks = (cmd->cdw12 & 0xFFFF) + 1; 2529 bytes = nblocks << nvstore->sectsz_bits; 2530 if (bytes > NVME_MAX_DATA_SIZE) { 2531 WPRINTF("%s command would exceed MDTS", __func__); 2532 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD); 2533 goto out; 2534 } 2535 2536 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) { 2537 WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)", 2538 __func__, lba, nblocks); 2539 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2540 goto out; 2541 } 2542 2543 offset = lba << nvstore->sectsz_bits; 2544 2545 req->bytes = bytes; 2546 req->io_req.br_offset = lba; 2547 2548 /* PRP bits 1:0 must be zero */ 2549 cmd->prp1 &= ~0x3UL; 2550 cmd->prp2 &= ~0x3UL; 2551 2552 if (nvstore->type == NVME_STOR_RAM) { 2553 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1, 2554 cmd->prp2, offset, bytes, is_write); 2555 } else { 2556 *status = nvme_write_read_blockif(sc, nvstore, req, 2557 cmd->prp1, cmd->prp2, offset, bytes, is_write); 2558 2559 if (*status == NVME_NO_STATUS) 2560 pending = true; 2561 } 2562 out: 2563 if (!pending) 2564 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status); 2565 2566 return (pending); 2567 } 2568 2569 static void 2570 pci_nvme_dealloc_sm(struct blockif_req *br, int err) 2571 { 2572 struct pci_nvme_ioreq *req = br->br_param; 2573 struct pci_nvme_softc *sc = req->sc; 2574 bool done = true; 2575 uint16_t status; 2576 2577 if (err) { 2578 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); 2579 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) { 2580 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2581 } else { 2582 struct iovec *iov = req->io_req.br_iov; 2583 2584 req->prev_gpaddr++; 2585 iov += req->prev_gpaddr; 2586 2587 /* The iov_* values already include the sector size */ 2588 req->io_req.br_offset = (off_t)iov->iov_base; 2589 req->io_req.br_resid = iov->iov_len; 2590 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) { 2591 pci_nvme_status_genc(&status, 2592 NVME_SC_INTERNAL_DEVICE_ERROR); 2593 } else 2594 done = false; 2595 } 2596 2597 if (done) { 2598 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, req->cid, 2599 status); 2600 pci_nvme_release_ioreq(sc, req); 2601 } 2602 } 2603 2604 static bool 2605 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, 2606 struct nvme_command *cmd, 2607 struct pci_nvme_blockstore *nvstore, 2608 struct pci_nvme_ioreq *req, 2609 uint16_t *status) 2610 { 2611 struct nvme_dsm_range *range = NULL; 2612 uint32_t nr, r, non_zero, dr; 2613 int err; 2614 bool pending = false; 2615 2616 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) { 2617 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE); 2618 goto out; 2619 } 2620 2621 nr = cmd->cdw10 & 0xff; 2622 2623 /* copy locally because a range entry could straddle PRPs */ 2624 range = calloc(1, NVME_MAX_DSM_TRIM); 2625 if (range == NULL) { 2626 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2627 goto out; 2628 } 2629 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, 2630 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP); 2631 2632 /* Check for invalid ranges and the number of non-zero lengths */ 2633 non_zero = 0; 2634 for (r = 0; r <= nr; r++) { 2635 if (pci_nvme_out_of_range(nvstore, 2636 range[r].starting_lba, range[r].length)) { 2637 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2638 goto out; 2639 } 2640 if (range[r].length != 0) 2641 non_zero++; 2642 } 2643 2644 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) { 2645 size_t offset, bytes; 2646 int sectsz_bits = sc->nvstore.sectsz_bits; 2647 2648 /* 2649 * DSM calls are advisory only, and compliant controllers 2650 * may choose to take no actions (i.e. return Success). 2651 */ 2652 if (!nvstore->deallocate) { 2653 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2654 goto out; 2655 } 2656 2657 /* If all ranges have a zero length, return Success */ 2658 if (non_zero == 0) { 2659 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2660 goto out; 2661 } 2662 2663 if (req == NULL) { 2664 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2665 goto out; 2666 } 2667 2668 offset = range[0].starting_lba << sectsz_bits; 2669 bytes = range[0].length << sectsz_bits; 2670 2671 /* 2672 * If the request is for more than a single range, store 2673 * the ranges in the br_iov. Optimize for the common case 2674 * of a single range. 2675 * 2676 * Note that NVMe Number of Ranges is a zero based value 2677 */ 2678 req->io_req.br_iovcnt = 0; 2679 req->io_req.br_offset = offset; 2680 req->io_req.br_resid = bytes; 2681 2682 if (nr == 0) { 2683 req->io_req.br_callback = pci_nvme_io_done; 2684 } else { 2685 struct iovec *iov = req->io_req.br_iov; 2686 2687 for (r = 0, dr = 0; r <= nr; r++) { 2688 offset = range[r].starting_lba << sectsz_bits; 2689 bytes = range[r].length << sectsz_bits; 2690 if (bytes == 0) 2691 continue; 2692 2693 if ((nvstore->size - offset) < bytes) { 2694 pci_nvme_status_genc(status, 2695 NVME_SC_LBA_OUT_OF_RANGE); 2696 goto out; 2697 } 2698 iov[dr].iov_base = (void *)offset; 2699 iov[dr].iov_len = bytes; 2700 dr++; 2701 } 2702 req->io_req.br_callback = pci_nvme_dealloc_sm; 2703 2704 /* 2705 * Use prev_gpaddr to track the current entry and 2706 * prev_size to track the number of entries 2707 */ 2708 req->prev_gpaddr = 0; 2709 req->prev_size = dr; 2710 } 2711 2712 err = blockif_delete(nvstore->ctx, &req->io_req); 2713 if (err) 2714 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2715 else 2716 pending = true; 2717 } 2718 out: 2719 free(range); 2720 return (pending); 2721 } 2722 2723 static void 2724 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) 2725 { 2726 struct nvme_submission_queue *sq; 2727 uint16_t status; 2728 uint16_t sqhead; 2729 2730 /* handle all submissions up to sq->tail index */ 2731 sq = &sc->submit_queues[idx]; 2732 2733 pthread_mutex_lock(&sq->mtx); 2734 2735 sqhead = sq->head; 2736 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p", 2737 idx, sqhead, sq->tail, sq->qbase); 2738 2739 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2740 struct nvme_command *cmd; 2741 struct pci_nvme_ioreq *req; 2742 uint32_t nsid; 2743 bool pending; 2744 2745 pending = false; 2746 req = NULL; 2747 status = 0; 2748 2749 cmd = &sq->qbase[sqhead]; 2750 sqhead = (sqhead + 1) % sq->size; 2751 2752 nsid = le32toh(cmd->nsid); 2753 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) { 2754 pci_nvme_status_genc(&status, 2755 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 2756 status |= 2757 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT; 2758 goto complete; 2759 } 2760 2761 req = pci_nvme_get_ioreq(sc); 2762 if (req == NULL) { 2763 pci_nvme_status_genc(&status, 2764 NVME_SC_INTERNAL_DEVICE_ERROR); 2765 WPRINTF("%s: unable to allocate IO req", __func__); 2766 goto complete; 2767 } 2768 req->nvme_sq = sq; 2769 req->sqid = idx; 2770 req->opc = cmd->opc; 2771 req->cid = cmd->cid; 2772 req->nsid = cmd->nsid; 2773 2774 switch (cmd->opc) { 2775 case NVME_OPC_FLUSH: 2776 pending = nvme_opc_flush(sc, cmd, &sc->nvstore, 2777 req, &status); 2778 break; 2779 case NVME_OPC_WRITE: 2780 case NVME_OPC_READ: 2781 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore, 2782 req, &status); 2783 break; 2784 case NVME_OPC_WRITE_ZEROES: 2785 /* TODO: write zeroes 2786 WPRINTF("%s write zeroes lba 0x%lx blocks %u", 2787 __func__, lba, cmd->cdw12 & 0xFFFF); */ 2788 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2789 break; 2790 case NVME_OPC_DATASET_MANAGEMENT: 2791 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, 2792 req, &status); 2793 break; 2794 default: 2795 WPRINTF("%s unhandled io command 0x%x", 2796 __func__, cmd->opc); 2797 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE); 2798 } 2799 complete: 2800 if (!pending) { 2801 pci_nvme_set_completion(sc, sq, idx, cmd->cid, status); 2802 if (req != NULL) 2803 pci_nvme_release_ioreq(sc, req); 2804 } 2805 } 2806 2807 sq->head = sqhead; 2808 2809 pthread_mutex_unlock(&sq->mtx); 2810 } 2811 2812 static void 2813 pci_nvme_handle_doorbell(struct vmctx *ctx __unused, struct pci_nvme_softc* sc, 2814 uint64_t idx, int is_sq, uint64_t value) 2815 { 2816 DPRINTF("nvme doorbell %lu, %s, val 0x%lx", 2817 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF); 2818 2819 if (is_sq) { 2820 if (idx > sc->num_squeues) { 2821 WPRINTF("%s queue index %lu overflow from " 2822 "guest (max %u)", 2823 __func__, idx, sc->num_squeues); 2824 return; 2825 } 2826 2827 atomic_store_short(&sc->submit_queues[idx].tail, 2828 (uint16_t)value); 2829 2830 if (idx == 0) { 2831 pci_nvme_handle_admin_cmd(sc, value); 2832 } else { 2833 /* submission queue; handle new entries in SQ */ 2834 if (idx > sc->num_squeues) { 2835 WPRINTF("%s SQ index %lu overflow from " 2836 "guest (max %u)", 2837 __func__, idx, sc->num_squeues); 2838 return; 2839 } 2840 pci_nvme_handle_io_cmd(sc, (uint16_t)idx); 2841 } 2842 } else { 2843 if (idx > sc->num_cqueues) { 2844 WPRINTF("%s queue index %lu overflow from " 2845 "guest (max %u)", 2846 __func__, idx, sc->num_cqueues); 2847 return; 2848 } 2849 2850 atomic_store_short(&sc->compl_queues[idx].head, 2851 (uint16_t)value); 2852 } 2853 } 2854 2855 static void 2856 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) 2857 { 2858 const char *s = iswrite ? "WRITE" : "READ"; 2859 2860 switch (offset) { 2861 case NVME_CR_CAP_LOW: 2862 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s); 2863 break; 2864 case NVME_CR_CAP_HI: 2865 DPRINTF("%s %s NVME_CR_CAP_HI", func, s); 2866 break; 2867 case NVME_CR_VS: 2868 DPRINTF("%s %s NVME_CR_VS", func, s); 2869 break; 2870 case NVME_CR_INTMS: 2871 DPRINTF("%s %s NVME_CR_INTMS", func, s); 2872 break; 2873 case NVME_CR_INTMC: 2874 DPRINTF("%s %s NVME_CR_INTMC", func, s); 2875 break; 2876 case NVME_CR_CC: 2877 DPRINTF("%s %s NVME_CR_CC", func, s); 2878 break; 2879 case NVME_CR_CSTS: 2880 DPRINTF("%s %s NVME_CR_CSTS", func, s); 2881 break; 2882 case NVME_CR_NSSR: 2883 DPRINTF("%s %s NVME_CR_NSSR", func, s); 2884 break; 2885 case NVME_CR_AQA: 2886 DPRINTF("%s %s NVME_CR_AQA", func, s); 2887 break; 2888 case NVME_CR_ASQ_LOW: 2889 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s); 2890 break; 2891 case NVME_CR_ASQ_HI: 2892 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s); 2893 break; 2894 case NVME_CR_ACQ_LOW: 2895 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s); 2896 break; 2897 case NVME_CR_ACQ_HI: 2898 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s); 2899 break; 2900 default: 2901 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset); 2902 } 2903 2904 } 2905 2906 static void 2907 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc, 2908 uint64_t offset, int size, uint64_t value) 2909 { 2910 uint32_t ccreg; 2911 2912 if (offset >= NVME_DOORBELL_OFFSET) { 2913 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; 2914 uint64_t idx = belloffset / 8; /* door bell size = 2*int */ 2915 int is_sq = (belloffset % 8) < 4; 2916 2917 if ((sc->regs.csts & NVME_CSTS_RDY) == 0) { 2918 WPRINTF("doorbell write prior to RDY (offset=%#lx)\n", 2919 offset); 2920 return; 2921 } 2922 2923 if (belloffset > ((sc->max_queues+1) * 8 - 4)) { 2924 WPRINTF("guest attempted an overflow write offset " 2925 "0x%lx, val 0x%lx in %s", 2926 offset, value, __func__); 2927 return; 2928 } 2929 2930 if (is_sq) { 2931 if (sc->submit_queues[idx].qbase == NULL) 2932 return; 2933 } else if (sc->compl_queues[idx].qbase == NULL) 2934 return; 2935 2936 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value); 2937 return; 2938 } 2939 2940 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx", 2941 offset, size, value); 2942 2943 if (size != 4) { 2944 WPRINTF("guest wrote invalid size %d (offset 0x%lx, " 2945 "val 0x%lx) to bar0 in %s", 2946 size, offset, value, __func__); 2947 /* TODO: shutdown device */ 2948 return; 2949 } 2950 2951 pci_nvme_bar0_reg_dumps(__func__, offset, 1); 2952 2953 pthread_mutex_lock(&sc->mtx); 2954 2955 switch (offset) { 2956 case NVME_CR_CAP_LOW: 2957 case NVME_CR_CAP_HI: 2958 /* readonly */ 2959 break; 2960 case NVME_CR_VS: 2961 /* readonly */ 2962 break; 2963 case NVME_CR_INTMS: 2964 /* MSI-X, so ignore */ 2965 break; 2966 case NVME_CR_INTMC: 2967 /* MSI-X, so ignore */ 2968 break; 2969 case NVME_CR_CC: 2970 ccreg = (uint32_t)value; 2971 2972 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u " 2973 "iocqes %u", 2974 __func__, 2975 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), 2976 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), 2977 NVME_CC_GET_IOCQES(ccreg)); 2978 2979 if (NVME_CC_GET_SHN(ccreg)) { 2980 /* perform shutdown - flush out data to backend */ 2981 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << 2982 NVME_CSTS_REG_SHST_SHIFT); 2983 sc->regs.csts |= NVME_SHST_COMPLETE << 2984 NVME_CSTS_REG_SHST_SHIFT; 2985 } 2986 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { 2987 if (NVME_CC_GET_EN(ccreg) == 0) 2988 /* transition 1-> causes controller reset */ 2989 pci_nvme_reset_locked(sc); 2990 else 2991 pci_nvme_init_controller(ctx, sc); 2992 } 2993 2994 /* Insert the iocqes, iosqes and en bits from the write */ 2995 sc->regs.cc &= ~NVME_CC_WRITE_MASK; 2996 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; 2997 if (NVME_CC_GET_EN(ccreg) == 0) { 2998 /* Insert the ams, mps and css bit fields */ 2999 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; 3000 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; 3001 sc->regs.csts &= ~NVME_CSTS_RDY; 3002 } else if ((sc->pending_ios == 0) && 3003 !(sc->regs.csts & NVME_CSTS_CFS)) { 3004 sc->regs.csts |= NVME_CSTS_RDY; 3005 } 3006 break; 3007 case NVME_CR_CSTS: 3008 break; 3009 case NVME_CR_NSSR: 3010 /* ignore writes; don't support subsystem reset */ 3011 break; 3012 case NVME_CR_AQA: 3013 sc->regs.aqa = (uint32_t)value; 3014 break; 3015 case NVME_CR_ASQ_LOW: 3016 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | 3017 (0xFFFFF000 & value); 3018 break; 3019 case NVME_CR_ASQ_HI: 3020 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | 3021 (value << 32); 3022 break; 3023 case NVME_CR_ACQ_LOW: 3024 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | 3025 (0xFFFFF000 & value); 3026 break; 3027 case NVME_CR_ACQ_HI: 3028 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | 3029 (value << 32); 3030 break; 3031 default: 3032 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d", 3033 __func__, offset, value, size); 3034 } 3035 pthread_mutex_unlock(&sc->mtx); 3036 } 3037 3038 static void 3039 pci_nvme_write(struct vmctx *ctx, int vcpu __unused, struct pci_devinst *pi, 3040 int baridx, uint64_t offset, int size, uint64_t value) 3041 { 3042 struct pci_nvme_softc* sc = pi->pi_arg; 3043 3044 if (baridx == pci_msix_table_bar(pi) || 3045 baridx == pci_msix_pba_bar(pi)) { 3046 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, " 3047 " value 0x%lx", baridx, offset, size, value); 3048 3049 pci_emul_msix_twrite(pi, offset, size, value); 3050 return; 3051 } 3052 3053 switch (baridx) { 3054 case 0: 3055 pci_nvme_write_bar_0(ctx, sc, offset, size, value); 3056 break; 3057 3058 default: 3059 DPRINTF("%s unknown baridx %d, val 0x%lx", 3060 __func__, baridx, value); 3061 } 3062 } 3063 3064 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, 3065 uint64_t offset, int size) 3066 { 3067 uint64_t value; 3068 3069 pci_nvme_bar0_reg_dumps(__func__, offset, 0); 3070 3071 if (offset < NVME_DOORBELL_OFFSET) { 3072 void *p = &(sc->regs); 3073 pthread_mutex_lock(&sc->mtx); 3074 memcpy(&value, (void *)((uintptr_t)p + offset), size); 3075 pthread_mutex_unlock(&sc->mtx); 3076 } else { 3077 value = 0; 3078 WPRINTF("pci_nvme: read invalid offset %ld", offset); 3079 } 3080 3081 switch (size) { 3082 case 1: 3083 value &= 0xFF; 3084 break; 3085 case 2: 3086 value &= 0xFFFF; 3087 break; 3088 case 4: 3089 value &= 0xFFFFFFFF; 3090 break; 3091 } 3092 3093 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x", 3094 offset, size, (uint32_t)value); 3095 3096 return (value); 3097 } 3098 3099 3100 3101 static uint64_t 3102 pci_nvme_read(struct vmctx *ctx __unused, int vcpu __unused, 3103 struct pci_devinst *pi, int baridx, uint64_t offset, int size) 3104 { 3105 struct pci_nvme_softc* sc = pi->pi_arg; 3106 3107 if (baridx == pci_msix_table_bar(pi) || 3108 baridx == pci_msix_pba_bar(pi)) { 3109 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d", 3110 baridx, offset, size); 3111 3112 return pci_emul_msix_tread(pi, offset, size); 3113 } 3114 3115 switch (baridx) { 3116 case 0: 3117 return pci_nvme_read_bar_0(sc, offset, size); 3118 3119 default: 3120 DPRINTF("unknown bar %d, 0x%lx", baridx, offset); 3121 } 3122 3123 return (0); 3124 } 3125 3126 static int 3127 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl) 3128 { 3129 char bident[sizeof("XX:X:X")]; 3130 const char *value; 3131 uint32_t sectsz; 3132 3133 sc->max_queues = NVME_QUEUES; 3134 sc->max_qentries = NVME_MAX_QENTRIES; 3135 sc->ioslots = NVME_IOSLOTS; 3136 sc->num_squeues = sc->max_queues; 3137 sc->num_cqueues = sc->max_queues; 3138 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 3139 sectsz = 0; 3140 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), 3141 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3142 3143 value = get_config_value_node(nvl, "maxq"); 3144 if (value != NULL) 3145 sc->max_queues = atoi(value); 3146 value = get_config_value_node(nvl, "qsz"); 3147 if (value != NULL) { 3148 sc->max_qentries = atoi(value); 3149 if (sc->max_qentries <= 0) { 3150 EPRINTLN("nvme: Invalid qsz option %d", 3151 sc->max_qentries); 3152 return (-1); 3153 } 3154 } 3155 value = get_config_value_node(nvl, "ioslots"); 3156 if (value != NULL) { 3157 sc->ioslots = atoi(value); 3158 if (sc->ioslots <= 0) { 3159 EPRINTLN("Invalid ioslots option %d", sc->ioslots); 3160 return (-1); 3161 } 3162 } 3163 value = get_config_value_node(nvl, "sectsz"); 3164 if (value != NULL) 3165 sectsz = atoi(value); 3166 value = get_config_value_node(nvl, "ser"); 3167 if (value != NULL) { 3168 /* 3169 * This field indicates the Product Serial Number in 3170 * 7-bit ASCII, unused bytes should be space characters. 3171 * Ref: NVMe v1.3c. 3172 */ 3173 cpywithpad((char *)sc->ctrldata.sn, 3174 sizeof(sc->ctrldata.sn), value, ' '); 3175 } 3176 value = get_config_value_node(nvl, "eui64"); 3177 if (value != NULL) 3178 sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0)); 3179 value = get_config_value_node(nvl, "dsm"); 3180 if (value != NULL) { 3181 if (strcmp(value, "auto") == 0) 3182 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 3183 else if (strcmp(value, "enable") == 0) 3184 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE; 3185 else if (strcmp(value, "disable") == 0) 3186 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE; 3187 } 3188 3189 value = get_config_value_node(nvl, "ram"); 3190 if (value != NULL) { 3191 uint64_t sz = strtoull(value, NULL, 10); 3192 3193 sc->nvstore.type = NVME_STOR_RAM; 3194 sc->nvstore.size = sz * 1024 * 1024; 3195 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 3196 sc->nvstore.sectsz = 4096; 3197 sc->nvstore.sectsz_bits = 12; 3198 if (sc->nvstore.ctx == NULL) { 3199 EPRINTLN("nvme: Unable to allocate RAM"); 3200 return (-1); 3201 } 3202 } else { 3203 snprintf(bident, sizeof(bident), "%d:%d", 3204 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3205 sc->nvstore.ctx = blockif_open(nvl, bident); 3206 if (sc->nvstore.ctx == NULL) { 3207 EPRINTLN("nvme: Could not open backing file: %s", 3208 strerror(errno)); 3209 return (-1); 3210 } 3211 sc->nvstore.type = NVME_STOR_BLOCKIF; 3212 sc->nvstore.size = blockif_size(sc->nvstore.ctx); 3213 } 3214 3215 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) 3216 sc->nvstore.sectsz = sectsz; 3217 else if (sc->nvstore.type != NVME_STOR_RAM) 3218 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); 3219 for (sc->nvstore.sectsz_bits = 9; 3220 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; 3221 sc->nvstore.sectsz_bits++); 3222 3223 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) 3224 sc->max_queues = NVME_QUEUES; 3225 3226 return (0); 3227 } 3228 3229 static void 3230 pci_nvme_resized(struct blockif_ctxt *bctxt __unused, void *arg, 3231 size_t new_size) 3232 { 3233 struct pci_nvme_softc *sc; 3234 struct pci_nvme_blockstore *nvstore; 3235 struct nvme_namespace_data *nd; 3236 3237 sc = arg; 3238 nvstore = &sc->nvstore; 3239 nd = &sc->nsdata; 3240 3241 nvstore->size = new_size; 3242 pci_nvme_init_nsdata_size(nvstore, nd); 3243 3244 /* Add changed NSID to list */ 3245 sc->ns_log.ns[0] = 1; 3246 sc->ns_log.ns[1] = 0; 3247 3248 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE, 3249 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED); 3250 } 3251 3252 static int 3253 pci_nvme_init(struct vmctx *ctx __unused, struct pci_devinst *pi, nvlist_t *nvl) 3254 { 3255 struct pci_nvme_softc *sc; 3256 uint32_t pci_membar_sz; 3257 int error; 3258 3259 error = 0; 3260 3261 sc = calloc(1, sizeof(struct pci_nvme_softc)); 3262 pi->pi_arg = sc; 3263 sc->nsc_pi = pi; 3264 3265 error = pci_nvme_parse_config(sc, nvl); 3266 if (error < 0) 3267 goto done; 3268 else 3269 error = 0; 3270 3271 STAILQ_INIT(&sc->ioreqs_free); 3272 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); 3273 for (uint32_t i = 0; i < sc->ioslots; i++) { 3274 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); 3275 } 3276 3277 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); 3278 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); 3279 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); 3280 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); 3281 pci_set_cfgdata8(pi, PCIR_PROGIF, 3282 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); 3283 3284 /* 3285 * Allocate size of NVMe registers + doorbell space for all queues. 3286 * 3287 * The specification requires a minimum memory I/O window size of 16K. 3288 * The Windows driver will refuse to start a device with a smaller 3289 * window. 3290 */ 3291 pci_membar_sz = sizeof(struct nvme_registers) + 3292 2 * sizeof(uint32_t) * (sc->max_queues + 1); 3293 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); 3294 3295 DPRINTF("nvme membar size: %u", pci_membar_sz); 3296 3297 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); 3298 if (error) { 3299 WPRINTF("%s pci alloc mem bar failed", __func__); 3300 goto done; 3301 } 3302 3303 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); 3304 if (error) { 3305 WPRINTF("%s pci add msixcap failed", __func__); 3306 goto done; 3307 } 3308 3309 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); 3310 if (error) { 3311 WPRINTF("%s pci add Express capability failed", __func__); 3312 goto done; 3313 } 3314 3315 pthread_mutex_init(&sc->mtx, NULL); 3316 sem_init(&sc->iosemlock, 0, sc->ioslots); 3317 blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc); 3318 3319 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues); 3320 /* 3321 * Controller data depends on Namespace data so initialize Namespace 3322 * data first. 3323 */ 3324 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore); 3325 pci_nvme_init_ctrldata(sc); 3326 pci_nvme_init_logpages(sc); 3327 pci_nvme_init_features(sc); 3328 3329 pci_nvme_aer_init(sc); 3330 pci_nvme_aen_init(sc); 3331 3332 pci_nvme_reset(sc); 3333 3334 pci_lintr_request(pi); 3335 3336 done: 3337 return (error); 3338 } 3339 3340 static int 3341 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts) 3342 { 3343 char *cp, *ram; 3344 3345 if (opts == NULL) 3346 return (0); 3347 3348 if (strncmp(opts, "ram=", 4) == 0) { 3349 cp = strchr(opts, ','); 3350 if (cp == NULL) { 3351 set_config_value_node(nvl, "ram", opts + 4); 3352 return (0); 3353 } 3354 ram = strndup(opts + 4, cp - opts - 4); 3355 set_config_value_node(nvl, "ram", ram); 3356 free(ram); 3357 return (pci_parse_legacy_config(nvl, cp + 1)); 3358 } else 3359 return (blockif_legacy_config(nvl, opts)); 3360 } 3361 3362 static const struct pci_devemu pci_de_nvme = { 3363 .pe_emu = "nvme", 3364 .pe_init = pci_nvme_init, 3365 .pe_legacy_config = pci_nvme_legacy_config, 3366 .pe_barwrite = pci_nvme_write, 3367 .pe_barread = pci_nvme_read 3368 }; 3369 PCI_EMUL_SET(pci_de_nvme); 3370