1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2017 Shunsuke Mie 5 * Copyright (c) 2018 Leon Dang 6 * Copyright (c) 2020 Chuck Tuffli 7 * 8 * Function crc16 Copyright (c) 2017, Fedor Uporov 9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 33 /* 34 * bhyve PCIe-NVMe device emulation. 35 * 36 * options: 37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt> 38 * 39 * accepted devpath: 40 * /dev/blockdev 41 * /path/to/image 42 * ram=size_in_MiB 43 * 44 * maxq = max number of queues 45 * qsz = max elements in each queue 46 * ioslots = max number of concurrent io requests 47 * sectsz = sector size (defaults to blockif sector size) 48 * ser = serial number (20-chars max) 49 * eui64 = IEEE Extended Unique Identifier (8 byte value) 50 * dsm = DataSet Management support. Option is one of auto, enable,disable 51 * 52 */ 53 54 /* TODO: 55 - create async event for smart and log 56 - intr coalesce 57 */ 58 59 #include <sys/cdefs.h> 60 __FBSDID("$FreeBSD$"); 61 62 #include <sys/errno.h> 63 #include <sys/types.h> 64 #include <net/ieee_oui.h> 65 #ifndef __FreeBSD__ 66 #include <endian.h> 67 #endif 68 69 #include <assert.h> 70 #include <pthread.h> 71 #include <pthread_np.h> 72 #include <semaphore.h> 73 #include <stdbool.h> 74 #include <stddef.h> 75 #include <stdint.h> 76 #include <stdio.h> 77 #include <stdlib.h> 78 #include <string.h> 79 80 #include <machine/atomic.h> 81 #include <machine/vmm.h> 82 #include <vmmapi.h> 83 84 #include <dev/nvme/nvme.h> 85 86 #include "bhyverun.h" 87 #include "block_if.h" 88 #include "config.h" 89 #include "debug.h" 90 #include "pci_emul.h" 91 92 93 static int nvme_debug = 0; 94 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args) 95 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args) 96 97 /* defaults; can be overridden */ 98 #define NVME_MSIX_BAR 4 99 100 #define NVME_IOSLOTS 8 101 102 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ 103 #define NVME_MMIO_SPACE_MIN (1 << 14) 104 105 #define NVME_QUEUES 16 106 #define NVME_MAX_QENTRIES 2048 107 /* Memory Page size Minimum reported in CAP register */ 108 #define NVME_MPSMIN 0 109 /* MPSMIN converted to bytes */ 110 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN)) 111 112 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) 113 #define NVME_MDTS 9 114 /* Note the + 1 allows for the initial descriptor to not be page aligned */ 115 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1) 116 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES) 117 118 /* This is a synthetic status code to indicate there is no status */ 119 #define NVME_NO_STATUS 0xffff 120 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) 121 122 /* Reported temperature in Kelvin (i.e. room temperature) */ 123 #define NVME_TEMPERATURE 296 124 125 /* helpers */ 126 127 /* Convert a zero-based value into a one-based value */ 128 #define ONE_BASED(zero) ((zero) + 1) 129 /* Convert a one-based value into a zero-based value */ 130 #define ZERO_BASED(one) ((one) - 1) 131 132 /* Encode number of SQ's and CQ's for Set/Get Features */ 133 #define NVME_FEATURE_NUM_QUEUES(sc) \ 134 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ 135 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16; 136 137 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) 138 139 enum nvme_controller_register_offsets { 140 NVME_CR_CAP_LOW = 0x00, 141 NVME_CR_CAP_HI = 0x04, 142 NVME_CR_VS = 0x08, 143 NVME_CR_INTMS = 0x0c, 144 NVME_CR_INTMC = 0x10, 145 NVME_CR_CC = 0x14, 146 NVME_CR_CSTS = 0x1c, 147 NVME_CR_NSSR = 0x20, 148 NVME_CR_AQA = 0x24, 149 NVME_CR_ASQ_LOW = 0x28, 150 NVME_CR_ASQ_HI = 0x2c, 151 NVME_CR_ACQ_LOW = 0x30, 152 NVME_CR_ACQ_HI = 0x34, 153 }; 154 155 enum nvme_cmd_cdw11 { 156 NVME_CMD_CDW11_PC = 0x0001, 157 NVME_CMD_CDW11_IEN = 0x0002, 158 NVME_CMD_CDW11_IV = 0xFFFF0000, 159 }; 160 161 enum nvme_copy_dir { 162 NVME_COPY_TO_PRP, 163 NVME_COPY_FROM_PRP, 164 }; 165 166 #define NVME_CQ_INTEN 0x01 167 #define NVME_CQ_INTCOAL 0x02 168 169 struct nvme_completion_queue { 170 struct nvme_completion *qbase; 171 pthread_mutex_t mtx; 172 uint32_t size; 173 uint16_t tail; /* nvme progress */ 174 uint16_t head; /* guest progress */ 175 uint16_t intr_vec; 176 uint32_t intr_en; 177 }; 178 179 struct nvme_submission_queue { 180 struct nvme_command *qbase; 181 pthread_mutex_t mtx; 182 uint32_t size; 183 uint16_t head; /* nvme progress */ 184 uint16_t tail; /* guest progress */ 185 uint16_t cqid; /* completion queue id */ 186 int qpriority; 187 }; 188 189 enum nvme_storage_type { 190 NVME_STOR_BLOCKIF = 0, 191 NVME_STOR_RAM = 1, 192 }; 193 194 struct pci_nvme_blockstore { 195 enum nvme_storage_type type; 196 void *ctx; 197 uint64_t size; 198 uint32_t sectsz; 199 uint32_t sectsz_bits; 200 uint64_t eui64; 201 uint32_t deallocate:1; 202 }; 203 204 /* 205 * Calculate the number of additional page descriptors for guest IO requests 206 * based on the advertised Max Data Transfer (MDTS) and given the number of 207 * default iovec's in a struct blockif_req. 208 */ 209 #define MDTS_PAD_SIZE \ 210 ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \ 211 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \ 212 0 ) 213 214 struct pci_nvme_ioreq { 215 struct pci_nvme_softc *sc; 216 STAILQ_ENTRY(pci_nvme_ioreq) link; 217 struct nvme_submission_queue *nvme_sq; 218 uint16_t sqid; 219 220 /* command information */ 221 uint16_t opc; 222 uint16_t cid; 223 uint32_t nsid; 224 225 uint64_t prev_gpaddr; 226 size_t prev_size; 227 size_t bytes; 228 229 struct blockif_req io_req; 230 231 struct iovec iovpadding[MDTS_PAD_SIZE]; 232 }; 233 234 enum nvme_dsm_type { 235 /* Dataset Management bit in ONCS reflects backing storage capability */ 236 NVME_DATASET_MANAGEMENT_AUTO, 237 /* Unconditionally set Dataset Management bit in ONCS */ 238 NVME_DATASET_MANAGEMENT_ENABLE, 239 /* Unconditionally clear Dataset Management bit in ONCS */ 240 NVME_DATASET_MANAGEMENT_DISABLE, 241 }; 242 243 struct pci_nvme_softc; 244 struct nvme_feature_obj; 245 246 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *, 247 struct nvme_feature_obj *, 248 struct nvme_command *, 249 struct nvme_completion *); 250 251 struct nvme_feature_obj { 252 uint32_t cdw11; 253 nvme_feature_cb set; 254 nvme_feature_cb get; 255 bool namespace_specific; 256 }; 257 258 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1) 259 260 typedef enum { 261 PCI_NVME_AE_TYPE_ERROR = 0, 262 PCI_NVME_AE_TYPE_SMART, 263 PCI_NVME_AE_TYPE_NOTICE, 264 PCI_NVME_AE_TYPE_IO_CMD = 6, 265 PCI_NVME_AE_TYPE_VENDOR = 7, 266 PCI_NVME_AE_TYPE_MAX /* Must be last */ 267 } pci_nvme_async_type; 268 269 /* Asynchronous Event Requests */ 270 struct pci_nvme_aer { 271 STAILQ_ENTRY(pci_nvme_aer) link; 272 uint16_t cid; /* Command ID of the submitted AER */ 273 }; 274 275 /** Asynchronous Event Information - Notice */ 276 typedef enum { 277 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED = 0, 278 PCI_NVME_AEI_NOTICE_FW_ACTIVATION, 279 PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE, 280 PCI_NVME_AEI_NOTICE_ANA_CHANGE, 281 PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE, 282 PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT, 283 PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE, 284 PCI_NVME_AEI_NOTICE_MAX, 285 } pci_nvme_async_event_info_notice; 286 287 #define PCI_NVME_AEI_NOTICE_SHIFT 8 288 #define PCI_NVME_AEI_NOTICE_MASK(event) (1 << (event + PCI_NVME_AEI_NOTICE_SHIFT)) 289 290 /* Asynchronous Event Notifications */ 291 struct pci_nvme_aen { 292 pci_nvme_async_type atype; 293 uint32_t event_data; 294 bool posted; 295 }; 296 297 /* 298 * By default, enable all Asynchrnous Event Notifications: 299 * SMART / Health Critical Warnings 300 * Namespace Attribute Notices 301 */ 302 #define PCI_NVME_AEN_DEFAULT_MASK 0x11f 303 304 typedef enum { 305 NVME_CNTRLTYPE_IO = 1, 306 NVME_CNTRLTYPE_DISCOVERY = 2, 307 NVME_CNTRLTYPE_ADMIN = 3, 308 } pci_nvme_cntrl_type; 309 310 struct pci_nvme_softc { 311 struct pci_devinst *nsc_pi; 312 313 pthread_mutex_t mtx; 314 315 struct nvme_registers regs; 316 317 struct nvme_namespace_data nsdata; 318 struct nvme_controller_data ctrldata; 319 struct nvme_error_information_entry err_log; 320 struct nvme_health_information_page health_log; 321 struct nvme_firmware_page fw_log; 322 struct nvme_ns_list ns_log; 323 324 struct pci_nvme_blockstore nvstore; 325 326 uint16_t max_qentries; /* max entries per queue */ 327 uint32_t max_queues; /* max number of IO SQ's or CQ's */ 328 uint32_t num_cqueues; 329 uint32_t num_squeues; 330 bool num_q_is_set; /* Has host set Number of Queues */ 331 332 struct pci_nvme_ioreq *ioreqs; 333 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */ 334 uint32_t pending_ios; 335 uint32_t ioslots; 336 sem_t iosemlock; 337 338 /* 339 * Memory mapped Submission and Completion queues 340 * Each array includes both Admin and IO queues 341 */ 342 struct nvme_completion_queue *compl_queues; 343 struct nvme_submission_queue *submit_queues; 344 345 struct nvme_feature_obj feat[NVME_FID_MAX]; 346 347 enum nvme_dsm_type dataset_management; 348 349 /* Accounting for SMART data */ 350 __uint128_t read_data_units; 351 __uint128_t write_data_units; 352 __uint128_t read_commands; 353 __uint128_t write_commands; 354 uint32_t read_dunits_remainder; 355 uint32_t write_dunits_remainder; 356 357 STAILQ_HEAD(, pci_nvme_aer) aer_list; 358 pthread_mutex_t aer_mtx; 359 uint32_t aer_count; 360 struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX]; 361 pthread_t aen_tid; 362 pthread_mutex_t aen_mtx; 363 pthread_cond_t aen_cond; 364 }; 365 366 367 static void pci_nvme_cq_update(struct pci_nvme_softc *sc, 368 struct nvme_completion_queue *cq, 369 uint32_t cdw0, 370 uint16_t cid, 371 uint16_t sqid, 372 uint16_t status); 373 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); 374 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); 375 static void pci_nvme_io_done(struct blockif_req *, int); 376 377 /* Controller Configuration utils */ 378 #define NVME_CC_GET_EN(cc) \ 379 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) 380 #define NVME_CC_GET_CSS(cc) \ 381 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) 382 #define NVME_CC_GET_SHN(cc) \ 383 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) 384 #define NVME_CC_GET_IOSQES(cc) \ 385 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) 386 #define NVME_CC_GET_IOCQES(cc) \ 387 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) 388 389 #define NVME_CC_WRITE_MASK \ 390 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ 391 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ 392 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) 393 394 #define NVME_CC_NEN_WRITE_MASK \ 395 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ 396 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ 397 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) 398 399 /* Controller Status utils */ 400 #define NVME_CSTS_GET_RDY(sts) \ 401 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) 402 403 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) 404 405 /* Completion Queue status word utils */ 406 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) 407 #define NVME_STATUS_MASK \ 408 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ 409 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) 410 411 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \ 412 NVME_CTRLR_DATA_ONCS_DSM_SHIFT) 413 414 static void nvme_feature_invalid_cb(struct pci_nvme_softc *, 415 struct nvme_feature_obj *, 416 struct nvme_command *, 417 struct nvme_completion *); 418 static void nvme_feature_temperature(struct pci_nvme_softc *, 419 struct nvme_feature_obj *, 420 struct nvme_command *, 421 struct nvme_completion *); 422 static void nvme_feature_num_queues(struct pci_nvme_softc *, 423 struct nvme_feature_obj *, 424 struct nvme_command *, 425 struct nvme_completion *); 426 static void nvme_feature_iv_config(struct pci_nvme_softc *, 427 struct nvme_feature_obj *, 428 struct nvme_command *, 429 struct nvme_completion *); 430 static void nvme_feature_async_event(struct pci_nvme_softc *, 431 struct nvme_feature_obj *, 432 struct nvme_command *, 433 struct nvme_completion *); 434 435 static void *aen_thr(void *arg); 436 437 static __inline void 438 cpywithpad(char *dst, size_t dst_size, const char *src, char pad) 439 { 440 size_t len; 441 442 len = strnlen(src, dst_size); 443 memset(dst, pad, dst_size); 444 memcpy(dst, src, len); 445 } 446 447 static __inline void 448 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) 449 { 450 451 *status &= ~NVME_STATUS_MASK; 452 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | 453 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; 454 } 455 456 static __inline void 457 pci_nvme_status_genc(uint16_t *status, uint16_t code) 458 { 459 460 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); 461 } 462 463 /* 464 * Initialize the requested number or IO Submission and Completion Queues. 465 * Admin queues are allocated implicitly. 466 */ 467 static void 468 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq) 469 { 470 uint32_t i; 471 472 /* 473 * Allocate and initialize the Submission Queues 474 */ 475 if (nsq > NVME_QUEUES) { 476 WPRINTF("%s: clamping number of SQ from %u to %u", 477 __func__, nsq, NVME_QUEUES); 478 nsq = NVME_QUEUES; 479 } 480 481 sc->num_squeues = nsq; 482 483 sc->submit_queues = calloc(sc->num_squeues + 1, 484 sizeof(struct nvme_submission_queue)); 485 if (sc->submit_queues == NULL) { 486 WPRINTF("%s: SQ allocation failed", __func__); 487 sc->num_squeues = 0; 488 } else { 489 struct nvme_submission_queue *sq = sc->submit_queues; 490 491 for (i = 0; i < sc->num_squeues + 1; i++) 492 pthread_mutex_init(&sq[i].mtx, NULL); 493 } 494 495 /* 496 * Allocate and initialize the Completion Queues 497 */ 498 if (ncq > NVME_QUEUES) { 499 WPRINTF("%s: clamping number of CQ from %u to %u", 500 __func__, ncq, NVME_QUEUES); 501 ncq = NVME_QUEUES; 502 } 503 504 sc->num_cqueues = ncq; 505 506 sc->compl_queues = calloc(sc->num_cqueues + 1, 507 sizeof(struct nvme_completion_queue)); 508 if (sc->compl_queues == NULL) { 509 WPRINTF("%s: CQ allocation failed", __func__); 510 sc->num_cqueues = 0; 511 } else { 512 struct nvme_completion_queue *cq = sc->compl_queues; 513 514 for (i = 0; i < sc->num_cqueues + 1; i++) 515 pthread_mutex_init(&cq[i].mtx, NULL); 516 } 517 } 518 519 static void 520 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) 521 { 522 struct nvme_controller_data *cd = &sc->ctrldata; 523 524 cd->vid = 0xFB5D; 525 cd->ssvid = 0x0000; 526 527 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); 528 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); 529 530 /* Num of submission commands that we can handle at a time (2^rab) */ 531 cd->rab = 4; 532 533 /* FreeBSD OUI */ 534 cd->ieee[0] = 0x58; 535 cd->ieee[1] = 0x9c; 536 cd->ieee[2] = 0xfc; 537 538 cd->mic = 0; 539 540 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */ 541 542 cd->ver = NVME_REV(1,4); 543 544 cd->cntrltype = NVME_CNTRLTYPE_IO; 545 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; 546 cd->oaes = NVMEB(NVME_CTRLR_DATA_OAES_NS_ATTR); 547 cd->acl = 2; 548 cd->aerl = 4; 549 550 /* Advertise 1, Read-only firmware slot */ 551 cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK | 552 (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT); 553 cd->lpa = 0; /* TODO: support some simple things like SMART */ 554 cd->elpe = 0; /* max error log page entries */ 555 cd->npss = 1; /* number of power states support */ 556 557 /* Warning Composite Temperature Threshold */ 558 cd->wctemp = 0x0157; 559 cd->cctemp = 0x0157; 560 561 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | 562 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); 563 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | 564 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); 565 cd->nn = 1; /* number of namespaces */ 566 567 cd->oncs = 0; 568 switch (sc->dataset_management) { 569 case NVME_DATASET_MANAGEMENT_AUTO: 570 if (sc->nvstore.deallocate) 571 cd->oncs |= NVME_ONCS_DSM; 572 break; 573 case NVME_DATASET_MANAGEMENT_ENABLE: 574 cd->oncs |= NVME_ONCS_DSM; 575 break; 576 default: 577 break; 578 } 579 580 cd->fna = NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK << 581 NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT; 582 583 cd->vwc = NVME_CTRLR_DATA_VWC_ALL_NO << NVME_CTRLR_DATA_VWC_ALL_SHIFT; 584 585 cd->power_state[0].mp = 10; 586 } 587 588 /* 589 * Calculate the CRC-16 of the given buffer 590 * See copyright attribution at top of file 591 */ 592 static uint16_t 593 crc16(uint16_t crc, const void *buffer, unsigned int len) 594 { 595 const unsigned char *cp = buffer; 596 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */ 597 static uint16_t const crc16_table[256] = { 598 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, 599 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, 600 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, 601 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841, 602 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, 603 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41, 604 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641, 605 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040, 606 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240, 607 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441, 608 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41, 609 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840, 610 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41, 611 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40, 612 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640, 613 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041, 614 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240, 615 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441, 616 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41, 617 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840, 618 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41, 619 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40, 620 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640, 621 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041, 622 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241, 623 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440, 624 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40, 625 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841, 626 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40, 627 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41, 628 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, 629 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 630 }; 631 632 while (len--) 633 crc = (((crc >> 8) & 0xffU) ^ 634 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU; 635 return crc; 636 } 637 638 static void 639 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore, 640 struct nvme_namespace_data *nd) 641 { 642 643 /* Get capacity and block size information from backing store */ 644 nd->nsze = nvstore->size / nvstore->sectsz; 645 nd->ncap = nd->nsze; 646 nd->nuse = nd->nsze; 647 } 648 649 static void 650 pci_nvme_init_nsdata(struct pci_nvme_softc *sc, 651 struct nvme_namespace_data *nd, uint32_t nsid, 652 struct pci_nvme_blockstore *nvstore) 653 { 654 655 pci_nvme_init_nsdata_size(nvstore, nd); 656 657 if (nvstore->type == NVME_STOR_BLOCKIF) 658 nvstore->deallocate = blockif_candelete(nvstore->ctx); 659 660 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ 661 nd->flbas = 0; 662 663 /* Create an EUI-64 if user did not provide one */ 664 if (nvstore->eui64 == 0) { 665 char *data = NULL; 666 uint64_t eui64 = nvstore->eui64; 667 668 asprintf(&data, "%s%u%u%u", get_config_value("name"), 669 sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot, 670 sc->nsc_pi->pi_func); 671 672 if (data != NULL) { 673 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); 674 free(data); 675 } 676 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff); 677 } 678 be64enc(nd->eui64, nvstore->eui64); 679 680 /* LBA data-sz = 2^lbads */ 681 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; 682 } 683 684 static void 685 pci_nvme_init_logpages(struct pci_nvme_softc *sc) 686 { 687 688 memset(&sc->err_log, 0, sizeof(sc->err_log)); 689 memset(&sc->health_log, 0, sizeof(sc->health_log)); 690 memset(&sc->fw_log, 0, sizeof(sc->fw_log)); 691 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 692 693 /* Set read/write remainder to round up according to spec */ 694 sc->read_dunits_remainder = 999; 695 sc->write_dunits_remainder = 999; 696 697 /* Set nominal Health values checked by implementations */ 698 sc->health_log.temperature = NVME_TEMPERATURE; 699 sc->health_log.available_spare = 100; 700 sc->health_log.available_spare_threshold = 10; 701 } 702 703 static void 704 pci_nvme_init_features(struct pci_nvme_softc *sc) 705 { 706 enum nvme_feature fid; 707 708 for (fid = 0; fid < NVME_FID_MAX; fid++) { 709 switch (fid) { 710 case NVME_FEAT_ARBITRATION: 711 case NVME_FEAT_POWER_MANAGEMENT: 712 case NVME_FEAT_INTERRUPT_COALESCING: //XXX 713 case NVME_FEAT_WRITE_ATOMICITY: 714 /* Mandatory but no special handling required */ 715 //XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 716 //XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 717 // this returns a data buffer 718 break; 719 case NVME_FEAT_TEMPERATURE_THRESHOLD: 720 sc->feat[fid].set = nvme_feature_temperature; 721 break; 722 case NVME_FEAT_ERROR_RECOVERY: 723 sc->feat[fid].namespace_specific = true; 724 break; 725 case NVME_FEAT_NUMBER_OF_QUEUES: 726 sc->feat[fid].set = nvme_feature_num_queues; 727 break; 728 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 729 sc->feat[fid].set = nvme_feature_iv_config; 730 break; 731 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 732 sc->feat[fid].set = nvme_feature_async_event; 733 /* Enable all AENs by default */ 734 sc->feat[fid].cdw11 = PCI_NVME_AEN_DEFAULT_MASK; 735 break; 736 default: 737 sc->feat[fid].set = nvme_feature_invalid_cb; 738 sc->feat[fid].get = nvme_feature_invalid_cb; 739 } 740 } 741 } 742 743 static void 744 pci_nvme_aer_reset(struct pci_nvme_softc *sc) 745 { 746 747 STAILQ_INIT(&sc->aer_list); 748 sc->aer_count = 0; 749 } 750 751 static void 752 pci_nvme_aer_init(struct pci_nvme_softc *sc) 753 { 754 755 pthread_mutex_init(&sc->aer_mtx, NULL); 756 pci_nvme_aer_reset(sc); 757 } 758 759 static void 760 pci_nvme_aer_destroy(struct pci_nvme_softc *sc) 761 { 762 struct pci_nvme_aer *aer = NULL; 763 764 pthread_mutex_lock(&sc->aer_mtx); 765 while (!STAILQ_EMPTY(&sc->aer_list)) { 766 aer = STAILQ_FIRST(&sc->aer_list); 767 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 768 free(aer); 769 } 770 pthread_mutex_unlock(&sc->aer_mtx); 771 772 pci_nvme_aer_reset(sc); 773 } 774 775 static bool 776 pci_nvme_aer_available(struct pci_nvme_softc *sc) 777 { 778 779 return (sc->aer_count != 0); 780 } 781 782 static bool 783 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc) 784 { 785 struct nvme_controller_data *cd = &sc->ctrldata; 786 787 /* AERL is a zero based value while aer_count is one's based */ 788 return (sc->aer_count == (cd->aerl + 1)); 789 } 790 791 /* 792 * Add an Async Event Request 793 * 794 * Stores an AER to be returned later if the Controller needs to notify the 795 * host of an event. 796 * Note that while the NVMe spec doesn't require Controllers to return AER's 797 * in order, this implementation does preserve the order. 798 */ 799 static int 800 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid) 801 { 802 struct pci_nvme_aer *aer = NULL; 803 804 aer = calloc(1, sizeof(struct pci_nvme_aer)); 805 if (aer == NULL) 806 return (-1); 807 808 /* Save the Command ID for use in the completion message */ 809 aer->cid = cid; 810 811 pthread_mutex_lock(&sc->aer_mtx); 812 sc->aer_count++; 813 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link); 814 pthread_mutex_unlock(&sc->aer_mtx); 815 816 return (0); 817 } 818 819 /* 820 * Get an Async Event Request structure 821 * 822 * Returns a pointer to an AER previously submitted by the host or NULL if 823 * no AER's exist. Caller is responsible for freeing the returned struct. 824 */ 825 static struct pci_nvme_aer * 826 pci_nvme_aer_get(struct pci_nvme_softc *sc) 827 { 828 struct pci_nvme_aer *aer = NULL; 829 830 pthread_mutex_lock(&sc->aer_mtx); 831 aer = STAILQ_FIRST(&sc->aer_list); 832 if (aer != NULL) { 833 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 834 sc->aer_count--; 835 } 836 pthread_mutex_unlock(&sc->aer_mtx); 837 838 return (aer); 839 } 840 841 static void 842 pci_nvme_aen_reset(struct pci_nvme_softc *sc) 843 { 844 uint32_t atype; 845 846 memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen)); 847 848 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 849 sc->aen[atype].atype = atype; 850 } 851 } 852 853 static void 854 pci_nvme_aen_init(struct pci_nvme_softc *sc) 855 { 856 char nstr[80]; 857 858 pci_nvme_aen_reset(sc); 859 860 pthread_mutex_init(&sc->aen_mtx, NULL); 861 pthread_create(&sc->aen_tid, NULL, aen_thr, sc); 862 snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot, 863 sc->nsc_pi->pi_func); 864 pthread_set_name_np(sc->aen_tid, nstr); 865 } 866 867 static void 868 pci_nvme_aen_destroy(struct pci_nvme_softc *sc) 869 { 870 871 pci_nvme_aen_reset(sc); 872 } 873 874 /* Notify the AEN thread of pending work */ 875 static void 876 pci_nvme_aen_notify(struct pci_nvme_softc *sc) 877 { 878 879 pthread_cond_signal(&sc->aen_cond); 880 } 881 882 /* 883 * Post an Asynchronous Event Notification 884 */ 885 static int32_t 886 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype, 887 uint32_t event_data) 888 { 889 struct pci_nvme_aen *aen; 890 891 if (atype >= PCI_NVME_AE_TYPE_MAX) { 892 return(EINVAL); 893 } 894 895 pthread_mutex_lock(&sc->aen_mtx); 896 aen = &sc->aen[atype]; 897 898 /* Has the controller already posted an event of this type? */ 899 if (aen->posted) { 900 pthread_mutex_unlock(&sc->aen_mtx); 901 return(EALREADY); 902 } 903 904 aen->event_data = event_data; 905 aen->posted = true; 906 pthread_mutex_unlock(&sc->aen_mtx); 907 908 pci_nvme_aen_notify(sc); 909 910 return(0); 911 } 912 913 static void 914 pci_nvme_aen_process(struct pci_nvme_softc *sc) 915 { 916 struct pci_nvme_aer *aer; 917 struct pci_nvme_aen *aen; 918 pci_nvme_async_type atype; 919 uint32_t mask; 920 uint16_t status; 921 uint8_t lid; 922 923 #ifndef __FreeBSD__ 924 lid = 0; 925 #endif 926 927 assert(pthread_mutex_isowned_np(&sc->aen_mtx)); 928 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 929 aen = &sc->aen[atype]; 930 /* Previous iterations may have depleted the available AER's */ 931 if (!pci_nvme_aer_available(sc)) { 932 DPRINTF("%s: no AER", __func__); 933 break; 934 } 935 936 if (!aen->posted) { 937 DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype); 938 continue; 939 } 940 941 status = NVME_SC_SUCCESS; 942 943 /* Is the event masked? */ 944 mask = 945 sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11; 946 947 DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data); 948 switch (atype) { 949 case PCI_NVME_AE_TYPE_ERROR: 950 lid = NVME_LOG_ERROR; 951 break; 952 case PCI_NVME_AE_TYPE_SMART: 953 mask &= 0xff; 954 if ((mask & aen->event_data) == 0) 955 continue; 956 lid = NVME_LOG_HEALTH_INFORMATION; 957 break; 958 case PCI_NVME_AE_TYPE_NOTICE: 959 if (aen->event_data >= PCI_NVME_AEI_NOTICE_MAX) { 960 EPRINTLN("%s unknown AEN notice type %u", 961 __func__, aen->event_data); 962 status = NVME_SC_INTERNAL_DEVICE_ERROR; 963 break; 964 } 965 if ((PCI_NVME_AEI_NOTICE_MASK(aen->event_data) & mask) == 0) 966 continue; 967 switch (aen->event_data) { 968 case PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED: 969 lid = NVME_LOG_CHANGED_NAMESPACE; 970 break; 971 case PCI_NVME_AEI_NOTICE_FW_ACTIVATION: 972 lid = NVME_LOG_FIRMWARE_SLOT; 973 break; 974 case PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE: 975 lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED; 976 break; 977 case PCI_NVME_AEI_NOTICE_ANA_CHANGE: 978 lid = NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS; 979 break; 980 case PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE: 981 lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE; 982 break; 983 case PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT: 984 lid = NVME_LOG_LBA_STATUS_INFORMATION; 985 break; 986 case PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE: 987 lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE; 988 break; 989 default: 990 lid = 0; 991 } 992 break; 993 default: 994 /* bad type?!? */ 995 EPRINTLN("%s unknown AEN type %u", __func__, atype); 996 status = NVME_SC_INTERNAL_DEVICE_ERROR; 997 break; 998 } 999 1000 aer = pci_nvme_aer_get(sc); 1001 assert(aer != NULL); 1002 1003 DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype); 1004 pci_nvme_cq_update(sc, &sc->compl_queues[0], 1005 (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */ 1006 aer->cid, 1007 0, /* SQID */ 1008 status); 1009 1010 aen->event_data = 0; 1011 aen->posted = false; 1012 1013 pci_generate_msix(sc->nsc_pi, 0); 1014 } 1015 } 1016 1017 static void * 1018 aen_thr(void *arg) 1019 { 1020 struct pci_nvme_softc *sc; 1021 1022 sc = arg; 1023 1024 pthread_mutex_lock(&sc->aen_mtx); 1025 for (;;) { 1026 pci_nvme_aen_process(sc); 1027 pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx); 1028 } 1029 #ifdef __FreeBSD__ 1030 pthread_mutex_unlock(&sc->aen_mtx); 1031 1032 pthread_exit(NULL); 1033 #endif 1034 return (NULL); 1035 } 1036 1037 static void 1038 pci_nvme_reset_locked(struct pci_nvme_softc *sc) 1039 { 1040 uint32_t i; 1041 1042 DPRINTF("%s", __func__); 1043 1044 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | 1045 (1 << NVME_CAP_LO_REG_CQR_SHIFT) | 1046 (60 << NVME_CAP_LO_REG_TO_SHIFT); 1047 1048 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; 1049 1050 sc->regs.vs = NVME_REV(1,4); /* NVMe v1.4 */ 1051 1052 sc->regs.cc = 0; 1053 1054 assert(sc->submit_queues != NULL); 1055 1056 for (i = 0; i < sc->num_squeues + 1; i++) { 1057 sc->submit_queues[i].qbase = NULL; 1058 sc->submit_queues[i].size = 0; 1059 sc->submit_queues[i].cqid = 0; 1060 sc->submit_queues[i].tail = 0; 1061 sc->submit_queues[i].head = 0; 1062 } 1063 1064 assert(sc->compl_queues != NULL); 1065 1066 for (i = 0; i < sc->num_cqueues + 1; i++) { 1067 sc->compl_queues[i].qbase = NULL; 1068 sc->compl_queues[i].size = 0; 1069 sc->compl_queues[i].tail = 0; 1070 sc->compl_queues[i].head = 0; 1071 } 1072 1073 sc->num_q_is_set = false; 1074 1075 pci_nvme_aer_destroy(sc); 1076 pci_nvme_aen_destroy(sc); 1077 1078 /* 1079 * Clear CSTS.RDY last to prevent the host from enabling Controller 1080 * before cleanup completes 1081 */ 1082 sc->regs.csts = 0; 1083 } 1084 1085 static void 1086 pci_nvme_reset(struct pci_nvme_softc *sc) 1087 { 1088 pthread_mutex_lock(&sc->mtx); 1089 pci_nvme_reset_locked(sc); 1090 pthread_mutex_unlock(&sc->mtx); 1091 } 1092 1093 static void 1094 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc) 1095 { 1096 uint16_t acqs, asqs; 1097 1098 DPRINTF("%s", __func__); 1099 1100 asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1; 1101 sc->submit_queues[0].size = asqs; 1102 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq, 1103 sizeof(struct nvme_command) * asqs); 1104 1105 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p", 1106 __func__, sc->regs.asq, sc->submit_queues[0].qbase); 1107 1108 acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 1109 NVME_AQA_REG_ACQS_MASK) + 1; 1110 sc->compl_queues[0].size = acqs; 1111 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq, 1112 sizeof(struct nvme_completion) * acqs); 1113 sc->compl_queues[0].intr_en = NVME_CQ_INTEN; 1114 1115 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p", 1116 __func__, sc->regs.acq, sc->compl_queues[0].qbase); 1117 } 1118 1119 static int 1120 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b, 1121 size_t len, enum nvme_copy_dir dir) 1122 { 1123 uint8_t *p; 1124 size_t bytes; 1125 1126 if (len > (8 * 1024)) { 1127 return (-1); 1128 } 1129 1130 /* Copy from the start of prp1 to the end of the physical page */ 1131 bytes = PAGE_SIZE - (prp1 & PAGE_MASK); 1132 bytes = MIN(bytes, len); 1133 1134 p = vm_map_gpa(ctx, prp1, bytes); 1135 if (p == NULL) { 1136 return (-1); 1137 } 1138 1139 if (dir == NVME_COPY_TO_PRP) 1140 memcpy(p, b, bytes); 1141 else 1142 memcpy(b, p, bytes); 1143 1144 b += bytes; 1145 1146 len -= bytes; 1147 if (len == 0) { 1148 return (0); 1149 } 1150 1151 len = MIN(len, PAGE_SIZE); 1152 1153 p = vm_map_gpa(ctx, prp2, len); 1154 if (p == NULL) { 1155 return (-1); 1156 } 1157 1158 if (dir == NVME_COPY_TO_PRP) 1159 memcpy(p, b, len); 1160 else 1161 memcpy(b, p, len); 1162 1163 return (0); 1164 } 1165 1166 /* 1167 * Write a Completion Queue Entry update 1168 * 1169 * Write the completion and update the doorbell value 1170 */ 1171 static void 1172 pci_nvme_cq_update(struct pci_nvme_softc *sc, 1173 struct nvme_completion_queue *cq, 1174 uint32_t cdw0, 1175 uint16_t cid, 1176 uint16_t sqid, 1177 uint16_t status) 1178 { 1179 struct nvme_submission_queue *sq = &sc->submit_queues[sqid]; 1180 struct nvme_completion *cqe; 1181 1182 assert(cq->qbase != NULL); 1183 1184 pthread_mutex_lock(&cq->mtx); 1185 1186 cqe = &cq->qbase[cq->tail]; 1187 1188 /* Flip the phase bit */ 1189 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK; 1190 1191 cqe->cdw0 = cdw0; 1192 cqe->sqhd = sq->head; 1193 cqe->sqid = sqid; 1194 cqe->cid = cid; 1195 cqe->status = status; 1196 1197 cq->tail++; 1198 if (cq->tail >= cq->size) { 1199 cq->tail = 0; 1200 } 1201 1202 pthread_mutex_unlock(&cq->mtx); 1203 } 1204 1205 static int 1206 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1207 struct nvme_completion* compl) 1208 { 1209 uint16_t qid = command->cdw10 & 0xffff; 1210 1211 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid); 1212 if (qid == 0 || qid > sc->num_squeues || 1213 (sc->submit_queues[qid].qbase == NULL)) { 1214 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u", 1215 __func__, qid, sc->num_squeues); 1216 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1217 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1218 return (1); 1219 } 1220 1221 sc->submit_queues[qid].qbase = NULL; 1222 sc->submit_queues[qid].cqid = 0; 1223 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1224 return (1); 1225 } 1226 1227 static int 1228 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1229 struct nvme_completion* compl) 1230 { 1231 if (command->cdw11 & NVME_CMD_CDW11_PC) { 1232 uint16_t qid = command->cdw10 & 0xffff; 1233 struct nvme_submission_queue *nsq; 1234 1235 if ((qid == 0) || (qid > sc->num_squeues) || 1236 (sc->submit_queues[qid].qbase != NULL)) { 1237 WPRINTF("%s queue index %u > num_squeues %u", 1238 __func__, qid, sc->num_squeues); 1239 pci_nvme_status_tc(&compl->status, 1240 NVME_SCT_COMMAND_SPECIFIC, 1241 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1242 return (1); 1243 } 1244 1245 nsq = &sc->submit_queues[qid]; 1246 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1247 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries); 1248 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) { 1249 /* 1250 * Queues must specify at least two entries 1251 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1252 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1253 */ 1254 pci_nvme_status_tc(&compl->status, 1255 NVME_SCT_COMMAND_SPECIFIC, 1256 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1257 return (1); 1258 } 1259 nsq->head = nsq->tail = 0; 1260 1261 nsq->cqid = (command->cdw11 >> 16) & 0xffff; 1262 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) { 1263 pci_nvme_status_tc(&compl->status, 1264 NVME_SCT_COMMAND_SPECIFIC, 1265 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1266 return (1); 1267 } 1268 1269 if (sc->compl_queues[nsq->cqid].qbase == NULL) { 1270 pci_nvme_status_tc(&compl->status, 1271 NVME_SCT_COMMAND_SPECIFIC, 1272 NVME_SC_COMPLETION_QUEUE_INVALID); 1273 return (1); 1274 } 1275 1276 nsq->qpriority = (command->cdw11 >> 1) & 0x03; 1277 1278 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1279 sizeof(struct nvme_command) * (size_t)nsq->size); 1280 1281 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__, 1282 qid, nsq->size, nsq->qbase, nsq->cqid); 1283 1284 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1285 1286 DPRINTF("%s completed creating IOSQ qid %u", 1287 __func__, qid); 1288 } else { 1289 /* 1290 * Guest sent non-cont submission queue request. 1291 * This setting is unsupported by this emulation. 1292 */ 1293 WPRINTF("%s unsupported non-contig (list-based) " 1294 "create i/o submission queue", __func__); 1295 1296 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1297 } 1298 return (1); 1299 } 1300 1301 static int 1302 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1303 struct nvme_completion* compl) 1304 { 1305 uint16_t qid = command->cdw10 & 0xffff; 1306 uint16_t sqid; 1307 1308 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid); 1309 if (qid == 0 || qid > sc->num_cqueues || 1310 (sc->compl_queues[qid].qbase == NULL)) { 1311 WPRINTF("%s queue index %u / num_cqueues %u", 1312 __func__, qid, sc->num_cqueues); 1313 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1314 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1315 return (1); 1316 } 1317 1318 /* Deleting an Active CQ is an error */ 1319 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++) 1320 if (sc->submit_queues[sqid].cqid == qid) { 1321 pci_nvme_status_tc(&compl->status, 1322 NVME_SCT_COMMAND_SPECIFIC, 1323 NVME_SC_INVALID_QUEUE_DELETION); 1324 return (1); 1325 } 1326 1327 sc->compl_queues[qid].qbase = NULL; 1328 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1329 return (1); 1330 } 1331 1332 static int 1333 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1334 struct nvme_completion* compl) 1335 { 1336 struct nvme_completion_queue *ncq; 1337 uint16_t qid = command->cdw10 & 0xffff; 1338 1339 /* Only support Physically Contiguous queues */ 1340 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) { 1341 WPRINTF("%s unsupported non-contig (list-based) " 1342 "create i/o completion queue", 1343 __func__); 1344 1345 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1346 return (1); 1347 } 1348 1349 if ((qid == 0) || (qid > sc->num_cqueues) || 1350 (sc->compl_queues[qid].qbase != NULL)) { 1351 WPRINTF("%s queue index %u > num_cqueues %u", 1352 __func__, qid, sc->num_cqueues); 1353 pci_nvme_status_tc(&compl->status, 1354 NVME_SCT_COMMAND_SPECIFIC, 1355 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1356 return (1); 1357 } 1358 1359 ncq = &sc->compl_queues[qid]; 1360 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; 1361 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; 1362 if (ncq->intr_vec > (sc->max_queues + 1)) { 1363 pci_nvme_status_tc(&compl->status, 1364 NVME_SCT_COMMAND_SPECIFIC, 1365 NVME_SC_INVALID_INTERRUPT_VECTOR); 1366 return (1); 1367 } 1368 1369 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1370 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) { 1371 /* 1372 * Queues must specify at least two entries 1373 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1374 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1375 */ 1376 pci_nvme_status_tc(&compl->status, 1377 NVME_SCT_COMMAND_SPECIFIC, 1378 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1379 return (1); 1380 } 1381 ncq->head = ncq->tail = 0; 1382 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1383 command->prp1, 1384 sizeof(struct nvme_command) * (size_t)ncq->size); 1385 1386 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1387 1388 1389 return (1); 1390 } 1391 1392 static int 1393 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, 1394 struct nvme_completion* compl) 1395 { 1396 uint64_t logoff; 1397 uint32_t logsize; 1398 uint8_t logpage = command->cdw10 & 0xFF; 1399 1400 #ifndef __FreeBSD__ 1401 logsize = 0; 1402 #endif 1403 1404 DPRINTF("%s log page %u len %u", __func__, logpage, logsize); 1405 1406 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1407 1408 /* 1409 * Command specifies the number of dwords to return in fields NUMDU 1410 * and NUMDL. This is a zero-based value. 1411 */ 1412 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1; 1413 logsize *= sizeof(uint32_t); 1414 logoff = ((uint64_t)(command->cdw13) << 32) | command->cdw12; 1415 1416 switch (logpage) { 1417 case NVME_LOG_ERROR: 1418 if (logoff >= sizeof(sc->err_log)) { 1419 pci_nvme_status_genc(&compl->status, 1420 NVME_SC_INVALID_FIELD); 1421 break; 1422 } 1423 1424 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1425 command->prp2, (uint8_t *)&sc->err_log + logoff, 1426 MIN(logsize - logoff, sizeof(sc->err_log)), 1427 NVME_COPY_TO_PRP); 1428 break; 1429 case NVME_LOG_HEALTH_INFORMATION: 1430 if (logoff >= sizeof(sc->health_log)) { 1431 pci_nvme_status_genc(&compl->status, 1432 NVME_SC_INVALID_FIELD); 1433 break; 1434 } 1435 1436 pthread_mutex_lock(&sc->mtx); 1437 memcpy(&sc->health_log.data_units_read, &sc->read_data_units, 1438 sizeof(sc->health_log.data_units_read)); 1439 memcpy(&sc->health_log.data_units_written, &sc->write_data_units, 1440 sizeof(sc->health_log.data_units_written)); 1441 memcpy(&sc->health_log.host_read_commands, &sc->read_commands, 1442 sizeof(sc->health_log.host_read_commands)); 1443 memcpy(&sc->health_log.host_write_commands, &sc->write_commands, 1444 sizeof(sc->health_log.host_write_commands)); 1445 pthread_mutex_unlock(&sc->mtx); 1446 1447 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1448 command->prp2, (uint8_t *)&sc->health_log + logoff, 1449 MIN(logsize - logoff, sizeof(sc->health_log)), 1450 NVME_COPY_TO_PRP); 1451 break; 1452 case NVME_LOG_FIRMWARE_SLOT: 1453 if (logoff >= sizeof(sc->fw_log)) { 1454 pci_nvme_status_genc(&compl->status, 1455 NVME_SC_INVALID_FIELD); 1456 break; 1457 } 1458 1459 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1460 command->prp2, (uint8_t *)&sc->fw_log + logoff, 1461 MIN(logsize - logoff, sizeof(sc->fw_log)), 1462 NVME_COPY_TO_PRP); 1463 break; 1464 case NVME_LOG_CHANGED_NAMESPACE: 1465 if (logoff >= sizeof(sc->ns_log)) { 1466 pci_nvme_status_genc(&compl->status, 1467 NVME_SC_INVALID_FIELD); 1468 break; 1469 } 1470 1471 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1472 command->prp2, (uint8_t *)&sc->ns_log + logoff, 1473 MIN(logsize - logoff, sizeof(sc->ns_log)), 1474 NVME_COPY_TO_PRP); 1475 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 1476 break; 1477 default: 1478 DPRINTF("%s get log page %x command not supported", 1479 __func__, logpage); 1480 1481 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1482 NVME_SC_INVALID_LOG_PAGE); 1483 } 1484 1485 return (1); 1486 } 1487 1488 static int 1489 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, 1490 struct nvme_completion* compl) 1491 { 1492 void *dest; 1493 uint16_t status; 1494 1495 #ifndef __FreeBSD__ 1496 status = 0; 1497 #endif 1498 1499 DPRINTF("%s identify 0x%x nsid 0x%x", __func__, 1500 command->cdw10 & 0xFF, command->nsid); 1501 1502 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1503 1504 switch (command->cdw10 & 0xFF) { 1505 case 0x00: /* return Identify Namespace data structure */ 1506 /* Global NS only valid with NS Management */ 1507 if (command->nsid == NVME_GLOBAL_NAMESPACE_TAG) { 1508 pci_nvme_status_genc(&status, 1509 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1510 break; 1511 } 1512 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1513 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata), 1514 NVME_COPY_TO_PRP); 1515 break; 1516 case 0x01: /* return Identify Controller data structure */ 1517 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1518 command->prp2, (uint8_t *)&sc->ctrldata, 1519 sizeof(sc->ctrldata), 1520 NVME_COPY_TO_PRP); 1521 break; 1522 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ 1523 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1524 sizeof(uint32_t) * 1024); 1525 /* All unused entries shall be zero */ 1526 bzero(dest, sizeof(uint32_t) * 1024); 1527 ((uint32_t *)dest)[0] = 1; 1528 break; 1529 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ 1530 if (command->nsid != 1) { 1531 pci_nvme_status_genc(&status, 1532 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1533 break; 1534 } 1535 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1536 sizeof(uint32_t) * 1024); 1537 /* All bytes after the descriptor shall be zero */ 1538 bzero(dest, sizeof(uint32_t) * 1024); 1539 1540 /* Return NIDT=1 (i.e. EUI64) descriptor */ 1541 ((uint8_t *)dest)[0] = 1; 1542 ((uint8_t *)dest)[1] = sizeof(uint64_t); 1543 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t)); 1544 break; 1545 default: 1546 DPRINTF("%s unsupported identify command requested 0x%x", 1547 __func__, command->cdw10 & 0xFF); 1548 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD); 1549 break; 1550 } 1551 1552 compl->status = status; 1553 return (1); 1554 } 1555 1556 static const char * 1557 nvme_fid_to_name(uint8_t fid) 1558 { 1559 const char *name; 1560 1561 switch (fid) { 1562 case NVME_FEAT_ARBITRATION: 1563 name = "Arbitration"; 1564 break; 1565 case NVME_FEAT_POWER_MANAGEMENT: 1566 name = "Power Management"; 1567 break; 1568 case NVME_FEAT_LBA_RANGE_TYPE: 1569 name = "LBA Range Type"; 1570 break; 1571 case NVME_FEAT_TEMPERATURE_THRESHOLD: 1572 name = "Temperature Threshold"; 1573 break; 1574 case NVME_FEAT_ERROR_RECOVERY: 1575 name = "Error Recovery"; 1576 break; 1577 case NVME_FEAT_VOLATILE_WRITE_CACHE: 1578 name = "Volatile Write Cache"; 1579 break; 1580 case NVME_FEAT_NUMBER_OF_QUEUES: 1581 name = "Number of Queues"; 1582 break; 1583 case NVME_FEAT_INTERRUPT_COALESCING: 1584 name = "Interrupt Coalescing"; 1585 break; 1586 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 1587 name = "Interrupt Vector Configuration"; 1588 break; 1589 case NVME_FEAT_WRITE_ATOMICITY: 1590 name = "Write Atomicity Normal"; 1591 break; 1592 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 1593 name = "Asynchronous Event Configuration"; 1594 break; 1595 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 1596 name = "Autonomous Power State Transition"; 1597 break; 1598 case NVME_FEAT_HOST_MEMORY_BUFFER: 1599 name = "Host Memory Buffer"; 1600 break; 1601 case NVME_FEAT_TIMESTAMP: 1602 name = "Timestamp"; 1603 break; 1604 case NVME_FEAT_KEEP_ALIVE_TIMER: 1605 name = "Keep Alive Timer"; 1606 break; 1607 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT: 1608 name = "Host Controlled Thermal Management"; 1609 break; 1610 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG: 1611 name = "Non-Operation Power State Config"; 1612 break; 1613 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG: 1614 name = "Read Recovery Level Config"; 1615 break; 1616 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 1617 name = "Predictable Latency Mode Config"; 1618 break; 1619 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW: 1620 name = "Predictable Latency Mode Window"; 1621 break; 1622 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES: 1623 name = "LBA Status Information Report Interval"; 1624 break; 1625 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 1626 name = "Host Behavior Support"; 1627 break; 1628 case NVME_FEAT_SANITIZE_CONFIG: 1629 name = "Sanitize Config"; 1630 break; 1631 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION: 1632 name = "Endurance Group Event Configuration"; 1633 break; 1634 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 1635 name = "Software Progress Marker"; 1636 break; 1637 case NVME_FEAT_HOST_IDENTIFIER: 1638 name = "Host Identifier"; 1639 break; 1640 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK: 1641 name = "Reservation Notification Mask"; 1642 break; 1643 case NVME_FEAT_RESERVATION_PERSISTENCE: 1644 name = "Reservation Persistence"; 1645 break; 1646 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG: 1647 name = "Namespace Write Protection Config"; 1648 break; 1649 default: 1650 name = "Unknown"; 1651 break; 1652 } 1653 1654 return (name); 1655 } 1656 1657 static void 1658 nvme_feature_invalid_cb(struct pci_nvme_softc *sc, 1659 struct nvme_feature_obj *feat, 1660 struct nvme_command *command, 1661 struct nvme_completion *compl) 1662 { 1663 1664 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1665 } 1666 1667 static void 1668 nvme_feature_iv_config(struct pci_nvme_softc *sc, 1669 struct nvme_feature_obj *feat, 1670 struct nvme_command *command, 1671 struct nvme_completion *compl) 1672 { 1673 uint32_t i; 1674 uint32_t cdw11 = command->cdw11; 1675 uint16_t iv; 1676 bool cd; 1677 1678 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1679 1680 iv = cdw11 & 0xffff; 1681 cd = cdw11 & (1 << 16); 1682 1683 if (iv > (sc->max_queues + 1)) { 1684 return; 1685 } 1686 1687 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */ 1688 if ((iv == 0) && !cd) 1689 return; 1690 1691 /* Requested Interrupt Vector must be used by a CQ */ 1692 for (i = 0; i < sc->num_cqueues + 1; i++) { 1693 if (sc->compl_queues[i].intr_vec == iv) { 1694 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1695 } 1696 } 1697 } 1698 1699 #define NVME_ASYNC_EVENT_ENDURANCE_GROUP (0x4000) 1700 static void 1701 nvme_feature_async_event(struct pci_nvme_softc *sc, 1702 struct nvme_feature_obj *feat, 1703 struct nvme_command *command, 1704 struct nvme_completion *compl) 1705 { 1706 1707 if (command->cdw11 & NVME_ASYNC_EVENT_ENDURANCE_GROUP) 1708 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1709 } 1710 1711 #define NVME_TEMP_THRESH_OVER 0 1712 #define NVME_TEMP_THRESH_UNDER 1 1713 static void 1714 nvme_feature_temperature(struct pci_nvme_softc *sc, 1715 struct nvme_feature_obj *feat, 1716 struct nvme_command *command, 1717 struct nvme_completion *compl) 1718 { 1719 uint16_t tmpth; /* Temperature Threshold */ 1720 uint8_t tmpsel; /* Threshold Temperature Select */ 1721 uint8_t thsel; /* Threshold Type Select */ 1722 bool set_crit = false; 1723 1724 tmpth = command->cdw11 & 0xffff; 1725 tmpsel = (command->cdw11 >> 16) & 0xf; 1726 thsel = (command->cdw11 >> 20) & 0x3; 1727 1728 DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel); 1729 1730 /* Check for unsupported values */ 1731 if (((tmpsel != 0) && (tmpsel != 0xf)) || 1732 (thsel > NVME_TEMP_THRESH_UNDER)) { 1733 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1734 return; 1735 } 1736 1737 if (((thsel == NVME_TEMP_THRESH_OVER) && (NVME_TEMPERATURE >= tmpth)) || 1738 ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth))) 1739 set_crit = true; 1740 1741 pthread_mutex_lock(&sc->mtx); 1742 if (set_crit) 1743 sc->health_log.critical_warning |= 1744 NVME_CRIT_WARN_ST_TEMPERATURE; 1745 else 1746 sc->health_log.critical_warning &= 1747 ~NVME_CRIT_WARN_ST_TEMPERATURE; 1748 pthread_mutex_unlock(&sc->mtx); 1749 1750 if (set_crit) 1751 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART, 1752 sc->health_log.critical_warning); 1753 1754 1755 DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status); 1756 } 1757 1758 static void 1759 nvme_feature_num_queues(struct pci_nvme_softc *sc, 1760 struct nvme_feature_obj *feat, 1761 struct nvme_command *command, 1762 struct nvme_completion *compl) 1763 { 1764 uint16_t nqr; /* Number of Queues Requested */ 1765 1766 if (sc->num_q_is_set) { 1767 WPRINTF("%s: Number of Queues already set", __func__); 1768 pci_nvme_status_genc(&compl->status, 1769 NVME_SC_COMMAND_SEQUENCE_ERROR); 1770 return; 1771 } 1772 1773 nqr = command->cdw11 & 0xFFFF; 1774 if (nqr == 0xffff) { 1775 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr); 1776 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1777 return; 1778 } 1779 1780 sc->num_squeues = ONE_BASED(nqr); 1781 if (sc->num_squeues > sc->max_queues) { 1782 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues, 1783 sc->max_queues); 1784 sc->num_squeues = sc->max_queues; 1785 } 1786 1787 nqr = (command->cdw11 >> 16) & 0xFFFF; 1788 if (nqr == 0xffff) { 1789 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr); 1790 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1791 return; 1792 } 1793 1794 sc->num_cqueues = ONE_BASED(nqr); 1795 if (sc->num_cqueues > sc->max_queues) { 1796 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues, 1797 sc->max_queues); 1798 sc->num_cqueues = sc->max_queues; 1799 } 1800 1801 /* Patch the command value which will be saved on callback's return */ 1802 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc); 1803 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 1804 1805 sc->num_q_is_set = true; 1806 } 1807 1808 static int 1809 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command, 1810 struct nvme_completion *compl) 1811 { 1812 struct nvme_feature_obj *feat; 1813 uint32_t nsid = command->nsid; 1814 uint8_t fid = command->cdw10 & 0xFF; 1815 1816 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1817 1818 if (fid >= NVME_FID_MAX) { 1819 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1820 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1821 return (1); 1822 } 1823 feat = &sc->feat[fid]; 1824 1825 if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) { 1826 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1827 return (1); 1828 } 1829 1830 if (!feat->namespace_specific && 1831 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) { 1832 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1833 NVME_SC_FEATURE_NOT_NS_SPECIFIC); 1834 return (1); 1835 } 1836 1837 compl->cdw0 = 0; 1838 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1839 1840 if (feat->set) 1841 feat->set(sc, feat, command, compl); 1842 1843 DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11); 1844 if (compl->status == NVME_SC_SUCCESS) { 1845 feat->cdw11 = command->cdw11; 1846 if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) && 1847 (command->cdw11 != 0)) 1848 pci_nvme_aen_notify(sc); 1849 } 1850 1851 return (0); 1852 } 1853 1854 #define NVME_FEATURES_SEL_SUPPORTED 0x3 1855 #define NVME_FEATURES_NS_SPECIFIC (1 << 1) 1856 1857 static int 1858 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, 1859 struct nvme_completion* compl) 1860 { 1861 struct nvme_feature_obj *feat; 1862 uint8_t fid = command->cdw10 & 0xFF; 1863 uint8_t sel = (command->cdw10 >> 8) & 0x7; 1864 1865 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1866 1867 if (fid >= NVME_FID_MAX) { 1868 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1869 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1870 return (1); 1871 } 1872 1873 compl->cdw0 = 0; 1874 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1875 1876 feat = &sc->feat[fid]; 1877 if (feat->get) { 1878 feat->get(sc, feat, command, compl); 1879 } 1880 1881 if (compl->status == NVME_SC_SUCCESS) { 1882 if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific) 1883 compl->cdw0 = NVME_FEATURES_NS_SPECIFIC; 1884 else 1885 compl->cdw0 = feat->cdw11; 1886 } 1887 1888 return (0); 1889 } 1890 1891 static int 1892 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command, 1893 struct nvme_completion* compl) 1894 { 1895 uint8_t ses, lbaf, pi; 1896 1897 /* Only supports Secure Erase Setting - User Data Erase */ 1898 ses = (command->cdw10 >> 9) & 0x7; 1899 if (ses > 0x1) { 1900 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1901 return (1); 1902 } 1903 1904 /* Only supports a single LBA Format */ 1905 lbaf = command->cdw10 & 0xf; 1906 if (lbaf != 0) { 1907 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1908 NVME_SC_INVALID_FORMAT); 1909 return (1); 1910 } 1911 1912 /* Doesn't support Protection Infomation */ 1913 pi = (command->cdw10 >> 5) & 0x7; 1914 if (pi != 0) { 1915 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1916 return (1); 1917 } 1918 1919 if (sc->nvstore.type == NVME_STOR_RAM) { 1920 if (sc->nvstore.ctx) 1921 free(sc->nvstore.ctx); 1922 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 1923 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1924 } else { 1925 struct pci_nvme_ioreq *req; 1926 int err; 1927 1928 req = pci_nvme_get_ioreq(sc); 1929 if (req == NULL) { 1930 pci_nvme_status_genc(&compl->status, 1931 NVME_SC_INTERNAL_DEVICE_ERROR); 1932 WPRINTF("%s: unable to allocate IO req", __func__); 1933 return (1); 1934 } 1935 req->nvme_sq = &sc->submit_queues[0]; 1936 req->sqid = 0; 1937 req->opc = command->opc; 1938 req->cid = command->cid; 1939 req->nsid = command->nsid; 1940 1941 req->io_req.br_offset = 0; 1942 req->io_req.br_resid = sc->nvstore.size; 1943 req->io_req.br_callback = pci_nvme_io_done; 1944 1945 err = blockif_delete(sc->nvstore.ctx, &req->io_req); 1946 if (err) { 1947 pci_nvme_status_genc(&compl->status, 1948 NVME_SC_INTERNAL_DEVICE_ERROR); 1949 pci_nvme_release_ioreq(sc, req); 1950 } else 1951 compl->status = NVME_NO_STATUS; 1952 } 1953 1954 return (1); 1955 } 1956 1957 static int 1958 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command, 1959 struct nvme_completion* compl) 1960 { 1961 DPRINTF("%s submission queue %u, command ID 0x%x", __func__, 1962 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF); 1963 1964 /* TODO: search for the command ID and abort it */ 1965 1966 compl->cdw0 = 1; 1967 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1968 return (1); 1969 } 1970 1971 static int 1972 nvme_opc_async_event_req(struct pci_nvme_softc* sc, 1973 struct nvme_command* command, struct nvme_completion* compl) 1974 { 1975 DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__, 1976 sc->aer_count, sc->ctrldata.aerl, command->cid); 1977 1978 /* Don't exceed the Async Event Request Limit (AERL). */ 1979 if (pci_nvme_aer_limit_reached(sc)) { 1980 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1981 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); 1982 return (1); 1983 } 1984 1985 if (pci_nvme_aer_add(sc, command->cid)) { 1986 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC, 1987 NVME_SC_INTERNAL_DEVICE_ERROR); 1988 return (1); 1989 } 1990 1991 /* 1992 * Raise events when they happen based on the Set Features cmd. 1993 * These events happen async, so only set completion successful if 1994 * there is an event reflective of the request to get event. 1995 */ 1996 compl->status = NVME_NO_STATUS; 1997 pci_nvme_aen_notify(sc); 1998 1999 return (0); 2000 } 2001 2002 static void 2003 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) 2004 { 2005 struct nvme_completion compl; 2006 struct nvme_command *cmd; 2007 struct nvme_submission_queue *sq; 2008 struct nvme_completion_queue *cq; 2009 uint16_t sqhead; 2010 2011 DPRINTF("%s index %u", __func__, (uint32_t)value); 2012 2013 sq = &sc->submit_queues[0]; 2014 cq = &sc->compl_queues[0]; 2015 2016 pthread_mutex_lock(&sq->mtx); 2017 2018 sqhead = sq->head; 2019 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail); 2020 2021 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2022 cmd = &(sq->qbase)[sqhead]; 2023 compl.cdw0 = 0; 2024 compl.status = 0; 2025 2026 switch (cmd->opc) { 2027 case NVME_OPC_DELETE_IO_SQ: 2028 DPRINTF("%s command DELETE_IO_SQ", __func__); 2029 nvme_opc_delete_io_sq(sc, cmd, &compl); 2030 break; 2031 case NVME_OPC_CREATE_IO_SQ: 2032 DPRINTF("%s command CREATE_IO_SQ", __func__); 2033 nvme_opc_create_io_sq(sc, cmd, &compl); 2034 break; 2035 case NVME_OPC_DELETE_IO_CQ: 2036 DPRINTF("%s command DELETE_IO_CQ", __func__); 2037 nvme_opc_delete_io_cq(sc, cmd, &compl); 2038 break; 2039 case NVME_OPC_CREATE_IO_CQ: 2040 DPRINTF("%s command CREATE_IO_CQ", __func__); 2041 nvme_opc_create_io_cq(sc, cmd, &compl); 2042 break; 2043 case NVME_OPC_GET_LOG_PAGE: 2044 DPRINTF("%s command GET_LOG_PAGE", __func__); 2045 nvme_opc_get_log_page(sc, cmd, &compl); 2046 break; 2047 case NVME_OPC_IDENTIFY: 2048 DPRINTF("%s command IDENTIFY", __func__); 2049 nvme_opc_identify(sc, cmd, &compl); 2050 break; 2051 case NVME_OPC_ABORT: 2052 DPRINTF("%s command ABORT", __func__); 2053 nvme_opc_abort(sc, cmd, &compl); 2054 break; 2055 case NVME_OPC_SET_FEATURES: 2056 DPRINTF("%s command SET_FEATURES", __func__); 2057 nvme_opc_set_features(sc, cmd, &compl); 2058 break; 2059 case NVME_OPC_GET_FEATURES: 2060 DPRINTF("%s command GET_FEATURES", __func__); 2061 nvme_opc_get_features(sc, cmd, &compl); 2062 break; 2063 case NVME_OPC_FIRMWARE_ACTIVATE: 2064 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__); 2065 pci_nvme_status_tc(&compl.status, 2066 NVME_SCT_COMMAND_SPECIFIC, 2067 NVME_SC_INVALID_FIRMWARE_SLOT); 2068 break; 2069 case NVME_OPC_ASYNC_EVENT_REQUEST: 2070 DPRINTF("%s command ASYNC_EVENT_REQ", __func__); 2071 nvme_opc_async_event_req(sc, cmd, &compl); 2072 break; 2073 case NVME_OPC_FORMAT_NVM: 2074 DPRINTF("%s command FORMAT_NVM", __func__); 2075 if ((sc->ctrldata.oacs & 2076 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) { 2077 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 2078 break; 2079 } 2080 nvme_opc_format_nvm(sc, cmd, &compl); 2081 break; 2082 case NVME_OPC_SECURITY_SEND: 2083 case NVME_OPC_SECURITY_RECEIVE: 2084 case NVME_OPC_SANITIZE: 2085 case NVME_OPC_GET_LBA_STATUS: 2086 DPRINTF("%s command OPC=%#x (unsupported)", __func__, 2087 cmd->opc); 2088 /* Valid but unsupported opcodes */ 2089 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD); 2090 break; 2091 default: 2092 DPRINTF("%s command OPC=%#X (not implemented)", 2093 __func__, 2094 cmd->opc); 2095 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 2096 } 2097 sqhead = (sqhead + 1) % sq->size; 2098 2099 if (NVME_COMPLETION_VALID(compl)) { 2100 pci_nvme_cq_update(sc, &sc->compl_queues[0], 2101 compl.cdw0, 2102 cmd->cid, 2103 0, /* SQID */ 2104 compl.status); 2105 } 2106 } 2107 2108 DPRINTF("setting sqhead %u", sqhead); 2109 sq->head = sqhead; 2110 2111 if (cq->head != cq->tail) 2112 pci_generate_msix(sc->nsc_pi, 0); 2113 2114 pthread_mutex_unlock(&sq->mtx); 2115 } 2116 2117 /* 2118 * Update the Write and Read statistics reported in SMART data 2119 * 2120 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up. 2121 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000 2122 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999. 2123 */ 2124 static void 2125 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc, 2126 size_t bytes, uint16_t status) 2127 { 2128 2129 pthread_mutex_lock(&sc->mtx); 2130 switch (opc) { 2131 case NVME_OPC_WRITE: 2132 sc->write_commands++; 2133 if (status != NVME_SC_SUCCESS) 2134 break; 2135 sc->write_dunits_remainder += (bytes / 512); 2136 while (sc->write_dunits_remainder >= 1000) { 2137 sc->write_data_units++; 2138 sc->write_dunits_remainder -= 1000; 2139 } 2140 break; 2141 case NVME_OPC_READ: 2142 sc->read_commands++; 2143 if (status != NVME_SC_SUCCESS) 2144 break; 2145 sc->read_dunits_remainder += (bytes / 512); 2146 while (sc->read_dunits_remainder >= 1000) { 2147 sc->read_data_units++; 2148 sc->read_dunits_remainder -= 1000; 2149 } 2150 break; 2151 default: 2152 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc); 2153 break; 2154 } 2155 pthread_mutex_unlock(&sc->mtx); 2156 } 2157 2158 /* 2159 * Check if the combination of Starting LBA (slba) and number of blocks 2160 * exceeds the range of the underlying storage. 2161 * 2162 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores 2163 * the capacity in bytes as a uint64_t, care must be taken to avoid integer 2164 * overflow. 2165 */ 2166 static bool 2167 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba, 2168 uint32_t nblocks) 2169 { 2170 size_t offset, bytes; 2171 2172 /* Overflow check of multiplying Starting LBA by the sector size */ 2173 if (slba >> (64 - nvstore->sectsz_bits)) 2174 return (true); 2175 2176 offset = slba << nvstore->sectsz_bits; 2177 bytes = nblocks << nvstore->sectsz_bits; 2178 2179 /* Overflow check of Number of Logical Blocks */ 2180 if ((nvstore->size <= offset) || ((nvstore->size - offset) < bytes)) 2181 return (true); 2182 2183 return (false); 2184 } 2185 2186 static int 2187 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req, 2188 uint64_t gpaddr, size_t size, int do_write, uint64_t lba) 2189 { 2190 int iovidx; 2191 2192 if (req == NULL) 2193 return (-1); 2194 2195 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) { 2196 return (-1); 2197 } 2198 2199 /* concatenate contig block-iovs to minimize number of iovs */ 2200 if ((req->prev_gpaddr + req->prev_size) == gpaddr) { 2201 iovidx = req->io_req.br_iovcnt - 1; 2202 2203 req->io_req.br_iov[iovidx].iov_base = 2204 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2205 req->prev_gpaddr, size); 2206 2207 req->prev_size += size; 2208 req->io_req.br_resid += size; 2209 2210 req->io_req.br_iov[iovidx].iov_len = req->prev_size; 2211 } else { 2212 iovidx = req->io_req.br_iovcnt; 2213 if (iovidx == 0) { 2214 req->io_req.br_offset = lba; 2215 req->io_req.br_resid = 0; 2216 req->io_req.br_param = req; 2217 } 2218 2219 req->io_req.br_iov[iovidx].iov_base = 2220 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2221 gpaddr, size); 2222 2223 req->io_req.br_iov[iovidx].iov_len = size; 2224 2225 req->prev_gpaddr = gpaddr; 2226 req->prev_size = size; 2227 req->io_req.br_resid += size; 2228 2229 req->io_req.br_iovcnt++; 2230 } 2231 2232 return (0); 2233 } 2234 2235 static void 2236 pci_nvme_set_completion(struct pci_nvme_softc *sc, 2237 struct nvme_submission_queue *sq, int sqid, uint16_t cid, 2238 uint32_t cdw0, uint16_t status) 2239 { 2240 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; 2241 2242 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x", 2243 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), 2244 NVME_STATUS_GET_SC(status)); 2245 2246 pci_nvme_cq_update(sc, cq, 2247 0, /* CDW0 */ 2248 cid, 2249 sqid, 2250 status); 2251 2252 if (cq->head != cq->tail) { 2253 if (cq->intr_en & NVME_CQ_INTEN) { 2254 pci_generate_msix(sc->nsc_pi, cq->intr_vec); 2255 } else { 2256 DPRINTF("%s: CQ%u interrupt disabled", 2257 __func__, sq->cqid); 2258 } 2259 } 2260 } 2261 2262 static void 2263 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) 2264 { 2265 req->sc = NULL; 2266 req->nvme_sq = NULL; 2267 req->sqid = 0; 2268 2269 pthread_mutex_lock(&sc->mtx); 2270 2271 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link); 2272 sc->pending_ios--; 2273 2274 /* when no more IO pending, can set to ready if device reset/enabled */ 2275 if (sc->pending_ios == 0 && 2276 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) 2277 sc->regs.csts |= NVME_CSTS_RDY; 2278 2279 pthread_mutex_unlock(&sc->mtx); 2280 2281 sem_post(&sc->iosemlock); 2282 } 2283 2284 static struct pci_nvme_ioreq * 2285 pci_nvme_get_ioreq(struct pci_nvme_softc *sc) 2286 { 2287 struct pci_nvme_ioreq *req = NULL; 2288 2289 sem_wait(&sc->iosemlock); 2290 pthread_mutex_lock(&sc->mtx); 2291 2292 req = STAILQ_FIRST(&sc->ioreqs_free); 2293 assert(req != NULL); 2294 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link); 2295 2296 req->sc = sc; 2297 2298 sc->pending_ios++; 2299 2300 pthread_mutex_unlock(&sc->mtx); 2301 2302 req->io_req.br_iovcnt = 0; 2303 req->io_req.br_offset = 0; 2304 req->io_req.br_resid = 0; 2305 req->io_req.br_param = req; 2306 req->prev_gpaddr = 0; 2307 req->prev_size = 0; 2308 2309 return req; 2310 } 2311 2312 static void 2313 pci_nvme_io_done(struct blockif_req *br, int err) 2314 { 2315 struct pci_nvme_ioreq *req = br->br_param; 2316 struct nvme_submission_queue *sq = req->nvme_sq; 2317 uint16_t code, status; 2318 2319 #ifndef __FreeBSD__ 2320 status = 0; 2321 #endif 2322 2323 DPRINTF("%s error %d %s", __func__, err, strerror(err)); 2324 2325 /* TODO return correct error */ 2326 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; 2327 pci_nvme_status_genc(&status, code); 2328 2329 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status); 2330 pci_nvme_stats_write_read_update(req->sc, req->opc, 2331 req->bytes, status); 2332 pci_nvme_release_ioreq(req->sc, req); 2333 } 2334 2335 /* 2336 * Implements the Flush command. The specification states: 2337 * If a volatile write cache is not present, Flush commands complete 2338 * successfully and have no effect 2339 * in the description of the Volatile Write Cache (VWC) field of the Identify 2340 * Controller data. Therefore, set status to Success if the command is 2341 * not supported (i.e. RAM or as indicated by the blockif). 2342 */ 2343 static bool 2344 nvme_opc_flush(struct pci_nvme_softc *sc, 2345 struct nvme_command *cmd, 2346 struct pci_nvme_blockstore *nvstore, 2347 struct pci_nvme_ioreq *req, 2348 uint16_t *status) 2349 { 2350 bool pending = false; 2351 2352 if (nvstore->type == NVME_STOR_RAM) { 2353 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2354 } else { 2355 int err; 2356 2357 req->io_req.br_callback = pci_nvme_io_done; 2358 2359 err = blockif_flush(nvstore->ctx, &req->io_req); 2360 switch (err) { 2361 case 0: 2362 pending = true; 2363 break; 2364 case EOPNOTSUPP: 2365 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2366 break; 2367 default: 2368 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2369 } 2370 } 2371 2372 return (pending); 2373 } 2374 2375 static uint16_t 2376 nvme_write_read_ram(struct pci_nvme_softc *sc, 2377 struct pci_nvme_blockstore *nvstore, 2378 uint64_t prp1, uint64_t prp2, 2379 size_t offset, uint64_t bytes, 2380 bool is_write) 2381 { 2382 uint8_t *buf = nvstore->ctx; 2383 enum nvme_copy_dir dir; 2384 uint16_t status; 2385 2386 #ifndef __FreeBSD__ 2387 status = 0; 2388 #endif 2389 2390 if (is_write) 2391 dir = NVME_COPY_TO_PRP; 2392 else 2393 dir = NVME_COPY_FROM_PRP; 2394 2395 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2, 2396 buf + offset, bytes, dir)) 2397 pci_nvme_status_genc(&status, 2398 NVME_SC_DATA_TRANSFER_ERROR); 2399 else 2400 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2401 2402 return (status); 2403 } 2404 2405 static uint16_t 2406 nvme_write_read_blockif(struct pci_nvme_softc *sc, 2407 struct pci_nvme_blockstore *nvstore, 2408 struct pci_nvme_ioreq *req, 2409 uint64_t prp1, uint64_t prp2, 2410 size_t offset, uint64_t bytes, 2411 bool is_write) 2412 { 2413 uint64_t size; 2414 int err; 2415 uint16_t status = NVME_NO_STATUS; 2416 2417 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes); 2418 if (pci_nvme_append_iov_req(sc, req, prp1, 2419 size, is_write, offset)) { 2420 pci_nvme_status_genc(&status, 2421 NVME_SC_DATA_TRANSFER_ERROR); 2422 goto out; 2423 } 2424 2425 offset += size; 2426 bytes -= size; 2427 2428 if (bytes == 0) { 2429 ; 2430 } else if (bytes <= PAGE_SIZE) { 2431 size = bytes; 2432 if (pci_nvme_append_iov_req(sc, req, prp2, 2433 size, is_write, offset)) { 2434 pci_nvme_status_genc(&status, 2435 NVME_SC_DATA_TRANSFER_ERROR); 2436 goto out; 2437 } 2438 } else { 2439 void *vmctx = sc->nsc_pi->pi_vmctx; 2440 uint64_t *prp_list = &prp2; 2441 uint64_t *last = prp_list; 2442 2443 /* PRP2 is pointer to a physical region page list */ 2444 while (bytes) { 2445 /* Last entry in list points to the next list */ 2446 if ((prp_list == last) && (bytes > PAGE_SIZE)) { 2447 uint64_t prp = *prp_list; 2448 2449 prp_list = paddr_guest2host(vmctx, prp, 2450 PAGE_SIZE - (prp % PAGE_SIZE)); 2451 last = prp_list + (NVME_PRP2_ITEMS - 1); 2452 } 2453 2454 size = MIN(bytes, PAGE_SIZE); 2455 2456 if (pci_nvme_append_iov_req(sc, req, *prp_list, 2457 size, is_write, offset)) { 2458 pci_nvme_status_genc(&status, 2459 NVME_SC_DATA_TRANSFER_ERROR); 2460 goto out; 2461 } 2462 2463 offset += size; 2464 bytes -= size; 2465 2466 prp_list++; 2467 } 2468 } 2469 req->io_req.br_callback = pci_nvme_io_done; 2470 if (is_write) 2471 err = blockif_write(nvstore->ctx, &req->io_req); 2472 else 2473 err = blockif_read(nvstore->ctx, &req->io_req); 2474 2475 if (err) 2476 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); 2477 out: 2478 return (status); 2479 } 2480 2481 static bool 2482 nvme_opc_write_read(struct pci_nvme_softc *sc, 2483 struct nvme_command *cmd, 2484 struct pci_nvme_blockstore *nvstore, 2485 struct pci_nvme_ioreq *req, 2486 uint16_t *status) 2487 { 2488 uint64_t lba, nblocks, bytes; 2489 size_t offset; 2490 bool is_write = cmd->opc == NVME_OPC_WRITE; 2491 bool pending = false; 2492 2493 #ifndef __FreeBSD__ 2494 bytes = 0; 2495 #endif 2496 2497 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; 2498 nblocks = (cmd->cdw12 & 0xFFFF) + 1; 2499 2500 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) { 2501 WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)", 2502 __func__, lba, nblocks); 2503 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2504 goto out; 2505 } 2506 2507 bytes = nblocks << nvstore->sectsz_bits; 2508 if (bytes > NVME_MAX_DATA_SIZE) { 2509 WPRINTF("%s command would exceed MDTS", __func__); 2510 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD); 2511 goto out; 2512 } 2513 2514 offset = lba << nvstore->sectsz_bits; 2515 2516 req->bytes = bytes; 2517 req->io_req.br_offset = lba; 2518 2519 /* PRP bits 1:0 must be zero */ 2520 cmd->prp1 &= ~0x3UL; 2521 cmd->prp2 &= ~0x3UL; 2522 2523 if (nvstore->type == NVME_STOR_RAM) { 2524 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1, 2525 cmd->prp2, offset, bytes, is_write); 2526 } else { 2527 *status = nvme_write_read_blockif(sc, nvstore, req, 2528 cmd->prp1, cmd->prp2, offset, bytes, is_write); 2529 2530 if (*status == NVME_NO_STATUS) 2531 pending = true; 2532 } 2533 out: 2534 if (!pending) 2535 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status); 2536 2537 return (pending); 2538 } 2539 2540 static void 2541 pci_nvme_dealloc_sm(struct blockif_req *br, int err) 2542 { 2543 struct pci_nvme_ioreq *req = br->br_param; 2544 struct pci_nvme_softc *sc = req->sc; 2545 bool done = true; 2546 uint16_t status; 2547 2548 #ifndef __FreeBSD__ 2549 status = 0; 2550 #endif 2551 2552 if (err) { 2553 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); 2554 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) { 2555 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2556 } else { 2557 struct iovec *iov = req->io_req.br_iov; 2558 2559 req->prev_gpaddr++; 2560 iov += req->prev_gpaddr; 2561 2562 /* The iov_* values already include the sector size */ 2563 req->io_req.br_offset = (off_t)iov->iov_base; 2564 req->io_req.br_resid = iov->iov_len; 2565 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) { 2566 pci_nvme_status_genc(&status, 2567 NVME_SC_INTERNAL_DEVICE_ERROR); 2568 } else 2569 done = false; 2570 } 2571 2572 if (done) { 2573 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, 2574 req->cid, 0, status); 2575 pci_nvme_release_ioreq(sc, req); 2576 } 2577 } 2578 2579 static bool 2580 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, 2581 struct nvme_command *cmd, 2582 struct pci_nvme_blockstore *nvstore, 2583 struct pci_nvme_ioreq *req, 2584 uint16_t *status) 2585 { 2586 struct nvme_dsm_range *range; 2587 uint32_t nr, r, non_zero, dr; 2588 int err; 2589 bool pending = false; 2590 2591 #ifndef __FreeBSD__ 2592 range = NULL; 2593 #endif 2594 2595 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) { 2596 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE); 2597 goto out; 2598 } 2599 2600 nr = cmd->cdw10 & 0xff; 2601 2602 /* copy locally because a range entry could straddle PRPs */ 2603 range = calloc(1, NVME_MAX_DSM_TRIM); 2604 if (range == NULL) { 2605 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2606 goto out; 2607 } 2608 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, 2609 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP); 2610 2611 /* Check for invalid ranges and the number of non-zero lengths */ 2612 non_zero = 0; 2613 for (r = 0; r <= nr; r++) { 2614 if (pci_nvme_out_of_range(nvstore, 2615 range[r].starting_lba, range[r].length)) { 2616 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2617 goto out; 2618 } 2619 if (range[r].length != 0) 2620 non_zero++; 2621 } 2622 2623 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) { 2624 size_t offset, bytes; 2625 int sectsz_bits = sc->nvstore.sectsz_bits; 2626 2627 /* 2628 * DSM calls are advisory only, and compliant controllers 2629 * may choose to take no actions (i.e. return Success). 2630 */ 2631 if (!nvstore->deallocate) { 2632 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2633 goto out; 2634 } 2635 2636 /* If all ranges have a zero length, return Success */ 2637 if (non_zero == 0) { 2638 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2639 goto out; 2640 } 2641 2642 if (req == NULL) { 2643 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2644 goto out; 2645 } 2646 2647 offset = range[0].starting_lba << sectsz_bits; 2648 bytes = range[0].length << sectsz_bits; 2649 2650 /* 2651 * If the request is for more than a single range, store 2652 * the ranges in the br_iov. Optimize for the common case 2653 * of a single range. 2654 * 2655 * Note that NVMe Number of Ranges is a zero based value 2656 */ 2657 req->io_req.br_iovcnt = 0; 2658 req->io_req.br_offset = offset; 2659 req->io_req.br_resid = bytes; 2660 2661 if (nr == 0) { 2662 req->io_req.br_callback = pci_nvme_io_done; 2663 } else { 2664 struct iovec *iov = req->io_req.br_iov; 2665 2666 for (r = 0, dr = 0; r <= nr; r++) { 2667 offset = range[r].starting_lba << sectsz_bits; 2668 bytes = range[r].length << sectsz_bits; 2669 if (bytes == 0) 2670 continue; 2671 2672 if ((nvstore->size - offset) < bytes) { 2673 pci_nvme_status_genc(status, 2674 NVME_SC_LBA_OUT_OF_RANGE); 2675 goto out; 2676 } 2677 iov[dr].iov_base = (void *)offset; 2678 iov[dr].iov_len = bytes; 2679 dr++; 2680 } 2681 req->io_req.br_callback = pci_nvme_dealloc_sm; 2682 2683 /* 2684 * Use prev_gpaddr to track the current entry and 2685 * prev_size to track the number of entries 2686 */ 2687 req->prev_gpaddr = 0; 2688 req->prev_size = dr; 2689 } 2690 2691 err = blockif_delete(nvstore->ctx, &req->io_req); 2692 if (err) 2693 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2694 else 2695 pending = true; 2696 } 2697 out: 2698 free(range); 2699 return (pending); 2700 } 2701 2702 static void 2703 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) 2704 { 2705 struct nvme_submission_queue *sq; 2706 uint16_t status; 2707 uint16_t sqhead; 2708 2709 #ifndef __FreeBSD__ 2710 status = 0; 2711 #endif 2712 2713 /* handle all submissions up to sq->tail index */ 2714 sq = &sc->submit_queues[idx]; 2715 2716 pthread_mutex_lock(&sq->mtx); 2717 2718 sqhead = sq->head; 2719 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p", 2720 idx, sqhead, sq->tail, sq->qbase); 2721 2722 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2723 struct nvme_command *cmd; 2724 struct pci_nvme_ioreq *req; 2725 uint32_t nsid; 2726 bool pending; 2727 2728 pending = false; 2729 req = NULL; 2730 status = 0; 2731 2732 cmd = &sq->qbase[sqhead]; 2733 sqhead = (sqhead + 1) % sq->size; 2734 2735 nsid = le32toh(cmd->nsid); 2736 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) { 2737 pci_nvme_status_genc(&status, 2738 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 2739 status |= 2740 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT; 2741 goto complete; 2742 } 2743 2744 req = pci_nvme_get_ioreq(sc); 2745 if (req == NULL) { 2746 pci_nvme_status_genc(&status, 2747 NVME_SC_INTERNAL_DEVICE_ERROR); 2748 WPRINTF("%s: unable to allocate IO req", __func__); 2749 goto complete; 2750 } 2751 req->nvme_sq = sq; 2752 req->sqid = idx; 2753 req->opc = cmd->opc; 2754 req->cid = cmd->cid; 2755 req->nsid = cmd->nsid; 2756 2757 switch (cmd->opc) { 2758 case NVME_OPC_FLUSH: 2759 pending = nvme_opc_flush(sc, cmd, &sc->nvstore, 2760 req, &status); 2761 break; 2762 case NVME_OPC_WRITE: 2763 case NVME_OPC_READ: 2764 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore, 2765 req, &status); 2766 break; 2767 case NVME_OPC_WRITE_ZEROES: 2768 /* TODO: write zeroes 2769 WPRINTF("%s write zeroes lba 0x%lx blocks %u", 2770 __func__, lba, cmd->cdw12 & 0xFFFF); */ 2771 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2772 break; 2773 case NVME_OPC_DATASET_MANAGEMENT: 2774 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, 2775 req, &status); 2776 break; 2777 default: 2778 WPRINTF("%s unhandled io command 0x%x", 2779 __func__, cmd->opc); 2780 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE); 2781 } 2782 complete: 2783 if (!pending) { 2784 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, 2785 status); 2786 if (req != NULL) 2787 pci_nvme_release_ioreq(sc, req); 2788 } 2789 } 2790 2791 sq->head = sqhead; 2792 2793 pthread_mutex_unlock(&sq->mtx); 2794 } 2795 2796 static void 2797 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc, 2798 uint64_t idx, int is_sq, uint64_t value) 2799 { 2800 DPRINTF("nvme doorbell %lu, %s, val 0x%lx", 2801 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF); 2802 2803 if (is_sq) { 2804 if (idx > sc->num_squeues) { 2805 WPRINTF("%s queue index %lu overflow from " 2806 "guest (max %u)", 2807 __func__, idx, sc->num_squeues); 2808 return; 2809 } 2810 2811 atomic_store_short(&sc->submit_queues[idx].tail, 2812 (uint16_t)value); 2813 2814 if (idx == 0) { 2815 pci_nvme_handle_admin_cmd(sc, value); 2816 } else { 2817 /* submission queue; handle new entries in SQ */ 2818 if (idx > sc->num_squeues) { 2819 WPRINTF("%s SQ index %lu overflow from " 2820 "guest (max %u)", 2821 __func__, idx, sc->num_squeues); 2822 return; 2823 } 2824 pci_nvme_handle_io_cmd(sc, (uint16_t)idx); 2825 } 2826 } else { 2827 if (idx > sc->num_cqueues) { 2828 WPRINTF("%s queue index %lu overflow from " 2829 "guest (max %u)", 2830 __func__, idx, sc->num_cqueues); 2831 return; 2832 } 2833 2834 atomic_store_short(&sc->compl_queues[idx].head, 2835 (uint16_t)value); 2836 } 2837 } 2838 2839 static void 2840 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) 2841 { 2842 const char *s = iswrite ? "WRITE" : "READ"; 2843 2844 switch (offset) { 2845 case NVME_CR_CAP_LOW: 2846 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s); 2847 break; 2848 case NVME_CR_CAP_HI: 2849 DPRINTF("%s %s NVME_CR_CAP_HI", func, s); 2850 break; 2851 case NVME_CR_VS: 2852 DPRINTF("%s %s NVME_CR_VS", func, s); 2853 break; 2854 case NVME_CR_INTMS: 2855 DPRINTF("%s %s NVME_CR_INTMS", func, s); 2856 break; 2857 case NVME_CR_INTMC: 2858 DPRINTF("%s %s NVME_CR_INTMC", func, s); 2859 break; 2860 case NVME_CR_CC: 2861 DPRINTF("%s %s NVME_CR_CC", func, s); 2862 break; 2863 case NVME_CR_CSTS: 2864 DPRINTF("%s %s NVME_CR_CSTS", func, s); 2865 break; 2866 case NVME_CR_NSSR: 2867 DPRINTF("%s %s NVME_CR_NSSR", func, s); 2868 break; 2869 case NVME_CR_AQA: 2870 DPRINTF("%s %s NVME_CR_AQA", func, s); 2871 break; 2872 case NVME_CR_ASQ_LOW: 2873 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s); 2874 break; 2875 case NVME_CR_ASQ_HI: 2876 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s); 2877 break; 2878 case NVME_CR_ACQ_LOW: 2879 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s); 2880 break; 2881 case NVME_CR_ACQ_HI: 2882 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s); 2883 break; 2884 default: 2885 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset); 2886 } 2887 2888 } 2889 2890 static void 2891 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc, 2892 uint64_t offset, int size, uint64_t value) 2893 { 2894 uint32_t ccreg; 2895 2896 if (offset >= NVME_DOORBELL_OFFSET) { 2897 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; 2898 uint64_t idx = belloffset / 8; /* door bell size = 2*int */ 2899 int is_sq = (belloffset % 8) < 4; 2900 2901 if (belloffset > ((sc->max_queues+1) * 8 - 4)) { 2902 WPRINTF("guest attempted an overflow write offset " 2903 "0x%lx, val 0x%lx in %s", 2904 offset, value, __func__); 2905 return; 2906 } 2907 2908 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value); 2909 return; 2910 } 2911 2912 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx", 2913 offset, size, value); 2914 2915 if (size != 4) { 2916 WPRINTF("guest wrote invalid size %d (offset 0x%lx, " 2917 "val 0x%lx) to bar0 in %s", 2918 size, offset, value, __func__); 2919 /* TODO: shutdown device */ 2920 return; 2921 } 2922 2923 pci_nvme_bar0_reg_dumps(__func__, offset, 1); 2924 2925 pthread_mutex_lock(&sc->mtx); 2926 2927 switch (offset) { 2928 case NVME_CR_CAP_LOW: 2929 case NVME_CR_CAP_HI: 2930 /* readonly */ 2931 break; 2932 case NVME_CR_VS: 2933 /* readonly */ 2934 break; 2935 case NVME_CR_INTMS: 2936 /* MSI-X, so ignore */ 2937 break; 2938 case NVME_CR_INTMC: 2939 /* MSI-X, so ignore */ 2940 break; 2941 case NVME_CR_CC: 2942 ccreg = (uint32_t)value; 2943 2944 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u " 2945 "iocqes %u", 2946 __func__, 2947 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), 2948 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), 2949 NVME_CC_GET_IOCQES(ccreg)); 2950 2951 if (NVME_CC_GET_SHN(ccreg)) { 2952 /* perform shutdown - flush out data to backend */ 2953 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << 2954 NVME_CSTS_REG_SHST_SHIFT); 2955 sc->regs.csts |= NVME_SHST_COMPLETE << 2956 NVME_CSTS_REG_SHST_SHIFT; 2957 } 2958 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { 2959 if (NVME_CC_GET_EN(ccreg) == 0) 2960 /* transition 1-> causes controller reset */ 2961 pci_nvme_reset_locked(sc); 2962 else 2963 pci_nvme_init_controller(ctx, sc); 2964 } 2965 2966 /* Insert the iocqes, iosqes and en bits from the write */ 2967 sc->regs.cc &= ~NVME_CC_WRITE_MASK; 2968 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; 2969 if (NVME_CC_GET_EN(ccreg) == 0) { 2970 /* Insert the ams, mps and css bit fields */ 2971 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; 2972 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; 2973 sc->regs.csts &= ~NVME_CSTS_RDY; 2974 } else if (sc->pending_ios == 0) { 2975 sc->regs.csts |= NVME_CSTS_RDY; 2976 } 2977 break; 2978 case NVME_CR_CSTS: 2979 break; 2980 case NVME_CR_NSSR: 2981 /* ignore writes; don't support subsystem reset */ 2982 break; 2983 case NVME_CR_AQA: 2984 sc->regs.aqa = (uint32_t)value; 2985 break; 2986 case NVME_CR_ASQ_LOW: 2987 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | 2988 (0xFFFFF000 & value); 2989 break; 2990 case NVME_CR_ASQ_HI: 2991 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | 2992 (value << 32); 2993 break; 2994 case NVME_CR_ACQ_LOW: 2995 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | 2996 (0xFFFFF000 & value); 2997 break; 2998 case NVME_CR_ACQ_HI: 2999 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | 3000 (value << 32); 3001 break; 3002 default: 3003 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d", 3004 __func__, offset, value, size); 3005 } 3006 pthread_mutex_unlock(&sc->mtx); 3007 } 3008 3009 static void 3010 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, 3011 int baridx, uint64_t offset, int size, uint64_t value) 3012 { 3013 struct pci_nvme_softc* sc = pi->pi_arg; 3014 3015 if (baridx == pci_msix_table_bar(pi) || 3016 baridx == pci_msix_pba_bar(pi)) { 3017 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, " 3018 " value 0x%lx", baridx, offset, size, value); 3019 3020 pci_emul_msix_twrite(pi, offset, size, value); 3021 return; 3022 } 3023 3024 switch (baridx) { 3025 case 0: 3026 pci_nvme_write_bar_0(ctx, sc, offset, size, value); 3027 break; 3028 3029 default: 3030 DPRINTF("%s unknown baridx %d, val 0x%lx", 3031 __func__, baridx, value); 3032 } 3033 } 3034 3035 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, 3036 uint64_t offset, int size) 3037 { 3038 uint64_t value; 3039 3040 pci_nvme_bar0_reg_dumps(__func__, offset, 0); 3041 3042 if (offset < NVME_DOORBELL_OFFSET) { 3043 void *p = &(sc->regs); 3044 pthread_mutex_lock(&sc->mtx); 3045 memcpy(&value, (void *)((uintptr_t)p + offset), size); 3046 pthread_mutex_unlock(&sc->mtx); 3047 } else { 3048 value = 0; 3049 WPRINTF("pci_nvme: read invalid offset %ld", offset); 3050 } 3051 3052 switch (size) { 3053 case 1: 3054 value &= 0xFF; 3055 break; 3056 case 2: 3057 value &= 0xFFFF; 3058 break; 3059 case 4: 3060 value &= 0xFFFFFFFF; 3061 break; 3062 } 3063 3064 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x", 3065 offset, size, (uint32_t)value); 3066 3067 return (value); 3068 } 3069 3070 3071 3072 static uint64_t 3073 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, 3074 uint64_t offset, int size) 3075 { 3076 struct pci_nvme_softc* sc = pi->pi_arg; 3077 3078 if (baridx == pci_msix_table_bar(pi) || 3079 baridx == pci_msix_pba_bar(pi)) { 3080 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d", 3081 baridx, offset, size); 3082 3083 return pci_emul_msix_tread(pi, offset, size); 3084 } 3085 3086 switch (baridx) { 3087 case 0: 3088 return pci_nvme_read_bar_0(sc, offset, size); 3089 3090 default: 3091 DPRINTF("unknown bar %d, 0x%lx", baridx, offset); 3092 } 3093 3094 return (0); 3095 } 3096 3097 static int 3098 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl) 3099 { 3100 char bident[sizeof("XX:X:X")]; 3101 const char *value; 3102 uint32_t sectsz; 3103 3104 sc->max_queues = NVME_QUEUES; 3105 sc->max_qentries = NVME_MAX_QENTRIES; 3106 sc->ioslots = NVME_IOSLOTS; 3107 sc->num_squeues = sc->max_queues; 3108 sc->num_cqueues = sc->max_queues; 3109 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 3110 sectsz = 0; 3111 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), 3112 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3113 3114 value = get_config_value_node(nvl, "maxq"); 3115 if (value != NULL) 3116 sc->max_queues = atoi(value); 3117 value = get_config_value_node(nvl, "qsz"); 3118 if (value != NULL) { 3119 sc->max_qentries = atoi(value); 3120 if (sc->max_qentries <= 0) { 3121 EPRINTLN("nvme: Invalid qsz option %d", 3122 sc->max_qentries); 3123 return (-1); 3124 } 3125 } 3126 value = get_config_value_node(nvl, "ioslots"); 3127 if (value != NULL) { 3128 sc->ioslots = atoi(value); 3129 if (sc->ioslots <= 0) { 3130 EPRINTLN("Invalid ioslots option %d", sc->ioslots); 3131 return (-1); 3132 } 3133 } 3134 value = get_config_value_node(nvl, "sectsz"); 3135 if (value != NULL) 3136 sectsz = atoi(value); 3137 value = get_config_value_node(nvl, "ser"); 3138 if (value != NULL) { 3139 /* 3140 * This field indicates the Product Serial Number in 3141 * 7-bit ASCII, unused bytes should be space characters. 3142 * Ref: NVMe v1.3c. 3143 */ 3144 cpywithpad((char *)sc->ctrldata.sn, 3145 sizeof(sc->ctrldata.sn), value, ' '); 3146 } 3147 value = get_config_value_node(nvl, "eui64"); 3148 if (value != NULL) 3149 sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0)); 3150 value = get_config_value_node(nvl, "dsm"); 3151 if (value != NULL) { 3152 if (strcmp(value, "auto") == 0) 3153 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 3154 else if (strcmp(value, "enable") == 0) 3155 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE; 3156 else if (strcmp(value, "disable") == 0) 3157 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE; 3158 } 3159 3160 value = get_config_value_node(nvl, "ram"); 3161 if (value != NULL) { 3162 uint64_t sz = strtoull(value, NULL, 10); 3163 3164 sc->nvstore.type = NVME_STOR_RAM; 3165 sc->nvstore.size = sz * 1024 * 1024; 3166 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 3167 sc->nvstore.sectsz = 4096; 3168 sc->nvstore.sectsz_bits = 12; 3169 if (sc->nvstore.ctx == NULL) { 3170 EPRINTLN("nvme: Unable to allocate RAM"); 3171 return (-1); 3172 } 3173 } else { 3174 snprintf(bident, sizeof(bident), "%d:%d", 3175 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3176 sc->nvstore.ctx = blockif_open(nvl, bident); 3177 if (sc->nvstore.ctx == NULL) { 3178 EPRINTLN("nvme: Could not open backing file: %s", 3179 strerror(errno)); 3180 return (-1); 3181 } 3182 sc->nvstore.type = NVME_STOR_BLOCKIF; 3183 sc->nvstore.size = blockif_size(sc->nvstore.ctx); 3184 } 3185 3186 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) 3187 sc->nvstore.sectsz = sectsz; 3188 else if (sc->nvstore.type != NVME_STOR_RAM) 3189 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); 3190 for (sc->nvstore.sectsz_bits = 9; 3191 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; 3192 sc->nvstore.sectsz_bits++); 3193 3194 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) 3195 sc->max_queues = NVME_QUEUES; 3196 3197 return (0); 3198 } 3199 3200 static void 3201 pci_nvme_resized(struct blockif_ctxt *bctxt, void *arg, size_t new_size) 3202 { 3203 struct pci_nvme_softc *sc; 3204 struct pci_nvme_blockstore *nvstore; 3205 struct nvme_namespace_data *nd; 3206 3207 sc = arg; 3208 nvstore = &sc->nvstore; 3209 nd = &sc->nsdata; 3210 3211 nvstore->size = new_size; 3212 pci_nvme_init_nsdata_size(nvstore, nd); 3213 3214 /* Add changed NSID to list */ 3215 sc->ns_log.ns[0] = 1; 3216 sc->ns_log.ns[1] = 0; 3217 3218 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE, 3219 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED); 3220 } 3221 3222 static int 3223 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) 3224 { 3225 struct pci_nvme_softc *sc; 3226 uint32_t pci_membar_sz; 3227 int error; 3228 3229 error = 0; 3230 3231 sc = calloc(1, sizeof(struct pci_nvme_softc)); 3232 pi->pi_arg = sc; 3233 sc->nsc_pi = pi; 3234 3235 error = pci_nvme_parse_config(sc, nvl); 3236 if (error < 0) 3237 goto done; 3238 else 3239 error = 0; 3240 3241 STAILQ_INIT(&sc->ioreqs_free); 3242 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); 3243 for (int i = 0; i < sc->ioslots; i++) { 3244 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); 3245 } 3246 3247 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); 3248 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); 3249 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); 3250 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); 3251 pci_set_cfgdata8(pi, PCIR_PROGIF, 3252 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); 3253 3254 /* 3255 * Allocate size of NVMe registers + doorbell space for all queues. 3256 * 3257 * The specification requires a minimum memory I/O window size of 16K. 3258 * The Windows driver will refuse to start a device with a smaller 3259 * window. 3260 */ 3261 pci_membar_sz = sizeof(struct nvme_registers) + 3262 2 * sizeof(uint32_t) * (sc->max_queues + 1); 3263 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); 3264 3265 DPRINTF("nvme membar size: %u", pci_membar_sz); 3266 3267 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); 3268 if (error) { 3269 WPRINTF("%s pci alloc mem bar failed", __func__); 3270 goto done; 3271 } 3272 3273 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); 3274 if (error) { 3275 WPRINTF("%s pci add msixcap failed", __func__); 3276 goto done; 3277 } 3278 3279 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); 3280 if (error) { 3281 WPRINTF("%s pci add Express capability failed", __func__); 3282 goto done; 3283 } 3284 3285 pthread_mutex_init(&sc->mtx, NULL); 3286 sem_init(&sc->iosemlock, 0, sc->ioslots); 3287 blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc); 3288 3289 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues); 3290 /* 3291 * Controller data depends on Namespace data so initialize Namespace 3292 * data first. 3293 */ 3294 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore); 3295 pci_nvme_init_ctrldata(sc); 3296 pci_nvme_init_logpages(sc); 3297 pci_nvme_init_features(sc); 3298 3299 pci_nvme_aer_init(sc); 3300 pci_nvme_aen_init(sc); 3301 3302 pci_nvme_reset(sc); 3303 3304 pci_lintr_request(pi); 3305 3306 done: 3307 return (error); 3308 } 3309 3310 static int 3311 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts) 3312 { 3313 char *cp, *ram; 3314 3315 if (opts == NULL) 3316 return (0); 3317 3318 if (strncmp(opts, "ram=", 4) == 0) { 3319 cp = strchr(opts, ','); 3320 if (cp == NULL) { 3321 set_config_value_node(nvl, "ram", opts + 4); 3322 return (0); 3323 } 3324 ram = strndup(opts + 4, cp - opts - 4); 3325 set_config_value_node(nvl, "ram", ram); 3326 free(ram); 3327 return (pci_parse_legacy_config(nvl, cp + 1)); 3328 } else 3329 return (blockif_legacy_config(nvl, opts)); 3330 } 3331 3332 struct pci_devemu pci_de_nvme = { 3333 .pe_emu = "nvme", 3334 .pe_init = pci_nvme_init, 3335 .pe_legacy_config = pci_nvme_legacy_config, 3336 .pe_barwrite = pci_nvme_write, 3337 .pe_barread = pci_nvme_read 3338 }; 3339 PCI_EMUL_SET(pci_de_nvme); 3340