1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2017 Shunsuke Mie 5 * Copyright (c) 2018 Leon Dang 6 * Copyright (c) 2020 Chuck Tuffli 7 * 8 * Function crc16 Copyright (c) 2017, Fedor Uporov 9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 33 /* 34 * bhyve PCIe-NVMe device emulation. 35 * 36 * options: 37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt> 38 * 39 * accepted devpath: 40 * /dev/blockdev 41 * /path/to/image 42 * ram=size_in_MiB 43 * 44 * maxq = max number of queues 45 * qsz = max elements in each queue 46 * ioslots = max number of concurrent io requests 47 * sectsz = sector size (defaults to blockif sector size) 48 * ser = serial number (20-chars max) 49 * eui64 = IEEE Extended Unique Identifier (8 byte value) 50 * dsm = DataSet Management support. Option is one of auto, enable,disable 51 * 52 */ 53 54 /* TODO: 55 - create async event for smart and log 56 - intr coalesce 57 */ 58 59 #include <sys/cdefs.h> 60 __FBSDID("$FreeBSD$"); 61 62 #include <sys/errno.h> 63 #include <sys/types.h> 64 #include <net/ieee_oui.h> 65 66 #include <assert.h> 67 #include <pthread.h> 68 #include <pthread_np.h> 69 #include <semaphore.h> 70 #include <stdbool.h> 71 #include <stddef.h> 72 #include <stdint.h> 73 #include <stdio.h> 74 #include <stdlib.h> 75 #include <string.h> 76 77 #include <machine/atomic.h> 78 #include <machine/vmm.h> 79 #include <vmmapi.h> 80 81 #include <dev/nvme/nvme.h> 82 83 #include "bhyverun.h" 84 #include "block_if.h" 85 #include "config.h" 86 #include "debug.h" 87 #include "pci_emul.h" 88 89 90 static int nvme_debug = 0; 91 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args) 92 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args) 93 94 /* defaults; can be overridden */ 95 #define NVME_MSIX_BAR 4 96 97 #define NVME_IOSLOTS 8 98 99 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ 100 #define NVME_MMIO_SPACE_MIN (1 << 14) 101 102 #define NVME_QUEUES 16 103 #define NVME_MAX_QENTRIES 2048 104 /* Memory Page size Minimum reported in CAP register */ 105 #define NVME_MPSMIN 0 106 /* MPSMIN converted to bytes */ 107 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN)) 108 109 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) 110 #define NVME_MDTS 9 111 /* Note the + 1 allows for the initial descriptor to not be page aligned */ 112 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1) 113 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES) 114 115 /* This is a synthetic status code to indicate there is no status */ 116 #define NVME_NO_STATUS 0xffff 117 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) 118 119 /* Reported temperature in Kelvin (i.e. room temperature) */ 120 #define NVME_TEMPERATURE 296 121 122 /* helpers */ 123 124 /* Convert a zero-based value into a one-based value */ 125 #define ONE_BASED(zero) ((zero) + 1) 126 /* Convert a one-based value into a zero-based value */ 127 #define ZERO_BASED(one) ((one) - 1) 128 129 /* Encode number of SQ's and CQ's for Set/Get Features */ 130 #define NVME_FEATURE_NUM_QUEUES(sc) \ 131 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ 132 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16; 133 134 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) 135 136 enum nvme_controller_register_offsets { 137 NVME_CR_CAP_LOW = 0x00, 138 NVME_CR_CAP_HI = 0x04, 139 NVME_CR_VS = 0x08, 140 NVME_CR_INTMS = 0x0c, 141 NVME_CR_INTMC = 0x10, 142 NVME_CR_CC = 0x14, 143 NVME_CR_CSTS = 0x1c, 144 NVME_CR_NSSR = 0x20, 145 NVME_CR_AQA = 0x24, 146 NVME_CR_ASQ_LOW = 0x28, 147 NVME_CR_ASQ_HI = 0x2c, 148 NVME_CR_ACQ_LOW = 0x30, 149 NVME_CR_ACQ_HI = 0x34, 150 }; 151 152 enum nvme_cmd_cdw11 { 153 NVME_CMD_CDW11_PC = 0x0001, 154 NVME_CMD_CDW11_IEN = 0x0002, 155 NVME_CMD_CDW11_IV = 0xFFFF0000, 156 }; 157 158 enum nvme_copy_dir { 159 NVME_COPY_TO_PRP, 160 NVME_COPY_FROM_PRP, 161 }; 162 163 #define NVME_CQ_INTEN 0x01 164 #define NVME_CQ_INTCOAL 0x02 165 166 struct nvme_completion_queue { 167 struct nvme_completion *qbase; 168 pthread_mutex_t mtx; 169 uint32_t size; 170 uint16_t tail; /* nvme progress */ 171 uint16_t head; /* guest progress */ 172 uint16_t intr_vec; 173 uint32_t intr_en; 174 }; 175 176 struct nvme_submission_queue { 177 struct nvme_command *qbase; 178 pthread_mutex_t mtx; 179 uint32_t size; 180 uint16_t head; /* nvme progress */ 181 uint16_t tail; /* guest progress */ 182 uint16_t cqid; /* completion queue id */ 183 int qpriority; 184 }; 185 186 enum nvme_storage_type { 187 NVME_STOR_BLOCKIF = 0, 188 NVME_STOR_RAM = 1, 189 }; 190 191 struct pci_nvme_blockstore { 192 enum nvme_storage_type type; 193 void *ctx; 194 uint64_t size; 195 uint32_t sectsz; 196 uint32_t sectsz_bits; 197 uint64_t eui64; 198 uint32_t deallocate:1; 199 }; 200 201 /* 202 * Calculate the number of additional page descriptors for guest IO requests 203 * based on the advertised Max Data Transfer (MDTS) and given the number of 204 * default iovec's in a struct blockif_req. 205 */ 206 #define MDTS_PAD_SIZE \ 207 ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \ 208 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \ 209 0 ) 210 211 struct pci_nvme_ioreq { 212 struct pci_nvme_softc *sc; 213 STAILQ_ENTRY(pci_nvme_ioreq) link; 214 struct nvme_submission_queue *nvme_sq; 215 uint16_t sqid; 216 217 /* command information */ 218 uint16_t opc; 219 uint16_t cid; 220 uint32_t nsid; 221 222 uint64_t prev_gpaddr; 223 size_t prev_size; 224 size_t bytes; 225 226 struct blockif_req io_req; 227 228 struct iovec iovpadding[MDTS_PAD_SIZE]; 229 }; 230 231 enum nvme_dsm_type { 232 /* Dataset Management bit in ONCS reflects backing storage capability */ 233 NVME_DATASET_MANAGEMENT_AUTO, 234 /* Unconditionally set Dataset Management bit in ONCS */ 235 NVME_DATASET_MANAGEMENT_ENABLE, 236 /* Unconditionally clear Dataset Management bit in ONCS */ 237 NVME_DATASET_MANAGEMENT_DISABLE, 238 }; 239 240 struct pci_nvme_softc; 241 struct nvme_feature_obj; 242 243 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *, 244 struct nvme_feature_obj *, 245 struct nvme_command *, 246 struct nvme_completion *); 247 248 struct nvme_feature_obj { 249 uint32_t cdw11; 250 nvme_feature_cb set; 251 nvme_feature_cb get; 252 bool namespace_specific; 253 }; 254 255 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1) 256 257 typedef enum { 258 PCI_NVME_AE_TYPE_ERROR = 0, 259 PCI_NVME_AE_TYPE_SMART, 260 PCI_NVME_AE_TYPE_NOTICE, 261 PCI_NVME_AE_TYPE_IO_CMD = 6, 262 PCI_NVME_AE_TYPE_VENDOR = 7, 263 PCI_NVME_AE_TYPE_MAX /* Must be last */ 264 } pci_nvme_async_type; 265 266 /* Asynchronous Event Requests */ 267 struct pci_nvme_aer { 268 STAILQ_ENTRY(pci_nvme_aer) link; 269 uint16_t cid; /* Command ID of the submitted AER */ 270 }; 271 272 typedef enum { 273 PCI_NVME_AE_INFO_NS_ATTR_CHANGED = 0, 274 PCI_NVME_AE_INFO_FW_ACTIVATION, 275 PCI_NVME_AE_INFO_TELEMETRY_CHANGE, 276 PCI_NVME_AE_INFO_ANA_CHANGE, 277 PCI_NVME_AE_INFO_PREDICT_LATENCY_CHANGE, 278 PCI_NVME_AE_INFO_LBA_STATUS_ALERT, 279 PCI_NVME_AE_INFO_ENDURANCE_GROUP_CHANGE, 280 PCI_NVME_AE_INFO_MAX, 281 } pci_nvme_async_info; 282 283 /* Asynchronous Event Notifications */ 284 struct pci_nvme_aen { 285 pci_nvme_async_type atype; 286 uint32_t event_data; 287 bool posted; 288 }; 289 290 /* 291 * By default, enable all Asynchrnous Event Notifications: 292 * SMART / Health Critical Warnings 293 * Namespace Attribute Notices 294 */ 295 #define PCI_NVME_AEN_DEFAULT_MASK 0x11f 296 297 typedef enum { 298 NVME_CNTRLTYPE_IO = 1, 299 NVME_CNTRLTYPE_DISCOVERY = 2, 300 NVME_CNTRLTYPE_ADMIN = 3, 301 } pci_nvme_cntrl_type; 302 303 struct pci_nvme_softc { 304 struct pci_devinst *nsc_pi; 305 306 pthread_mutex_t mtx; 307 308 struct nvme_registers regs; 309 310 struct nvme_namespace_data nsdata; 311 struct nvme_controller_data ctrldata; 312 struct nvme_error_information_entry err_log; 313 struct nvme_health_information_page health_log; 314 struct nvme_firmware_page fw_log; 315 struct nvme_ns_list ns_log; 316 317 struct pci_nvme_blockstore nvstore; 318 319 uint16_t max_qentries; /* max entries per queue */ 320 uint32_t max_queues; /* max number of IO SQ's or CQ's */ 321 uint32_t num_cqueues; 322 uint32_t num_squeues; 323 bool num_q_is_set; /* Has host set Number of Queues */ 324 325 struct pci_nvme_ioreq *ioreqs; 326 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */ 327 uint32_t pending_ios; 328 uint32_t ioslots; 329 sem_t iosemlock; 330 331 /* 332 * Memory mapped Submission and Completion queues 333 * Each array includes both Admin and IO queues 334 */ 335 struct nvme_completion_queue *compl_queues; 336 struct nvme_submission_queue *submit_queues; 337 338 struct nvme_feature_obj feat[NVME_FID_MAX]; 339 340 enum nvme_dsm_type dataset_management; 341 342 /* Accounting for SMART data */ 343 __uint128_t read_data_units; 344 __uint128_t write_data_units; 345 __uint128_t read_commands; 346 __uint128_t write_commands; 347 uint32_t read_dunits_remainder; 348 uint32_t write_dunits_remainder; 349 350 STAILQ_HEAD(, pci_nvme_aer) aer_list; 351 pthread_mutex_t aer_mtx; 352 uint32_t aer_count; 353 struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX]; 354 pthread_t aen_tid; 355 pthread_mutex_t aen_mtx; 356 pthread_cond_t aen_cond; 357 }; 358 359 360 static void pci_nvme_cq_update(struct pci_nvme_softc *sc, 361 struct nvme_completion_queue *cq, 362 uint32_t cdw0, 363 uint16_t cid, 364 uint16_t sqid, 365 uint16_t status); 366 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); 367 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); 368 static void pci_nvme_io_done(struct blockif_req *, int); 369 370 /* Controller Configuration utils */ 371 #define NVME_CC_GET_EN(cc) \ 372 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) 373 #define NVME_CC_GET_CSS(cc) \ 374 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) 375 #define NVME_CC_GET_SHN(cc) \ 376 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) 377 #define NVME_CC_GET_IOSQES(cc) \ 378 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) 379 #define NVME_CC_GET_IOCQES(cc) \ 380 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) 381 382 #define NVME_CC_WRITE_MASK \ 383 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ 384 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ 385 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) 386 387 #define NVME_CC_NEN_WRITE_MASK \ 388 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ 389 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ 390 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) 391 392 /* Controller Status utils */ 393 #define NVME_CSTS_GET_RDY(sts) \ 394 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) 395 396 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) 397 398 /* Completion Queue status word utils */ 399 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) 400 #define NVME_STATUS_MASK \ 401 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ 402 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) 403 404 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \ 405 NVME_CTRLR_DATA_ONCS_DSM_SHIFT) 406 407 static void nvme_feature_invalid_cb(struct pci_nvme_softc *, 408 struct nvme_feature_obj *, 409 struct nvme_command *, 410 struct nvme_completion *); 411 static void nvme_feature_temperature(struct pci_nvme_softc *, 412 struct nvme_feature_obj *, 413 struct nvme_command *, 414 struct nvme_completion *); 415 static void nvme_feature_num_queues(struct pci_nvme_softc *, 416 struct nvme_feature_obj *, 417 struct nvme_command *, 418 struct nvme_completion *); 419 static void nvme_feature_iv_config(struct pci_nvme_softc *, 420 struct nvme_feature_obj *, 421 struct nvme_command *, 422 struct nvme_completion *); 423 static void nvme_feature_async_event(struct pci_nvme_softc *, 424 struct nvme_feature_obj *, 425 struct nvme_command *, 426 struct nvme_completion *); 427 428 static void *aen_thr(void *arg); 429 430 static __inline void 431 cpywithpad(char *dst, size_t dst_size, const char *src, char pad) 432 { 433 size_t len; 434 435 len = strnlen(src, dst_size); 436 memset(dst, pad, dst_size); 437 memcpy(dst, src, len); 438 } 439 440 static __inline void 441 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) 442 { 443 444 *status &= ~NVME_STATUS_MASK; 445 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | 446 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; 447 } 448 449 static __inline void 450 pci_nvme_status_genc(uint16_t *status, uint16_t code) 451 { 452 453 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); 454 } 455 456 /* 457 * Initialize the requested number or IO Submission and Completion Queues. 458 * Admin queues are allocated implicitly. 459 */ 460 static void 461 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq) 462 { 463 uint32_t i; 464 465 /* 466 * Allocate and initialize the Submission Queues 467 */ 468 if (nsq > NVME_QUEUES) { 469 WPRINTF("%s: clamping number of SQ from %u to %u", 470 __func__, nsq, NVME_QUEUES); 471 nsq = NVME_QUEUES; 472 } 473 474 sc->num_squeues = nsq; 475 476 sc->submit_queues = calloc(sc->num_squeues + 1, 477 sizeof(struct nvme_submission_queue)); 478 if (sc->submit_queues == NULL) { 479 WPRINTF("%s: SQ allocation failed", __func__); 480 sc->num_squeues = 0; 481 } else { 482 struct nvme_submission_queue *sq = sc->submit_queues; 483 484 for (i = 0; i < sc->num_squeues; i++) 485 pthread_mutex_init(&sq[i].mtx, NULL); 486 } 487 488 /* 489 * Allocate and initialize the Completion Queues 490 */ 491 if (ncq > NVME_QUEUES) { 492 WPRINTF("%s: clamping number of CQ from %u to %u", 493 __func__, ncq, NVME_QUEUES); 494 ncq = NVME_QUEUES; 495 } 496 497 sc->num_cqueues = ncq; 498 499 sc->compl_queues = calloc(sc->num_cqueues + 1, 500 sizeof(struct nvme_completion_queue)); 501 if (sc->compl_queues == NULL) { 502 WPRINTF("%s: CQ allocation failed", __func__); 503 sc->num_cqueues = 0; 504 } else { 505 struct nvme_completion_queue *cq = sc->compl_queues; 506 507 for (i = 0; i < sc->num_cqueues; i++) 508 pthread_mutex_init(&cq[i].mtx, NULL); 509 } 510 } 511 512 static void 513 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) 514 { 515 struct nvme_controller_data *cd = &sc->ctrldata; 516 517 cd->vid = 0xFB5D; 518 cd->ssvid = 0x0000; 519 520 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); 521 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); 522 523 /* Num of submission commands that we can handle at a time (2^rab) */ 524 cd->rab = 4; 525 526 /* FreeBSD OUI */ 527 cd->ieee[0] = 0x58; 528 cd->ieee[1] = 0x9c; 529 cd->ieee[2] = 0xfc; 530 531 cd->mic = 0; 532 533 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */ 534 535 cd->ver = NVME_REV(1,4); 536 537 cd->cntrltype = NVME_CNTRLTYPE_IO; 538 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; 539 cd->acl = 2; 540 cd->aerl = 4; 541 542 /* Advertise 1, Read-only firmware slot */ 543 cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK | 544 (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT); 545 cd->lpa = 0; /* TODO: support some simple things like SMART */ 546 cd->elpe = 0; /* max error log page entries */ 547 cd->npss = 1; /* number of power states support */ 548 549 /* Warning Composite Temperature Threshold */ 550 cd->wctemp = 0x0157; 551 cd->cctemp = 0x0157; 552 553 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | 554 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); 555 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | 556 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); 557 cd->nn = 1; /* number of namespaces */ 558 559 cd->oncs = 0; 560 switch (sc->dataset_management) { 561 case NVME_DATASET_MANAGEMENT_AUTO: 562 if (sc->nvstore.deallocate) 563 cd->oncs |= NVME_ONCS_DSM; 564 break; 565 case NVME_DATASET_MANAGEMENT_ENABLE: 566 cd->oncs |= NVME_ONCS_DSM; 567 break; 568 default: 569 break; 570 } 571 572 cd->fna = NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK << 573 NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT; 574 575 cd->vwc = NVME_CTRLR_DATA_VWC_ALL_NO << NVME_CTRLR_DATA_VWC_ALL_SHIFT; 576 577 cd->power_state[0].mp = 10; 578 } 579 580 /* 581 * Calculate the CRC-16 of the given buffer 582 * See copyright attribution at top of file 583 */ 584 static uint16_t 585 crc16(uint16_t crc, const void *buffer, unsigned int len) 586 { 587 const unsigned char *cp = buffer; 588 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */ 589 static uint16_t const crc16_table[256] = { 590 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, 591 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, 592 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, 593 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841, 594 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, 595 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41, 596 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641, 597 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040, 598 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240, 599 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441, 600 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41, 601 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840, 602 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41, 603 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40, 604 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640, 605 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041, 606 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240, 607 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441, 608 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41, 609 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840, 610 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41, 611 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40, 612 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640, 613 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041, 614 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241, 615 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440, 616 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40, 617 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841, 618 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40, 619 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41, 620 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, 621 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 622 }; 623 624 while (len--) 625 crc = (((crc >> 8) & 0xffU) ^ 626 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU; 627 return crc; 628 } 629 630 static void 631 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore, 632 struct nvme_namespace_data *nd) 633 { 634 635 /* Get capacity and block size information from backing store */ 636 nd->nsze = nvstore->size / nvstore->sectsz; 637 nd->ncap = nd->nsze; 638 nd->nuse = nd->nsze; 639 } 640 641 static void 642 pci_nvme_init_nsdata(struct pci_nvme_softc *sc, 643 struct nvme_namespace_data *nd, uint32_t nsid, 644 struct pci_nvme_blockstore *nvstore) 645 { 646 647 pci_nvme_init_nsdata_size(nvstore, nd); 648 649 if (nvstore->type == NVME_STOR_BLOCKIF) 650 nvstore->deallocate = blockif_candelete(nvstore->ctx); 651 652 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ 653 nd->flbas = 0; 654 655 /* Create an EUI-64 if user did not provide one */ 656 if (nvstore->eui64 == 0) { 657 char *data = NULL; 658 uint64_t eui64 = nvstore->eui64; 659 660 asprintf(&data, "%s%u%u%u", get_config_value("name"), 661 sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot, 662 sc->nsc_pi->pi_func); 663 664 if (data != NULL) { 665 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); 666 free(data); 667 } 668 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff); 669 } 670 be64enc(nd->eui64, nvstore->eui64); 671 672 /* LBA data-sz = 2^lbads */ 673 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; 674 } 675 676 static void 677 pci_nvme_init_logpages(struct pci_nvme_softc *sc) 678 { 679 680 memset(&sc->err_log, 0, sizeof(sc->err_log)); 681 memset(&sc->health_log, 0, sizeof(sc->health_log)); 682 memset(&sc->fw_log, 0, sizeof(sc->fw_log)); 683 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 684 685 /* Set read/write remainder to round up according to spec */ 686 sc->read_dunits_remainder = 999; 687 sc->write_dunits_remainder = 999; 688 689 /* Set nominal Health values checked by implementations */ 690 sc->health_log.temperature = NVME_TEMPERATURE; 691 sc->health_log.available_spare = 100; 692 sc->health_log.available_spare_threshold = 10; 693 } 694 695 static void 696 pci_nvme_init_features(struct pci_nvme_softc *sc) 697 { 698 enum nvme_feature fid; 699 700 for (fid = 0; fid < NVME_FID_MAX; fid++) { 701 switch (fid) { 702 case NVME_FEAT_ARBITRATION: 703 case NVME_FEAT_POWER_MANAGEMENT: 704 case NVME_FEAT_INTERRUPT_COALESCING: //XXX 705 case NVME_FEAT_WRITE_ATOMICITY: 706 /* Mandatory but no special handling required */ 707 //XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 708 //XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 709 // this returns a data buffer 710 break; 711 case NVME_FEAT_TEMPERATURE_THRESHOLD: 712 sc->feat[fid].set = nvme_feature_temperature; 713 break; 714 case NVME_FEAT_ERROR_RECOVERY: 715 sc->feat[fid].namespace_specific = true; 716 break; 717 case NVME_FEAT_NUMBER_OF_QUEUES: 718 sc->feat[fid].set = nvme_feature_num_queues; 719 break; 720 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 721 sc->feat[fid].set = nvme_feature_iv_config; 722 break; 723 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 724 sc->feat[fid].set = nvme_feature_async_event; 725 /* Enable all AENs by default */ 726 sc->feat[fid].cdw11 = PCI_NVME_AEN_DEFAULT_MASK; 727 break; 728 default: 729 sc->feat[fid].set = nvme_feature_invalid_cb; 730 sc->feat[fid].get = nvme_feature_invalid_cb; 731 } 732 } 733 } 734 735 static void 736 pci_nvme_aer_reset(struct pci_nvme_softc *sc) 737 { 738 739 STAILQ_INIT(&sc->aer_list); 740 sc->aer_count = 0; 741 } 742 743 static void 744 pci_nvme_aer_init(struct pci_nvme_softc *sc) 745 { 746 747 pthread_mutex_init(&sc->aer_mtx, NULL); 748 pci_nvme_aer_reset(sc); 749 } 750 751 static void 752 pci_nvme_aer_destroy(struct pci_nvme_softc *sc) 753 { 754 struct pci_nvme_aer *aer = NULL; 755 756 pthread_mutex_lock(&sc->aer_mtx); 757 while (!STAILQ_EMPTY(&sc->aer_list)) { 758 aer = STAILQ_FIRST(&sc->aer_list); 759 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 760 free(aer); 761 } 762 pthread_mutex_unlock(&sc->aer_mtx); 763 764 pci_nvme_aer_reset(sc); 765 } 766 767 static bool 768 pci_nvme_aer_available(struct pci_nvme_softc *sc) 769 { 770 771 return (sc->aer_count != 0); 772 } 773 774 static bool 775 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc) 776 { 777 struct nvme_controller_data *cd = &sc->ctrldata; 778 779 /* AERL is a zero based value while aer_count is one's based */ 780 return (sc->aer_count == (cd->aerl + 1)); 781 } 782 783 /* 784 * Add an Async Event Request 785 * 786 * Stores an AER to be returned later if the Controller needs to notify the 787 * host of an event. 788 * Note that while the NVMe spec doesn't require Controllers to return AER's 789 * in order, this implementation does preserve the order. 790 */ 791 static int 792 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid) 793 { 794 struct pci_nvme_aer *aer = NULL; 795 796 aer = calloc(1, sizeof(struct pci_nvme_aer)); 797 if (aer == NULL) 798 return (-1); 799 800 /* Save the Command ID for use in the completion message */ 801 aer->cid = cid; 802 803 pthread_mutex_lock(&sc->aer_mtx); 804 sc->aer_count++; 805 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link); 806 pthread_mutex_unlock(&sc->aer_mtx); 807 808 return (0); 809 } 810 811 /* 812 * Get an Async Event Request structure 813 * 814 * Returns a pointer to an AER previously submitted by the host or NULL if 815 * no AER's exist. Caller is responsible for freeing the returned struct. 816 */ 817 static struct pci_nvme_aer * 818 pci_nvme_aer_get(struct pci_nvme_softc *sc) 819 { 820 struct pci_nvme_aer *aer = NULL; 821 822 pthread_mutex_lock(&sc->aer_mtx); 823 aer = STAILQ_FIRST(&sc->aer_list); 824 if (aer != NULL) { 825 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 826 sc->aer_count--; 827 } 828 pthread_mutex_unlock(&sc->aer_mtx); 829 830 return (aer); 831 } 832 833 static void 834 pci_nvme_aen_reset(struct pci_nvme_softc *sc) 835 { 836 uint32_t atype; 837 838 memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen)); 839 840 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 841 sc->aen[atype].atype = atype; 842 } 843 } 844 845 static void 846 pci_nvme_aen_init(struct pci_nvme_softc *sc) 847 { 848 char nstr[80]; 849 850 pci_nvme_aen_reset(sc); 851 852 pthread_mutex_init(&sc->aen_mtx, NULL); 853 pthread_create(&sc->aen_tid, NULL, aen_thr, sc); 854 snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot, 855 sc->nsc_pi->pi_func); 856 pthread_set_name_np(sc->aen_tid, nstr); 857 } 858 859 static void 860 pci_nvme_aen_destroy(struct pci_nvme_softc *sc) 861 { 862 863 pci_nvme_aen_reset(sc); 864 } 865 866 /* Notify the AEN thread of pending work */ 867 static void 868 pci_nvme_aen_notify(struct pci_nvme_softc *sc) 869 { 870 871 pthread_cond_signal(&sc->aen_cond); 872 } 873 874 /* 875 * Post an Asynchronous Event Notification 876 */ 877 static int32_t 878 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype, 879 uint32_t event_data) 880 { 881 struct pci_nvme_aen *aen; 882 883 if (atype >= PCI_NVME_AE_TYPE_MAX) { 884 return(EINVAL); 885 } 886 887 pthread_mutex_lock(&sc->aen_mtx); 888 aen = &sc->aen[atype]; 889 890 /* Has the controller already posted an event of this type? */ 891 if (aen->posted) { 892 pthread_mutex_unlock(&sc->aen_mtx); 893 return(EALREADY); 894 } 895 896 aen->event_data = event_data; 897 aen->posted = true; 898 pthread_mutex_unlock(&sc->aen_mtx); 899 900 pci_nvme_aen_notify(sc); 901 902 return(0); 903 } 904 905 static void 906 pci_nvme_aen_process(struct pci_nvme_softc *sc) 907 { 908 struct pci_nvme_aer *aer; 909 struct pci_nvme_aen *aen; 910 pci_nvme_async_type atype; 911 uint32_t mask; 912 uint16_t status; 913 uint8_t lid; 914 915 assert(pthread_mutex_isowned_np(&sc->aen_mtx)); 916 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 917 aen = &sc->aen[atype]; 918 /* Previous iterations may have depleted the available AER's */ 919 if (!pci_nvme_aer_available(sc)) { 920 DPRINTF("%s: no AER", __func__); 921 break; 922 } 923 924 if (!aen->posted) { 925 DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype); 926 continue; 927 } 928 929 status = NVME_SC_SUCCESS; 930 931 /* Is the event masked? */ 932 mask = 933 sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11; 934 935 DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data); 936 switch (atype) { 937 case PCI_NVME_AE_TYPE_ERROR: 938 lid = NVME_LOG_ERROR; 939 break; 940 case PCI_NVME_AE_TYPE_SMART: 941 mask &= 0xff; 942 if ((mask & aen->event_data) == 0) 943 continue; 944 lid = NVME_LOG_HEALTH_INFORMATION; 945 break; 946 case PCI_NVME_AE_TYPE_NOTICE: 947 if (aen->event_data >= PCI_NVME_AE_INFO_MAX) { 948 EPRINTLN("%s unknown AEN notice type %u", 949 __func__, aen->event_data); 950 status = NVME_SC_INTERNAL_DEVICE_ERROR; 951 break; 952 } 953 mask >>= 8; 954 if (((1 << aen->event_data) & mask) == 0) 955 continue; 956 switch (aen->event_data) { 957 case PCI_NVME_AE_INFO_NS_ATTR_CHANGED: 958 lid = NVME_LOG_CHANGED_NAMESPACE; 959 break; 960 case PCI_NVME_AE_INFO_FW_ACTIVATION: 961 lid = NVME_LOG_FIRMWARE_SLOT; 962 break; 963 case PCI_NVME_AE_INFO_TELEMETRY_CHANGE: 964 lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED; 965 break; 966 case PCI_NVME_AE_INFO_ANA_CHANGE: 967 lid = NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS; 968 break; 969 case PCI_NVME_AE_INFO_PREDICT_LATENCY_CHANGE: 970 lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE; 971 break; 972 case PCI_NVME_AE_INFO_LBA_STATUS_ALERT: 973 lid = NVME_LOG_LBA_STATUS_INFORMATION; 974 break; 975 case PCI_NVME_AE_INFO_ENDURANCE_GROUP_CHANGE: 976 lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE; 977 break; 978 default: 979 lid = 0; 980 } 981 break; 982 default: 983 /* bad type?!? */ 984 EPRINTLN("%s unknown AEN type %u", __func__, atype); 985 status = NVME_SC_INTERNAL_DEVICE_ERROR; 986 break; 987 } 988 989 aer = pci_nvme_aer_get(sc); 990 assert(aer != NULL); 991 992 DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype); 993 pci_nvme_cq_update(sc, &sc->compl_queues[0], 994 (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */ 995 aer->cid, 996 0, /* SQID */ 997 status); 998 999 aen->event_data = 0; 1000 aen->posted = false; 1001 1002 pci_generate_msix(sc->nsc_pi, 0); 1003 } 1004 } 1005 1006 static void * 1007 aen_thr(void *arg) 1008 { 1009 struct pci_nvme_softc *sc; 1010 1011 sc = arg; 1012 1013 pthread_mutex_lock(&sc->aen_mtx); 1014 for (;;) { 1015 pci_nvme_aen_process(sc); 1016 pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx); 1017 } 1018 pthread_mutex_unlock(&sc->aen_mtx); 1019 1020 pthread_exit(NULL); 1021 return (NULL); 1022 } 1023 1024 static void 1025 pci_nvme_reset_locked(struct pci_nvme_softc *sc) 1026 { 1027 uint32_t i; 1028 1029 DPRINTF("%s", __func__); 1030 1031 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | 1032 (1 << NVME_CAP_LO_REG_CQR_SHIFT) | 1033 (60 << NVME_CAP_LO_REG_TO_SHIFT); 1034 1035 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; 1036 1037 sc->regs.vs = NVME_REV(1,4); /* NVMe v1.4 */ 1038 1039 sc->regs.cc = 0; 1040 1041 assert(sc->submit_queues != NULL); 1042 1043 for (i = 0; i < sc->num_squeues + 1; i++) { 1044 sc->submit_queues[i].qbase = NULL; 1045 sc->submit_queues[i].size = 0; 1046 sc->submit_queues[i].cqid = 0; 1047 sc->submit_queues[i].tail = 0; 1048 sc->submit_queues[i].head = 0; 1049 } 1050 1051 assert(sc->compl_queues != NULL); 1052 1053 for (i = 0; i < sc->num_cqueues + 1; i++) { 1054 sc->compl_queues[i].qbase = NULL; 1055 sc->compl_queues[i].size = 0; 1056 sc->compl_queues[i].tail = 0; 1057 sc->compl_queues[i].head = 0; 1058 } 1059 1060 sc->num_q_is_set = false; 1061 1062 pci_nvme_aer_destroy(sc); 1063 pci_nvme_aen_destroy(sc); 1064 1065 /* 1066 * Clear CSTS.RDY last to prevent the host from enabling Controller 1067 * before cleanup completes 1068 */ 1069 sc->regs.csts = 0; 1070 } 1071 1072 static void 1073 pci_nvme_reset(struct pci_nvme_softc *sc) 1074 { 1075 pthread_mutex_lock(&sc->mtx); 1076 pci_nvme_reset_locked(sc); 1077 pthread_mutex_unlock(&sc->mtx); 1078 } 1079 1080 static void 1081 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc) 1082 { 1083 uint16_t acqs, asqs; 1084 1085 DPRINTF("%s", __func__); 1086 1087 asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1; 1088 sc->submit_queues[0].size = asqs; 1089 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq, 1090 sizeof(struct nvme_command) * asqs); 1091 1092 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p", 1093 __func__, sc->regs.asq, sc->submit_queues[0].qbase); 1094 1095 acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 1096 NVME_AQA_REG_ACQS_MASK) + 1; 1097 sc->compl_queues[0].size = acqs; 1098 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq, 1099 sizeof(struct nvme_completion) * acqs); 1100 sc->compl_queues[0].intr_en = NVME_CQ_INTEN; 1101 1102 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p", 1103 __func__, sc->regs.acq, sc->compl_queues[0].qbase); 1104 } 1105 1106 static int 1107 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b, 1108 size_t len, enum nvme_copy_dir dir) 1109 { 1110 uint8_t *p; 1111 size_t bytes; 1112 1113 if (len > (8 * 1024)) { 1114 return (-1); 1115 } 1116 1117 /* Copy from the start of prp1 to the end of the physical page */ 1118 bytes = PAGE_SIZE - (prp1 & PAGE_MASK); 1119 bytes = MIN(bytes, len); 1120 1121 p = vm_map_gpa(ctx, prp1, bytes); 1122 if (p == NULL) { 1123 return (-1); 1124 } 1125 1126 if (dir == NVME_COPY_TO_PRP) 1127 memcpy(p, b, bytes); 1128 else 1129 memcpy(b, p, bytes); 1130 1131 b += bytes; 1132 1133 len -= bytes; 1134 if (len == 0) { 1135 return (0); 1136 } 1137 1138 len = MIN(len, PAGE_SIZE); 1139 1140 p = vm_map_gpa(ctx, prp2, len); 1141 if (p == NULL) { 1142 return (-1); 1143 } 1144 1145 if (dir == NVME_COPY_TO_PRP) 1146 memcpy(p, b, len); 1147 else 1148 memcpy(b, p, len); 1149 1150 return (0); 1151 } 1152 1153 /* 1154 * Write a Completion Queue Entry update 1155 * 1156 * Write the completion and update the doorbell value 1157 */ 1158 static void 1159 pci_nvme_cq_update(struct pci_nvme_softc *sc, 1160 struct nvme_completion_queue *cq, 1161 uint32_t cdw0, 1162 uint16_t cid, 1163 uint16_t sqid, 1164 uint16_t status) 1165 { 1166 struct nvme_submission_queue *sq = &sc->submit_queues[sqid]; 1167 struct nvme_completion *cqe; 1168 1169 assert(cq->qbase != NULL); 1170 1171 pthread_mutex_lock(&cq->mtx); 1172 1173 cqe = &cq->qbase[cq->tail]; 1174 1175 /* Flip the phase bit */ 1176 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK; 1177 1178 cqe->cdw0 = cdw0; 1179 cqe->sqhd = sq->head; 1180 cqe->sqid = sqid; 1181 cqe->cid = cid; 1182 cqe->status = status; 1183 1184 cq->tail++; 1185 if (cq->tail >= cq->size) { 1186 cq->tail = 0; 1187 } 1188 1189 pthread_mutex_unlock(&cq->mtx); 1190 } 1191 1192 static int 1193 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1194 struct nvme_completion* compl) 1195 { 1196 uint16_t qid = command->cdw10 & 0xffff; 1197 1198 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid); 1199 if (qid == 0 || qid > sc->num_squeues || 1200 (sc->submit_queues[qid].qbase == NULL)) { 1201 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u", 1202 __func__, qid, sc->num_squeues); 1203 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1204 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1205 return (1); 1206 } 1207 1208 sc->submit_queues[qid].qbase = NULL; 1209 sc->submit_queues[qid].cqid = 0; 1210 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1211 return (1); 1212 } 1213 1214 static int 1215 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1216 struct nvme_completion* compl) 1217 { 1218 if (command->cdw11 & NVME_CMD_CDW11_PC) { 1219 uint16_t qid = command->cdw10 & 0xffff; 1220 struct nvme_submission_queue *nsq; 1221 1222 if ((qid == 0) || (qid > sc->num_squeues) || 1223 (sc->submit_queues[qid].qbase != NULL)) { 1224 WPRINTF("%s queue index %u > num_squeues %u", 1225 __func__, qid, sc->num_squeues); 1226 pci_nvme_status_tc(&compl->status, 1227 NVME_SCT_COMMAND_SPECIFIC, 1228 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1229 return (1); 1230 } 1231 1232 nsq = &sc->submit_queues[qid]; 1233 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1234 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries); 1235 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) { 1236 /* 1237 * Queues must specify at least two entries 1238 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1239 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1240 */ 1241 pci_nvme_status_tc(&compl->status, 1242 NVME_SCT_COMMAND_SPECIFIC, 1243 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1244 return (1); 1245 } 1246 nsq->head = nsq->tail = 0; 1247 1248 nsq->cqid = (command->cdw11 >> 16) & 0xffff; 1249 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) { 1250 pci_nvme_status_tc(&compl->status, 1251 NVME_SCT_COMMAND_SPECIFIC, 1252 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1253 return (1); 1254 } 1255 1256 if (sc->compl_queues[nsq->cqid].qbase == NULL) { 1257 pci_nvme_status_tc(&compl->status, 1258 NVME_SCT_COMMAND_SPECIFIC, 1259 NVME_SC_COMPLETION_QUEUE_INVALID); 1260 return (1); 1261 } 1262 1263 nsq->qpriority = (command->cdw11 >> 1) & 0x03; 1264 1265 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1266 sizeof(struct nvme_command) * (size_t)nsq->size); 1267 1268 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__, 1269 qid, nsq->size, nsq->qbase, nsq->cqid); 1270 1271 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1272 1273 DPRINTF("%s completed creating IOSQ qid %u", 1274 __func__, qid); 1275 } else { 1276 /* 1277 * Guest sent non-cont submission queue request. 1278 * This setting is unsupported by this emulation. 1279 */ 1280 WPRINTF("%s unsupported non-contig (list-based) " 1281 "create i/o submission queue", __func__); 1282 1283 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1284 } 1285 return (1); 1286 } 1287 1288 static int 1289 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1290 struct nvme_completion* compl) 1291 { 1292 uint16_t qid = command->cdw10 & 0xffff; 1293 uint16_t sqid; 1294 1295 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid); 1296 if (qid == 0 || qid > sc->num_cqueues || 1297 (sc->compl_queues[qid].qbase == NULL)) { 1298 WPRINTF("%s queue index %u / num_cqueues %u", 1299 __func__, qid, sc->num_cqueues); 1300 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1301 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1302 return (1); 1303 } 1304 1305 /* Deleting an Active CQ is an error */ 1306 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++) 1307 if (sc->submit_queues[sqid].cqid == qid) { 1308 pci_nvme_status_tc(&compl->status, 1309 NVME_SCT_COMMAND_SPECIFIC, 1310 NVME_SC_INVALID_QUEUE_DELETION); 1311 return (1); 1312 } 1313 1314 sc->compl_queues[qid].qbase = NULL; 1315 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1316 return (1); 1317 } 1318 1319 static int 1320 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1321 struct nvme_completion* compl) 1322 { 1323 struct nvme_completion_queue *ncq; 1324 uint16_t qid = command->cdw10 & 0xffff; 1325 1326 /* Only support Physically Contiguous queues */ 1327 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) { 1328 WPRINTF("%s unsupported non-contig (list-based) " 1329 "create i/o completion queue", 1330 __func__); 1331 1332 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1333 return (1); 1334 } 1335 1336 if ((qid == 0) || (qid > sc->num_cqueues) || 1337 (sc->compl_queues[qid].qbase != NULL)) { 1338 WPRINTF("%s queue index %u > num_cqueues %u", 1339 __func__, qid, sc->num_cqueues); 1340 pci_nvme_status_tc(&compl->status, 1341 NVME_SCT_COMMAND_SPECIFIC, 1342 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1343 return (1); 1344 } 1345 1346 ncq = &sc->compl_queues[qid]; 1347 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; 1348 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; 1349 if (ncq->intr_vec > (sc->max_queues + 1)) { 1350 pci_nvme_status_tc(&compl->status, 1351 NVME_SCT_COMMAND_SPECIFIC, 1352 NVME_SC_INVALID_INTERRUPT_VECTOR); 1353 return (1); 1354 } 1355 1356 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1357 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) { 1358 /* 1359 * Queues must specify at least two entries 1360 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1361 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1362 */ 1363 pci_nvme_status_tc(&compl->status, 1364 NVME_SCT_COMMAND_SPECIFIC, 1365 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1366 return (1); 1367 } 1368 ncq->head = ncq->tail = 0; 1369 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1370 command->prp1, 1371 sizeof(struct nvme_command) * (size_t)ncq->size); 1372 1373 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1374 1375 1376 return (1); 1377 } 1378 1379 static int 1380 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, 1381 struct nvme_completion* compl) 1382 { 1383 uint64_t logoff; 1384 uint32_t logsize; 1385 uint8_t logpage = command->cdw10 & 0xFF; 1386 1387 DPRINTF("%s log page %u len %u", __func__, logpage, logsize); 1388 1389 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1390 1391 /* 1392 * Command specifies the number of dwords to return in fields NUMDU 1393 * and NUMDL. This is a zero-based value. 1394 */ 1395 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1; 1396 logsize *= sizeof(uint32_t); 1397 logoff = ((uint64_t)(command->cdw13) << 32) | command->cdw12; 1398 1399 switch (logpage) { 1400 case NVME_LOG_ERROR: 1401 if (logoff >= sizeof(sc->err_log)) { 1402 pci_nvme_status_genc(&compl->status, 1403 NVME_SC_INVALID_FIELD); 1404 break; 1405 } 1406 1407 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1408 command->prp2, (uint8_t *)&sc->err_log + logoff, 1409 MIN(logsize - logoff, sizeof(sc->err_log)), 1410 NVME_COPY_TO_PRP); 1411 break; 1412 case NVME_LOG_HEALTH_INFORMATION: 1413 if (logoff >= sizeof(sc->health_log)) { 1414 pci_nvme_status_genc(&compl->status, 1415 NVME_SC_INVALID_FIELD); 1416 break; 1417 } 1418 1419 pthread_mutex_lock(&sc->mtx); 1420 memcpy(&sc->health_log.data_units_read, &sc->read_data_units, 1421 sizeof(sc->health_log.data_units_read)); 1422 memcpy(&sc->health_log.data_units_written, &sc->write_data_units, 1423 sizeof(sc->health_log.data_units_written)); 1424 memcpy(&sc->health_log.host_read_commands, &sc->read_commands, 1425 sizeof(sc->health_log.host_read_commands)); 1426 memcpy(&sc->health_log.host_write_commands, &sc->write_commands, 1427 sizeof(sc->health_log.host_write_commands)); 1428 pthread_mutex_unlock(&sc->mtx); 1429 1430 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1431 command->prp2, (uint8_t *)&sc->health_log + logoff, 1432 MIN(logsize - logoff, sizeof(sc->health_log)), 1433 NVME_COPY_TO_PRP); 1434 break; 1435 case NVME_LOG_FIRMWARE_SLOT: 1436 if (logoff >= sizeof(sc->fw_log)) { 1437 pci_nvme_status_genc(&compl->status, 1438 NVME_SC_INVALID_FIELD); 1439 break; 1440 } 1441 1442 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1443 command->prp2, (uint8_t *)&sc->fw_log + logoff, 1444 MIN(logsize - logoff, sizeof(sc->fw_log)), 1445 NVME_COPY_TO_PRP); 1446 break; 1447 case NVME_LOG_CHANGED_NAMESPACE: 1448 if (logoff >= sizeof(sc->ns_log)) { 1449 pci_nvme_status_genc(&compl->status, 1450 NVME_SC_INVALID_FIELD); 1451 break; 1452 } 1453 1454 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1455 command->prp2, (uint8_t *)&sc->ns_log + logoff, 1456 MIN(logsize - logoff, sizeof(sc->ns_log)), 1457 NVME_COPY_TO_PRP); 1458 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 1459 break; 1460 default: 1461 DPRINTF("%s get log page %x command not supported", 1462 __func__, logpage); 1463 1464 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1465 NVME_SC_INVALID_LOG_PAGE); 1466 } 1467 1468 return (1); 1469 } 1470 1471 static int 1472 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, 1473 struct nvme_completion* compl) 1474 { 1475 void *dest; 1476 uint16_t status; 1477 1478 DPRINTF("%s identify 0x%x nsid 0x%x", __func__, 1479 command->cdw10 & 0xFF, command->nsid); 1480 1481 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1482 1483 switch (command->cdw10 & 0xFF) { 1484 case 0x00: /* return Identify Namespace data structure */ 1485 /* Global NS only valid with NS Management */ 1486 if (command->nsid == NVME_GLOBAL_NAMESPACE_TAG) { 1487 pci_nvme_status_genc(&status, 1488 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1489 break; 1490 } 1491 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1492 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata), 1493 NVME_COPY_TO_PRP); 1494 break; 1495 case 0x01: /* return Identify Controller data structure */ 1496 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1497 command->prp2, (uint8_t *)&sc->ctrldata, 1498 sizeof(sc->ctrldata), 1499 NVME_COPY_TO_PRP); 1500 break; 1501 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ 1502 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1503 sizeof(uint32_t) * 1024); 1504 /* All unused entries shall be zero */ 1505 bzero(dest, sizeof(uint32_t) * 1024); 1506 ((uint32_t *)dest)[0] = 1; 1507 break; 1508 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ 1509 if (command->nsid != 1) { 1510 pci_nvme_status_genc(&status, 1511 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1512 break; 1513 } 1514 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1515 sizeof(uint32_t) * 1024); 1516 /* All bytes after the descriptor shall be zero */ 1517 bzero(dest, sizeof(uint32_t) * 1024); 1518 1519 /* Return NIDT=1 (i.e. EUI64) descriptor */ 1520 ((uint8_t *)dest)[0] = 1; 1521 ((uint8_t *)dest)[1] = sizeof(uint64_t); 1522 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t)); 1523 break; 1524 default: 1525 DPRINTF("%s unsupported identify command requested 0x%x", 1526 __func__, command->cdw10 & 0xFF); 1527 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD); 1528 break; 1529 } 1530 1531 compl->status = status; 1532 return (1); 1533 } 1534 1535 static const char * 1536 nvme_fid_to_name(uint8_t fid) 1537 { 1538 const char *name; 1539 1540 switch (fid) { 1541 case NVME_FEAT_ARBITRATION: 1542 name = "Arbitration"; 1543 break; 1544 case NVME_FEAT_POWER_MANAGEMENT: 1545 name = "Power Management"; 1546 break; 1547 case NVME_FEAT_LBA_RANGE_TYPE: 1548 name = "LBA Range Type"; 1549 break; 1550 case NVME_FEAT_TEMPERATURE_THRESHOLD: 1551 name = "Temperature Threshold"; 1552 break; 1553 case NVME_FEAT_ERROR_RECOVERY: 1554 name = "Error Recovery"; 1555 break; 1556 case NVME_FEAT_VOLATILE_WRITE_CACHE: 1557 name = "Volatile Write Cache"; 1558 break; 1559 case NVME_FEAT_NUMBER_OF_QUEUES: 1560 name = "Number of Queues"; 1561 break; 1562 case NVME_FEAT_INTERRUPT_COALESCING: 1563 name = "Interrupt Coalescing"; 1564 break; 1565 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 1566 name = "Interrupt Vector Configuration"; 1567 break; 1568 case NVME_FEAT_WRITE_ATOMICITY: 1569 name = "Write Atomicity Normal"; 1570 break; 1571 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 1572 name = "Asynchronous Event Configuration"; 1573 break; 1574 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 1575 name = "Autonomous Power State Transition"; 1576 break; 1577 case NVME_FEAT_HOST_MEMORY_BUFFER: 1578 name = "Host Memory Buffer"; 1579 break; 1580 case NVME_FEAT_TIMESTAMP: 1581 name = "Timestamp"; 1582 break; 1583 case NVME_FEAT_KEEP_ALIVE_TIMER: 1584 name = "Keep Alive Timer"; 1585 break; 1586 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT: 1587 name = "Host Controlled Thermal Management"; 1588 break; 1589 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG: 1590 name = "Non-Operation Power State Config"; 1591 break; 1592 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG: 1593 name = "Read Recovery Level Config"; 1594 break; 1595 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 1596 name = "Predictable Latency Mode Config"; 1597 break; 1598 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW: 1599 name = "Predictable Latency Mode Window"; 1600 break; 1601 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES: 1602 name = "LBA Status Information Report Interval"; 1603 break; 1604 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 1605 name = "Host Behavior Support"; 1606 break; 1607 case NVME_FEAT_SANITIZE_CONFIG: 1608 name = "Sanitize Config"; 1609 break; 1610 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION: 1611 name = "Endurance Group Event Configuration"; 1612 break; 1613 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 1614 name = "Software Progress Marker"; 1615 break; 1616 case NVME_FEAT_HOST_IDENTIFIER: 1617 name = "Host Identifier"; 1618 break; 1619 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK: 1620 name = "Reservation Notification Mask"; 1621 break; 1622 case NVME_FEAT_RESERVATION_PERSISTENCE: 1623 name = "Reservation Persistence"; 1624 break; 1625 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG: 1626 name = "Namespace Write Protection Config"; 1627 break; 1628 default: 1629 name = "Unknown"; 1630 break; 1631 } 1632 1633 return (name); 1634 } 1635 1636 static void 1637 nvme_feature_invalid_cb(struct pci_nvme_softc *sc, 1638 struct nvme_feature_obj *feat, 1639 struct nvme_command *command, 1640 struct nvme_completion *compl) 1641 { 1642 1643 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1644 } 1645 1646 static void 1647 nvme_feature_iv_config(struct pci_nvme_softc *sc, 1648 struct nvme_feature_obj *feat, 1649 struct nvme_command *command, 1650 struct nvme_completion *compl) 1651 { 1652 uint32_t i; 1653 uint32_t cdw11 = command->cdw11; 1654 uint16_t iv; 1655 bool cd; 1656 1657 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1658 1659 iv = cdw11 & 0xffff; 1660 cd = cdw11 & (1 << 16); 1661 1662 if (iv > (sc->max_queues + 1)) { 1663 return; 1664 } 1665 1666 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */ 1667 if ((iv == 0) && !cd) 1668 return; 1669 1670 /* Requested Interrupt Vector must be used by a CQ */ 1671 for (i = 0; i < sc->num_cqueues + 1; i++) { 1672 if (sc->compl_queues[i].intr_vec == iv) { 1673 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1674 } 1675 } 1676 } 1677 1678 #define NVME_ASYNC_EVENT_ENDURANCE_GROUP (0x4000) 1679 static void 1680 nvme_feature_async_event(struct pci_nvme_softc *sc, 1681 struct nvme_feature_obj *feat, 1682 struct nvme_command *command, 1683 struct nvme_completion *compl) 1684 { 1685 1686 if (command->cdw11 & NVME_ASYNC_EVENT_ENDURANCE_GROUP) 1687 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1688 } 1689 1690 #define NVME_TEMP_THRESH_OVER 0 1691 #define NVME_TEMP_THRESH_UNDER 1 1692 static void 1693 nvme_feature_temperature(struct pci_nvme_softc *sc, 1694 struct nvme_feature_obj *feat, 1695 struct nvme_command *command, 1696 struct nvme_completion *compl) 1697 { 1698 uint16_t tmpth; /* Temperature Threshold */ 1699 uint8_t tmpsel; /* Threshold Temperature Select */ 1700 uint8_t thsel; /* Threshold Type Select */ 1701 bool set_crit = false; 1702 1703 tmpth = command->cdw11 & 0xffff; 1704 tmpsel = (command->cdw11 >> 16) & 0xf; 1705 thsel = (command->cdw11 >> 20) & 0x3; 1706 1707 DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel); 1708 1709 /* Check for unsupported values */ 1710 if (((tmpsel != 0) && (tmpsel != 0xf)) || 1711 (thsel > NVME_TEMP_THRESH_UNDER)) { 1712 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1713 return; 1714 } 1715 1716 if (((thsel == NVME_TEMP_THRESH_OVER) && (NVME_TEMPERATURE >= tmpth)) || 1717 ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth))) 1718 set_crit = true; 1719 1720 pthread_mutex_lock(&sc->mtx); 1721 if (set_crit) 1722 sc->health_log.critical_warning |= 1723 NVME_CRIT_WARN_ST_TEMPERATURE; 1724 else 1725 sc->health_log.critical_warning &= 1726 ~NVME_CRIT_WARN_ST_TEMPERATURE; 1727 pthread_mutex_unlock(&sc->mtx); 1728 1729 if (set_crit) 1730 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART, 1731 sc->health_log.critical_warning); 1732 1733 1734 DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status); 1735 } 1736 1737 static void 1738 nvme_feature_num_queues(struct pci_nvme_softc *sc, 1739 struct nvme_feature_obj *feat, 1740 struct nvme_command *command, 1741 struct nvme_completion *compl) 1742 { 1743 uint16_t nqr; /* Number of Queues Requested */ 1744 1745 if (sc->num_q_is_set) { 1746 WPRINTF("%s: Number of Queues already set", __func__); 1747 pci_nvme_status_genc(&compl->status, 1748 NVME_SC_COMMAND_SEQUENCE_ERROR); 1749 return; 1750 } 1751 1752 nqr = command->cdw11 & 0xFFFF; 1753 if (nqr == 0xffff) { 1754 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr); 1755 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1756 return; 1757 } 1758 1759 sc->num_squeues = ONE_BASED(nqr); 1760 if (sc->num_squeues > sc->max_queues) { 1761 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues, 1762 sc->max_queues); 1763 sc->num_squeues = sc->max_queues; 1764 } 1765 1766 nqr = (command->cdw11 >> 16) & 0xFFFF; 1767 if (nqr == 0xffff) { 1768 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr); 1769 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1770 return; 1771 } 1772 1773 sc->num_cqueues = ONE_BASED(nqr); 1774 if (sc->num_cqueues > sc->max_queues) { 1775 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues, 1776 sc->max_queues); 1777 sc->num_cqueues = sc->max_queues; 1778 } 1779 1780 /* Patch the command value which will be saved on callback's return */ 1781 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc); 1782 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 1783 1784 sc->num_q_is_set = true; 1785 } 1786 1787 static int 1788 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command, 1789 struct nvme_completion *compl) 1790 { 1791 struct nvme_feature_obj *feat; 1792 uint32_t nsid = command->nsid; 1793 uint8_t fid = command->cdw10 & 0xFF; 1794 1795 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1796 1797 if (fid >= NVME_FID_MAX) { 1798 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1799 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1800 return (1); 1801 } 1802 feat = &sc->feat[fid]; 1803 1804 if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) { 1805 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1806 return (1); 1807 } 1808 1809 if (!feat->namespace_specific && 1810 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) { 1811 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1812 NVME_SC_FEATURE_NOT_NS_SPECIFIC); 1813 return (1); 1814 } 1815 1816 compl->cdw0 = 0; 1817 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1818 1819 if (feat->set) 1820 feat->set(sc, feat, command, compl); 1821 1822 DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11); 1823 if (compl->status == NVME_SC_SUCCESS) { 1824 feat->cdw11 = command->cdw11; 1825 if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) && 1826 (command->cdw11 != 0)) 1827 pci_nvme_aen_notify(sc); 1828 } 1829 1830 return (0); 1831 } 1832 1833 #define NVME_FEATURES_SEL_SUPPORTED 0x3 1834 #define NVME_FEATURES_NS_SPECIFIC (1 << 1) 1835 1836 static int 1837 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, 1838 struct nvme_completion* compl) 1839 { 1840 struct nvme_feature_obj *feat; 1841 uint8_t fid = command->cdw10 & 0xFF; 1842 uint8_t sel = (command->cdw10 >> 8) & 0x7; 1843 1844 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1845 1846 if (fid >= NVME_FID_MAX) { 1847 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1848 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1849 return (1); 1850 } 1851 1852 compl->cdw0 = 0; 1853 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1854 1855 feat = &sc->feat[fid]; 1856 if (feat->get) { 1857 feat->get(sc, feat, command, compl); 1858 } 1859 1860 if (compl->status == NVME_SC_SUCCESS) { 1861 if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific) 1862 compl->cdw0 = NVME_FEATURES_NS_SPECIFIC; 1863 else 1864 compl->cdw0 = feat->cdw11; 1865 } 1866 1867 return (0); 1868 } 1869 1870 static int 1871 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command, 1872 struct nvme_completion* compl) 1873 { 1874 uint8_t ses, lbaf, pi; 1875 1876 /* Only supports Secure Erase Setting - User Data Erase */ 1877 ses = (command->cdw10 >> 9) & 0x7; 1878 if (ses > 0x1) { 1879 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1880 return (1); 1881 } 1882 1883 /* Only supports a single LBA Format */ 1884 lbaf = command->cdw10 & 0xf; 1885 if (lbaf != 0) { 1886 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1887 NVME_SC_INVALID_FORMAT); 1888 return (1); 1889 } 1890 1891 /* Doesn't support Protection Infomation */ 1892 pi = (command->cdw10 >> 5) & 0x7; 1893 if (pi != 0) { 1894 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1895 return (1); 1896 } 1897 1898 if (sc->nvstore.type == NVME_STOR_RAM) { 1899 if (sc->nvstore.ctx) 1900 free(sc->nvstore.ctx); 1901 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 1902 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1903 } else { 1904 struct pci_nvme_ioreq *req; 1905 int err; 1906 1907 req = pci_nvme_get_ioreq(sc); 1908 if (req == NULL) { 1909 pci_nvme_status_genc(&compl->status, 1910 NVME_SC_INTERNAL_DEVICE_ERROR); 1911 WPRINTF("%s: unable to allocate IO req", __func__); 1912 return (1); 1913 } 1914 req->nvme_sq = &sc->submit_queues[0]; 1915 req->sqid = 0; 1916 req->opc = command->opc; 1917 req->cid = command->cid; 1918 req->nsid = command->nsid; 1919 1920 req->io_req.br_offset = 0; 1921 req->io_req.br_resid = sc->nvstore.size; 1922 req->io_req.br_callback = pci_nvme_io_done; 1923 1924 err = blockif_delete(sc->nvstore.ctx, &req->io_req); 1925 if (err) { 1926 pci_nvme_status_genc(&compl->status, 1927 NVME_SC_INTERNAL_DEVICE_ERROR); 1928 pci_nvme_release_ioreq(sc, req); 1929 } else 1930 compl->status = NVME_NO_STATUS; 1931 } 1932 1933 return (1); 1934 } 1935 1936 static int 1937 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command, 1938 struct nvme_completion* compl) 1939 { 1940 DPRINTF("%s submission queue %u, command ID 0x%x", __func__, 1941 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF); 1942 1943 /* TODO: search for the command ID and abort it */ 1944 1945 compl->cdw0 = 1; 1946 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1947 return (1); 1948 } 1949 1950 static int 1951 nvme_opc_async_event_req(struct pci_nvme_softc* sc, 1952 struct nvme_command* command, struct nvme_completion* compl) 1953 { 1954 DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__, 1955 sc->aer_count, sc->ctrldata.aerl, command->cid); 1956 1957 /* Don't exceed the Async Event Request Limit (AERL). */ 1958 if (pci_nvme_aer_limit_reached(sc)) { 1959 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1960 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); 1961 return (1); 1962 } 1963 1964 if (pci_nvme_aer_add(sc, command->cid)) { 1965 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC, 1966 NVME_SC_INTERNAL_DEVICE_ERROR); 1967 return (1); 1968 } 1969 1970 /* 1971 * Raise events when they happen based on the Set Features cmd. 1972 * These events happen async, so only set completion successful if 1973 * there is an event reflective of the request to get event. 1974 */ 1975 compl->status = NVME_NO_STATUS; 1976 pci_nvme_aen_notify(sc); 1977 1978 return (0); 1979 } 1980 1981 static void 1982 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) 1983 { 1984 struct nvme_completion compl; 1985 struct nvme_command *cmd; 1986 struct nvme_submission_queue *sq; 1987 struct nvme_completion_queue *cq; 1988 uint16_t sqhead; 1989 1990 DPRINTF("%s index %u", __func__, (uint32_t)value); 1991 1992 sq = &sc->submit_queues[0]; 1993 cq = &sc->compl_queues[0]; 1994 1995 pthread_mutex_lock(&sq->mtx); 1996 1997 sqhead = sq->head; 1998 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail); 1999 2000 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2001 cmd = &(sq->qbase)[sqhead]; 2002 compl.cdw0 = 0; 2003 compl.status = 0; 2004 2005 switch (cmd->opc) { 2006 case NVME_OPC_DELETE_IO_SQ: 2007 DPRINTF("%s command DELETE_IO_SQ", __func__); 2008 nvme_opc_delete_io_sq(sc, cmd, &compl); 2009 break; 2010 case NVME_OPC_CREATE_IO_SQ: 2011 DPRINTF("%s command CREATE_IO_SQ", __func__); 2012 nvme_opc_create_io_sq(sc, cmd, &compl); 2013 break; 2014 case NVME_OPC_DELETE_IO_CQ: 2015 DPRINTF("%s command DELETE_IO_CQ", __func__); 2016 nvme_opc_delete_io_cq(sc, cmd, &compl); 2017 break; 2018 case NVME_OPC_CREATE_IO_CQ: 2019 DPRINTF("%s command CREATE_IO_CQ", __func__); 2020 nvme_opc_create_io_cq(sc, cmd, &compl); 2021 break; 2022 case NVME_OPC_GET_LOG_PAGE: 2023 DPRINTF("%s command GET_LOG_PAGE", __func__); 2024 nvme_opc_get_log_page(sc, cmd, &compl); 2025 break; 2026 case NVME_OPC_IDENTIFY: 2027 DPRINTF("%s command IDENTIFY", __func__); 2028 nvme_opc_identify(sc, cmd, &compl); 2029 break; 2030 case NVME_OPC_ABORT: 2031 DPRINTF("%s command ABORT", __func__); 2032 nvme_opc_abort(sc, cmd, &compl); 2033 break; 2034 case NVME_OPC_SET_FEATURES: 2035 DPRINTF("%s command SET_FEATURES", __func__); 2036 nvme_opc_set_features(sc, cmd, &compl); 2037 break; 2038 case NVME_OPC_GET_FEATURES: 2039 DPRINTF("%s command GET_FEATURES", __func__); 2040 nvme_opc_get_features(sc, cmd, &compl); 2041 break; 2042 case NVME_OPC_FIRMWARE_ACTIVATE: 2043 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__); 2044 pci_nvme_status_tc(&compl.status, 2045 NVME_SCT_COMMAND_SPECIFIC, 2046 NVME_SC_INVALID_FIRMWARE_SLOT); 2047 break; 2048 case NVME_OPC_ASYNC_EVENT_REQUEST: 2049 DPRINTF("%s command ASYNC_EVENT_REQ", __func__); 2050 nvme_opc_async_event_req(sc, cmd, &compl); 2051 break; 2052 case NVME_OPC_FORMAT_NVM: 2053 DPRINTF("%s command FORMAT_NVM", __func__); 2054 if ((sc->ctrldata.oacs & 2055 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) { 2056 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 2057 break; 2058 } 2059 nvme_opc_format_nvm(sc, cmd, &compl); 2060 break; 2061 case NVME_OPC_SECURITY_SEND: 2062 case NVME_OPC_SECURITY_RECEIVE: 2063 case NVME_OPC_SANITIZE: 2064 case NVME_OPC_GET_LBA_STATUS: 2065 DPRINTF("%s command OPC=%#x (unsupported)", __func__, 2066 cmd->opc); 2067 /* Valid but unsupported opcodes */ 2068 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD); 2069 break; 2070 default: 2071 DPRINTF("%s command OPC=%#X (not implemented)", 2072 __func__, 2073 cmd->opc); 2074 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 2075 } 2076 sqhead = (sqhead + 1) % sq->size; 2077 2078 if (NVME_COMPLETION_VALID(compl)) { 2079 pci_nvme_cq_update(sc, &sc->compl_queues[0], 2080 compl.cdw0, 2081 cmd->cid, 2082 0, /* SQID */ 2083 compl.status); 2084 } 2085 } 2086 2087 DPRINTF("setting sqhead %u", sqhead); 2088 sq->head = sqhead; 2089 2090 if (cq->head != cq->tail) 2091 pci_generate_msix(sc->nsc_pi, 0); 2092 2093 pthread_mutex_unlock(&sq->mtx); 2094 } 2095 2096 /* 2097 * Update the Write and Read statistics reported in SMART data 2098 * 2099 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up. 2100 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000 2101 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999. 2102 */ 2103 static void 2104 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc, 2105 size_t bytes, uint16_t status) 2106 { 2107 2108 pthread_mutex_lock(&sc->mtx); 2109 switch (opc) { 2110 case NVME_OPC_WRITE: 2111 sc->write_commands++; 2112 if (status != NVME_SC_SUCCESS) 2113 break; 2114 sc->write_dunits_remainder += (bytes / 512); 2115 while (sc->write_dunits_remainder >= 1000) { 2116 sc->write_data_units++; 2117 sc->write_dunits_remainder -= 1000; 2118 } 2119 break; 2120 case NVME_OPC_READ: 2121 sc->read_commands++; 2122 if (status != NVME_SC_SUCCESS) 2123 break; 2124 sc->read_dunits_remainder += (bytes / 512); 2125 while (sc->read_dunits_remainder >= 1000) { 2126 sc->read_data_units++; 2127 sc->read_dunits_remainder -= 1000; 2128 } 2129 break; 2130 default: 2131 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc); 2132 break; 2133 } 2134 pthread_mutex_unlock(&sc->mtx); 2135 } 2136 2137 /* 2138 * Check if the combination of Starting LBA (slba) and number of blocks 2139 * exceeds the range of the underlying storage. 2140 * 2141 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores 2142 * the capacity in bytes as a uint64_t, care must be taken to avoid integer 2143 * overflow. 2144 */ 2145 static bool 2146 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba, 2147 uint32_t nblocks) 2148 { 2149 size_t offset, bytes; 2150 2151 /* Overflow check of multiplying Starting LBA by the sector size */ 2152 if (slba >> (64 - nvstore->sectsz_bits)) 2153 return (true); 2154 2155 offset = slba << nvstore->sectsz_bits; 2156 bytes = nblocks << nvstore->sectsz_bits; 2157 2158 /* Overflow check of Number of Logical Blocks */ 2159 if ((nvstore->size <= offset) || ((nvstore->size - offset) < bytes)) 2160 return (true); 2161 2162 return (false); 2163 } 2164 2165 static int 2166 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req, 2167 uint64_t gpaddr, size_t size, int do_write, uint64_t lba) 2168 { 2169 int iovidx; 2170 2171 if (req == NULL) 2172 return (-1); 2173 2174 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) { 2175 return (-1); 2176 } 2177 2178 /* concatenate contig block-iovs to minimize number of iovs */ 2179 if ((req->prev_gpaddr + req->prev_size) == gpaddr) { 2180 iovidx = req->io_req.br_iovcnt - 1; 2181 2182 req->io_req.br_iov[iovidx].iov_base = 2183 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2184 req->prev_gpaddr, size); 2185 2186 req->prev_size += size; 2187 req->io_req.br_resid += size; 2188 2189 req->io_req.br_iov[iovidx].iov_len = req->prev_size; 2190 } else { 2191 iovidx = req->io_req.br_iovcnt; 2192 if (iovidx == 0) { 2193 req->io_req.br_offset = lba; 2194 req->io_req.br_resid = 0; 2195 req->io_req.br_param = req; 2196 } 2197 2198 req->io_req.br_iov[iovidx].iov_base = 2199 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2200 gpaddr, size); 2201 2202 req->io_req.br_iov[iovidx].iov_len = size; 2203 2204 req->prev_gpaddr = gpaddr; 2205 req->prev_size = size; 2206 req->io_req.br_resid += size; 2207 2208 req->io_req.br_iovcnt++; 2209 } 2210 2211 return (0); 2212 } 2213 2214 static void 2215 pci_nvme_set_completion(struct pci_nvme_softc *sc, 2216 struct nvme_submission_queue *sq, int sqid, uint16_t cid, 2217 uint32_t cdw0, uint16_t status) 2218 { 2219 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; 2220 2221 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x", 2222 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), 2223 NVME_STATUS_GET_SC(status)); 2224 2225 pci_nvme_cq_update(sc, cq, 2226 0, /* CDW0 */ 2227 cid, 2228 sqid, 2229 status); 2230 2231 if (cq->head != cq->tail) { 2232 if (cq->intr_en & NVME_CQ_INTEN) { 2233 pci_generate_msix(sc->nsc_pi, cq->intr_vec); 2234 } else { 2235 DPRINTF("%s: CQ%u interrupt disabled", 2236 __func__, sq->cqid); 2237 } 2238 } 2239 } 2240 2241 static void 2242 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) 2243 { 2244 req->sc = NULL; 2245 req->nvme_sq = NULL; 2246 req->sqid = 0; 2247 2248 pthread_mutex_lock(&sc->mtx); 2249 2250 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link); 2251 sc->pending_ios--; 2252 2253 /* when no more IO pending, can set to ready if device reset/enabled */ 2254 if (sc->pending_ios == 0 && 2255 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) 2256 sc->regs.csts |= NVME_CSTS_RDY; 2257 2258 pthread_mutex_unlock(&sc->mtx); 2259 2260 sem_post(&sc->iosemlock); 2261 } 2262 2263 static struct pci_nvme_ioreq * 2264 pci_nvme_get_ioreq(struct pci_nvme_softc *sc) 2265 { 2266 struct pci_nvme_ioreq *req = NULL; 2267 2268 sem_wait(&sc->iosemlock); 2269 pthread_mutex_lock(&sc->mtx); 2270 2271 req = STAILQ_FIRST(&sc->ioreqs_free); 2272 assert(req != NULL); 2273 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link); 2274 2275 req->sc = sc; 2276 2277 sc->pending_ios++; 2278 2279 pthread_mutex_unlock(&sc->mtx); 2280 2281 req->io_req.br_iovcnt = 0; 2282 req->io_req.br_offset = 0; 2283 req->io_req.br_resid = 0; 2284 req->io_req.br_param = req; 2285 req->prev_gpaddr = 0; 2286 req->prev_size = 0; 2287 2288 return req; 2289 } 2290 2291 static void 2292 pci_nvme_io_done(struct blockif_req *br, int err) 2293 { 2294 struct pci_nvme_ioreq *req = br->br_param; 2295 struct nvme_submission_queue *sq = req->nvme_sq; 2296 uint16_t code, status; 2297 2298 DPRINTF("%s error %d %s", __func__, err, strerror(err)); 2299 2300 /* TODO return correct error */ 2301 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; 2302 pci_nvme_status_genc(&status, code); 2303 2304 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status); 2305 pci_nvme_stats_write_read_update(req->sc, req->opc, 2306 req->bytes, status); 2307 pci_nvme_release_ioreq(req->sc, req); 2308 } 2309 2310 /* 2311 * Implements the Flush command. The specification states: 2312 * If a volatile write cache is not present, Flush commands complete 2313 * successfully and have no effect 2314 * in the description of the Volatile Write Cache (VWC) field of the Identify 2315 * Controller data. Therefore, set status to Success if the command is 2316 * not supported (i.e. RAM or as indicated by the blockif). 2317 */ 2318 static bool 2319 nvme_opc_flush(struct pci_nvme_softc *sc, 2320 struct nvme_command *cmd, 2321 struct pci_nvme_blockstore *nvstore, 2322 struct pci_nvme_ioreq *req, 2323 uint16_t *status) 2324 { 2325 bool pending = false; 2326 2327 if (nvstore->type == NVME_STOR_RAM) { 2328 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2329 } else { 2330 int err; 2331 2332 req->io_req.br_callback = pci_nvme_io_done; 2333 2334 err = blockif_flush(nvstore->ctx, &req->io_req); 2335 switch (err) { 2336 case 0: 2337 pending = true; 2338 break; 2339 case EOPNOTSUPP: 2340 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2341 break; 2342 default: 2343 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2344 } 2345 } 2346 2347 return (pending); 2348 } 2349 2350 static uint16_t 2351 nvme_write_read_ram(struct pci_nvme_softc *sc, 2352 struct pci_nvme_blockstore *nvstore, 2353 uint64_t prp1, uint64_t prp2, 2354 size_t offset, uint64_t bytes, 2355 bool is_write) 2356 { 2357 uint8_t *buf = nvstore->ctx; 2358 enum nvme_copy_dir dir; 2359 uint16_t status; 2360 2361 if (is_write) 2362 dir = NVME_COPY_TO_PRP; 2363 else 2364 dir = NVME_COPY_FROM_PRP; 2365 2366 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2, 2367 buf + offset, bytes, dir)) 2368 pci_nvme_status_genc(&status, 2369 NVME_SC_DATA_TRANSFER_ERROR); 2370 else 2371 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2372 2373 return (status); 2374 } 2375 2376 static uint16_t 2377 nvme_write_read_blockif(struct pci_nvme_softc *sc, 2378 struct pci_nvme_blockstore *nvstore, 2379 struct pci_nvme_ioreq *req, 2380 uint64_t prp1, uint64_t prp2, 2381 size_t offset, uint64_t bytes, 2382 bool is_write) 2383 { 2384 uint64_t size; 2385 int err; 2386 uint16_t status = NVME_NO_STATUS; 2387 2388 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes); 2389 if (pci_nvme_append_iov_req(sc, req, prp1, 2390 size, is_write, offset)) { 2391 pci_nvme_status_genc(&status, 2392 NVME_SC_DATA_TRANSFER_ERROR); 2393 goto out; 2394 } 2395 2396 offset += size; 2397 bytes -= size; 2398 2399 if (bytes == 0) { 2400 ; 2401 } else if (bytes <= PAGE_SIZE) { 2402 size = bytes; 2403 if (pci_nvme_append_iov_req(sc, req, prp2, 2404 size, is_write, offset)) { 2405 pci_nvme_status_genc(&status, 2406 NVME_SC_DATA_TRANSFER_ERROR); 2407 goto out; 2408 } 2409 } else { 2410 void *vmctx = sc->nsc_pi->pi_vmctx; 2411 uint64_t *prp_list = &prp2; 2412 uint64_t *last = prp_list; 2413 2414 /* PRP2 is pointer to a physical region page list */ 2415 while (bytes) { 2416 /* Last entry in list points to the next list */ 2417 if ((prp_list == last) && (bytes > PAGE_SIZE)) { 2418 uint64_t prp = *prp_list; 2419 2420 prp_list = paddr_guest2host(vmctx, prp, 2421 PAGE_SIZE - (prp % PAGE_SIZE)); 2422 last = prp_list + (NVME_PRP2_ITEMS - 1); 2423 } 2424 2425 size = MIN(bytes, PAGE_SIZE); 2426 2427 if (pci_nvme_append_iov_req(sc, req, *prp_list, 2428 size, is_write, offset)) { 2429 pci_nvme_status_genc(&status, 2430 NVME_SC_DATA_TRANSFER_ERROR); 2431 goto out; 2432 } 2433 2434 offset += size; 2435 bytes -= size; 2436 2437 prp_list++; 2438 } 2439 } 2440 req->io_req.br_callback = pci_nvme_io_done; 2441 if (is_write) 2442 err = blockif_write(nvstore->ctx, &req->io_req); 2443 else 2444 err = blockif_read(nvstore->ctx, &req->io_req); 2445 2446 if (err) 2447 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); 2448 out: 2449 return (status); 2450 } 2451 2452 static bool 2453 nvme_opc_write_read(struct pci_nvme_softc *sc, 2454 struct nvme_command *cmd, 2455 struct pci_nvme_blockstore *nvstore, 2456 struct pci_nvme_ioreq *req, 2457 uint16_t *status) 2458 { 2459 uint64_t lba, nblocks, bytes; 2460 size_t offset; 2461 bool is_write = cmd->opc == NVME_OPC_WRITE; 2462 bool pending = false; 2463 2464 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; 2465 nblocks = (cmd->cdw12 & 0xFFFF) + 1; 2466 2467 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) { 2468 WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)", 2469 __func__, lba, nblocks); 2470 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2471 goto out; 2472 } 2473 2474 bytes = nblocks << nvstore->sectsz_bits; 2475 if (bytes > NVME_MAX_DATA_SIZE) { 2476 WPRINTF("%s command would exceed MDTS", __func__); 2477 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD); 2478 goto out; 2479 } 2480 2481 offset = lba << nvstore->sectsz_bits; 2482 2483 req->bytes = bytes; 2484 req->io_req.br_offset = lba; 2485 2486 /* PRP bits 1:0 must be zero */ 2487 cmd->prp1 &= ~0x3UL; 2488 cmd->prp2 &= ~0x3UL; 2489 2490 if (nvstore->type == NVME_STOR_RAM) { 2491 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1, 2492 cmd->prp2, offset, bytes, is_write); 2493 } else { 2494 *status = nvme_write_read_blockif(sc, nvstore, req, 2495 cmd->prp1, cmd->prp2, offset, bytes, is_write); 2496 2497 if (*status == NVME_NO_STATUS) 2498 pending = true; 2499 } 2500 out: 2501 if (!pending) 2502 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status); 2503 2504 return (pending); 2505 } 2506 2507 static void 2508 pci_nvme_dealloc_sm(struct blockif_req *br, int err) 2509 { 2510 struct pci_nvme_ioreq *req = br->br_param; 2511 struct pci_nvme_softc *sc = req->sc; 2512 bool done = true; 2513 uint16_t status; 2514 2515 if (err) { 2516 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); 2517 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) { 2518 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2519 } else { 2520 struct iovec *iov = req->io_req.br_iov; 2521 2522 req->prev_gpaddr++; 2523 iov += req->prev_gpaddr; 2524 2525 /* The iov_* values already include the sector size */ 2526 req->io_req.br_offset = (off_t)iov->iov_base; 2527 req->io_req.br_resid = iov->iov_len; 2528 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) { 2529 pci_nvme_status_genc(&status, 2530 NVME_SC_INTERNAL_DEVICE_ERROR); 2531 } else 2532 done = false; 2533 } 2534 2535 if (done) { 2536 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, 2537 req->cid, 0, status); 2538 pci_nvme_release_ioreq(sc, req); 2539 } 2540 } 2541 2542 static bool 2543 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, 2544 struct nvme_command *cmd, 2545 struct pci_nvme_blockstore *nvstore, 2546 struct pci_nvme_ioreq *req, 2547 uint16_t *status) 2548 { 2549 struct nvme_dsm_range *range; 2550 uint32_t nr, r, non_zero, dr; 2551 int err; 2552 bool pending = false; 2553 2554 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) { 2555 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE); 2556 goto out; 2557 } 2558 2559 nr = cmd->cdw10 & 0xff; 2560 2561 /* copy locally because a range entry could straddle PRPs */ 2562 range = calloc(1, NVME_MAX_DSM_TRIM); 2563 if (range == NULL) { 2564 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2565 goto out; 2566 } 2567 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, 2568 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP); 2569 2570 /* Check for invalid ranges and the number of non-zero lengths */ 2571 non_zero = 0; 2572 for (r = 0; r <= nr; r++) { 2573 if (pci_nvme_out_of_range(nvstore, 2574 range[r].starting_lba, range[r].length)) { 2575 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2576 goto out; 2577 } 2578 if (range[r].length != 0) 2579 non_zero++; 2580 } 2581 2582 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) { 2583 size_t offset, bytes; 2584 int sectsz_bits = sc->nvstore.sectsz_bits; 2585 2586 /* 2587 * DSM calls are advisory only, and compliant controllers 2588 * may choose to take no actions (i.e. return Success). 2589 */ 2590 if (!nvstore->deallocate) { 2591 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2592 goto out; 2593 } 2594 2595 /* If all ranges have a zero length, return Success */ 2596 if (non_zero == 0) { 2597 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2598 goto out; 2599 } 2600 2601 if (req == NULL) { 2602 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2603 goto out; 2604 } 2605 2606 offset = range[0].starting_lba << sectsz_bits; 2607 bytes = range[0].length << sectsz_bits; 2608 2609 /* 2610 * If the request is for more than a single range, store 2611 * the ranges in the br_iov. Optimize for the common case 2612 * of a single range. 2613 * 2614 * Note that NVMe Number of Ranges is a zero based value 2615 */ 2616 req->io_req.br_iovcnt = 0; 2617 req->io_req.br_offset = offset; 2618 req->io_req.br_resid = bytes; 2619 2620 if (nr == 0) { 2621 req->io_req.br_callback = pci_nvme_io_done; 2622 } else { 2623 struct iovec *iov = req->io_req.br_iov; 2624 2625 for (r = 0, dr = 0; r <= nr; r++) { 2626 offset = range[r].starting_lba << sectsz_bits; 2627 bytes = range[r].length << sectsz_bits; 2628 if (bytes == 0) 2629 continue; 2630 2631 if ((nvstore->size - offset) < bytes) { 2632 pci_nvme_status_genc(status, 2633 NVME_SC_LBA_OUT_OF_RANGE); 2634 goto out; 2635 } 2636 iov[dr].iov_base = (void *)offset; 2637 iov[dr].iov_len = bytes; 2638 dr++; 2639 } 2640 req->io_req.br_callback = pci_nvme_dealloc_sm; 2641 2642 /* 2643 * Use prev_gpaddr to track the current entry and 2644 * prev_size to track the number of entries 2645 */ 2646 req->prev_gpaddr = 0; 2647 req->prev_size = dr; 2648 } 2649 2650 err = blockif_delete(nvstore->ctx, &req->io_req); 2651 if (err) 2652 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2653 else 2654 pending = true; 2655 } 2656 out: 2657 free(range); 2658 return (pending); 2659 } 2660 2661 static void 2662 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) 2663 { 2664 struct nvme_submission_queue *sq; 2665 uint16_t status; 2666 uint16_t sqhead; 2667 2668 /* handle all submissions up to sq->tail index */ 2669 sq = &sc->submit_queues[idx]; 2670 2671 pthread_mutex_lock(&sq->mtx); 2672 2673 sqhead = sq->head; 2674 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p", 2675 idx, sqhead, sq->tail, sq->qbase); 2676 2677 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2678 struct nvme_command *cmd; 2679 struct pci_nvme_ioreq *req; 2680 uint32_t nsid; 2681 bool pending; 2682 2683 pending = false; 2684 req = NULL; 2685 status = 0; 2686 2687 cmd = &sq->qbase[sqhead]; 2688 sqhead = (sqhead + 1) % sq->size; 2689 2690 nsid = le32toh(cmd->nsid); 2691 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) { 2692 pci_nvme_status_genc(&status, 2693 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 2694 status |= 2695 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT; 2696 goto complete; 2697 } 2698 2699 req = pci_nvme_get_ioreq(sc); 2700 if (req == NULL) { 2701 pci_nvme_status_genc(&status, 2702 NVME_SC_INTERNAL_DEVICE_ERROR); 2703 WPRINTF("%s: unable to allocate IO req", __func__); 2704 goto complete; 2705 } 2706 req->nvme_sq = sq; 2707 req->sqid = idx; 2708 req->opc = cmd->opc; 2709 req->cid = cmd->cid; 2710 req->nsid = cmd->nsid; 2711 2712 switch (cmd->opc) { 2713 case NVME_OPC_FLUSH: 2714 pending = nvme_opc_flush(sc, cmd, &sc->nvstore, 2715 req, &status); 2716 break; 2717 case NVME_OPC_WRITE: 2718 case NVME_OPC_READ: 2719 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore, 2720 req, &status); 2721 break; 2722 case NVME_OPC_WRITE_ZEROES: 2723 /* TODO: write zeroes 2724 WPRINTF("%s write zeroes lba 0x%lx blocks %u", 2725 __func__, lba, cmd->cdw12 & 0xFFFF); */ 2726 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2727 break; 2728 case NVME_OPC_DATASET_MANAGEMENT: 2729 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, 2730 req, &status); 2731 break; 2732 default: 2733 WPRINTF("%s unhandled io command 0x%x", 2734 __func__, cmd->opc); 2735 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE); 2736 } 2737 complete: 2738 if (!pending) { 2739 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, 2740 status); 2741 if (req != NULL) 2742 pci_nvme_release_ioreq(sc, req); 2743 } 2744 } 2745 2746 sq->head = sqhead; 2747 2748 pthread_mutex_unlock(&sq->mtx); 2749 } 2750 2751 static void 2752 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc, 2753 uint64_t idx, int is_sq, uint64_t value) 2754 { 2755 DPRINTF("nvme doorbell %lu, %s, val 0x%lx", 2756 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF); 2757 2758 if (is_sq) { 2759 if (idx > sc->num_squeues) { 2760 WPRINTF("%s queue index %lu overflow from " 2761 "guest (max %u)", 2762 __func__, idx, sc->num_squeues); 2763 return; 2764 } 2765 2766 atomic_store_short(&sc->submit_queues[idx].tail, 2767 (uint16_t)value); 2768 2769 if (idx == 0) { 2770 pci_nvme_handle_admin_cmd(sc, value); 2771 } else { 2772 /* submission queue; handle new entries in SQ */ 2773 if (idx > sc->num_squeues) { 2774 WPRINTF("%s SQ index %lu overflow from " 2775 "guest (max %u)", 2776 __func__, idx, sc->num_squeues); 2777 return; 2778 } 2779 pci_nvme_handle_io_cmd(sc, (uint16_t)idx); 2780 } 2781 } else { 2782 if (idx > sc->num_cqueues) { 2783 WPRINTF("%s queue index %lu overflow from " 2784 "guest (max %u)", 2785 __func__, idx, sc->num_cqueues); 2786 return; 2787 } 2788 2789 atomic_store_short(&sc->compl_queues[idx].head, 2790 (uint16_t)value); 2791 } 2792 } 2793 2794 static void 2795 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) 2796 { 2797 const char *s = iswrite ? "WRITE" : "READ"; 2798 2799 switch (offset) { 2800 case NVME_CR_CAP_LOW: 2801 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s); 2802 break; 2803 case NVME_CR_CAP_HI: 2804 DPRINTF("%s %s NVME_CR_CAP_HI", func, s); 2805 break; 2806 case NVME_CR_VS: 2807 DPRINTF("%s %s NVME_CR_VS", func, s); 2808 break; 2809 case NVME_CR_INTMS: 2810 DPRINTF("%s %s NVME_CR_INTMS", func, s); 2811 break; 2812 case NVME_CR_INTMC: 2813 DPRINTF("%s %s NVME_CR_INTMC", func, s); 2814 break; 2815 case NVME_CR_CC: 2816 DPRINTF("%s %s NVME_CR_CC", func, s); 2817 break; 2818 case NVME_CR_CSTS: 2819 DPRINTF("%s %s NVME_CR_CSTS", func, s); 2820 break; 2821 case NVME_CR_NSSR: 2822 DPRINTF("%s %s NVME_CR_NSSR", func, s); 2823 break; 2824 case NVME_CR_AQA: 2825 DPRINTF("%s %s NVME_CR_AQA", func, s); 2826 break; 2827 case NVME_CR_ASQ_LOW: 2828 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s); 2829 break; 2830 case NVME_CR_ASQ_HI: 2831 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s); 2832 break; 2833 case NVME_CR_ACQ_LOW: 2834 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s); 2835 break; 2836 case NVME_CR_ACQ_HI: 2837 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s); 2838 break; 2839 default: 2840 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset); 2841 } 2842 2843 } 2844 2845 static void 2846 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc, 2847 uint64_t offset, int size, uint64_t value) 2848 { 2849 uint32_t ccreg; 2850 2851 if (offset >= NVME_DOORBELL_OFFSET) { 2852 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; 2853 uint64_t idx = belloffset / 8; /* door bell size = 2*int */ 2854 int is_sq = (belloffset % 8) < 4; 2855 2856 if (belloffset > ((sc->max_queues+1) * 8 - 4)) { 2857 WPRINTF("guest attempted an overflow write offset " 2858 "0x%lx, val 0x%lx in %s", 2859 offset, value, __func__); 2860 return; 2861 } 2862 2863 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value); 2864 return; 2865 } 2866 2867 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx", 2868 offset, size, value); 2869 2870 if (size != 4) { 2871 WPRINTF("guest wrote invalid size %d (offset 0x%lx, " 2872 "val 0x%lx) to bar0 in %s", 2873 size, offset, value, __func__); 2874 /* TODO: shutdown device */ 2875 return; 2876 } 2877 2878 pci_nvme_bar0_reg_dumps(__func__, offset, 1); 2879 2880 pthread_mutex_lock(&sc->mtx); 2881 2882 switch (offset) { 2883 case NVME_CR_CAP_LOW: 2884 case NVME_CR_CAP_HI: 2885 /* readonly */ 2886 break; 2887 case NVME_CR_VS: 2888 /* readonly */ 2889 break; 2890 case NVME_CR_INTMS: 2891 /* MSI-X, so ignore */ 2892 break; 2893 case NVME_CR_INTMC: 2894 /* MSI-X, so ignore */ 2895 break; 2896 case NVME_CR_CC: 2897 ccreg = (uint32_t)value; 2898 2899 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u " 2900 "iocqes %u", 2901 __func__, 2902 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), 2903 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), 2904 NVME_CC_GET_IOCQES(ccreg)); 2905 2906 if (NVME_CC_GET_SHN(ccreg)) { 2907 /* perform shutdown - flush out data to backend */ 2908 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << 2909 NVME_CSTS_REG_SHST_SHIFT); 2910 sc->regs.csts |= NVME_SHST_COMPLETE << 2911 NVME_CSTS_REG_SHST_SHIFT; 2912 } 2913 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { 2914 if (NVME_CC_GET_EN(ccreg) == 0) 2915 /* transition 1-> causes controller reset */ 2916 pci_nvme_reset_locked(sc); 2917 else 2918 pci_nvme_init_controller(ctx, sc); 2919 } 2920 2921 /* Insert the iocqes, iosqes and en bits from the write */ 2922 sc->regs.cc &= ~NVME_CC_WRITE_MASK; 2923 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; 2924 if (NVME_CC_GET_EN(ccreg) == 0) { 2925 /* Insert the ams, mps and css bit fields */ 2926 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; 2927 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; 2928 sc->regs.csts &= ~NVME_CSTS_RDY; 2929 } else if (sc->pending_ios == 0) { 2930 sc->regs.csts |= NVME_CSTS_RDY; 2931 } 2932 break; 2933 case NVME_CR_CSTS: 2934 break; 2935 case NVME_CR_NSSR: 2936 /* ignore writes; don't support subsystem reset */ 2937 break; 2938 case NVME_CR_AQA: 2939 sc->regs.aqa = (uint32_t)value; 2940 break; 2941 case NVME_CR_ASQ_LOW: 2942 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | 2943 (0xFFFFF000 & value); 2944 break; 2945 case NVME_CR_ASQ_HI: 2946 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | 2947 (value << 32); 2948 break; 2949 case NVME_CR_ACQ_LOW: 2950 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | 2951 (0xFFFFF000 & value); 2952 break; 2953 case NVME_CR_ACQ_HI: 2954 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | 2955 (value << 32); 2956 break; 2957 default: 2958 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d", 2959 __func__, offset, value, size); 2960 } 2961 pthread_mutex_unlock(&sc->mtx); 2962 } 2963 2964 static void 2965 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, 2966 int baridx, uint64_t offset, int size, uint64_t value) 2967 { 2968 struct pci_nvme_softc* sc = pi->pi_arg; 2969 2970 if (baridx == pci_msix_table_bar(pi) || 2971 baridx == pci_msix_pba_bar(pi)) { 2972 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, " 2973 " value 0x%lx", baridx, offset, size, value); 2974 2975 pci_emul_msix_twrite(pi, offset, size, value); 2976 return; 2977 } 2978 2979 switch (baridx) { 2980 case 0: 2981 pci_nvme_write_bar_0(ctx, sc, offset, size, value); 2982 break; 2983 2984 default: 2985 DPRINTF("%s unknown baridx %d, val 0x%lx", 2986 __func__, baridx, value); 2987 } 2988 } 2989 2990 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, 2991 uint64_t offset, int size) 2992 { 2993 uint64_t value; 2994 2995 pci_nvme_bar0_reg_dumps(__func__, offset, 0); 2996 2997 if (offset < NVME_DOORBELL_OFFSET) { 2998 void *p = &(sc->regs); 2999 pthread_mutex_lock(&sc->mtx); 3000 memcpy(&value, (void *)((uintptr_t)p + offset), size); 3001 pthread_mutex_unlock(&sc->mtx); 3002 } else { 3003 value = 0; 3004 WPRINTF("pci_nvme: read invalid offset %ld", offset); 3005 } 3006 3007 switch (size) { 3008 case 1: 3009 value &= 0xFF; 3010 break; 3011 case 2: 3012 value &= 0xFFFF; 3013 break; 3014 case 4: 3015 value &= 0xFFFFFFFF; 3016 break; 3017 } 3018 3019 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x", 3020 offset, size, (uint32_t)value); 3021 3022 return (value); 3023 } 3024 3025 3026 3027 static uint64_t 3028 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, 3029 uint64_t offset, int size) 3030 { 3031 struct pci_nvme_softc* sc = pi->pi_arg; 3032 3033 if (baridx == pci_msix_table_bar(pi) || 3034 baridx == pci_msix_pba_bar(pi)) { 3035 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d", 3036 baridx, offset, size); 3037 3038 return pci_emul_msix_tread(pi, offset, size); 3039 } 3040 3041 switch (baridx) { 3042 case 0: 3043 return pci_nvme_read_bar_0(sc, offset, size); 3044 3045 default: 3046 DPRINTF("unknown bar %d, 0x%lx", baridx, offset); 3047 } 3048 3049 return (0); 3050 } 3051 3052 static int 3053 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl) 3054 { 3055 char bident[sizeof("XX:X:X")]; 3056 const char *value; 3057 uint32_t sectsz; 3058 3059 sc->max_queues = NVME_QUEUES; 3060 sc->max_qentries = NVME_MAX_QENTRIES; 3061 sc->ioslots = NVME_IOSLOTS; 3062 sc->num_squeues = sc->max_queues; 3063 sc->num_cqueues = sc->max_queues; 3064 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 3065 sectsz = 0; 3066 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), 3067 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3068 3069 value = get_config_value_node(nvl, "maxq"); 3070 if (value != NULL) 3071 sc->max_queues = atoi(value); 3072 value = get_config_value_node(nvl, "qsz"); 3073 if (value != NULL) { 3074 sc->max_qentries = atoi(value); 3075 if (sc->max_qentries <= 0) { 3076 EPRINTLN("nvme: Invalid qsz option %d", 3077 sc->max_qentries); 3078 return (-1); 3079 } 3080 } 3081 value = get_config_value_node(nvl, "ioslots"); 3082 if (value != NULL) { 3083 sc->ioslots = atoi(value); 3084 if (sc->ioslots <= 0) { 3085 EPRINTLN("Invalid ioslots option %d", sc->ioslots); 3086 return (-1); 3087 } 3088 } 3089 value = get_config_value_node(nvl, "sectsz"); 3090 if (value != NULL) 3091 sectsz = atoi(value); 3092 value = get_config_value_node(nvl, "ser"); 3093 if (value != NULL) { 3094 /* 3095 * This field indicates the Product Serial Number in 3096 * 7-bit ASCII, unused bytes should be space characters. 3097 * Ref: NVMe v1.3c. 3098 */ 3099 cpywithpad((char *)sc->ctrldata.sn, 3100 sizeof(sc->ctrldata.sn), value, ' '); 3101 } 3102 value = get_config_value_node(nvl, "eui64"); 3103 if (value != NULL) 3104 sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0)); 3105 value = get_config_value_node(nvl, "dsm"); 3106 if (value != NULL) { 3107 if (strcmp(value, "auto") == 0) 3108 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 3109 else if (strcmp(value, "enable") == 0) 3110 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE; 3111 else if (strcmp(value, "disable") == 0) 3112 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE; 3113 } 3114 3115 value = get_config_value_node(nvl, "ram"); 3116 if (value != NULL) { 3117 uint64_t sz = strtoull(value, NULL, 10); 3118 3119 sc->nvstore.type = NVME_STOR_RAM; 3120 sc->nvstore.size = sz * 1024 * 1024; 3121 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 3122 sc->nvstore.sectsz = 4096; 3123 sc->nvstore.sectsz_bits = 12; 3124 if (sc->nvstore.ctx == NULL) { 3125 EPRINTLN("nvme: Unable to allocate RAM"); 3126 return (-1); 3127 } 3128 } else { 3129 snprintf(bident, sizeof(bident), "%d:%d", 3130 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3131 sc->nvstore.ctx = blockif_open(nvl, bident); 3132 if (sc->nvstore.ctx == NULL) { 3133 EPRINTLN("nvme: Could not open backing file: %s", 3134 strerror(errno)); 3135 return (-1); 3136 } 3137 sc->nvstore.type = NVME_STOR_BLOCKIF; 3138 sc->nvstore.size = blockif_size(sc->nvstore.ctx); 3139 } 3140 3141 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) 3142 sc->nvstore.sectsz = sectsz; 3143 else if (sc->nvstore.type != NVME_STOR_RAM) 3144 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); 3145 for (sc->nvstore.sectsz_bits = 9; 3146 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; 3147 sc->nvstore.sectsz_bits++); 3148 3149 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) 3150 sc->max_queues = NVME_QUEUES; 3151 3152 return (0); 3153 } 3154 3155 static void 3156 pci_nvme_resized(struct blockif_ctxt *bctxt, void *arg, size_t new_size) 3157 { 3158 struct pci_nvme_softc *sc; 3159 struct pci_nvme_blockstore *nvstore; 3160 struct nvme_namespace_data *nd; 3161 3162 sc = arg; 3163 nvstore = &sc->nvstore; 3164 nd = &sc->nsdata; 3165 3166 nvstore->size = new_size; 3167 pci_nvme_init_nsdata_size(nvstore, nd); 3168 3169 /* Add changed NSID to list */ 3170 sc->ns_log.ns[0] = 1; 3171 sc->ns_log.ns[1] = 0; 3172 3173 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE, 3174 PCI_NVME_AE_INFO_NS_ATTR_CHANGED); 3175 } 3176 3177 static int 3178 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) 3179 { 3180 struct pci_nvme_softc *sc; 3181 uint32_t pci_membar_sz; 3182 int error; 3183 3184 error = 0; 3185 3186 sc = calloc(1, sizeof(struct pci_nvme_softc)); 3187 pi->pi_arg = sc; 3188 sc->nsc_pi = pi; 3189 3190 error = pci_nvme_parse_config(sc, nvl); 3191 if (error < 0) 3192 goto done; 3193 else 3194 error = 0; 3195 3196 STAILQ_INIT(&sc->ioreqs_free); 3197 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); 3198 for (int i = 0; i < sc->ioslots; i++) { 3199 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); 3200 } 3201 3202 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); 3203 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); 3204 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); 3205 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); 3206 pci_set_cfgdata8(pi, PCIR_PROGIF, 3207 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); 3208 3209 /* 3210 * Allocate size of NVMe registers + doorbell space for all queues. 3211 * 3212 * The specification requires a minimum memory I/O window size of 16K. 3213 * The Windows driver will refuse to start a device with a smaller 3214 * window. 3215 */ 3216 pci_membar_sz = sizeof(struct nvme_registers) + 3217 2 * sizeof(uint32_t) * (sc->max_queues + 1); 3218 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); 3219 3220 DPRINTF("nvme membar size: %u", pci_membar_sz); 3221 3222 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); 3223 if (error) { 3224 WPRINTF("%s pci alloc mem bar failed", __func__); 3225 goto done; 3226 } 3227 3228 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); 3229 if (error) { 3230 WPRINTF("%s pci add msixcap failed", __func__); 3231 goto done; 3232 } 3233 3234 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); 3235 if (error) { 3236 WPRINTF("%s pci add Express capability failed", __func__); 3237 goto done; 3238 } 3239 3240 pthread_mutex_init(&sc->mtx, NULL); 3241 sem_init(&sc->iosemlock, 0, sc->ioslots); 3242 blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc); 3243 3244 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues); 3245 /* 3246 * Controller data depends on Namespace data so initialize Namespace 3247 * data first. 3248 */ 3249 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore); 3250 pci_nvme_init_ctrldata(sc); 3251 pci_nvme_init_logpages(sc); 3252 pci_nvme_init_features(sc); 3253 3254 pci_nvme_aer_init(sc); 3255 pci_nvme_aen_init(sc); 3256 3257 pci_nvme_reset(sc); 3258 3259 pci_lintr_request(pi); 3260 3261 done: 3262 return (error); 3263 } 3264 3265 static int 3266 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts) 3267 { 3268 char *cp, *ram; 3269 3270 if (opts == NULL) 3271 return (0); 3272 3273 if (strncmp(opts, "ram=", 4) == 0) { 3274 cp = strchr(opts, ','); 3275 if (cp == NULL) { 3276 set_config_value_node(nvl, "ram", opts + 4); 3277 return (0); 3278 } 3279 ram = strndup(opts + 4, cp - opts - 4); 3280 set_config_value_node(nvl, "ram", ram); 3281 free(ram); 3282 return (pci_parse_legacy_config(nvl, cp + 1)); 3283 } else 3284 return (blockif_legacy_config(nvl, opts)); 3285 } 3286 3287 struct pci_devemu pci_de_nvme = { 3288 .pe_emu = "nvme", 3289 .pe_init = pci_nvme_init, 3290 .pe_legacy_config = pci_nvme_legacy_config, 3291 .pe_barwrite = pci_nvme_write, 3292 .pe_barread = pci_nvme_read 3293 }; 3294 PCI_EMUL_SET(pci_de_nvme); 3295