1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2017 Shunsuke Mie 5 * Copyright (c) 2018 Leon Dang 6 * Copyright (c) 2020 Chuck Tuffli 7 * 8 * Function crc16 Copyright (c) 2017, Fedor Uporov 9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 33 /* 34 * bhyve PCIe-NVMe device emulation. 35 * 36 * options: 37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt> 38 * 39 * accepted devpath: 40 * /dev/blockdev 41 * /path/to/image 42 * ram=size_in_MiB 43 * 44 * maxq = max number of queues 45 * qsz = max elements in each queue 46 * ioslots = max number of concurrent io requests 47 * sectsz = sector size (defaults to blockif sector size) 48 * ser = serial number (20-chars max) 49 * eui64 = IEEE Extended Unique Identifier (8 byte value) 50 * dsm = DataSet Management support. Option is one of auto, enable,disable 51 * 52 */ 53 54 /* TODO: 55 - create async event for smart and log 56 - intr coalesce 57 */ 58 59 #include <sys/cdefs.h> 60 __FBSDID("$FreeBSD$"); 61 62 #include <sys/errno.h> 63 #include <sys/types.h> 64 #include <net/ieee_oui.h> 65 #ifndef __FreeBSD__ 66 #include <endian.h> 67 #endif 68 69 #include <assert.h> 70 #include <pthread.h> 71 #include <semaphore.h> 72 #include <stdbool.h> 73 #include <stddef.h> 74 #include <stdint.h> 75 #include <stdio.h> 76 #include <stdlib.h> 77 #include <string.h> 78 79 #include <machine/atomic.h> 80 #include <machine/vmm.h> 81 #include <vmmapi.h> 82 83 #include <dev/nvme/nvme.h> 84 85 #include "bhyverun.h" 86 #include "block_if.h" 87 #include "debug.h" 88 #include "pci_emul.h" 89 90 91 static int nvme_debug = 0; 92 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args) 93 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args) 94 95 /* defaults; can be overridden */ 96 #define NVME_MSIX_BAR 4 97 98 #define NVME_IOSLOTS 8 99 100 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ 101 #define NVME_MMIO_SPACE_MIN (1 << 14) 102 103 #define NVME_QUEUES 16 104 #define NVME_MAX_QENTRIES 2048 105 /* Memory Page size Minimum reported in CAP register */ 106 #define NVME_MPSMIN 0 107 /* MPSMIN converted to bytes */ 108 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN)) 109 110 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) 111 #define NVME_MDTS 9 112 /* Note the + 1 allows for the initial descriptor to not be page aligned */ 113 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1) 114 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES) 115 116 /* This is a synthetic status code to indicate there is no status */ 117 #define NVME_NO_STATUS 0xffff 118 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) 119 120 /* helpers */ 121 122 /* Convert a zero-based value into a one-based value */ 123 #define ONE_BASED(zero) ((zero) + 1) 124 /* Convert a one-based value into a zero-based value */ 125 #define ZERO_BASED(one) ((one) - 1) 126 127 /* Encode number of SQ's and CQ's for Set/Get Features */ 128 #define NVME_FEATURE_NUM_QUEUES(sc) \ 129 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ 130 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16; 131 132 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) 133 134 enum nvme_controller_register_offsets { 135 NVME_CR_CAP_LOW = 0x00, 136 NVME_CR_CAP_HI = 0x04, 137 NVME_CR_VS = 0x08, 138 NVME_CR_INTMS = 0x0c, 139 NVME_CR_INTMC = 0x10, 140 NVME_CR_CC = 0x14, 141 NVME_CR_CSTS = 0x1c, 142 NVME_CR_NSSR = 0x20, 143 NVME_CR_AQA = 0x24, 144 NVME_CR_ASQ_LOW = 0x28, 145 NVME_CR_ASQ_HI = 0x2c, 146 NVME_CR_ACQ_LOW = 0x30, 147 NVME_CR_ACQ_HI = 0x34, 148 }; 149 150 enum nvme_cmd_cdw11 { 151 NVME_CMD_CDW11_PC = 0x0001, 152 NVME_CMD_CDW11_IEN = 0x0002, 153 NVME_CMD_CDW11_IV = 0xFFFF0000, 154 }; 155 156 enum nvme_copy_dir { 157 NVME_COPY_TO_PRP, 158 NVME_COPY_FROM_PRP, 159 }; 160 161 #define NVME_CQ_INTEN 0x01 162 #define NVME_CQ_INTCOAL 0x02 163 164 struct nvme_completion_queue { 165 struct nvme_completion *qbase; 166 pthread_mutex_t mtx; 167 uint32_t size; 168 uint16_t tail; /* nvme progress */ 169 uint16_t head; /* guest progress */ 170 uint16_t intr_vec; 171 uint32_t intr_en; 172 }; 173 174 struct nvme_submission_queue { 175 struct nvme_command *qbase; 176 pthread_mutex_t mtx; 177 uint32_t size; 178 uint16_t head; /* nvme progress */ 179 uint16_t tail; /* guest progress */ 180 uint16_t cqid; /* completion queue id */ 181 int qpriority; 182 }; 183 184 enum nvme_storage_type { 185 NVME_STOR_BLOCKIF = 0, 186 NVME_STOR_RAM = 1, 187 }; 188 189 struct pci_nvme_blockstore { 190 enum nvme_storage_type type; 191 void *ctx; 192 uint64_t size; 193 uint32_t sectsz; 194 uint32_t sectsz_bits; 195 uint64_t eui64; 196 uint32_t deallocate:1; 197 }; 198 199 /* 200 * Calculate the number of additional page descriptors for guest IO requests 201 * based on the advertised Max Data Transfer (MDTS) and given the number of 202 * default iovec's in a struct blockif_req. 203 * 204 * Note the + 1 allows for the initial descriptor to not be page aligned. 205 */ 206 #define MDTS_PAD_SIZE \ 207 NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \ 208 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \ 209 0 210 211 struct pci_nvme_ioreq { 212 struct pci_nvme_softc *sc; 213 STAILQ_ENTRY(pci_nvme_ioreq) link; 214 struct nvme_submission_queue *nvme_sq; 215 uint16_t sqid; 216 217 /* command information */ 218 uint16_t opc; 219 uint16_t cid; 220 uint32_t nsid; 221 222 uint64_t prev_gpaddr; 223 size_t prev_size; 224 size_t bytes; 225 226 struct blockif_req io_req; 227 228 struct iovec iovpadding[MDTS_PAD_SIZE]; 229 }; 230 231 enum nvme_dsm_type { 232 /* Dataset Management bit in ONCS reflects backing storage capability */ 233 NVME_DATASET_MANAGEMENT_AUTO, 234 /* Unconditionally set Dataset Management bit in ONCS */ 235 NVME_DATASET_MANAGEMENT_ENABLE, 236 /* Unconditionally clear Dataset Management bit in ONCS */ 237 NVME_DATASET_MANAGEMENT_DISABLE, 238 }; 239 240 struct pci_nvme_softc; 241 struct nvme_feature_obj; 242 243 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *, 244 struct nvme_feature_obj *, 245 struct nvme_command *, 246 struct nvme_completion *); 247 248 struct nvme_feature_obj { 249 uint32_t cdw11; 250 nvme_feature_cb set; 251 nvme_feature_cb get; 252 bool namespace_specific; 253 }; 254 255 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1) 256 257 struct pci_nvme_aer { 258 STAILQ_ENTRY(pci_nvme_aer) link; 259 uint16_t cid; /* Command ID of the submitted AER */ 260 }; 261 262 struct pci_nvme_softc { 263 struct pci_devinst *nsc_pi; 264 265 pthread_mutex_t mtx; 266 267 struct nvme_registers regs; 268 269 struct nvme_namespace_data nsdata; 270 struct nvme_controller_data ctrldata; 271 struct nvme_error_information_entry err_log; 272 struct nvme_health_information_page health_log; 273 struct nvme_firmware_page fw_log; 274 275 struct pci_nvme_blockstore nvstore; 276 277 uint16_t max_qentries; /* max entries per queue */ 278 uint32_t max_queues; /* max number of IO SQ's or CQ's */ 279 uint32_t num_cqueues; 280 uint32_t num_squeues; 281 bool num_q_is_set; /* Has host set Number of Queues */ 282 283 struct pci_nvme_ioreq *ioreqs; 284 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */ 285 uint32_t pending_ios; 286 uint32_t ioslots; 287 sem_t iosemlock; 288 289 /* 290 * Memory mapped Submission and Completion queues 291 * Each array includes both Admin and IO queues 292 */ 293 struct nvme_completion_queue *compl_queues; 294 struct nvme_submission_queue *submit_queues; 295 296 struct nvme_feature_obj feat[NVME_FID_MAX]; 297 298 enum nvme_dsm_type dataset_management; 299 300 /* Accounting for SMART data */ 301 __uint128_t read_data_units; 302 __uint128_t write_data_units; 303 __uint128_t read_commands; 304 __uint128_t write_commands; 305 uint32_t read_dunits_remainder; 306 uint32_t write_dunits_remainder; 307 308 STAILQ_HEAD(, pci_nvme_aer) aer_list; 309 uint32_t aer_count; 310 }; 311 312 313 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); 314 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); 315 static void pci_nvme_io_done(struct blockif_req *, int); 316 317 /* Controller Configuration utils */ 318 #define NVME_CC_GET_EN(cc) \ 319 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) 320 #define NVME_CC_GET_CSS(cc) \ 321 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) 322 #define NVME_CC_GET_SHN(cc) \ 323 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) 324 #define NVME_CC_GET_IOSQES(cc) \ 325 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) 326 #define NVME_CC_GET_IOCQES(cc) \ 327 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) 328 329 #define NVME_CC_WRITE_MASK \ 330 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ 331 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ 332 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) 333 334 #define NVME_CC_NEN_WRITE_MASK \ 335 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ 336 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ 337 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) 338 339 /* Controller Status utils */ 340 #define NVME_CSTS_GET_RDY(sts) \ 341 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) 342 343 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) 344 345 /* Completion Queue status word utils */ 346 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) 347 #define NVME_STATUS_MASK \ 348 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ 349 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) 350 351 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \ 352 NVME_CTRLR_DATA_ONCS_DSM_SHIFT) 353 354 static void nvme_feature_invalid_cb(struct pci_nvme_softc *, 355 struct nvme_feature_obj *, 356 struct nvme_command *, 357 struct nvme_completion *); 358 static void nvme_feature_num_queues(struct pci_nvme_softc *, 359 struct nvme_feature_obj *, 360 struct nvme_command *, 361 struct nvme_completion *); 362 static void nvme_feature_iv_config(struct pci_nvme_softc *, 363 struct nvme_feature_obj *, 364 struct nvme_command *, 365 struct nvme_completion *); 366 367 static __inline void 368 cpywithpad(char *dst, size_t dst_size, const char *src, char pad) 369 { 370 size_t len; 371 372 len = strnlen(src, dst_size); 373 memset(dst, pad, dst_size); 374 memcpy(dst, src, len); 375 } 376 377 static __inline void 378 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) 379 { 380 381 *status &= ~NVME_STATUS_MASK; 382 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | 383 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; 384 } 385 386 static __inline void 387 pci_nvme_status_genc(uint16_t *status, uint16_t code) 388 { 389 390 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); 391 } 392 393 /* 394 * Initialize the requested number or IO Submission and Completion Queues. 395 * Admin queues are allocated implicitly. 396 */ 397 static void 398 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq) 399 { 400 uint32_t i; 401 402 /* 403 * Allocate and initialize the Submission Queues 404 */ 405 if (nsq > NVME_QUEUES) { 406 WPRINTF("%s: clamping number of SQ from %u to %u", 407 __func__, nsq, NVME_QUEUES); 408 nsq = NVME_QUEUES; 409 } 410 411 sc->num_squeues = nsq; 412 413 sc->submit_queues = calloc(sc->num_squeues + 1, 414 sizeof(struct nvme_submission_queue)); 415 if (sc->submit_queues == NULL) { 416 WPRINTF("%s: SQ allocation failed", __func__); 417 sc->num_squeues = 0; 418 } else { 419 struct nvme_submission_queue *sq = sc->submit_queues; 420 421 for (i = 0; i < sc->num_squeues; i++) 422 pthread_mutex_init(&sq[i].mtx, NULL); 423 } 424 425 /* 426 * Allocate and initialize the Completion Queues 427 */ 428 if (ncq > NVME_QUEUES) { 429 WPRINTF("%s: clamping number of CQ from %u to %u", 430 __func__, ncq, NVME_QUEUES); 431 ncq = NVME_QUEUES; 432 } 433 434 sc->num_cqueues = ncq; 435 436 sc->compl_queues = calloc(sc->num_cqueues + 1, 437 sizeof(struct nvme_completion_queue)); 438 if (sc->compl_queues == NULL) { 439 WPRINTF("%s: CQ allocation failed", __func__); 440 sc->num_cqueues = 0; 441 } else { 442 struct nvme_completion_queue *cq = sc->compl_queues; 443 444 for (i = 0; i < sc->num_cqueues; i++) 445 pthread_mutex_init(&cq[i].mtx, NULL); 446 } 447 } 448 449 static void 450 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) 451 { 452 struct nvme_controller_data *cd = &sc->ctrldata; 453 454 cd->vid = 0xFB5D; 455 cd->ssvid = 0x0000; 456 457 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); 458 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); 459 460 /* Num of submission commands that we can handle at a time (2^rab) */ 461 cd->rab = 4; 462 463 /* FreeBSD OUI */ 464 cd->ieee[0] = 0x58; 465 cd->ieee[1] = 0x9c; 466 cd->ieee[2] = 0xfc; 467 468 cd->mic = 0; 469 470 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */ 471 472 cd->ver = 0x00010300; 473 474 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; 475 cd->acl = 2; 476 cd->aerl = 4; 477 478 /* Advertise 1, Read-only firmware slot */ 479 cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK | 480 (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT); 481 cd->lpa = 0; /* TODO: support some simple things like SMART */ 482 cd->elpe = 0; /* max error log page entries */ 483 cd->npss = 1; /* number of power states support */ 484 485 /* Warning Composite Temperature Threshold */ 486 cd->wctemp = 0x0157; 487 488 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | 489 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); 490 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | 491 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); 492 cd->nn = 1; /* number of namespaces */ 493 494 cd->oncs = 0; 495 switch (sc->dataset_management) { 496 case NVME_DATASET_MANAGEMENT_AUTO: 497 if (sc->nvstore.deallocate) 498 cd->oncs |= NVME_ONCS_DSM; 499 break; 500 case NVME_DATASET_MANAGEMENT_ENABLE: 501 cd->oncs |= NVME_ONCS_DSM; 502 break; 503 default: 504 break; 505 } 506 507 cd->fna = 0x03; 508 509 cd->power_state[0].mp = 10; 510 } 511 512 /* 513 * Calculate the CRC-16 of the given buffer 514 * See copyright attribution at top of file 515 */ 516 static uint16_t 517 crc16(uint16_t crc, const void *buffer, unsigned int len) 518 { 519 const unsigned char *cp = buffer; 520 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */ 521 static uint16_t const crc16_table[256] = { 522 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, 523 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, 524 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, 525 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841, 526 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, 527 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41, 528 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641, 529 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040, 530 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240, 531 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441, 532 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41, 533 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840, 534 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41, 535 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40, 536 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640, 537 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041, 538 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240, 539 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441, 540 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41, 541 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840, 542 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41, 543 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40, 544 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640, 545 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041, 546 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241, 547 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440, 548 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40, 549 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841, 550 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40, 551 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41, 552 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, 553 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 554 }; 555 556 while (len--) 557 crc = (((crc >> 8) & 0xffU) ^ 558 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU; 559 return crc; 560 } 561 562 static void 563 pci_nvme_init_nsdata(struct pci_nvme_softc *sc, 564 struct nvme_namespace_data *nd, uint32_t nsid, 565 struct pci_nvme_blockstore *nvstore) 566 { 567 568 /* Get capacity and block size information from backing store */ 569 nd->nsze = nvstore->size / nvstore->sectsz; 570 nd->ncap = nd->nsze; 571 nd->nuse = nd->nsze; 572 573 if (nvstore->type == NVME_STOR_BLOCKIF) 574 nvstore->deallocate = blockif_candelete(nvstore->ctx); 575 576 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ 577 nd->flbas = 0; 578 579 /* Create an EUI-64 if user did not provide one */ 580 if (nvstore->eui64 == 0) { 581 char *data = NULL; 582 uint64_t eui64 = nvstore->eui64; 583 584 asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus, 585 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 586 587 if (data != NULL) { 588 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); 589 free(data); 590 } 591 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff); 592 } 593 be64enc(nd->eui64, nvstore->eui64); 594 595 /* LBA data-sz = 2^lbads */ 596 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; 597 } 598 599 static void 600 pci_nvme_init_logpages(struct pci_nvme_softc *sc) 601 { 602 603 memset(&sc->err_log, 0, sizeof(sc->err_log)); 604 memset(&sc->health_log, 0, sizeof(sc->health_log)); 605 memset(&sc->fw_log, 0, sizeof(sc->fw_log)); 606 607 /* Set read/write remainder to round up according to spec */ 608 sc->read_dunits_remainder = 999; 609 sc->write_dunits_remainder = 999; 610 611 /* Set nominal Health values checked by implementations */ 612 sc->health_log.temperature = 310; 613 sc->health_log.available_spare = 100; 614 sc->health_log.available_spare_threshold = 10; 615 } 616 617 static void 618 pci_nvme_init_features(struct pci_nvme_softc *sc) 619 { 620 621 sc->feat[0].set = nvme_feature_invalid_cb; 622 sc->feat[0].get = nvme_feature_invalid_cb; 623 624 sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true; 625 sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true; 626 sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues; 627 sc->feat[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION].set = 628 nvme_feature_iv_config; 629 sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG].get = 630 nvme_feature_invalid_cb; 631 sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW].get = 632 nvme_feature_invalid_cb; 633 } 634 635 static void 636 pci_nvme_aer_init(struct pci_nvme_softc *sc) 637 { 638 639 STAILQ_INIT(&sc->aer_list); 640 sc->aer_count = 0; 641 } 642 643 static void 644 pci_nvme_aer_destroy(struct pci_nvme_softc *sc) 645 { 646 struct pci_nvme_aer *aer = NULL; 647 648 while (!STAILQ_EMPTY(&sc->aer_list)) { 649 aer = STAILQ_FIRST(&sc->aer_list); 650 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 651 free(aer); 652 } 653 654 pci_nvme_aer_init(sc); 655 } 656 657 #ifdef __FreeBSD__ 658 static bool 659 pci_nvme_aer_available(struct pci_nvme_softc *sc) 660 { 661 662 return (!STAILQ_EMPTY(&sc->aer_list)); 663 } 664 #else 665 /* This is kept behind an ifdef while it's unused to appease the compiler. */ 666 #endif 667 668 static bool 669 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc) 670 { 671 struct nvme_controller_data *cd = &sc->ctrldata; 672 673 /* AERL is a zero based value while aer_count is one's based */ 674 return (sc->aer_count == (cd->aerl + 1)); 675 } 676 677 /* 678 * Add an Async Event Request 679 * 680 * Stores an AER to be returned later if the Controller needs to notify the 681 * host of an event. 682 * Note that while the NVMe spec doesn't require Controllers to return AER's 683 * in order, this implementation does preserve the order. 684 */ 685 static int 686 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid) 687 { 688 struct pci_nvme_aer *aer = NULL; 689 690 if (pci_nvme_aer_limit_reached(sc)) 691 return (-1); 692 693 aer = calloc(1, sizeof(struct pci_nvme_aer)); 694 if (aer == NULL) 695 return (-1); 696 697 sc->aer_count++; 698 699 /* Save the Command ID for use in the completion message */ 700 aer->cid = cid; 701 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link); 702 703 return (0); 704 } 705 706 /* 707 * Get an Async Event Request structure 708 * 709 * Returns a pointer to an AER previously submitted by the host or NULL if 710 * no AER's exist. Caller is responsible for freeing the returned struct. 711 */ 712 #ifdef __FreeBSD__ 713 static struct pci_nvme_aer * 714 pci_nvme_aer_get(struct pci_nvme_softc *sc) 715 { 716 struct pci_nvme_aer *aer = NULL; 717 718 aer = STAILQ_FIRST(&sc->aer_list); 719 if (aer != NULL) { 720 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 721 sc->aer_count--; 722 } 723 724 return (aer); 725 } 726 #else 727 /* This is kept behind an ifdef while it's unused to appease the compiler. */ 728 #endif 729 730 static void 731 pci_nvme_reset_locked(struct pci_nvme_softc *sc) 732 { 733 uint32_t i; 734 735 DPRINTF("%s", __func__); 736 737 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | 738 (1 << NVME_CAP_LO_REG_CQR_SHIFT) | 739 (60 << NVME_CAP_LO_REG_TO_SHIFT); 740 741 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; 742 743 sc->regs.vs = 0x00010300; /* NVMe v1.3 */ 744 745 sc->regs.cc = 0; 746 sc->regs.csts = 0; 747 748 assert(sc->submit_queues != NULL); 749 750 for (i = 0; i < sc->num_squeues + 1; i++) { 751 sc->submit_queues[i].qbase = NULL; 752 sc->submit_queues[i].size = 0; 753 sc->submit_queues[i].cqid = 0; 754 sc->submit_queues[i].tail = 0; 755 sc->submit_queues[i].head = 0; 756 } 757 758 assert(sc->compl_queues != NULL); 759 760 for (i = 0; i < sc->num_cqueues + 1; i++) { 761 sc->compl_queues[i].qbase = NULL; 762 sc->compl_queues[i].size = 0; 763 sc->compl_queues[i].tail = 0; 764 sc->compl_queues[i].head = 0; 765 } 766 767 sc->num_q_is_set = false; 768 769 pci_nvme_aer_destroy(sc); 770 } 771 772 static void 773 pci_nvme_reset(struct pci_nvme_softc *sc) 774 { 775 pthread_mutex_lock(&sc->mtx); 776 pci_nvme_reset_locked(sc); 777 pthread_mutex_unlock(&sc->mtx); 778 } 779 780 static void 781 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc) 782 { 783 uint16_t acqs, asqs; 784 785 DPRINTF("%s", __func__); 786 787 asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1; 788 sc->submit_queues[0].size = asqs; 789 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq, 790 sizeof(struct nvme_command) * asqs); 791 792 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p", 793 __func__, sc->regs.asq, sc->submit_queues[0].qbase); 794 795 acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 796 NVME_AQA_REG_ACQS_MASK) + 1; 797 sc->compl_queues[0].size = acqs; 798 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq, 799 sizeof(struct nvme_completion) * acqs); 800 sc->compl_queues[0].intr_en = NVME_CQ_INTEN; 801 802 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p", 803 __func__, sc->regs.acq, sc->compl_queues[0].qbase); 804 } 805 806 static int 807 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b, 808 size_t len, enum nvme_copy_dir dir) 809 { 810 uint8_t *p; 811 size_t bytes; 812 813 if (len > (8 * 1024)) { 814 return (-1); 815 } 816 817 /* Copy from the start of prp1 to the end of the physical page */ 818 bytes = PAGE_SIZE - (prp1 & PAGE_MASK); 819 bytes = MIN(bytes, len); 820 821 p = vm_map_gpa(ctx, prp1, bytes); 822 if (p == NULL) { 823 return (-1); 824 } 825 826 if (dir == NVME_COPY_TO_PRP) 827 memcpy(p, b, bytes); 828 else 829 memcpy(b, p, bytes); 830 831 b += bytes; 832 833 len -= bytes; 834 if (len == 0) { 835 return (0); 836 } 837 838 len = MIN(len, PAGE_SIZE); 839 840 p = vm_map_gpa(ctx, prp2, len); 841 if (p == NULL) { 842 return (-1); 843 } 844 845 if (dir == NVME_COPY_TO_PRP) 846 memcpy(p, b, len); 847 else 848 memcpy(b, p, len); 849 850 return (0); 851 } 852 853 /* 854 * Write a Completion Queue Entry update 855 * 856 * Write the completion and update the doorbell value 857 */ 858 static void 859 pci_nvme_cq_update(struct pci_nvme_softc *sc, 860 struct nvme_completion_queue *cq, 861 uint32_t cdw0, 862 uint16_t cid, 863 uint16_t sqid, 864 uint16_t status) 865 { 866 struct nvme_submission_queue *sq = &sc->submit_queues[sqid]; 867 struct nvme_completion *cqe; 868 869 assert(cq->qbase != NULL); 870 871 pthread_mutex_lock(&cq->mtx); 872 873 cqe = &cq->qbase[cq->tail]; 874 875 /* Flip the phase bit */ 876 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK; 877 878 cqe->cdw0 = cdw0; 879 cqe->sqhd = sq->head; 880 cqe->sqid = sqid; 881 cqe->cid = cid; 882 cqe->status = status; 883 884 cq->tail++; 885 if (cq->tail >= cq->size) { 886 cq->tail = 0; 887 } 888 889 pthread_mutex_unlock(&cq->mtx); 890 } 891 892 static int 893 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 894 struct nvme_completion* compl) 895 { 896 uint16_t qid = command->cdw10 & 0xffff; 897 898 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid); 899 if (qid == 0 || qid > sc->num_squeues || 900 (sc->submit_queues[qid].qbase == NULL)) { 901 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u", 902 __func__, qid, sc->num_squeues); 903 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 904 NVME_SC_INVALID_QUEUE_IDENTIFIER); 905 return (1); 906 } 907 908 sc->submit_queues[qid].qbase = NULL; 909 sc->submit_queues[qid].cqid = 0; 910 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 911 return (1); 912 } 913 914 static int 915 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 916 struct nvme_completion* compl) 917 { 918 if (command->cdw11 & NVME_CMD_CDW11_PC) { 919 uint16_t qid = command->cdw10 & 0xffff; 920 struct nvme_submission_queue *nsq; 921 922 if ((qid == 0) || (qid > sc->num_squeues) || 923 (sc->submit_queues[qid].qbase != NULL)) { 924 WPRINTF("%s queue index %u > num_squeues %u", 925 __func__, qid, sc->num_squeues); 926 pci_nvme_status_tc(&compl->status, 927 NVME_SCT_COMMAND_SPECIFIC, 928 NVME_SC_INVALID_QUEUE_IDENTIFIER); 929 return (1); 930 } 931 932 nsq = &sc->submit_queues[qid]; 933 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 934 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries); 935 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) { 936 /* 937 * Queues must specify at least two entries 938 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 939 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 940 */ 941 pci_nvme_status_tc(&compl->status, 942 NVME_SCT_COMMAND_SPECIFIC, 943 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 944 return (1); 945 } 946 nsq->head = nsq->tail = 0; 947 948 nsq->cqid = (command->cdw11 >> 16) & 0xffff; 949 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) { 950 pci_nvme_status_tc(&compl->status, 951 NVME_SCT_COMMAND_SPECIFIC, 952 NVME_SC_INVALID_QUEUE_IDENTIFIER); 953 return (1); 954 } 955 956 if (sc->compl_queues[nsq->cqid].qbase == NULL) { 957 pci_nvme_status_tc(&compl->status, 958 NVME_SCT_COMMAND_SPECIFIC, 959 NVME_SC_COMPLETION_QUEUE_INVALID); 960 return (1); 961 } 962 963 nsq->qpriority = (command->cdw11 >> 1) & 0x03; 964 965 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 966 sizeof(struct nvme_command) * (size_t)nsq->size); 967 968 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__, 969 qid, nsq->size, nsq->qbase, nsq->cqid); 970 971 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 972 973 DPRINTF("%s completed creating IOSQ qid %u", 974 __func__, qid); 975 } else { 976 /* 977 * Guest sent non-cont submission queue request. 978 * This setting is unsupported by this emulation. 979 */ 980 WPRINTF("%s unsupported non-contig (list-based) " 981 "create i/o submission queue", __func__); 982 983 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 984 } 985 return (1); 986 } 987 988 static int 989 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 990 struct nvme_completion* compl) 991 { 992 uint16_t qid = command->cdw10 & 0xffff; 993 uint16_t sqid; 994 995 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid); 996 if (qid == 0 || qid > sc->num_cqueues || 997 (sc->compl_queues[qid].qbase == NULL)) { 998 WPRINTF("%s queue index %u / num_cqueues %u", 999 __func__, qid, sc->num_cqueues); 1000 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1001 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1002 return (1); 1003 } 1004 1005 /* Deleting an Active CQ is an error */ 1006 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++) 1007 if (sc->submit_queues[sqid].cqid == qid) { 1008 pci_nvme_status_tc(&compl->status, 1009 NVME_SCT_COMMAND_SPECIFIC, 1010 NVME_SC_INVALID_QUEUE_DELETION); 1011 return (1); 1012 } 1013 1014 sc->compl_queues[qid].qbase = NULL; 1015 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1016 return (1); 1017 } 1018 1019 static int 1020 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1021 struct nvme_completion* compl) 1022 { 1023 struct nvme_completion_queue *ncq; 1024 uint16_t qid = command->cdw10 & 0xffff; 1025 1026 /* Only support Physically Contiguous queues */ 1027 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) { 1028 WPRINTF("%s unsupported non-contig (list-based) " 1029 "create i/o completion queue", 1030 __func__); 1031 1032 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1033 return (1); 1034 } 1035 1036 if ((qid == 0) || (qid > sc->num_cqueues) || 1037 (sc->compl_queues[qid].qbase != NULL)) { 1038 WPRINTF("%s queue index %u > num_cqueues %u", 1039 __func__, qid, sc->num_cqueues); 1040 pci_nvme_status_tc(&compl->status, 1041 NVME_SCT_COMMAND_SPECIFIC, 1042 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1043 return (1); 1044 } 1045 1046 ncq = &sc->compl_queues[qid]; 1047 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; 1048 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; 1049 if (ncq->intr_vec > (sc->max_queues + 1)) { 1050 pci_nvme_status_tc(&compl->status, 1051 NVME_SCT_COMMAND_SPECIFIC, 1052 NVME_SC_INVALID_INTERRUPT_VECTOR); 1053 return (1); 1054 } 1055 1056 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1057 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) { 1058 /* 1059 * Queues must specify at least two entries 1060 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1061 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1062 */ 1063 pci_nvme_status_tc(&compl->status, 1064 NVME_SCT_COMMAND_SPECIFIC, 1065 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1066 return (1); 1067 } 1068 ncq->head = ncq->tail = 0; 1069 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1070 command->prp1, 1071 sizeof(struct nvme_command) * (size_t)ncq->size); 1072 1073 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1074 1075 1076 return (1); 1077 } 1078 1079 static int 1080 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, 1081 struct nvme_completion* compl) 1082 { 1083 uint32_t logsize = 0; 1084 uint8_t logpage = command->cdw10 & 0xFF; 1085 1086 DPRINTF("%s log page %u len %u", __func__, logpage, logsize); 1087 1088 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1089 1090 /* 1091 * Command specifies the number of dwords to return in fields NUMDU 1092 * and NUMDL. This is a zero-based value. 1093 */ 1094 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1; 1095 logsize *= sizeof(uint32_t); 1096 1097 switch (logpage) { 1098 case NVME_LOG_ERROR: 1099 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1100 command->prp2, (uint8_t *)&sc->err_log, 1101 MIN(logsize, sizeof(sc->err_log)), 1102 NVME_COPY_TO_PRP); 1103 break; 1104 case NVME_LOG_HEALTH_INFORMATION: 1105 pthread_mutex_lock(&sc->mtx); 1106 memcpy(&sc->health_log.data_units_read, &sc->read_data_units, 1107 sizeof(sc->health_log.data_units_read)); 1108 memcpy(&sc->health_log.data_units_written, &sc->write_data_units, 1109 sizeof(sc->health_log.data_units_written)); 1110 memcpy(&sc->health_log.host_read_commands, &sc->read_commands, 1111 sizeof(sc->health_log.host_read_commands)); 1112 memcpy(&sc->health_log.host_write_commands, &sc->write_commands, 1113 sizeof(sc->health_log.host_write_commands)); 1114 pthread_mutex_unlock(&sc->mtx); 1115 1116 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1117 command->prp2, (uint8_t *)&sc->health_log, 1118 MIN(logsize, sizeof(sc->health_log)), 1119 NVME_COPY_TO_PRP); 1120 break; 1121 case NVME_LOG_FIRMWARE_SLOT: 1122 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1123 command->prp2, (uint8_t *)&sc->fw_log, 1124 MIN(logsize, sizeof(sc->fw_log)), 1125 NVME_COPY_TO_PRP); 1126 break; 1127 default: 1128 DPRINTF("%s get log page %x command not supported", 1129 __func__, logpage); 1130 1131 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1132 NVME_SC_INVALID_LOG_PAGE); 1133 } 1134 1135 return (1); 1136 } 1137 1138 static int 1139 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, 1140 struct nvme_completion* compl) 1141 { 1142 void *dest; 1143 uint16_t status = 0; 1144 1145 DPRINTF("%s identify 0x%x nsid 0x%x", __func__, 1146 command->cdw10 & 0xFF, command->nsid); 1147 1148 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1149 1150 switch (command->cdw10 & 0xFF) { 1151 case 0x00: /* return Identify Namespace data structure */ 1152 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1153 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata), 1154 NVME_COPY_TO_PRP); 1155 break; 1156 case 0x01: /* return Identify Controller data structure */ 1157 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1158 command->prp2, (uint8_t *)&sc->ctrldata, 1159 sizeof(sc->ctrldata), 1160 NVME_COPY_TO_PRP); 1161 break; 1162 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ 1163 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1164 sizeof(uint32_t) * 1024); 1165 /* All unused entries shall be zero */ 1166 bzero(dest, sizeof(uint32_t) * 1024); 1167 ((uint32_t *)dest)[0] = 1; 1168 break; 1169 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ 1170 if (command->nsid != 1) { 1171 pci_nvme_status_genc(&status, 1172 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1173 break; 1174 } 1175 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1176 sizeof(uint32_t) * 1024); 1177 /* All bytes after the descriptor shall be zero */ 1178 bzero(dest, sizeof(uint32_t) * 1024); 1179 1180 /* Return NIDT=1 (i.e. EUI64) descriptor */ 1181 ((uint8_t *)dest)[0] = 1; 1182 ((uint8_t *)dest)[1] = sizeof(uint64_t); 1183 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t)); 1184 break; 1185 default: 1186 DPRINTF("%s unsupported identify command requested 0x%x", 1187 __func__, command->cdw10 & 0xFF); 1188 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD); 1189 break; 1190 } 1191 1192 compl->status = status; 1193 return (1); 1194 } 1195 1196 static const char * 1197 nvme_fid_to_name(uint8_t fid) 1198 { 1199 const char *name; 1200 1201 switch (fid) { 1202 case NVME_FEAT_ARBITRATION: 1203 name = "Arbitration"; 1204 break; 1205 case NVME_FEAT_POWER_MANAGEMENT: 1206 name = "Power Management"; 1207 break; 1208 case NVME_FEAT_LBA_RANGE_TYPE: 1209 name = "LBA Range Type"; 1210 break; 1211 case NVME_FEAT_TEMPERATURE_THRESHOLD: 1212 name = "Temperature Threshold"; 1213 break; 1214 case NVME_FEAT_ERROR_RECOVERY: 1215 name = "Error Recovery"; 1216 break; 1217 case NVME_FEAT_VOLATILE_WRITE_CACHE: 1218 name = "Volatile Write Cache"; 1219 break; 1220 case NVME_FEAT_NUMBER_OF_QUEUES: 1221 name = "Number of Queues"; 1222 break; 1223 case NVME_FEAT_INTERRUPT_COALESCING: 1224 name = "Interrupt Coalescing"; 1225 break; 1226 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 1227 name = "Interrupt Vector Configuration"; 1228 break; 1229 case NVME_FEAT_WRITE_ATOMICITY: 1230 name = "Write Atomicity Normal"; 1231 break; 1232 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 1233 name = "Asynchronous Event Configuration"; 1234 break; 1235 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 1236 name = "Autonomous Power State Transition"; 1237 break; 1238 case NVME_FEAT_HOST_MEMORY_BUFFER: 1239 name = "Host Memory Buffer"; 1240 break; 1241 case NVME_FEAT_TIMESTAMP: 1242 name = "Timestamp"; 1243 break; 1244 case NVME_FEAT_KEEP_ALIVE_TIMER: 1245 name = "Keep Alive Timer"; 1246 break; 1247 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT: 1248 name = "Host Controlled Thermal Management"; 1249 break; 1250 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG: 1251 name = "Non-Operation Power State Config"; 1252 break; 1253 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG: 1254 name = "Read Recovery Level Config"; 1255 break; 1256 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 1257 name = "Predictable Latency Mode Config"; 1258 break; 1259 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW: 1260 name = "Predictable Latency Mode Window"; 1261 break; 1262 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES: 1263 name = "LBA Status Information Report Interval"; 1264 break; 1265 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 1266 name = "Host Behavior Support"; 1267 break; 1268 case NVME_FEAT_SANITIZE_CONFIG: 1269 name = "Sanitize Config"; 1270 break; 1271 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION: 1272 name = "Endurance Group Event Configuration"; 1273 break; 1274 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 1275 name = "Software Progress Marker"; 1276 break; 1277 case NVME_FEAT_HOST_IDENTIFIER: 1278 name = "Host Identifier"; 1279 break; 1280 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK: 1281 name = "Reservation Notification Mask"; 1282 break; 1283 case NVME_FEAT_RESERVATION_PERSISTENCE: 1284 name = "Reservation Persistence"; 1285 break; 1286 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG: 1287 name = "Namespace Write Protection Config"; 1288 break; 1289 default: 1290 name = "Unknown"; 1291 break; 1292 } 1293 1294 return (name); 1295 } 1296 1297 static void 1298 nvme_feature_invalid_cb(struct pci_nvme_softc *sc, 1299 struct nvme_feature_obj *feat, 1300 struct nvme_command *command, 1301 struct nvme_completion *compl) 1302 { 1303 1304 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1305 } 1306 1307 static void 1308 nvme_feature_iv_config(struct pci_nvme_softc *sc, 1309 struct nvme_feature_obj *feat, 1310 struct nvme_command *command, 1311 struct nvme_completion *compl) 1312 { 1313 uint32_t i; 1314 uint32_t cdw11 = command->cdw11; 1315 uint16_t iv; 1316 bool cd; 1317 1318 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1319 1320 iv = cdw11 & 0xffff; 1321 cd = cdw11 & (1 << 16); 1322 1323 if (iv > (sc->max_queues + 1)) { 1324 return; 1325 } 1326 1327 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */ 1328 if ((iv == 0) && !cd) 1329 return; 1330 1331 /* Requested Interrupt Vector must be used by a CQ */ 1332 for (i = 0; i < sc->num_cqueues + 1; i++) { 1333 if (sc->compl_queues[i].intr_vec == iv) { 1334 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1335 } 1336 } 1337 1338 } 1339 1340 static void 1341 nvme_feature_num_queues(struct pci_nvme_softc *sc, 1342 struct nvme_feature_obj *feat, 1343 struct nvme_command *command, 1344 struct nvme_completion *compl) 1345 { 1346 uint16_t nqr; /* Number of Queues Requested */ 1347 1348 if (sc->num_q_is_set) { 1349 WPRINTF("%s: Number of Queues already set", __func__); 1350 pci_nvme_status_genc(&compl->status, 1351 NVME_SC_COMMAND_SEQUENCE_ERROR); 1352 return; 1353 } 1354 1355 nqr = command->cdw11 & 0xFFFF; 1356 if (nqr == 0xffff) { 1357 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr); 1358 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1359 return; 1360 } 1361 1362 sc->num_squeues = ONE_BASED(nqr); 1363 if (sc->num_squeues > sc->max_queues) { 1364 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues, 1365 sc->max_queues); 1366 sc->num_squeues = sc->max_queues; 1367 } 1368 1369 nqr = (command->cdw11 >> 16) & 0xFFFF; 1370 if (nqr == 0xffff) { 1371 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr); 1372 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1373 return; 1374 } 1375 1376 sc->num_cqueues = ONE_BASED(nqr); 1377 if (sc->num_cqueues > sc->max_queues) { 1378 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues, 1379 sc->max_queues); 1380 sc->num_cqueues = sc->max_queues; 1381 } 1382 1383 /* Patch the command value which will be saved on callback's return */ 1384 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc); 1385 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 1386 1387 sc->num_q_is_set = true; 1388 } 1389 1390 static int 1391 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command, 1392 struct nvme_completion *compl) 1393 { 1394 struct nvme_feature_obj *feat; 1395 uint32_t nsid = command->nsid; 1396 uint8_t fid = command->cdw10 & 0xFF; 1397 1398 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1399 1400 if (fid >= NVME_FID_MAX) { 1401 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1402 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1403 return (1); 1404 } 1405 feat = &sc->feat[fid]; 1406 1407 if (!feat->namespace_specific && 1408 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) { 1409 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1410 NVME_SC_FEATURE_NOT_NS_SPECIFIC); 1411 return (1); 1412 } 1413 1414 compl->cdw0 = 0; 1415 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1416 1417 if (feat->set) 1418 feat->set(sc, feat, command, compl); 1419 1420 if (compl->status == NVME_SC_SUCCESS) 1421 feat->cdw11 = command->cdw11; 1422 1423 return (0); 1424 } 1425 1426 static int 1427 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, 1428 struct nvme_completion* compl) 1429 { 1430 struct nvme_feature_obj *feat; 1431 uint8_t fid = command->cdw10 & 0xFF; 1432 1433 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1434 1435 if (fid >= NVME_FID_MAX) { 1436 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1437 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1438 return (1); 1439 } 1440 1441 compl->cdw0 = 0; 1442 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1443 1444 feat = &sc->feat[fid]; 1445 if (feat->get) { 1446 feat->get(sc, feat, command, compl); 1447 } 1448 1449 if (compl->status == NVME_SC_SUCCESS) { 1450 compl->cdw0 = feat->cdw11; 1451 } 1452 1453 return (0); 1454 } 1455 1456 static int 1457 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command, 1458 struct nvme_completion* compl) 1459 { 1460 uint8_t ses, lbaf, pi; 1461 1462 /* Only supports Secure Erase Setting - User Data Erase */ 1463 ses = (command->cdw10 >> 9) & 0x7; 1464 if (ses > 0x1) { 1465 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1466 return (1); 1467 } 1468 1469 /* Only supports a single LBA Format */ 1470 lbaf = command->cdw10 & 0xf; 1471 if (lbaf != 0) { 1472 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1473 NVME_SC_INVALID_FORMAT); 1474 return (1); 1475 } 1476 1477 /* Doesn't support Protection Infomation */ 1478 pi = (command->cdw10 >> 5) & 0x7; 1479 if (pi != 0) { 1480 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1481 return (1); 1482 } 1483 1484 if (sc->nvstore.type == NVME_STOR_RAM) { 1485 if (sc->nvstore.ctx) 1486 free(sc->nvstore.ctx); 1487 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 1488 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1489 } else { 1490 struct pci_nvme_ioreq *req; 1491 int err; 1492 1493 req = pci_nvme_get_ioreq(sc); 1494 if (req == NULL) { 1495 pci_nvme_status_genc(&compl->status, 1496 NVME_SC_INTERNAL_DEVICE_ERROR); 1497 WPRINTF("%s: unable to allocate IO req", __func__); 1498 return (1); 1499 } 1500 req->nvme_sq = &sc->submit_queues[0]; 1501 req->sqid = 0; 1502 req->opc = command->opc; 1503 req->cid = command->cid; 1504 req->nsid = command->nsid; 1505 1506 req->io_req.br_offset = 0; 1507 req->io_req.br_resid = sc->nvstore.size; 1508 req->io_req.br_callback = pci_nvme_io_done; 1509 1510 err = blockif_delete(sc->nvstore.ctx, &req->io_req); 1511 if (err) { 1512 pci_nvme_status_genc(&compl->status, 1513 NVME_SC_INTERNAL_DEVICE_ERROR); 1514 pci_nvme_release_ioreq(sc, req); 1515 } 1516 } 1517 1518 return (1); 1519 } 1520 1521 static int 1522 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command, 1523 struct nvme_completion* compl) 1524 { 1525 DPRINTF("%s submission queue %u, command ID 0x%x", __func__, 1526 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF); 1527 1528 /* TODO: search for the command ID and abort it */ 1529 1530 compl->cdw0 = 1; 1531 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1532 return (1); 1533 } 1534 1535 static int 1536 nvme_opc_async_event_req(struct pci_nvme_softc* sc, 1537 struct nvme_command* command, struct nvme_completion* compl) 1538 { 1539 DPRINTF("%s async event request 0x%x", __func__, command->cdw11); 1540 1541 /* Don't exceed the Async Event Request Limit (AERL). */ 1542 if (pci_nvme_aer_limit_reached(sc)) { 1543 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1544 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); 1545 return (1); 1546 } 1547 1548 if (pci_nvme_aer_add(sc, command->cid)) { 1549 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC, 1550 NVME_SC_INTERNAL_DEVICE_ERROR); 1551 return (1); 1552 } 1553 1554 /* 1555 * Raise events when they happen based on the Set Features cmd. 1556 * These events happen async, so only set completion successful if 1557 * there is an event reflective of the request to get event. 1558 */ 1559 compl->status = NVME_NO_STATUS; 1560 1561 return (0); 1562 } 1563 1564 static void 1565 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) 1566 { 1567 struct nvme_completion compl; 1568 struct nvme_command *cmd; 1569 struct nvme_submission_queue *sq; 1570 struct nvme_completion_queue *cq; 1571 uint16_t sqhead; 1572 1573 DPRINTF("%s index %u", __func__, (uint32_t)value); 1574 1575 sq = &sc->submit_queues[0]; 1576 cq = &sc->compl_queues[0]; 1577 1578 pthread_mutex_lock(&sq->mtx); 1579 1580 sqhead = sq->head; 1581 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail); 1582 1583 while (sqhead != atomic_load_acq_short(&sq->tail)) { 1584 cmd = &(sq->qbase)[sqhead]; 1585 compl.cdw0 = 0; 1586 compl.status = 0; 1587 1588 switch (cmd->opc) { 1589 case NVME_OPC_DELETE_IO_SQ: 1590 DPRINTF("%s command DELETE_IO_SQ", __func__); 1591 nvme_opc_delete_io_sq(sc, cmd, &compl); 1592 break; 1593 case NVME_OPC_CREATE_IO_SQ: 1594 DPRINTF("%s command CREATE_IO_SQ", __func__); 1595 nvme_opc_create_io_sq(sc, cmd, &compl); 1596 break; 1597 case NVME_OPC_DELETE_IO_CQ: 1598 DPRINTF("%s command DELETE_IO_CQ", __func__); 1599 nvme_opc_delete_io_cq(sc, cmd, &compl); 1600 break; 1601 case NVME_OPC_CREATE_IO_CQ: 1602 DPRINTF("%s command CREATE_IO_CQ", __func__); 1603 nvme_opc_create_io_cq(sc, cmd, &compl); 1604 break; 1605 case NVME_OPC_GET_LOG_PAGE: 1606 DPRINTF("%s command GET_LOG_PAGE", __func__); 1607 nvme_opc_get_log_page(sc, cmd, &compl); 1608 break; 1609 case NVME_OPC_IDENTIFY: 1610 DPRINTF("%s command IDENTIFY", __func__); 1611 nvme_opc_identify(sc, cmd, &compl); 1612 break; 1613 case NVME_OPC_ABORT: 1614 DPRINTF("%s command ABORT", __func__); 1615 nvme_opc_abort(sc, cmd, &compl); 1616 break; 1617 case NVME_OPC_SET_FEATURES: 1618 DPRINTF("%s command SET_FEATURES", __func__); 1619 nvme_opc_set_features(sc, cmd, &compl); 1620 break; 1621 case NVME_OPC_GET_FEATURES: 1622 DPRINTF("%s command GET_FEATURES", __func__); 1623 nvme_opc_get_features(sc, cmd, &compl); 1624 break; 1625 case NVME_OPC_FIRMWARE_ACTIVATE: 1626 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__); 1627 pci_nvme_status_tc(&compl.status, 1628 NVME_SCT_COMMAND_SPECIFIC, 1629 NVME_SC_INVALID_FIRMWARE_SLOT); 1630 break; 1631 case NVME_OPC_ASYNC_EVENT_REQUEST: 1632 DPRINTF("%s command ASYNC_EVENT_REQ", __func__); 1633 nvme_opc_async_event_req(sc, cmd, &compl); 1634 break; 1635 case NVME_OPC_FORMAT_NVM: 1636 DPRINTF("%s command FORMAT_NVM", __func__); 1637 if ((sc->ctrldata.oacs & 1638 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) { 1639 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 1640 } 1641 compl.status = NVME_NO_STATUS; 1642 nvme_opc_format_nvm(sc, cmd, &compl); 1643 break; 1644 default: 1645 DPRINTF("0x%x command is not implemented", 1646 cmd->opc); 1647 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 1648 } 1649 sqhead = (sqhead + 1) % sq->size; 1650 1651 if (NVME_COMPLETION_VALID(compl)) { 1652 pci_nvme_cq_update(sc, &sc->compl_queues[0], 1653 compl.cdw0, 1654 cmd->cid, 1655 0, /* SQID */ 1656 compl.status); 1657 } 1658 } 1659 1660 DPRINTF("setting sqhead %u", sqhead); 1661 sq->head = sqhead; 1662 1663 if (cq->head != cq->tail) 1664 pci_generate_msix(sc->nsc_pi, 0); 1665 1666 pthread_mutex_unlock(&sq->mtx); 1667 } 1668 1669 /* 1670 * Update the Write and Read statistics reported in SMART data 1671 * 1672 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up. 1673 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000 1674 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999. 1675 */ 1676 static void 1677 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc, 1678 size_t bytes, uint16_t status) 1679 { 1680 1681 pthread_mutex_lock(&sc->mtx); 1682 switch (opc) { 1683 case NVME_OPC_WRITE: 1684 sc->write_commands++; 1685 if (status != NVME_SC_SUCCESS) 1686 break; 1687 sc->write_dunits_remainder += (bytes / 512); 1688 while (sc->write_dunits_remainder >= 1000) { 1689 sc->write_data_units++; 1690 sc->write_dunits_remainder -= 1000; 1691 } 1692 break; 1693 case NVME_OPC_READ: 1694 sc->read_commands++; 1695 if (status != NVME_SC_SUCCESS) 1696 break; 1697 sc->read_dunits_remainder += (bytes / 512); 1698 while (sc->read_dunits_remainder >= 1000) { 1699 sc->read_data_units++; 1700 sc->read_dunits_remainder -= 1000; 1701 } 1702 break; 1703 default: 1704 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc); 1705 break; 1706 } 1707 pthread_mutex_unlock(&sc->mtx); 1708 } 1709 1710 /* 1711 * Check if the combination of Starting LBA (slba) and Number of Logical 1712 * Blocks (nlb) exceeds the range of the underlying storage. 1713 * 1714 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores 1715 * the capacity in bytes as a uint64_t, care must be taken to avoid integer 1716 * overflow. 1717 */ 1718 static bool 1719 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba, 1720 uint32_t nlb) 1721 { 1722 size_t offset, bytes; 1723 1724 /* Overflow check of multiplying Starting LBA by the sector size */ 1725 if (slba >> (64 - nvstore->sectsz_bits)) 1726 return (true); 1727 1728 offset = slba << nvstore->sectsz_bits; 1729 bytes = nlb << nvstore->sectsz_bits; 1730 1731 /* Overflow check of Number of Logical Blocks */ 1732 if ((nvstore->size - offset) < bytes) 1733 return (true); 1734 1735 return (false); 1736 } 1737 1738 static int 1739 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req, 1740 uint64_t gpaddr, size_t size, int do_write, uint64_t lba) 1741 { 1742 int iovidx; 1743 1744 if (req == NULL) 1745 return (-1); 1746 1747 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) { 1748 return (-1); 1749 } 1750 1751 /* concatenate contig block-iovs to minimize number of iovs */ 1752 if ((req->prev_gpaddr + req->prev_size) == gpaddr) { 1753 iovidx = req->io_req.br_iovcnt - 1; 1754 1755 req->io_req.br_iov[iovidx].iov_base = 1756 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 1757 req->prev_gpaddr, size); 1758 1759 req->prev_size += size; 1760 req->io_req.br_resid += size; 1761 1762 req->io_req.br_iov[iovidx].iov_len = req->prev_size; 1763 } else { 1764 iovidx = req->io_req.br_iovcnt; 1765 if (iovidx == 0) { 1766 req->io_req.br_offset = lba; 1767 req->io_req.br_resid = 0; 1768 req->io_req.br_param = req; 1769 } 1770 1771 req->io_req.br_iov[iovidx].iov_base = 1772 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 1773 gpaddr, size); 1774 1775 req->io_req.br_iov[iovidx].iov_len = size; 1776 1777 req->prev_gpaddr = gpaddr; 1778 req->prev_size = size; 1779 req->io_req.br_resid += size; 1780 1781 req->io_req.br_iovcnt++; 1782 } 1783 1784 return (0); 1785 } 1786 1787 static void 1788 pci_nvme_set_completion(struct pci_nvme_softc *sc, 1789 struct nvme_submission_queue *sq, int sqid, uint16_t cid, 1790 uint32_t cdw0, uint16_t status) 1791 { 1792 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; 1793 1794 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x", 1795 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), 1796 NVME_STATUS_GET_SC(status)); 1797 1798 pci_nvme_cq_update(sc, cq, 1799 0, /* CDW0 */ 1800 cid, 1801 sqid, 1802 status); 1803 1804 if (cq->head != cq->tail) { 1805 if (cq->intr_en & NVME_CQ_INTEN) { 1806 pci_generate_msix(sc->nsc_pi, cq->intr_vec); 1807 } else { 1808 DPRINTF("%s: CQ%u interrupt disabled", 1809 __func__, sq->cqid); 1810 } 1811 } 1812 } 1813 1814 static void 1815 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) 1816 { 1817 req->sc = NULL; 1818 req->nvme_sq = NULL; 1819 req->sqid = 0; 1820 1821 pthread_mutex_lock(&sc->mtx); 1822 1823 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link); 1824 sc->pending_ios--; 1825 1826 /* when no more IO pending, can set to ready if device reset/enabled */ 1827 if (sc->pending_ios == 0 && 1828 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) 1829 sc->regs.csts |= NVME_CSTS_RDY; 1830 1831 pthread_mutex_unlock(&sc->mtx); 1832 1833 sem_post(&sc->iosemlock); 1834 } 1835 1836 static struct pci_nvme_ioreq * 1837 pci_nvme_get_ioreq(struct pci_nvme_softc *sc) 1838 { 1839 struct pci_nvme_ioreq *req = NULL;; 1840 1841 sem_wait(&sc->iosemlock); 1842 pthread_mutex_lock(&sc->mtx); 1843 1844 req = STAILQ_FIRST(&sc->ioreqs_free); 1845 assert(req != NULL); 1846 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link); 1847 1848 req->sc = sc; 1849 1850 sc->pending_ios++; 1851 1852 pthread_mutex_unlock(&sc->mtx); 1853 1854 req->io_req.br_iovcnt = 0; 1855 req->io_req.br_offset = 0; 1856 req->io_req.br_resid = 0; 1857 req->io_req.br_param = req; 1858 req->prev_gpaddr = 0; 1859 req->prev_size = 0; 1860 1861 return req; 1862 } 1863 1864 static void 1865 pci_nvme_io_done(struct blockif_req *br, int err) 1866 { 1867 struct pci_nvme_ioreq *req = br->br_param; 1868 struct nvme_submission_queue *sq = req->nvme_sq; 1869 uint16_t code, status = 0; 1870 1871 DPRINTF("%s error %d %s", __func__, err, strerror(err)); 1872 1873 /* TODO return correct error */ 1874 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; 1875 pci_nvme_status_genc(&status, code); 1876 1877 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status); 1878 pci_nvme_stats_write_read_update(req->sc, req->opc, 1879 req->bytes, status); 1880 pci_nvme_release_ioreq(req->sc, req); 1881 } 1882 1883 /* 1884 * Implements the Flush command. The specification states: 1885 * If a volatile write cache is not present, Flush commands complete 1886 * successfully and have no effect 1887 * in the description of the Volatile Write Cache (VWC) field of the Identify 1888 * Controller data. Therefore, set status to Success if the command is 1889 * not supported (i.e. RAM or as indicated by the blockif). 1890 */ 1891 static bool 1892 nvme_opc_flush(struct pci_nvme_softc *sc, 1893 struct nvme_command *cmd, 1894 struct pci_nvme_blockstore *nvstore, 1895 struct pci_nvme_ioreq *req, 1896 uint16_t *status) 1897 { 1898 bool pending = false; 1899 1900 if (nvstore->type == NVME_STOR_RAM) { 1901 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 1902 } else { 1903 int err; 1904 1905 req->io_req.br_callback = pci_nvme_io_done; 1906 1907 err = blockif_flush(nvstore->ctx, &req->io_req); 1908 switch (err) { 1909 case 0: 1910 pending = true; 1911 break; 1912 case EOPNOTSUPP: 1913 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 1914 break; 1915 default: 1916 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 1917 } 1918 } 1919 1920 return (pending); 1921 } 1922 1923 static uint16_t 1924 nvme_write_read_ram(struct pci_nvme_softc *sc, 1925 struct pci_nvme_blockstore *nvstore, 1926 uint64_t prp1, uint64_t prp2, 1927 size_t offset, uint64_t bytes, 1928 bool is_write) 1929 { 1930 uint8_t *buf = nvstore->ctx; 1931 enum nvme_copy_dir dir; 1932 uint16_t status = 0; 1933 1934 if (is_write) 1935 dir = NVME_COPY_TO_PRP; 1936 else 1937 dir = NVME_COPY_FROM_PRP; 1938 1939 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2, 1940 buf + offset, bytes, dir)) 1941 pci_nvme_status_genc(&status, 1942 NVME_SC_DATA_TRANSFER_ERROR); 1943 else 1944 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1945 1946 return (status); 1947 } 1948 1949 static uint16_t 1950 nvme_write_read_blockif(struct pci_nvme_softc *sc, 1951 struct pci_nvme_blockstore *nvstore, 1952 struct pci_nvme_ioreq *req, 1953 uint64_t prp1, uint64_t prp2, 1954 size_t offset, uint64_t bytes, 1955 bool is_write) 1956 { 1957 uint64_t size; 1958 int err; 1959 uint16_t status = NVME_NO_STATUS; 1960 1961 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes); 1962 if (pci_nvme_append_iov_req(sc, req, prp1, 1963 size, is_write, offset)) { 1964 pci_nvme_status_genc(&status, 1965 NVME_SC_DATA_TRANSFER_ERROR); 1966 goto out; 1967 } 1968 1969 offset += size; 1970 bytes -= size; 1971 1972 if (bytes == 0) { 1973 ; 1974 } else if (bytes <= PAGE_SIZE) { 1975 size = bytes; 1976 if (pci_nvme_append_iov_req(sc, req, prp2, 1977 size, is_write, offset)) { 1978 pci_nvme_status_genc(&status, 1979 NVME_SC_DATA_TRANSFER_ERROR); 1980 goto out; 1981 } 1982 } else { 1983 void *vmctx = sc->nsc_pi->pi_vmctx; 1984 uint64_t *prp_list = &prp2; 1985 uint64_t *last = prp_list; 1986 1987 /* PRP2 is pointer to a physical region page list */ 1988 while (bytes) { 1989 /* Last entry in list points to the next list */ 1990 if (prp_list == last) { 1991 uint64_t prp = *prp_list; 1992 1993 prp_list = paddr_guest2host(vmctx, prp, 1994 PAGE_SIZE - (prp % PAGE_SIZE)); 1995 last = prp_list + (NVME_PRP2_ITEMS - 1); 1996 } 1997 1998 size = MIN(bytes, PAGE_SIZE); 1999 2000 if (pci_nvme_append_iov_req(sc, req, *prp_list, 2001 size, is_write, offset)) { 2002 pci_nvme_status_genc(&status, 2003 NVME_SC_DATA_TRANSFER_ERROR); 2004 goto out; 2005 } 2006 2007 offset += size; 2008 bytes -= size; 2009 2010 prp_list++; 2011 } 2012 } 2013 req->io_req.br_callback = pci_nvme_io_done; 2014 if (is_write) 2015 err = blockif_write(nvstore->ctx, &req->io_req); 2016 else 2017 err = blockif_read(nvstore->ctx, &req->io_req); 2018 2019 if (err) 2020 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); 2021 out: 2022 return (status); 2023 } 2024 2025 static bool 2026 nvme_opc_write_read(struct pci_nvme_softc *sc, 2027 struct nvme_command *cmd, 2028 struct pci_nvme_blockstore *nvstore, 2029 struct pci_nvme_ioreq *req, 2030 uint16_t *status) 2031 { 2032 uint64_t lba, nblocks, bytes = 0; 2033 size_t offset; 2034 bool is_write = cmd->opc == NVME_OPC_WRITE; 2035 bool pending = false; 2036 2037 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; 2038 nblocks = (cmd->cdw12 & 0xFFFF) + 1; 2039 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) { 2040 WPRINTF("%s command would exceed LBA range", __func__); 2041 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2042 goto out; 2043 } 2044 2045 bytes = nblocks << nvstore->sectsz_bits; 2046 if (bytes > NVME_MAX_DATA_SIZE) { 2047 WPRINTF("%s command would exceed MDTS", __func__); 2048 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD); 2049 goto out; 2050 } 2051 2052 offset = lba << nvstore->sectsz_bits; 2053 2054 req->bytes = bytes; 2055 req->io_req.br_offset = lba; 2056 2057 /* PRP bits 1:0 must be zero */ 2058 cmd->prp1 &= ~0x3UL; 2059 cmd->prp2 &= ~0x3UL; 2060 2061 if (nvstore->type == NVME_STOR_RAM) { 2062 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1, 2063 cmd->prp2, offset, bytes, is_write); 2064 } else { 2065 *status = nvme_write_read_blockif(sc, nvstore, req, 2066 cmd->prp1, cmd->prp2, offset, bytes, is_write); 2067 2068 if (*status == NVME_NO_STATUS) 2069 pending = true; 2070 } 2071 out: 2072 if (!pending) 2073 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status); 2074 2075 return (pending); 2076 } 2077 2078 static void 2079 pci_nvme_dealloc_sm(struct blockif_req *br, int err) 2080 { 2081 struct pci_nvme_ioreq *req = br->br_param; 2082 struct pci_nvme_softc *sc = req->sc; 2083 bool done = true; 2084 #ifdef __FreeBSD__ 2085 uint16_t status; 2086 #else 2087 uint16_t status = 0; 2088 #endif 2089 2090 if (err) { 2091 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); 2092 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) { 2093 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2094 } else { 2095 struct iovec *iov = req->io_req.br_iov; 2096 2097 req->prev_gpaddr++; 2098 iov += req->prev_gpaddr; 2099 2100 /* The iov_* values already include the sector size */ 2101 req->io_req.br_offset = (off_t)iov->iov_base; 2102 req->io_req.br_resid = iov->iov_len; 2103 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) { 2104 pci_nvme_status_genc(&status, 2105 NVME_SC_INTERNAL_DEVICE_ERROR); 2106 } else 2107 done = false; 2108 } 2109 2110 if (done) { 2111 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, 2112 req->cid, 0, status); 2113 pci_nvme_release_ioreq(sc, req); 2114 } 2115 } 2116 2117 static bool 2118 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, 2119 struct nvme_command *cmd, 2120 struct pci_nvme_blockstore *nvstore, 2121 struct pci_nvme_ioreq *req, 2122 uint16_t *status) 2123 { 2124 struct nvme_dsm_range *range = NULL; 2125 uint32_t nr, r, non_zero, dr; 2126 int err; 2127 bool pending = false; 2128 2129 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) { 2130 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE); 2131 goto out; 2132 } 2133 2134 nr = cmd->cdw10 & 0xff; 2135 2136 /* copy locally because a range entry could straddle PRPs */ 2137 range = calloc(1, NVME_MAX_DSM_TRIM); 2138 if (range == NULL) { 2139 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2140 goto out; 2141 } 2142 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, 2143 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP); 2144 2145 /* Check for invalid ranges and the number of non-zero lengths */ 2146 non_zero = 0; 2147 for (r = 0; r <= nr; r++) { 2148 if (pci_nvme_out_of_range(nvstore, 2149 range[r].starting_lba, range[r].length)) { 2150 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2151 goto out; 2152 } 2153 if (range[r].length != 0) 2154 non_zero++; 2155 } 2156 2157 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) { 2158 size_t offset, bytes; 2159 int sectsz_bits = sc->nvstore.sectsz_bits; 2160 2161 /* 2162 * DSM calls are advisory only, and compliant controllers 2163 * may choose to take no actions (i.e. return Success). 2164 */ 2165 if (!nvstore->deallocate) { 2166 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2167 goto out; 2168 } 2169 2170 /* If all ranges have a zero length, return Success */ 2171 if (non_zero == 0) { 2172 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2173 goto out; 2174 } 2175 2176 if (req == NULL) { 2177 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2178 goto out; 2179 } 2180 2181 offset = range[0].starting_lba << sectsz_bits; 2182 bytes = range[0].length << sectsz_bits; 2183 2184 /* 2185 * If the request is for more than a single range, store 2186 * the ranges in the br_iov. Optimize for the common case 2187 * of a single range. 2188 * 2189 * Note that NVMe Number of Ranges is a zero based value 2190 */ 2191 req->io_req.br_iovcnt = 0; 2192 req->io_req.br_offset = offset; 2193 req->io_req.br_resid = bytes; 2194 2195 if (nr == 0) { 2196 req->io_req.br_callback = pci_nvme_io_done; 2197 } else { 2198 struct iovec *iov = req->io_req.br_iov; 2199 2200 for (r = 0, dr = 0; r <= nr; r++) { 2201 offset = range[r].starting_lba << sectsz_bits; 2202 bytes = range[r].length << sectsz_bits; 2203 if (bytes == 0) 2204 continue; 2205 2206 if ((nvstore->size - offset) < bytes) { 2207 pci_nvme_status_genc(status, 2208 NVME_SC_LBA_OUT_OF_RANGE); 2209 goto out; 2210 } 2211 iov[dr].iov_base = (void *)offset; 2212 iov[dr].iov_len = bytes; 2213 dr++; 2214 } 2215 req->io_req.br_callback = pci_nvme_dealloc_sm; 2216 2217 /* 2218 * Use prev_gpaddr to track the current entry and 2219 * prev_size to track the number of entries 2220 */ 2221 req->prev_gpaddr = 0; 2222 req->prev_size = dr; 2223 } 2224 2225 err = blockif_delete(nvstore->ctx, &req->io_req); 2226 if (err) 2227 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2228 else 2229 pending = true; 2230 } 2231 out: 2232 free(range); 2233 return (pending); 2234 } 2235 2236 static void 2237 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) 2238 { 2239 struct nvme_submission_queue *sq; 2240 uint16_t status = 0; 2241 uint16_t sqhead; 2242 2243 /* handle all submissions up to sq->tail index */ 2244 sq = &sc->submit_queues[idx]; 2245 2246 pthread_mutex_lock(&sq->mtx); 2247 2248 sqhead = sq->head; 2249 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p", 2250 idx, sqhead, sq->tail, sq->qbase); 2251 2252 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2253 struct nvme_command *cmd; 2254 struct pci_nvme_ioreq *req; 2255 uint32_t nsid; 2256 bool pending; 2257 2258 pending = false; 2259 req = NULL; 2260 status = 0; 2261 2262 cmd = &sq->qbase[sqhead]; 2263 sqhead = (sqhead + 1) % sq->size; 2264 2265 nsid = le32toh(cmd->nsid); 2266 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) { 2267 pci_nvme_status_genc(&status, 2268 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 2269 status |= 2270 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT; 2271 goto complete; 2272 } 2273 2274 req = pci_nvme_get_ioreq(sc); 2275 if (req == NULL) { 2276 pci_nvme_status_genc(&status, 2277 NVME_SC_INTERNAL_DEVICE_ERROR); 2278 WPRINTF("%s: unable to allocate IO req", __func__); 2279 goto complete; 2280 } 2281 req->nvme_sq = sq; 2282 req->sqid = idx; 2283 req->opc = cmd->opc; 2284 req->cid = cmd->cid; 2285 req->nsid = cmd->nsid; 2286 2287 switch (cmd->opc) { 2288 case NVME_OPC_FLUSH: 2289 pending = nvme_opc_flush(sc, cmd, &sc->nvstore, 2290 req, &status); 2291 break; 2292 case NVME_OPC_WRITE: 2293 case NVME_OPC_READ: 2294 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore, 2295 req, &status); 2296 break; 2297 case NVME_OPC_WRITE_ZEROES: 2298 /* TODO: write zeroes 2299 WPRINTF("%s write zeroes lba 0x%lx blocks %u", 2300 __func__, lba, cmd->cdw12 & 0xFFFF); */ 2301 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2302 break; 2303 case NVME_OPC_DATASET_MANAGEMENT: 2304 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, 2305 req, &status); 2306 break; 2307 default: 2308 WPRINTF("%s unhandled io command 0x%x", 2309 __func__, cmd->opc); 2310 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE); 2311 } 2312 complete: 2313 if (!pending) { 2314 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, 2315 status); 2316 if (req != NULL) 2317 pci_nvme_release_ioreq(sc, req); 2318 } 2319 } 2320 2321 sq->head = sqhead; 2322 2323 pthread_mutex_unlock(&sq->mtx); 2324 } 2325 2326 static void 2327 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc, 2328 uint64_t idx, int is_sq, uint64_t value) 2329 { 2330 DPRINTF("nvme doorbell %lu, %s, val 0x%lx", 2331 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF); 2332 2333 if (is_sq) { 2334 if (idx > sc->num_squeues) { 2335 WPRINTF("%s queue index %lu overflow from " 2336 "guest (max %u)", 2337 __func__, idx, sc->num_squeues); 2338 return; 2339 } 2340 2341 atomic_store_short(&sc->submit_queues[idx].tail, 2342 (uint16_t)value); 2343 2344 if (idx == 0) { 2345 pci_nvme_handle_admin_cmd(sc, value); 2346 } else { 2347 /* submission queue; handle new entries in SQ */ 2348 if (idx > sc->num_squeues) { 2349 WPRINTF("%s SQ index %lu overflow from " 2350 "guest (max %u)", 2351 __func__, idx, sc->num_squeues); 2352 return; 2353 } 2354 pci_nvme_handle_io_cmd(sc, (uint16_t)idx); 2355 } 2356 } else { 2357 if (idx > sc->num_cqueues) { 2358 WPRINTF("%s queue index %lu overflow from " 2359 "guest (max %u)", 2360 __func__, idx, sc->num_cqueues); 2361 return; 2362 } 2363 2364 atomic_store_short(&sc->compl_queues[idx].head, 2365 (uint16_t)value); 2366 } 2367 } 2368 2369 static void 2370 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) 2371 { 2372 const char *s = iswrite ? "WRITE" : "READ"; 2373 2374 switch (offset) { 2375 case NVME_CR_CAP_LOW: 2376 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s); 2377 break; 2378 case NVME_CR_CAP_HI: 2379 DPRINTF("%s %s NVME_CR_CAP_HI", func, s); 2380 break; 2381 case NVME_CR_VS: 2382 DPRINTF("%s %s NVME_CR_VS", func, s); 2383 break; 2384 case NVME_CR_INTMS: 2385 DPRINTF("%s %s NVME_CR_INTMS", func, s); 2386 break; 2387 case NVME_CR_INTMC: 2388 DPRINTF("%s %s NVME_CR_INTMC", func, s); 2389 break; 2390 case NVME_CR_CC: 2391 DPRINTF("%s %s NVME_CR_CC", func, s); 2392 break; 2393 case NVME_CR_CSTS: 2394 DPRINTF("%s %s NVME_CR_CSTS", func, s); 2395 break; 2396 case NVME_CR_NSSR: 2397 DPRINTF("%s %s NVME_CR_NSSR", func, s); 2398 break; 2399 case NVME_CR_AQA: 2400 DPRINTF("%s %s NVME_CR_AQA", func, s); 2401 break; 2402 case NVME_CR_ASQ_LOW: 2403 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s); 2404 break; 2405 case NVME_CR_ASQ_HI: 2406 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s); 2407 break; 2408 case NVME_CR_ACQ_LOW: 2409 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s); 2410 break; 2411 case NVME_CR_ACQ_HI: 2412 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s); 2413 break; 2414 default: 2415 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset); 2416 } 2417 2418 } 2419 2420 static void 2421 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc, 2422 uint64_t offset, int size, uint64_t value) 2423 { 2424 uint32_t ccreg; 2425 2426 if (offset >= NVME_DOORBELL_OFFSET) { 2427 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; 2428 uint64_t idx = belloffset / 8; /* door bell size = 2*int */ 2429 int is_sq = (belloffset % 8) < 4; 2430 2431 if (belloffset > ((sc->max_queues+1) * 8 - 4)) { 2432 WPRINTF("guest attempted an overflow write offset " 2433 "0x%lx, val 0x%lx in %s", 2434 offset, value, __func__); 2435 return; 2436 } 2437 2438 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value); 2439 return; 2440 } 2441 2442 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx", 2443 offset, size, value); 2444 2445 if (size != 4) { 2446 WPRINTF("guest wrote invalid size %d (offset 0x%lx, " 2447 "val 0x%lx) to bar0 in %s", 2448 size, offset, value, __func__); 2449 /* TODO: shutdown device */ 2450 return; 2451 } 2452 2453 pci_nvme_bar0_reg_dumps(__func__, offset, 1); 2454 2455 pthread_mutex_lock(&sc->mtx); 2456 2457 switch (offset) { 2458 case NVME_CR_CAP_LOW: 2459 case NVME_CR_CAP_HI: 2460 /* readonly */ 2461 break; 2462 case NVME_CR_VS: 2463 /* readonly */ 2464 break; 2465 case NVME_CR_INTMS: 2466 /* MSI-X, so ignore */ 2467 break; 2468 case NVME_CR_INTMC: 2469 /* MSI-X, so ignore */ 2470 break; 2471 case NVME_CR_CC: 2472 ccreg = (uint32_t)value; 2473 2474 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u " 2475 "iocqes %u", 2476 __func__, 2477 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), 2478 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), 2479 NVME_CC_GET_IOCQES(ccreg)); 2480 2481 if (NVME_CC_GET_SHN(ccreg)) { 2482 /* perform shutdown - flush out data to backend */ 2483 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << 2484 NVME_CSTS_REG_SHST_SHIFT); 2485 sc->regs.csts |= NVME_SHST_COMPLETE << 2486 NVME_CSTS_REG_SHST_SHIFT; 2487 } 2488 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { 2489 if (NVME_CC_GET_EN(ccreg) == 0) 2490 /* transition 1-> causes controller reset */ 2491 pci_nvme_reset_locked(sc); 2492 else 2493 pci_nvme_init_controller(ctx, sc); 2494 } 2495 2496 /* Insert the iocqes, iosqes and en bits from the write */ 2497 sc->regs.cc &= ~NVME_CC_WRITE_MASK; 2498 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; 2499 if (NVME_CC_GET_EN(ccreg) == 0) { 2500 /* Insert the ams, mps and css bit fields */ 2501 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; 2502 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; 2503 sc->regs.csts &= ~NVME_CSTS_RDY; 2504 } else if (sc->pending_ios == 0) { 2505 sc->regs.csts |= NVME_CSTS_RDY; 2506 } 2507 break; 2508 case NVME_CR_CSTS: 2509 break; 2510 case NVME_CR_NSSR: 2511 /* ignore writes; don't support subsystem reset */ 2512 break; 2513 case NVME_CR_AQA: 2514 sc->regs.aqa = (uint32_t)value; 2515 break; 2516 case NVME_CR_ASQ_LOW: 2517 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | 2518 (0xFFFFF000 & value); 2519 break; 2520 case NVME_CR_ASQ_HI: 2521 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | 2522 (value << 32); 2523 break; 2524 case NVME_CR_ACQ_LOW: 2525 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | 2526 (0xFFFFF000 & value); 2527 break; 2528 case NVME_CR_ACQ_HI: 2529 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | 2530 (value << 32); 2531 break; 2532 default: 2533 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d", 2534 __func__, offset, value, size); 2535 } 2536 pthread_mutex_unlock(&sc->mtx); 2537 } 2538 2539 static void 2540 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, 2541 int baridx, uint64_t offset, int size, uint64_t value) 2542 { 2543 struct pci_nvme_softc* sc = pi->pi_arg; 2544 2545 if (baridx == pci_msix_table_bar(pi) || 2546 baridx == pci_msix_pba_bar(pi)) { 2547 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, " 2548 " value 0x%lx", baridx, offset, size, value); 2549 2550 pci_emul_msix_twrite(pi, offset, size, value); 2551 return; 2552 } 2553 2554 switch (baridx) { 2555 case 0: 2556 pci_nvme_write_bar_0(ctx, sc, offset, size, value); 2557 break; 2558 2559 default: 2560 DPRINTF("%s unknown baridx %d, val 0x%lx", 2561 __func__, baridx, value); 2562 } 2563 } 2564 2565 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, 2566 uint64_t offset, int size) 2567 { 2568 uint64_t value; 2569 2570 pci_nvme_bar0_reg_dumps(__func__, offset, 0); 2571 2572 if (offset < NVME_DOORBELL_OFFSET) { 2573 void *p = &(sc->regs); 2574 pthread_mutex_lock(&sc->mtx); 2575 memcpy(&value, (void *)((uintptr_t)p + offset), size); 2576 pthread_mutex_unlock(&sc->mtx); 2577 } else { 2578 value = 0; 2579 WPRINTF("pci_nvme: read invalid offset %ld", offset); 2580 } 2581 2582 switch (size) { 2583 case 1: 2584 value &= 0xFF; 2585 break; 2586 case 2: 2587 value &= 0xFFFF; 2588 break; 2589 case 4: 2590 value &= 0xFFFFFFFF; 2591 break; 2592 } 2593 2594 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x", 2595 offset, size, (uint32_t)value); 2596 2597 return (value); 2598 } 2599 2600 2601 2602 static uint64_t 2603 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, 2604 uint64_t offset, int size) 2605 { 2606 struct pci_nvme_softc* sc = pi->pi_arg; 2607 2608 if (baridx == pci_msix_table_bar(pi) || 2609 baridx == pci_msix_pba_bar(pi)) { 2610 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d", 2611 baridx, offset, size); 2612 2613 return pci_emul_msix_tread(pi, offset, size); 2614 } 2615 2616 switch (baridx) { 2617 case 0: 2618 return pci_nvme_read_bar_0(sc, offset, size); 2619 2620 default: 2621 DPRINTF("unknown bar %d, 0x%lx", baridx, offset); 2622 } 2623 2624 return (0); 2625 } 2626 2627 2628 static int 2629 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts) 2630 { 2631 char bident[sizeof("XX:X:X")]; 2632 char *uopt, *xopts, *config; 2633 uint32_t sectsz; 2634 int optidx; 2635 2636 sc->max_queues = NVME_QUEUES; 2637 sc->max_qentries = NVME_MAX_QENTRIES; 2638 sc->ioslots = NVME_IOSLOTS; 2639 sc->num_squeues = sc->max_queues; 2640 sc->num_cqueues = sc->max_queues; 2641 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 2642 sectsz = 0; 2643 2644 uopt = strdup(opts); 2645 optidx = 0; 2646 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), 2647 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 2648 for (xopts = strtok(uopt, ","); 2649 xopts != NULL; 2650 xopts = strtok(NULL, ",")) { 2651 2652 if ((config = strchr(xopts, '=')) != NULL) 2653 *config++ = '\0'; 2654 2655 if (!strcmp("maxq", xopts)) { 2656 sc->max_queues = atoi(config); 2657 } else if (!strcmp("qsz", xopts)) { 2658 sc->max_qentries = atoi(config); 2659 } else if (!strcmp("ioslots", xopts)) { 2660 sc->ioslots = atoi(config); 2661 } else if (!strcmp("sectsz", xopts)) { 2662 sectsz = atoi(config); 2663 } else if (!strcmp("ser", xopts)) { 2664 /* 2665 * This field indicates the Product Serial Number in 2666 * 7-bit ASCII, unused bytes should be space characters. 2667 * Ref: NVMe v1.3c. 2668 */ 2669 cpywithpad((char *)sc->ctrldata.sn, 2670 sizeof(sc->ctrldata.sn), config, ' '); 2671 } else if (!strcmp("ram", xopts)) { 2672 uint64_t sz = strtoull(&xopts[4], NULL, 10); 2673 2674 sc->nvstore.type = NVME_STOR_RAM; 2675 sc->nvstore.size = sz * 1024 * 1024; 2676 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 2677 sc->nvstore.sectsz = 4096; 2678 sc->nvstore.sectsz_bits = 12; 2679 if (sc->nvstore.ctx == NULL) { 2680 perror("Unable to allocate RAM"); 2681 free(uopt); 2682 return (-1); 2683 } 2684 } else if (!strcmp("eui64", xopts)) { 2685 sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0)); 2686 } else if (!strcmp("dsm", xopts)) { 2687 if (!strcmp("auto", config)) 2688 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 2689 else if (!strcmp("enable", config)) 2690 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE; 2691 else if (!strcmp("disable", config)) 2692 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE; 2693 } else if (optidx == 0) { 2694 snprintf(bident, sizeof(bident), "%d:%d", 2695 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 2696 sc->nvstore.ctx = blockif_open(xopts, bident); 2697 if (sc->nvstore.ctx == NULL) { 2698 perror("Could not open backing file"); 2699 free(uopt); 2700 return (-1); 2701 } 2702 sc->nvstore.type = NVME_STOR_BLOCKIF; 2703 sc->nvstore.size = blockif_size(sc->nvstore.ctx); 2704 } else { 2705 EPRINTLN("Invalid option %s", xopts); 2706 free(uopt); 2707 return (-1); 2708 } 2709 2710 optidx++; 2711 } 2712 free(uopt); 2713 2714 if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) { 2715 EPRINTLN("backing store not specified"); 2716 return (-1); 2717 } 2718 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) 2719 sc->nvstore.sectsz = sectsz; 2720 else if (sc->nvstore.type != NVME_STOR_RAM) 2721 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); 2722 for (sc->nvstore.sectsz_bits = 9; 2723 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; 2724 sc->nvstore.sectsz_bits++); 2725 2726 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) 2727 sc->max_queues = NVME_QUEUES; 2728 2729 if (sc->max_qentries <= 0) { 2730 EPRINTLN("Invalid qsz option"); 2731 return (-1); 2732 } 2733 if (sc->ioslots <= 0) { 2734 EPRINTLN("Invalid ioslots option"); 2735 return (-1); 2736 } 2737 2738 return (0); 2739 } 2740 2741 static int 2742 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) 2743 { 2744 struct pci_nvme_softc *sc; 2745 uint32_t pci_membar_sz; 2746 int error; 2747 2748 error = 0; 2749 2750 sc = calloc(1, sizeof(struct pci_nvme_softc)); 2751 pi->pi_arg = sc; 2752 sc->nsc_pi = pi; 2753 2754 error = pci_nvme_parse_opts(sc, opts); 2755 if (error < 0) 2756 goto done; 2757 else 2758 error = 0; 2759 2760 STAILQ_INIT(&sc->ioreqs_free); 2761 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); 2762 for (int i = 0; i < sc->ioslots; i++) { 2763 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); 2764 } 2765 2766 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); 2767 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); 2768 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); 2769 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); 2770 pci_set_cfgdata8(pi, PCIR_PROGIF, 2771 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); 2772 2773 /* 2774 * Allocate size of NVMe registers + doorbell space for all queues. 2775 * 2776 * The specification requires a minimum memory I/O window size of 16K. 2777 * The Windows driver will refuse to start a device with a smaller 2778 * window. 2779 */ 2780 pci_membar_sz = sizeof(struct nvme_registers) + 2781 2 * sizeof(uint32_t) * (sc->max_queues + 1); 2782 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); 2783 2784 DPRINTF("nvme membar size: %u", pci_membar_sz); 2785 2786 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); 2787 if (error) { 2788 WPRINTF("%s pci alloc mem bar failed", __func__); 2789 goto done; 2790 } 2791 2792 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); 2793 if (error) { 2794 WPRINTF("%s pci add msixcap failed", __func__); 2795 goto done; 2796 } 2797 2798 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); 2799 if (error) { 2800 WPRINTF("%s pci add Express capability failed", __func__); 2801 goto done; 2802 } 2803 2804 pthread_mutex_init(&sc->mtx, NULL); 2805 sem_init(&sc->iosemlock, 0, sc->ioslots); 2806 2807 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues); 2808 /* 2809 * Controller data depends on Namespace data so initialize Namespace 2810 * data first. 2811 */ 2812 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore); 2813 pci_nvme_init_ctrldata(sc); 2814 pci_nvme_init_logpages(sc); 2815 pci_nvme_init_features(sc); 2816 2817 pci_nvme_aer_init(sc); 2818 2819 pci_nvme_reset(sc); 2820 2821 pci_lintr_request(pi); 2822 2823 done: 2824 return (error); 2825 } 2826 2827 2828 struct pci_devemu pci_de_nvme = { 2829 .pe_emu = "nvme", 2830 .pe_init = pci_nvme_init, 2831 .pe_barwrite = pci_nvme_write, 2832 .pe_barread = pci_nvme_read 2833 }; 2834 PCI_EMUL_SET(pci_de_nvme); 2835