1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2018 Nexenta Systems, Inc. 14 * Copyright 2016 Tegile Systems, Inc. All rights reserved. 15 * Copyright (c) 2016 The MathWorks, Inc. All rights reserved. 16 * Copyright 2020 Joyent, Inc. 17 * Copyright 2019 Western Digital Corporation. 18 * Copyright 2020 Racktop Systems. 19 * Copyright 2022 Oxide Computer Company. 20 * Copyright 2022 OmniOS Community Edition (OmniOSce) Association. 21 */ 22 23 /* 24 * blkdev driver for NVMe compliant storage devices 25 * 26 * This driver targets and is designed to support all NVMe 1.x devices. 27 * Features are added to the driver as we encounter devices that require them 28 * and our needs, so some commands or log pages may not take advantage of newer 29 * features that devices support at this time. When you encounter such a case, 30 * it is generally fine to add that support to the driver as long as you take 31 * care to ensure that the requisite device version is met before using it. 32 * 33 * The driver has only been tested on x86 systems and will not work on big- 34 * endian systems without changes to the code accessing registers and data 35 * structures used by the hardware. 36 * 37 * 38 * Interrupt Usage: 39 * 40 * The driver will use a single interrupt while configuring the device as the 41 * specification requires, but contrary to the specification it will try to use 42 * a single-message MSI(-X) or FIXED interrupt. Later in the attach process it 43 * will switch to multiple-message MSI(-X) if supported. The driver wants to 44 * have one interrupt vector per CPU, but it will work correctly if less are 45 * available. Interrupts can be shared by queues, the interrupt handler will 46 * iterate through the I/O queue array by steps of n_intr_cnt. Usually only 47 * the admin queue will share an interrupt with one I/O queue. The interrupt 48 * handler will retrieve completed commands from all queues sharing an interrupt 49 * vector and will post them to a taskq for completion processing. 50 * 51 * 52 * Command Processing: 53 * 54 * NVMe devices can have up to 65535 I/O queue pairs, with each queue holding up 55 * to 65536 I/O commands. The driver will configure one I/O queue pair per 56 * available interrupt vector, with the queue length usually much smaller than 57 * the maximum of 65536. If the hardware doesn't provide enough queues, fewer 58 * interrupt vectors will be used. 59 * 60 * Additionally the hardware provides a single special admin queue pair that can 61 * hold up to 4096 admin commands. 62 * 63 * From the hardware perspective both queues of a queue pair are independent, 64 * but they share some driver state: the command array (holding pointers to 65 * commands currently being processed by the hardware) and the active command 66 * counter. Access to a submission queue and the shared state is protected by 67 * nq_mutex; completion queue is protected by ncq_mutex. 68 * 69 * When a command is submitted to a queue pair the active command counter is 70 * incremented and a pointer to the command is stored in the command array. The 71 * array index is used as command identifier (CID) in the submission queue 72 * entry. Some commands may take a very long time to complete, and if the queue 73 * wraps around in that time a submission may find the next array slot to still 74 * be used by a long-running command. In this case the array is sequentially 75 * searched for the next free slot. The length of the command array is the same 76 * as the configured queue length. Queue overrun is prevented by the semaphore, 77 * so a command submission may block if the queue is full. 78 * 79 * 80 * Polled I/O Support: 81 * 82 * For kernel core dump support the driver can do polled I/O. As interrupts are 83 * turned off while dumping the driver will just submit a command in the regular 84 * way, and then repeatedly attempt a command retrieval until it gets the 85 * command back. 86 * 87 * 88 * Namespace Support: 89 * 90 * NVMe devices can have multiple namespaces, each being a independent data 91 * store. The driver supports multiple namespaces and creates a blkdev interface 92 * for each namespace found. Namespaces can have various attributes to support 93 * protection information. This driver does not support any of this and ignores 94 * namespaces that have these attributes. 95 * 96 * As of NVMe 1.1 namespaces can have an 64bit Extended Unique Identifier 97 * (EUI64). This driver uses the EUI64 if present to generate the devid and 98 * passes it to blkdev to use it in the device node names. As this is currently 99 * untested namespaces with EUI64 are ignored by default. 100 * 101 * We currently support only (2 << NVME_MINOR_INST_SHIFT) - 2 namespaces in a 102 * single controller. This is an artificial limit imposed by the driver to be 103 * able to address a reasonable number of controllers and namespaces using a 104 * 32bit minor node number. 105 * 106 * 107 * Minor nodes: 108 * 109 * For each NVMe device the driver exposes one minor node for the controller and 110 * one minor node for each namespace. The only operations supported by those 111 * minor nodes are open(9E), close(9E), and ioctl(9E). This serves as the 112 * interface for the nvmeadm(1M) utility. 113 * 114 * 115 * Blkdev Interface: 116 * 117 * This driver uses blkdev to do all the heavy lifting involved with presenting 118 * a disk device to the system. As a result, the processing of I/O requests is 119 * relatively simple as blkdev takes care of partitioning, boundary checks, DMA 120 * setup, and splitting of transfers into manageable chunks. 121 * 122 * I/O requests coming in from blkdev are turned into NVM commands and posted to 123 * an I/O queue. The queue is selected by taking the CPU id modulo the number of 124 * queues. There is currently no timeout handling of I/O commands. 125 * 126 * Blkdev also supports querying device/media information and generating a 127 * devid. The driver reports the best block size as determined by the namespace 128 * format back to blkdev as physical block size to support partition and block 129 * alignment. The devid is either based on the namespace EUI64, if present, or 130 * composed using the device vendor ID, model number, serial number, and the 131 * namespace ID. 132 * 133 * 134 * Error Handling: 135 * 136 * Error handling is currently limited to detecting fatal hardware errors, 137 * either by asynchronous events, or synchronously through command status or 138 * admin command timeouts. In case of severe errors the device is fenced off, 139 * all further requests will return EIO. FMA is then called to fault the device. 140 * 141 * The hardware has a limit for outstanding asynchronous event requests. Before 142 * this limit is known the driver assumes it is at least 1 and posts a single 143 * asynchronous request. Later when the limit is known more asynchronous event 144 * requests are posted to allow quicker reception of error information. When an 145 * asynchronous event is posted by the hardware the driver will parse the error 146 * status fields and log information or fault the device, depending on the 147 * severity of the asynchronous event. The asynchronous event request is then 148 * reused and posted to the admin queue again. 149 * 150 * On command completion the command status is checked for errors. In case of 151 * errors indicating a driver bug the driver panics. Almost all other error 152 * status values just cause EIO to be returned. 153 * 154 * Command timeouts are currently detected for all admin commands except 155 * asynchronous event requests. If a command times out and the hardware appears 156 * to be healthy the driver attempts to abort the command. The original command 157 * timeout is also applied to the abort command. If the abort times out too the 158 * driver assumes the device to be dead, fences it off, and calls FMA to retire 159 * it. In all other cases the aborted command should return immediately with a 160 * status indicating it was aborted, and the driver will wait indefinitely for 161 * that to happen. No timeout handling of normal I/O commands is presently done. 162 * 163 * Any command that times out due to the controller dropping dead will be put on 164 * nvme_lost_cmds list if it references DMA memory. This will prevent the DMA 165 * memory being reused by the system and later be written to by a "dead" NVMe 166 * controller. 167 * 168 * 169 * Locking: 170 * 171 * Each queue pair has a nq_mutex and ncq_mutex. The nq_mutex must be held 172 * when accessing shared state and submission queue registers, ncq_mutex 173 * is held when accessing completion queue state and registers. 174 * Callers of nvme_unqueue_cmd() must make sure that nq_mutex is held, while 175 * nvme_submit_{admin,io}_cmd() and nvme_retrieve_cmd() take care of both 176 * mutexes themselves. 177 * 178 * Each command also has its own nc_mutex, which is associated with the 179 * condition variable nc_cv. It is only used on admin commands which are run 180 * synchronously. In that case it must be held across calls to 181 * nvme_submit_{admin,io}_cmd() and nvme_wait_cmd(), which is taken care of by 182 * nvme_admin_cmd(). It must also be held whenever the completion state of the 183 * command is changed or while a admin command timeout is handled. 184 * 185 * If both nc_mutex and nq_mutex must be held, nc_mutex must be acquired first. 186 * More than one nc_mutex may only be held when aborting commands. In this case, 187 * the nc_mutex of the command to be aborted must be held across the call to 188 * nvme_abort_cmd() to prevent the command from completing while the abort is in 189 * progress. 190 * 191 * If both nq_mutex and ncq_mutex need to be held, ncq_mutex must be 192 * acquired first. More than one nq_mutex is never held by a single thread. 193 * The ncq_mutex is only held by nvme_retrieve_cmd() and 194 * nvme_process_iocq(). nvme_process_iocq() is only called from the 195 * interrupt thread and nvme_retrieve_cmd() during polled I/O, so the 196 * mutex is non-contentious but is required for implementation completeness 197 * and safety. 198 * 199 * Each minor node has its own nm_mutex, which protects the open count nm_ocnt 200 * and exclusive-open flag nm_oexcl. 201 * 202 * 203 * Quiesce / Fast Reboot: 204 * 205 * The driver currently does not support fast reboot. A quiesce(9E) entry point 206 * is still provided which is used to send a shutdown notification to the 207 * device. 208 * 209 * 210 * NVMe Hotplug: 211 * 212 * The driver supports hot removal. The driver uses the NDI event framework 213 * to register a callback, nvme_remove_callback, to clean up when a disk is 214 * removed. In particular, the driver will unqueue outstanding I/O commands and 215 * set n_dead on the softstate to true so that other operations, such as ioctls 216 * and command submissions, fail as well. 217 * 218 * While the callback registration relies on the NDI event framework, the 219 * removal event itself is kicked off in the PCIe hotplug framework, when the 220 * PCIe bridge driver ("pcieb") gets a hotplug interrupt indicating that a 221 * device was removed from the slot. 222 * 223 * The NVMe driver instance itself will remain until the final close of the 224 * device. 225 * 226 * 227 * DDI UFM Support 228 * 229 * The driver supports the DDI UFM framework for reporting information about 230 * the device's firmware image and slot configuration. This data can be 231 * queried by userland software via ioctls to the ufm driver. For more 232 * information, see ddi_ufm(9E). 233 * 234 * 235 * Driver Configuration: 236 * 237 * The following driver properties can be changed to control some aspects of the 238 * drivers operation: 239 * - strict-version: can be set to 0 to allow devices conforming to newer 240 * major versions to be used 241 * - ignore-unknown-vendor-status: can be set to 1 to not handle any vendor 242 * specific command status as a fatal error leading device faulting 243 * - admin-queue-len: the maximum length of the admin queue (16-4096) 244 * - io-squeue-len: the maximum length of the I/O submission queues (16-65536) 245 * - io-cqueue-len: the maximum length of the I/O completion queues (16-65536) 246 * - async-event-limit: the maximum number of asynchronous event requests to be 247 * posted by the driver 248 * - volatile-write-cache-enable: can be set to 0 to disable the volatile write 249 * cache 250 * - min-phys-block-size: the minimum physical block size to report to blkdev, 251 * which is among other things the basis for ZFS vdev ashift 252 * - max-submission-queues: the maximum number of I/O submission queues. 253 * - max-completion-queues: the maximum number of I/O completion queues, 254 * can be less than max-submission-queues, in which case the completion 255 * queues are shared. 256 * 257 * 258 * TODO: 259 * - figure out sane default for I/O queue depth reported to blkdev 260 * - FMA handling of media errors 261 * - support for devices supporting very large I/O requests using chained PRPs 262 * - support for configuring hardware parameters like interrupt coalescing 263 * - support for media formatting and hard partitioning into namespaces 264 * - support for big-endian systems 265 * - support for fast reboot 266 * - support for NVMe Subsystem Reset (1.1) 267 * - support for Scatter/Gather lists (1.1) 268 * - support for Reservations (1.1) 269 * - support for power management 270 */ 271 272 #include <sys/byteorder.h> 273 #ifdef _BIG_ENDIAN 274 #error nvme driver needs porting for big-endian platforms 275 #endif 276 277 #include <sys/modctl.h> 278 #include <sys/conf.h> 279 #include <sys/devops.h> 280 #include <sys/ddi.h> 281 #include <sys/ddi_ufm.h> 282 #include <sys/sunddi.h> 283 #include <sys/sunndi.h> 284 #include <sys/bitmap.h> 285 #include <sys/sysmacros.h> 286 #include <sys/param.h> 287 #include <sys/varargs.h> 288 #include <sys/cpuvar.h> 289 #include <sys/disp.h> 290 #include <sys/blkdev.h> 291 #include <sys/atomic.h> 292 #include <sys/archsystm.h> 293 #include <sys/sata/sata_hba.h> 294 #include <sys/stat.h> 295 #include <sys/policy.h> 296 #include <sys/list.h> 297 #include <sys/dkio.h> 298 299 #include <sys/nvme.h> 300 301 #ifdef __x86 302 #include <sys/x86_archext.h> 303 #endif 304 305 #include "nvme_reg.h" 306 #include "nvme_var.h" 307 308 /* 309 * Assertions to make sure that we've properly captured various aspects of the 310 * packed structures and haven't broken them during updates. 311 */ 312 CTASSERT(sizeof (nvme_identify_ctrl_t) == 0x1000); 313 CTASSERT(offsetof(nvme_identify_ctrl_t, id_oacs) == 256); 314 CTASSERT(offsetof(nvme_identify_ctrl_t, id_sqes) == 512); 315 CTASSERT(offsetof(nvme_identify_ctrl_t, id_oncs) == 520); 316 CTASSERT(offsetof(nvme_identify_ctrl_t, id_subnqn) == 768); 317 CTASSERT(offsetof(nvme_identify_ctrl_t, id_nvmof) == 1792); 318 CTASSERT(offsetof(nvme_identify_ctrl_t, id_psd) == 2048); 319 CTASSERT(offsetof(nvme_identify_ctrl_t, id_vs) == 3072); 320 321 CTASSERT(sizeof (nvme_identify_nsid_t) == 0x1000); 322 CTASSERT(offsetof(nvme_identify_nsid_t, id_fpi) == 32); 323 CTASSERT(offsetof(nvme_identify_nsid_t, id_anagrpid) == 92); 324 CTASSERT(offsetof(nvme_identify_nsid_t, id_nguid) == 104); 325 CTASSERT(offsetof(nvme_identify_nsid_t, id_lbaf) == 128); 326 CTASSERT(offsetof(nvme_identify_nsid_t, id_vs) == 384); 327 328 CTASSERT(sizeof (nvme_identify_primary_caps_t) == 0x1000); 329 CTASSERT(offsetof(nvme_identify_primary_caps_t, nipc_vqfrt) == 32); 330 CTASSERT(offsetof(nvme_identify_primary_caps_t, nipc_vifrt) == 64); 331 332 333 /* NVMe spec version supported */ 334 static const int nvme_version_major = 1; 335 336 /* tunable for admin command timeout in seconds, default is 1s */ 337 int nvme_admin_cmd_timeout = 1; 338 339 /* tunable for FORMAT NVM command timeout in seconds, default is 600s */ 340 int nvme_format_cmd_timeout = 600; 341 342 /* tunable for firmware commit with NVME_FWC_SAVE, default is 15s */ 343 int nvme_commit_save_cmd_timeout = 15; 344 345 /* 346 * tunable for the size of arbitrary vendor specific admin commands, 347 * default is 16MiB. 348 */ 349 uint32_t nvme_vendor_specific_admin_cmd_size = 1 << 24; 350 351 /* 352 * tunable for the max timeout of arbitary vendor specific admin commands, 353 * default is 60s. 354 */ 355 uint_t nvme_vendor_specific_admin_cmd_max_timeout = 60; 356 357 static int nvme_attach(dev_info_t *, ddi_attach_cmd_t); 358 static int nvme_detach(dev_info_t *, ddi_detach_cmd_t); 359 static int nvme_quiesce(dev_info_t *); 360 static int nvme_fm_errcb(dev_info_t *, ddi_fm_error_t *, const void *); 361 static int nvme_setup_interrupts(nvme_t *, int, int); 362 static void nvme_release_interrupts(nvme_t *); 363 static uint_t nvme_intr(caddr_t, caddr_t); 364 365 static void nvme_shutdown(nvme_t *, int, boolean_t); 366 static boolean_t nvme_reset(nvme_t *, boolean_t); 367 static int nvme_init(nvme_t *); 368 static nvme_cmd_t *nvme_alloc_cmd(nvme_t *, int); 369 static void nvme_free_cmd(nvme_cmd_t *); 370 static nvme_cmd_t *nvme_create_nvm_cmd(nvme_namespace_t *, uint8_t, 371 bd_xfer_t *); 372 static void nvme_admin_cmd(nvme_cmd_t *, int); 373 static void nvme_submit_admin_cmd(nvme_qpair_t *, nvme_cmd_t *); 374 static int nvme_submit_io_cmd(nvme_qpair_t *, nvme_cmd_t *); 375 static void nvme_submit_cmd_common(nvme_qpair_t *, nvme_cmd_t *); 376 static nvme_cmd_t *nvme_unqueue_cmd(nvme_t *, nvme_qpair_t *, int); 377 static nvme_cmd_t *nvme_retrieve_cmd(nvme_t *, nvme_qpair_t *); 378 static void nvme_wait_cmd(nvme_cmd_t *, uint_t); 379 static void nvme_wakeup_cmd(void *); 380 static void nvme_async_event_task(void *); 381 382 static int nvme_check_unknown_cmd_status(nvme_cmd_t *); 383 static int nvme_check_vendor_cmd_status(nvme_cmd_t *); 384 static int nvme_check_integrity_cmd_status(nvme_cmd_t *); 385 static int nvme_check_specific_cmd_status(nvme_cmd_t *); 386 static int nvme_check_generic_cmd_status(nvme_cmd_t *); 387 static inline int nvme_check_cmd_status(nvme_cmd_t *); 388 389 static int nvme_abort_cmd(nvme_cmd_t *, uint_t); 390 static void nvme_async_event(nvme_t *); 391 static int nvme_format_nvm(nvme_t *, boolean_t, uint32_t, uint8_t, boolean_t, 392 uint8_t, boolean_t, uint8_t); 393 static int nvme_get_logpage(nvme_t *, boolean_t, void **, size_t *, uint8_t, 394 ...); 395 static int nvme_identify(nvme_t *, boolean_t, uint32_t, void **); 396 static int nvme_set_features(nvme_t *, boolean_t, uint32_t, uint8_t, uint32_t, 397 uint32_t *); 398 static int nvme_get_features(nvme_t *, boolean_t, uint32_t, uint8_t, uint32_t *, 399 void **, size_t *); 400 static int nvme_write_cache_set(nvme_t *, boolean_t); 401 static int nvme_set_nqueues(nvme_t *); 402 403 static void nvme_free_dma(nvme_dma_t *); 404 static int nvme_zalloc_dma(nvme_t *, size_t, uint_t, ddi_dma_attr_t *, 405 nvme_dma_t **); 406 static int nvme_zalloc_queue_dma(nvme_t *, uint32_t, uint16_t, uint_t, 407 nvme_dma_t **); 408 static void nvme_free_qpair(nvme_qpair_t *); 409 static int nvme_alloc_qpair(nvme_t *, uint32_t, nvme_qpair_t **, uint_t); 410 static int nvme_create_io_qpair(nvme_t *, nvme_qpair_t *, uint16_t); 411 412 static inline void nvme_put64(nvme_t *, uintptr_t, uint64_t); 413 static inline void nvme_put32(nvme_t *, uintptr_t, uint32_t); 414 static inline uint64_t nvme_get64(nvme_t *, uintptr_t); 415 static inline uint32_t nvme_get32(nvme_t *, uintptr_t); 416 417 static boolean_t nvme_check_regs_hdl(nvme_t *); 418 static boolean_t nvme_check_dma_hdl(nvme_dma_t *); 419 420 static int nvme_fill_prp(nvme_cmd_t *, ddi_dma_handle_t); 421 422 static void nvme_bd_xfer_done(void *); 423 static void nvme_bd_driveinfo(void *, bd_drive_t *); 424 static int nvme_bd_mediainfo(void *, bd_media_t *); 425 static int nvme_bd_cmd(nvme_namespace_t *, bd_xfer_t *, uint8_t); 426 static int nvme_bd_read(void *, bd_xfer_t *); 427 static int nvme_bd_write(void *, bd_xfer_t *); 428 static int nvme_bd_sync(void *, bd_xfer_t *); 429 static int nvme_bd_devid(void *, dev_info_t *, ddi_devid_t *); 430 static int nvme_bd_free_space(void *, bd_xfer_t *); 431 432 static int nvme_prp_dma_constructor(void *, void *, int); 433 static void nvme_prp_dma_destructor(void *, void *); 434 435 static void nvme_prepare_devid(nvme_t *, uint32_t); 436 437 /* DDI UFM callbacks */ 438 static int nvme_ufm_fill_image(ddi_ufm_handle_t *, void *, uint_t, 439 ddi_ufm_image_t *); 440 static int nvme_ufm_fill_slot(ddi_ufm_handle_t *, void *, uint_t, uint_t, 441 ddi_ufm_slot_t *); 442 static int nvme_ufm_getcaps(ddi_ufm_handle_t *, void *, ddi_ufm_cap_t *); 443 444 static int nvme_open(dev_t *, int, int, cred_t *); 445 static int nvme_close(dev_t, int, int, cred_t *); 446 static int nvme_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); 447 448 static ddi_ufm_ops_t nvme_ufm_ops = { 449 NULL, 450 nvme_ufm_fill_image, 451 nvme_ufm_fill_slot, 452 nvme_ufm_getcaps 453 }; 454 455 #define NVME_MINOR_INST_SHIFT 9 456 #define NVME_MINOR(inst, nsid) (((inst) << NVME_MINOR_INST_SHIFT) | (nsid)) 457 #define NVME_MINOR_INST(minor) ((minor) >> NVME_MINOR_INST_SHIFT) 458 #define NVME_MINOR_NSID(minor) ((minor) & ((1 << NVME_MINOR_INST_SHIFT) - 1)) 459 #define NVME_MINOR_MAX (NVME_MINOR(1, 0) - 2) 460 #define NVME_IS_VENDOR_SPECIFIC_CMD(x) (((x) >= 0xC0) && ((x) <= 0xFF)) 461 #define NVME_VENDOR_SPECIFIC_LOGPAGE_MIN 0xC0 462 #define NVME_VENDOR_SPECIFIC_LOGPAGE_MAX 0xFF 463 #define NVME_IS_VENDOR_SPECIFIC_LOGPAGE(x) \ 464 (((x) >= NVME_VENDOR_SPECIFIC_LOGPAGE_MIN) && \ 465 ((x) <= NVME_VENDOR_SPECIFIC_LOGPAGE_MAX)) 466 467 /* 468 * NVMe versions 1.3 and later actually support log pages up to UINT32_MAX 469 * DWords in size. However, revision 1.3 also modified the layout of the Get Log 470 * Page command significantly relative to version 1.2, including changing 471 * reserved bits, adding new bitfields, and requiring the use of command DWord 472 * 11 to fully specify the size of the log page (the lower and upper 16 bits of 473 * the number of DWords in the page are split between DWord 10 and DWord 11, 474 * respectively). 475 * 476 * All of these impose significantly different layout requirements on the 477 * `nvme_getlogpage_t` type. This could be solved with two different types, or a 478 * complicated/nested union with the two versions as the overlying members. Both 479 * of these are reasonable, if a bit convoluted. However, these is no current 480 * need for such large pages, or a way to test them, as most log pages actually 481 * fit within the current size limit. So for simplicity, we retain the size cap 482 * from version 1.2. 483 * 484 * Note that the number of DWords is zero-based, so we add 1. It is subtracted 485 * to form a zero-based value in `nvme_get_logpage`. 486 */ 487 #define NVME_VENDOR_SPECIFIC_LOGPAGE_MAX_SIZE \ 488 (((1 << 12) + 1) * sizeof (uint32_t)) 489 490 static void *nvme_state; 491 static kmem_cache_t *nvme_cmd_cache; 492 493 /* 494 * DMA attributes for queue DMA memory 495 * 496 * Queue DMA memory must be page aligned. The maximum length of a queue is 497 * 65536 entries, and an entry can be 64 bytes long. 498 */ 499 static ddi_dma_attr_t nvme_queue_dma_attr = { 500 .dma_attr_version = DMA_ATTR_V0, 501 .dma_attr_addr_lo = 0, 502 .dma_attr_addr_hi = 0xffffffffffffffffULL, 503 .dma_attr_count_max = (UINT16_MAX + 1) * sizeof (nvme_sqe_t) - 1, 504 .dma_attr_align = 0x1000, 505 .dma_attr_burstsizes = 0x7ff, 506 .dma_attr_minxfer = 0x1000, 507 .dma_attr_maxxfer = (UINT16_MAX + 1) * sizeof (nvme_sqe_t), 508 .dma_attr_seg = 0xffffffffffffffffULL, 509 .dma_attr_sgllen = 1, 510 .dma_attr_granular = 1, 511 .dma_attr_flags = 0, 512 }; 513 514 /* 515 * DMA attributes for transfers using Physical Region Page (PRP) entries 516 * 517 * A PRP entry describes one page of DMA memory using the page size specified 518 * in the controller configuration's memory page size register (CC.MPS). It uses 519 * a 64bit base address aligned to this page size. There is no limitation on 520 * chaining PRPs together for arbitrarily large DMA transfers. 521 */ 522 static ddi_dma_attr_t nvme_prp_dma_attr = { 523 .dma_attr_version = DMA_ATTR_V0, 524 .dma_attr_addr_lo = 0, 525 .dma_attr_addr_hi = 0xffffffffffffffffULL, 526 .dma_attr_count_max = 0xfff, 527 .dma_attr_align = 0x1000, 528 .dma_attr_burstsizes = 0x7ff, 529 .dma_attr_minxfer = 0x1000, 530 .dma_attr_maxxfer = 0x1000, 531 .dma_attr_seg = 0xfff, 532 .dma_attr_sgllen = -1, 533 .dma_attr_granular = 1, 534 .dma_attr_flags = 0, 535 }; 536 537 /* 538 * DMA attributes for transfers using scatter/gather lists 539 * 540 * A SGL entry describes a chunk of DMA memory using a 64bit base address and a 541 * 32bit length field. SGL Segment and SGL Last Segment entries require the 542 * length to be a multiple of 16 bytes. 543 */ 544 static ddi_dma_attr_t nvme_sgl_dma_attr = { 545 .dma_attr_version = DMA_ATTR_V0, 546 .dma_attr_addr_lo = 0, 547 .dma_attr_addr_hi = 0xffffffffffffffffULL, 548 .dma_attr_count_max = 0xffffffffUL, 549 .dma_attr_align = 1, 550 .dma_attr_burstsizes = 0x7ff, 551 .dma_attr_minxfer = 0x10, 552 .dma_attr_maxxfer = 0xfffffffffULL, 553 .dma_attr_seg = 0xffffffffffffffffULL, 554 .dma_attr_sgllen = -1, 555 .dma_attr_granular = 0x10, 556 .dma_attr_flags = 0 557 }; 558 559 static ddi_device_acc_attr_t nvme_reg_acc_attr = { 560 .devacc_attr_version = DDI_DEVICE_ATTR_V0, 561 .devacc_attr_endian_flags = DDI_STRUCTURE_LE_ACC, 562 .devacc_attr_dataorder = DDI_STRICTORDER_ACC 563 }; 564 565 static struct cb_ops nvme_cb_ops = { 566 .cb_open = nvme_open, 567 .cb_close = nvme_close, 568 .cb_strategy = nodev, 569 .cb_print = nodev, 570 .cb_dump = nodev, 571 .cb_read = nodev, 572 .cb_write = nodev, 573 .cb_ioctl = nvme_ioctl, 574 .cb_devmap = nodev, 575 .cb_mmap = nodev, 576 .cb_segmap = nodev, 577 .cb_chpoll = nochpoll, 578 .cb_prop_op = ddi_prop_op, 579 .cb_str = 0, 580 .cb_flag = D_NEW | D_MP, 581 .cb_rev = CB_REV, 582 .cb_aread = nodev, 583 .cb_awrite = nodev 584 }; 585 586 static struct dev_ops nvme_dev_ops = { 587 .devo_rev = DEVO_REV, 588 .devo_refcnt = 0, 589 .devo_getinfo = ddi_no_info, 590 .devo_identify = nulldev, 591 .devo_probe = nulldev, 592 .devo_attach = nvme_attach, 593 .devo_detach = nvme_detach, 594 .devo_reset = nodev, 595 .devo_cb_ops = &nvme_cb_ops, 596 .devo_bus_ops = NULL, 597 .devo_power = NULL, 598 .devo_quiesce = nvme_quiesce, 599 }; 600 601 static struct modldrv nvme_modldrv = { 602 .drv_modops = &mod_driverops, 603 .drv_linkinfo = "NVMe v1.1b", 604 .drv_dev_ops = &nvme_dev_ops 605 }; 606 607 static struct modlinkage nvme_modlinkage = { 608 .ml_rev = MODREV_1, 609 .ml_linkage = { &nvme_modldrv, NULL } 610 }; 611 612 static bd_ops_t nvme_bd_ops = { 613 .o_version = BD_OPS_CURRENT_VERSION, 614 .o_drive_info = nvme_bd_driveinfo, 615 .o_media_info = nvme_bd_mediainfo, 616 .o_devid_init = nvme_bd_devid, 617 .o_sync_cache = nvme_bd_sync, 618 .o_read = nvme_bd_read, 619 .o_write = nvme_bd_write, 620 .o_free_space = nvme_bd_free_space, 621 }; 622 623 /* 624 * This list will hold commands that have timed out and couldn't be aborted. 625 * As we don't know what the hardware may still do with the DMA memory we can't 626 * free them, so we'll keep them forever on this list where we can easily look 627 * at them with mdb. 628 */ 629 static struct list nvme_lost_cmds; 630 static kmutex_t nvme_lc_mutex; 631 632 int 633 _init(void) 634 { 635 int error; 636 637 error = ddi_soft_state_init(&nvme_state, sizeof (nvme_t), 1); 638 if (error != DDI_SUCCESS) 639 return (error); 640 641 nvme_cmd_cache = kmem_cache_create("nvme_cmd_cache", 642 sizeof (nvme_cmd_t), 64, NULL, NULL, NULL, NULL, NULL, 0); 643 644 mutex_init(&nvme_lc_mutex, NULL, MUTEX_DRIVER, NULL); 645 list_create(&nvme_lost_cmds, sizeof (nvme_cmd_t), 646 offsetof(nvme_cmd_t, nc_list)); 647 648 bd_mod_init(&nvme_dev_ops); 649 650 error = mod_install(&nvme_modlinkage); 651 if (error != DDI_SUCCESS) { 652 ddi_soft_state_fini(&nvme_state); 653 mutex_destroy(&nvme_lc_mutex); 654 list_destroy(&nvme_lost_cmds); 655 bd_mod_fini(&nvme_dev_ops); 656 } 657 658 return (error); 659 } 660 661 int 662 _fini(void) 663 { 664 int error; 665 666 if (!list_is_empty(&nvme_lost_cmds)) 667 return (DDI_FAILURE); 668 669 error = mod_remove(&nvme_modlinkage); 670 if (error == DDI_SUCCESS) { 671 ddi_soft_state_fini(&nvme_state); 672 kmem_cache_destroy(nvme_cmd_cache); 673 mutex_destroy(&nvme_lc_mutex); 674 list_destroy(&nvme_lost_cmds); 675 bd_mod_fini(&nvme_dev_ops); 676 } 677 678 return (error); 679 } 680 681 int 682 _info(struct modinfo *modinfop) 683 { 684 return (mod_info(&nvme_modlinkage, modinfop)); 685 } 686 687 static inline void 688 nvme_put64(nvme_t *nvme, uintptr_t reg, uint64_t val) 689 { 690 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0); 691 692 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 693 ddi_put64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg), val); 694 } 695 696 static inline void 697 nvme_put32(nvme_t *nvme, uintptr_t reg, uint32_t val) 698 { 699 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x3) == 0); 700 701 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 702 ddi_put32(nvme->n_regh, (uint32_t *)(nvme->n_regs + reg), val); 703 } 704 705 static inline uint64_t 706 nvme_get64(nvme_t *nvme, uintptr_t reg) 707 { 708 uint64_t val; 709 710 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0); 711 712 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 713 val = ddi_get64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg)); 714 715 return (val); 716 } 717 718 static inline uint32_t 719 nvme_get32(nvme_t *nvme, uintptr_t reg) 720 { 721 uint32_t val; 722 723 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x3) == 0); 724 725 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 726 val = ddi_get32(nvme->n_regh, (uint32_t *)(nvme->n_regs + reg)); 727 728 return (val); 729 } 730 731 static boolean_t 732 nvme_check_regs_hdl(nvme_t *nvme) 733 { 734 ddi_fm_error_t error; 735 736 ddi_fm_acc_err_get(nvme->n_regh, &error, DDI_FME_VERSION); 737 738 if (error.fme_status != DDI_FM_OK) 739 return (B_TRUE); 740 741 return (B_FALSE); 742 } 743 744 static boolean_t 745 nvme_check_dma_hdl(nvme_dma_t *dma) 746 { 747 ddi_fm_error_t error; 748 749 if (dma == NULL) 750 return (B_FALSE); 751 752 ddi_fm_dma_err_get(dma->nd_dmah, &error, DDI_FME_VERSION); 753 754 if (error.fme_status != DDI_FM_OK) 755 return (B_TRUE); 756 757 return (B_FALSE); 758 } 759 760 static void 761 nvme_free_dma_common(nvme_dma_t *dma) 762 { 763 if (dma->nd_dmah != NULL) 764 (void) ddi_dma_unbind_handle(dma->nd_dmah); 765 if (dma->nd_acch != NULL) 766 ddi_dma_mem_free(&dma->nd_acch); 767 if (dma->nd_dmah != NULL) 768 ddi_dma_free_handle(&dma->nd_dmah); 769 } 770 771 static void 772 nvme_free_dma(nvme_dma_t *dma) 773 { 774 nvme_free_dma_common(dma); 775 kmem_free(dma, sizeof (*dma)); 776 } 777 778 /* ARGSUSED */ 779 static void 780 nvme_prp_dma_destructor(void *buf, void *private) 781 { 782 nvme_dma_t *dma = (nvme_dma_t *)buf; 783 784 nvme_free_dma_common(dma); 785 } 786 787 static int 788 nvme_alloc_dma_common(nvme_t *nvme, nvme_dma_t *dma, 789 size_t len, uint_t flags, ddi_dma_attr_t *dma_attr) 790 { 791 if (ddi_dma_alloc_handle(nvme->n_dip, dma_attr, DDI_DMA_SLEEP, NULL, 792 &dma->nd_dmah) != DDI_SUCCESS) { 793 /* 794 * Due to DDI_DMA_SLEEP this can't be DDI_DMA_NORESOURCES, and 795 * the only other possible error is DDI_DMA_BADATTR which 796 * indicates a driver bug which should cause a panic. 797 */ 798 dev_err(nvme->n_dip, CE_PANIC, 799 "!failed to get DMA handle, check DMA attributes"); 800 return (DDI_FAILURE); 801 } 802 803 /* 804 * ddi_dma_mem_alloc() can only fail when DDI_DMA_NOSLEEP is specified 805 * or the flags are conflicting, which isn't the case here. 806 */ 807 (void) ddi_dma_mem_alloc(dma->nd_dmah, len, &nvme->n_reg_acc_attr, 808 DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL, &dma->nd_memp, 809 &dma->nd_len, &dma->nd_acch); 810 811 if (ddi_dma_addr_bind_handle(dma->nd_dmah, NULL, dma->nd_memp, 812 dma->nd_len, flags | DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL, 813 &dma->nd_cookie, &dma->nd_ncookie) != DDI_DMA_MAPPED) { 814 dev_err(nvme->n_dip, CE_WARN, 815 "!failed to bind DMA memory"); 816 atomic_inc_32(&nvme->n_dma_bind_err); 817 nvme_free_dma_common(dma); 818 return (DDI_FAILURE); 819 } 820 821 return (DDI_SUCCESS); 822 } 823 824 static int 825 nvme_zalloc_dma(nvme_t *nvme, size_t len, uint_t flags, 826 ddi_dma_attr_t *dma_attr, nvme_dma_t **ret) 827 { 828 nvme_dma_t *dma = kmem_zalloc(sizeof (nvme_dma_t), KM_SLEEP); 829 830 if (nvme_alloc_dma_common(nvme, dma, len, flags, dma_attr) != 831 DDI_SUCCESS) { 832 *ret = NULL; 833 kmem_free(dma, sizeof (nvme_dma_t)); 834 return (DDI_FAILURE); 835 } 836 837 bzero(dma->nd_memp, dma->nd_len); 838 839 *ret = dma; 840 return (DDI_SUCCESS); 841 } 842 843 /* ARGSUSED */ 844 static int 845 nvme_prp_dma_constructor(void *buf, void *private, int flags) 846 { 847 nvme_dma_t *dma = (nvme_dma_t *)buf; 848 nvme_t *nvme = (nvme_t *)private; 849 850 dma->nd_dmah = NULL; 851 dma->nd_acch = NULL; 852 853 if (nvme_alloc_dma_common(nvme, dma, nvme->n_pagesize, 854 DDI_DMA_READ, &nvme->n_prp_dma_attr) != DDI_SUCCESS) { 855 return (-1); 856 } 857 858 ASSERT(dma->nd_ncookie == 1); 859 860 dma->nd_cached = B_TRUE; 861 862 return (0); 863 } 864 865 static int 866 nvme_zalloc_queue_dma(nvme_t *nvme, uint32_t nentry, uint16_t qe_len, 867 uint_t flags, nvme_dma_t **dma) 868 { 869 uint32_t len = nentry * qe_len; 870 ddi_dma_attr_t q_dma_attr = nvme->n_queue_dma_attr; 871 872 len = roundup(len, nvme->n_pagesize); 873 874 if (nvme_zalloc_dma(nvme, len, flags, &q_dma_attr, dma) 875 != DDI_SUCCESS) { 876 dev_err(nvme->n_dip, CE_WARN, 877 "!failed to get DMA memory for queue"); 878 goto fail; 879 } 880 881 if ((*dma)->nd_ncookie != 1) { 882 dev_err(nvme->n_dip, CE_WARN, 883 "!got too many cookies for queue DMA"); 884 goto fail; 885 } 886 887 return (DDI_SUCCESS); 888 889 fail: 890 if (*dma) { 891 nvme_free_dma(*dma); 892 *dma = NULL; 893 } 894 895 return (DDI_FAILURE); 896 } 897 898 static void 899 nvme_free_cq(nvme_cq_t *cq) 900 { 901 mutex_destroy(&cq->ncq_mutex); 902 903 if (cq->ncq_cmd_taskq != NULL) 904 taskq_destroy(cq->ncq_cmd_taskq); 905 906 if (cq->ncq_dma != NULL) 907 nvme_free_dma(cq->ncq_dma); 908 909 kmem_free(cq, sizeof (*cq)); 910 } 911 912 static void 913 nvme_free_qpair(nvme_qpair_t *qp) 914 { 915 int i; 916 917 mutex_destroy(&qp->nq_mutex); 918 sema_destroy(&qp->nq_sema); 919 920 if (qp->nq_sqdma != NULL) 921 nvme_free_dma(qp->nq_sqdma); 922 923 if (qp->nq_active_cmds > 0) 924 for (i = 0; i != qp->nq_nentry; i++) 925 if (qp->nq_cmd[i] != NULL) 926 nvme_free_cmd(qp->nq_cmd[i]); 927 928 if (qp->nq_cmd != NULL) 929 kmem_free(qp->nq_cmd, sizeof (nvme_cmd_t *) * qp->nq_nentry); 930 931 kmem_free(qp, sizeof (nvme_qpair_t)); 932 } 933 934 /* 935 * Destroy the pre-allocated cq array, but only free individual completion 936 * queues from the given starting index. 937 */ 938 static void 939 nvme_destroy_cq_array(nvme_t *nvme, uint_t start) 940 { 941 uint_t i; 942 943 for (i = start; i < nvme->n_cq_count; i++) 944 if (nvme->n_cq[i] != NULL) 945 nvme_free_cq(nvme->n_cq[i]); 946 947 kmem_free(nvme->n_cq, sizeof (*nvme->n_cq) * nvme->n_cq_count); 948 } 949 950 static int 951 nvme_alloc_cq(nvme_t *nvme, uint32_t nentry, nvme_cq_t **cqp, uint16_t idx, 952 uint_t nthr) 953 { 954 nvme_cq_t *cq = kmem_zalloc(sizeof (*cq), KM_SLEEP); 955 char name[64]; /* large enough for the taskq name */ 956 957 mutex_init(&cq->ncq_mutex, NULL, MUTEX_DRIVER, 958 DDI_INTR_PRI(nvme->n_intr_pri)); 959 960 if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_cqe_t), 961 DDI_DMA_READ, &cq->ncq_dma) != DDI_SUCCESS) 962 goto fail; 963 964 cq->ncq_cq = (nvme_cqe_t *)cq->ncq_dma->nd_memp; 965 cq->ncq_nentry = nentry; 966 cq->ncq_id = idx; 967 cq->ncq_hdbl = NVME_REG_CQHDBL(nvme, idx); 968 969 /* 970 * Each completion queue has its own command taskq. 971 */ 972 (void) snprintf(name, sizeof (name), "%s%d_cmd_taskq%u", 973 ddi_driver_name(nvme->n_dip), ddi_get_instance(nvme->n_dip), idx); 974 975 cq->ncq_cmd_taskq = taskq_create(name, nthr, minclsyspri, 64, INT_MAX, 976 TASKQ_PREPOPULATE); 977 978 if (cq->ncq_cmd_taskq == NULL) { 979 dev_err(nvme->n_dip, CE_WARN, "!failed to create cmd " 980 "taskq for cq %u", idx); 981 goto fail; 982 } 983 984 *cqp = cq; 985 return (DDI_SUCCESS); 986 987 fail: 988 nvme_free_cq(cq); 989 *cqp = NULL; 990 991 return (DDI_FAILURE); 992 } 993 994 /* 995 * Create the n_cq array big enough to hold "ncq" completion queues. 996 * If the array already exists it will be re-sized (but only larger). 997 * The admin queue is included in this array, which boosts the 998 * max number of entries to UINT16_MAX + 1. 999 */ 1000 static int 1001 nvme_create_cq_array(nvme_t *nvme, uint_t ncq, uint32_t nentry, uint_t nthr) 1002 { 1003 nvme_cq_t **cq; 1004 uint_t i, cq_count; 1005 1006 ASSERT3U(ncq, >, nvme->n_cq_count); 1007 1008 cq = nvme->n_cq; 1009 cq_count = nvme->n_cq_count; 1010 1011 nvme->n_cq = kmem_zalloc(sizeof (*nvme->n_cq) * ncq, KM_SLEEP); 1012 nvme->n_cq_count = ncq; 1013 1014 for (i = 0; i < cq_count; i++) 1015 nvme->n_cq[i] = cq[i]; 1016 1017 for (; i < nvme->n_cq_count; i++) 1018 if (nvme_alloc_cq(nvme, nentry, &nvme->n_cq[i], i, nthr) != 1019 DDI_SUCCESS) 1020 goto fail; 1021 1022 if (cq != NULL) 1023 kmem_free(cq, sizeof (*cq) * cq_count); 1024 1025 return (DDI_SUCCESS); 1026 1027 fail: 1028 nvme_destroy_cq_array(nvme, cq_count); 1029 /* 1030 * Restore the original array 1031 */ 1032 nvme->n_cq_count = cq_count; 1033 nvme->n_cq = cq; 1034 1035 return (DDI_FAILURE); 1036 } 1037 1038 static int 1039 nvme_alloc_qpair(nvme_t *nvme, uint32_t nentry, nvme_qpair_t **nqp, 1040 uint_t idx) 1041 { 1042 nvme_qpair_t *qp = kmem_zalloc(sizeof (*qp), KM_SLEEP); 1043 uint_t cq_idx; 1044 1045 mutex_init(&qp->nq_mutex, NULL, MUTEX_DRIVER, 1046 DDI_INTR_PRI(nvme->n_intr_pri)); 1047 1048 /* 1049 * The NVMe spec defines that a full queue has one empty (unused) slot; 1050 * initialize the semaphore accordingly. 1051 */ 1052 sema_init(&qp->nq_sema, nentry - 1, NULL, SEMA_DRIVER, NULL); 1053 1054 if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_sqe_t), 1055 DDI_DMA_WRITE, &qp->nq_sqdma) != DDI_SUCCESS) 1056 goto fail; 1057 1058 /* 1059 * idx == 0 is adminq, those above 0 are shared io completion queues. 1060 */ 1061 cq_idx = idx == 0 ? 0 : 1 + (idx - 1) % (nvme->n_cq_count - 1); 1062 qp->nq_cq = nvme->n_cq[cq_idx]; 1063 qp->nq_sq = (nvme_sqe_t *)qp->nq_sqdma->nd_memp; 1064 qp->nq_nentry = nentry; 1065 1066 qp->nq_sqtdbl = NVME_REG_SQTDBL(nvme, idx); 1067 1068 qp->nq_cmd = kmem_zalloc(sizeof (nvme_cmd_t *) * nentry, KM_SLEEP); 1069 qp->nq_next_cmd = 0; 1070 1071 *nqp = qp; 1072 return (DDI_SUCCESS); 1073 1074 fail: 1075 nvme_free_qpair(qp); 1076 *nqp = NULL; 1077 1078 return (DDI_FAILURE); 1079 } 1080 1081 static nvme_cmd_t * 1082 nvme_alloc_cmd(nvme_t *nvme, int kmflag) 1083 { 1084 nvme_cmd_t *cmd = kmem_cache_alloc(nvme_cmd_cache, kmflag); 1085 1086 if (cmd == NULL) 1087 return (cmd); 1088 1089 bzero(cmd, sizeof (nvme_cmd_t)); 1090 1091 cmd->nc_nvme = nvme; 1092 1093 mutex_init(&cmd->nc_mutex, NULL, MUTEX_DRIVER, 1094 DDI_INTR_PRI(nvme->n_intr_pri)); 1095 cv_init(&cmd->nc_cv, NULL, CV_DRIVER, NULL); 1096 1097 return (cmd); 1098 } 1099 1100 static void 1101 nvme_free_cmd(nvme_cmd_t *cmd) 1102 { 1103 /* Don't free commands on the lost commands list. */ 1104 if (list_link_active(&cmd->nc_list)) 1105 return; 1106 1107 if (cmd->nc_dma) { 1108 nvme_free_dma(cmd->nc_dma); 1109 cmd->nc_dma = NULL; 1110 } 1111 1112 if (cmd->nc_prp) { 1113 kmem_cache_free(cmd->nc_nvme->n_prp_cache, cmd->nc_prp); 1114 cmd->nc_prp = NULL; 1115 } 1116 1117 cv_destroy(&cmd->nc_cv); 1118 mutex_destroy(&cmd->nc_mutex); 1119 1120 kmem_cache_free(nvme_cmd_cache, cmd); 1121 } 1122 1123 static void 1124 nvme_submit_admin_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd) 1125 { 1126 sema_p(&qp->nq_sema); 1127 nvme_submit_cmd_common(qp, cmd); 1128 } 1129 1130 static int 1131 nvme_submit_io_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd) 1132 { 1133 if (cmd->nc_nvme->n_dead) { 1134 return (EIO); 1135 } 1136 1137 if (sema_tryp(&qp->nq_sema) == 0) 1138 return (EAGAIN); 1139 1140 nvme_submit_cmd_common(qp, cmd); 1141 return (0); 1142 } 1143 1144 static void 1145 nvme_submit_cmd_common(nvme_qpair_t *qp, nvme_cmd_t *cmd) 1146 { 1147 nvme_reg_sqtdbl_t tail = { 0 }; 1148 1149 mutex_enter(&qp->nq_mutex); 1150 cmd->nc_completed = B_FALSE; 1151 1152 /* 1153 * Now that we hold the queue pair lock, we must check whether or not 1154 * the controller has been listed as dead (e.g. was removed due to 1155 * hotplug). This is necessary as otherwise we could race with 1156 * nvme_remove_callback(). Because this has not been enqueued, we don't 1157 * call nvme_unqueue_cmd(), which is why we must manually decrement the 1158 * semaphore. 1159 */ 1160 if (cmd->nc_nvme->n_dead) { 1161 taskq_dispatch_ent(qp->nq_cq->ncq_cmd_taskq, cmd->nc_callback, 1162 cmd, TQ_NOSLEEP, &cmd->nc_tqent); 1163 sema_v(&qp->nq_sema); 1164 mutex_exit(&qp->nq_mutex); 1165 return; 1166 } 1167 1168 /* 1169 * Try to insert the cmd into the active cmd array at the nq_next_cmd 1170 * slot. If the slot is already occupied advance to the next slot and 1171 * try again. This can happen for long running commands like async event 1172 * requests. 1173 */ 1174 while (qp->nq_cmd[qp->nq_next_cmd] != NULL) 1175 qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry; 1176 qp->nq_cmd[qp->nq_next_cmd] = cmd; 1177 1178 qp->nq_active_cmds++; 1179 1180 cmd->nc_sqe.sqe_cid = qp->nq_next_cmd; 1181 bcopy(&cmd->nc_sqe, &qp->nq_sq[qp->nq_sqtail], sizeof (nvme_sqe_t)); 1182 (void) ddi_dma_sync(qp->nq_sqdma->nd_dmah, 1183 sizeof (nvme_sqe_t) * qp->nq_sqtail, 1184 sizeof (nvme_sqe_t), DDI_DMA_SYNC_FORDEV); 1185 qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry; 1186 1187 tail.b.sqtdbl_sqt = qp->nq_sqtail = (qp->nq_sqtail + 1) % qp->nq_nentry; 1188 nvme_put32(cmd->nc_nvme, qp->nq_sqtdbl, tail.r); 1189 1190 mutex_exit(&qp->nq_mutex); 1191 } 1192 1193 static nvme_cmd_t * 1194 nvme_unqueue_cmd(nvme_t *nvme, nvme_qpair_t *qp, int cid) 1195 { 1196 nvme_cmd_t *cmd; 1197 1198 ASSERT(mutex_owned(&qp->nq_mutex)); 1199 ASSERT3S(cid, <, qp->nq_nentry); 1200 1201 cmd = qp->nq_cmd[cid]; 1202 qp->nq_cmd[cid] = NULL; 1203 ASSERT3U(qp->nq_active_cmds, >, 0); 1204 qp->nq_active_cmds--; 1205 sema_v(&qp->nq_sema); 1206 1207 ASSERT3P(cmd, !=, NULL); 1208 ASSERT3P(cmd->nc_nvme, ==, nvme); 1209 ASSERT3S(cmd->nc_sqe.sqe_cid, ==, cid); 1210 1211 return (cmd); 1212 } 1213 1214 /* 1215 * Get the command tied to the next completed cqe and bump along completion 1216 * queue head counter. 1217 */ 1218 static nvme_cmd_t * 1219 nvme_get_completed(nvme_t *nvme, nvme_cq_t *cq) 1220 { 1221 nvme_qpair_t *qp; 1222 nvme_cqe_t *cqe; 1223 nvme_cmd_t *cmd; 1224 1225 ASSERT(mutex_owned(&cq->ncq_mutex)); 1226 1227 cqe = &cq->ncq_cq[cq->ncq_head]; 1228 1229 /* Check phase tag of CQE. Hardware inverts it for new entries. */ 1230 if (cqe->cqe_sf.sf_p == cq->ncq_phase) 1231 return (NULL); 1232 1233 qp = nvme->n_ioq[cqe->cqe_sqid]; 1234 1235 mutex_enter(&qp->nq_mutex); 1236 cmd = nvme_unqueue_cmd(nvme, qp, cqe->cqe_cid); 1237 mutex_exit(&qp->nq_mutex); 1238 1239 ASSERT(cmd->nc_sqid == cqe->cqe_sqid); 1240 bcopy(cqe, &cmd->nc_cqe, sizeof (nvme_cqe_t)); 1241 1242 qp->nq_sqhead = cqe->cqe_sqhd; 1243 1244 cq->ncq_head = (cq->ncq_head + 1) % cq->ncq_nentry; 1245 1246 /* Toggle phase on wrap-around. */ 1247 if (cq->ncq_head == 0) 1248 cq->ncq_phase = cq->ncq_phase ? 0 : 1; 1249 1250 return (cmd); 1251 } 1252 1253 /* 1254 * Process all completed commands on the io completion queue. 1255 */ 1256 static uint_t 1257 nvme_process_iocq(nvme_t *nvme, nvme_cq_t *cq) 1258 { 1259 nvme_reg_cqhdbl_t head = { 0 }; 1260 nvme_cmd_t *cmd; 1261 uint_t completed = 0; 1262 1263 if (ddi_dma_sync(cq->ncq_dma->nd_dmah, 0, 0, DDI_DMA_SYNC_FORKERNEL) != 1264 DDI_SUCCESS) 1265 dev_err(nvme->n_dip, CE_WARN, "!ddi_dma_sync() failed in %s", 1266 __func__); 1267 1268 mutex_enter(&cq->ncq_mutex); 1269 1270 while ((cmd = nvme_get_completed(nvme, cq)) != NULL) { 1271 taskq_dispatch_ent(cq->ncq_cmd_taskq, cmd->nc_callback, cmd, 1272 TQ_NOSLEEP, &cmd->nc_tqent); 1273 1274 completed++; 1275 } 1276 1277 if (completed > 0) { 1278 /* 1279 * Update the completion queue head doorbell. 1280 */ 1281 head.b.cqhdbl_cqh = cq->ncq_head; 1282 nvme_put32(nvme, cq->ncq_hdbl, head.r); 1283 } 1284 1285 mutex_exit(&cq->ncq_mutex); 1286 1287 return (completed); 1288 } 1289 1290 static nvme_cmd_t * 1291 nvme_retrieve_cmd(nvme_t *nvme, nvme_qpair_t *qp) 1292 { 1293 nvme_cq_t *cq = qp->nq_cq; 1294 nvme_reg_cqhdbl_t head = { 0 }; 1295 nvme_cmd_t *cmd; 1296 1297 if (ddi_dma_sync(cq->ncq_dma->nd_dmah, 0, 0, DDI_DMA_SYNC_FORKERNEL) != 1298 DDI_SUCCESS) 1299 dev_err(nvme->n_dip, CE_WARN, "!ddi_dma_sync() failed in %s", 1300 __func__); 1301 1302 mutex_enter(&cq->ncq_mutex); 1303 1304 if ((cmd = nvme_get_completed(nvme, cq)) != NULL) { 1305 head.b.cqhdbl_cqh = cq->ncq_head; 1306 nvme_put32(nvme, cq->ncq_hdbl, head.r); 1307 } 1308 1309 mutex_exit(&cq->ncq_mutex); 1310 1311 return (cmd); 1312 } 1313 1314 static int 1315 nvme_check_unknown_cmd_status(nvme_cmd_t *cmd) 1316 { 1317 nvme_cqe_t *cqe = &cmd->nc_cqe; 1318 1319 dev_err(cmd->nc_nvme->n_dip, CE_WARN, 1320 "!unknown command status received: opc = %x, sqid = %d, cid = %d, " 1321 "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc, 1322 cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct, 1323 cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m); 1324 1325 if (cmd->nc_xfer != NULL) 1326 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 1327 1328 if (cmd->nc_nvme->n_strict_version) { 1329 cmd->nc_nvme->n_dead = B_TRUE; 1330 ddi_fm_service_impact(cmd->nc_nvme->n_dip, DDI_SERVICE_LOST); 1331 } 1332 1333 return (EIO); 1334 } 1335 1336 static int 1337 nvme_check_vendor_cmd_status(nvme_cmd_t *cmd) 1338 { 1339 nvme_cqe_t *cqe = &cmd->nc_cqe; 1340 1341 dev_err(cmd->nc_nvme->n_dip, CE_WARN, 1342 "!unknown command status received: opc = %x, sqid = %d, cid = %d, " 1343 "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc, 1344 cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct, 1345 cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m); 1346 if (!cmd->nc_nvme->n_ignore_unknown_vendor_status) { 1347 cmd->nc_nvme->n_dead = B_TRUE; 1348 ddi_fm_service_impact(cmd->nc_nvme->n_dip, DDI_SERVICE_LOST); 1349 } 1350 1351 return (EIO); 1352 } 1353 1354 static int 1355 nvme_check_integrity_cmd_status(nvme_cmd_t *cmd) 1356 { 1357 nvme_cqe_t *cqe = &cmd->nc_cqe; 1358 1359 switch (cqe->cqe_sf.sf_sc) { 1360 case NVME_CQE_SC_INT_NVM_WRITE: 1361 /* write fail */ 1362 /* TODO: post ereport */ 1363 if (cmd->nc_xfer != NULL) 1364 bd_error(cmd->nc_xfer, BD_ERR_MEDIA); 1365 return (EIO); 1366 1367 case NVME_CQE_SC_INT_NVM_READ: 1368 /* read fail */ 1369 /* TODO: post ereport */ 1370 if (cmd->nc_xfer != NULL) 1371 bd_error(cmd->nc_xfer, BD_ERR_MEDIA); 1372 return (EIO); 1373 1374 default: 1375 return (nvme_check_unknown_cmd_status(cmd)); 1376 } 1377 } 1378 1379 static int 1380 nvme_check_generic_cmd_status(nvme_cmd_t *cmd) 1381 { 1382 nvme_cqe_t *cqe = &cmd->nc_cqe; 1383 1384 switch (cqe->cqe_sf.sf_sc) { 1385 case NVME_CQE_SC_GEN_SUCCESS: 1386 return (0); 1387 1388 /* 1389 * Errors indicating a bug in the driver should cause a panic. 1390 */ 1391 case NVME_CQE_SC_GEN_INV_OPC: 1392 /* Invalid Command Opcode */ 1393 if (!cmd->nc_dontpanic) 1394 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, 1395 "programming error: invalid opcode in cmd %p", 1396 (void *)cmd); 1397 return (EINVAL); 1398 1399 case NVME_CQE_SC_GEN_INV_FLD: 1400 /* Invalid Field in Command */ 1401 if (!cmd->nc_dontpanic) 1402 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, 1403 "programming error: invalid field in cmd %p", 1404 (void *)cmd); 1405 return (EIO); 1406 1407 case NVME_CQE_SC_GEN_ID_CNFL: 1408 /* Command ID Conflict */ 1409 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " 1410 "cmd ID conflict in cmd %p", (void *)cmd); 1411 return (0); 1412 1413 case NVME_CQE_SC_GEN_INV_NS: 1414 /* Invalid Namespace or Format */ 1415 if (!cmd->nc_dontpanic) 1416 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, 1417 "programming error: invalid NS/format in cmd %p", 1418 (void *)cmd); 1419 return (EINVAL); 1420 1421 case NVME_CQE_SC_GEN_NVM_LBA_RANGE: 1422 /* LBA Out Of Range */ 1423 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " 1424 "LBA out of range in cmd %p", (void *)cmd); 1425 return (0); 1426 1427 /* 1428 * Non-fatal errors, handle gracefully. 1429 */ 1430 case NVME_CQE_SC_GEN_DATA_XFR_ERR: 1431 /* Data Transfer Error (DMA) */ 1432 /* TODO: post ereport */ 1433 atomic_inc_32(&cmd->nc_nvme->n_data_xfr_err); 1434 if (cmd->nc_xfer != NULL) 1435 bd_error(cmd->nc_xfer, BD_ERR_NTRDY); 1436 return (EIO); 1437 1438 case NVME_CQE_SC_GEN_INTERNAL_ERR: 1439 /* 1440 * Internal Error. The spec (v1.0, section 4.5.1.2) says 1441 * detailed error information is returned as async event, 1442 * so we pretty much ignore the error here and handle it 1443 * in the async event handler. 1444 */ 1445 atomic_inc_32(&cmd->nc_nvme->n_internal_err); 1446 if (cmd->nc_xfer != NULL) 1447 bd_error(cmd->nc_xfer, BD_ERR_NTRDY); 1448 return (EIO); 1449 1450 case NVME_CQE_SC_GEN_ABORT_REQUEST: 1451 /* 1452 * Command Abort Requested. This normally happens only when a 1453 * command times out. 1454 */ 1455 /* TODO: post ereport or change blkdev to handle this? */ 1456 atomic_inc_32(&cmd->nc_nvme->n_abort_rq_err); 1457 return (ECANCELED); 1458 1459 case NVME_CQE_SC_GEN_ABORT_PWRLOSS: 1460 /* Command Aborted due to Power Loss Notification */ 1461 ddi_fm_service_impact(cmd->nc_nvme->n_dip, DDI_SERVICE_LOST); 1462 cmd->nc_nvme->n_dead = B_TRUE; 1463 return (EIO); 1464 1465 case NVME_CQE_SC_GEN_ABORT_SQ_DEL: 1466 /* Command Aborted due to SQ Deletion */ 1467 atomic_inc_32(&cmd->nc_nvme->n_abort_sq_del); 1468 return (EIO); 1469 1470 case NVME_CQE_SC_GEN_NVM_CAP_EXC: 1471 /* Capacity Exceeded */ 1472 atomic_inc_32(&cmd->nc_nvme->n_nvm_cap_exc); 1473 if (cmd->nc_xfer != NULL) 1474 bd_error(cmd->nc_xfer, BD_ERR_MEDIA); 1475 return (EIO); 1476 1477 case NVME_CQE_SC_GEN_NVM_NS_NOTRDY: 1478 /* Namespace Not Ready */ 1479 atomic_inc_32(&cmd->nc_nvme->n_nvm_ns_notrdy); 1480 if (cmd->nc_xfer != NULL) 1481 bd_error(cmd->nc_xfer, BD_ERR_NTRDY); 1482 return (EIO); 1483 1484 default: 1485 return (nvme_check_unknown_cmd_status(cmd)); 1486 } 1487 } 1488 1489 static int 1490 nvme_check_specific_cmd_status(nvme_cmd_t *cmd) 1491 { 1492 nvme_cqe_t *cqe = &cmd->nc_cqe; 1493 1494 switch (cqe->cqe_sf.sf_sc) { 1495 case NVME_CQE_SC_SPC_INV_CQ: 1496 /* Completion Queue Invalid */ 1497 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE); 1498 atomic_inc_32(&cmd->nc_nvme->n_inv_cq_err); 1499 return (EINVAL); 1500 1501 case NVME_CQE_SC_SPC_INV_QID: 1502 /* Invalid Queue Identifier */ 1503 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE || 1504 cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_SQUEUE || 1505 cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE || 1506 cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_CQUEUE); 1507 atomic_inc_32(&cmd->nc_nvme->n_inv_qid_err); 1508 return (EINVAL); 1509 1510 case NVME_CQE_SC_SPC_MAX_QSZ_EXC: 1511 /* Max Queue Size Exceeded */ 1512 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE || 1513 cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE); 1514 atomic_inc_32(&cmd->nc_nvme->n_max_qsz_exc); 1515 return (EINVAL); 1516 1517 case NVME_CQE_SC_SPC_ABRT_CMD_EXC: 1518 /* Abort Command Limit Exceeded */ 1519 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_ABORT); 1520 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " 1521 "abort command limit exceeded in cmd %p", (void *)cmd); 1522 return (0); 1523 1524 case NVME_CQE_SC_SPC_ASYNC_EVREQ_EXC: 1525 /* Async Event Request Limit Exceeded */ 1526 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_ASYNC_EVENT); 1527 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " 1528 "async event request limit exceeded in cmd %p", 1529 (void *)cmd); 1530 return (0); 1531 1532 case NVME_CQE_SC_SPC_INV_INT_VECT: 1533 /* Invalid Interrupt Vector */ 1534 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE); 1535 atomic_inc_32(&cmd->nc_nvme->n_inv_int_vect); 1536 return (EINVAL); 1537 1538 case NVME_CQE_SC_SPC_INV_LOG_PAGE: 1539 /* Invalid Log Page */ 1540 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_GET_LOG_PAGE); 1541 atomic_inc_32(&cmd->nc_nvme->n_inv_log_page); 1542 return (EINVAL); 1543 1544 case NVME_CQE_SC_SPC_INV_FORMAT: 1545 /* Invalid Format */ 1546 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_FORMAT); 1547 atomic_inc_32(&cmd->nc_nvme->n_inv_format); 1548 if (cmd->nc_xfer != NULL) 1549 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 1550 return (EINVAL); 1551 1552 case NVME_CQE_SC_SPC_INV_Q_DEL: 1553 /* Invalid Queue Deletion */ 1554 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_CQUEUE); 1555 atomic_inc_32(&cmd->nc_nvme->n_inv_q_del); 1556 return (EINVAL); 1557 1558 case NVME_CQE_SC_SPC_NVM_CNFL_ATTR: 1559 /* Conflicting Attributes */ 1560 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_DSET_MGMT || 1561 cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_READ || 1562 cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE); 1563 atomic_inc_32(&cmd->nc_nvme->n_cnfl_attr); 1564 if (cmd->nc_xfer != NULL) 1565 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 1566 return (EINVAL); 1567 1568 case NVME_CQE_SC_SPC_NVM_INV_PROT: 1569 /* Invalid Protection Information */ 1570 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_COMPARE || 1571 cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_READ || 1572 cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE); 1573 atomic_inc_32(&cmd->nc_nvme->n_inv_prot); 1574 if (cmd->nc_xfer != NULL) 1575 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 1576 return (EINVAL); 1577 1578 case NVME_CQE_SC_SPC_NVM_READONLY: 1579 /* Write to Read Only Range */ 1580 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE); 1581 atomic_inc_32(&cmd->nc_nvme->n_readonly); 1582 if (cmd->nc_xfer != NULL) 1583 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 1584 return (EROFS); 1585 1586 case NVME_CQE_SC_SPC_INV_FW_SLOT: 1587 /* Invalid Firmware Slot */ 1588 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 1589 return (EINVAL); 1590 1591 case NVME_CQE_SC_SPC_INV_FW_IMG: 1592 /* Invalid Firmware Image */ 1593 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 1594 return (EINVAL); 1595 1596 case NVME_CQE_SC_SPC_FW_RESET: 1597 /* Conventional Reset Required */ 1598 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 1599 return (0); 1600 1601 case NVME_CQE_SC_SPC_FW_NSSR: 1602 /* NVMe Subsystem Reset Required */ 1603 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 1604 return (0); 1605 1606 case NVME_CQE_SC_SPC_FW_NEXT_RESET: 1607 /* Activation Requires Reset */ 1608 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 1609 return (0); 1610 1611 case NVME_CQE_SC_SPC_FW_MTFA: 1612 /* Activation Requires Maximum Time Violation */ 1613 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 1614 return (EAGAIN); 1615 1616 case NVME_CQE_SC_SPC_FW_PROHIBITED: 1617 /* Activation Prohibited */ 1618 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 1619 return (EINVAL); 1620 1621 case NVME_CQE_SC_SPC_FW_OVERLAP: 1622 /* Overlapping Firmware Ranges */ 1623 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_IMAGE_LOAD); 1624 return (EINVAL); 1625 1626 default: 1627 return (nvme_check_unknown_cmd_status(cmd)); 1628 } 1629 } 1630 1631 static inline int 1632 nvme_check_cmd_status(nvme_cmd_t *cmd) 1633 { 1634 nvme_cqe_t *cqe = &cmd->nc_cqe; 1635 1636 /* 1637 * Take a shortcut if the controller is dead, or if 1638 * command status indicates no error. 1639 */ 1640 if (cmd->nc_nvme->n_dead) 1641 return (EIO); 1642 1643 if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC && 1644 cqe->cqe_sf.sf_sc == NVME_CQE_SC_GEN_SUCCESS) 1645 return (0); 1646 1647 if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC) 1648 return (nvme_check_generic_cmd_status(cmd)); 1649 else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_SPECIFIC) 1650 return (nvme_check_specific_cmd_status(cmd)); 1651 else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_INTEGRITY) 1652 return (nvme_check_integrity_cmd_status(cmd)); 1653 else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_VENDOR) 1654 return (nvme_check_vendor_cmd_status(cmd)); 1655 1656 return (nvme_check_unknown_cmd_status(cmd)); 1657 } 1658 1659 static int 1660 nvme_abort_cmd(nvme_cmd_t *abort_cmd, uint_t sec) 1661 { 1662 nvme_t *nvme = abort_cmd->nc_nvme; 1663 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 1664 nvme_abort_cmd_t ac = { 0 }; 1665 int ret = 0; 1666 1667 sema_p(&nvme->n_abort_sema); 1668 1669 ac.b.ac_cid = abort_cmd->nc_sqe.sqe_cid; 1670 ac.b.ac_sqid = abort_cmd->nc_sqid; 1671 1672 cmd->nc_sqid = 0; 1673 cmd->nc_sqe.sqe_opc = NVME_OPC_ABORT; 1674 cmd->nc_callback = nvme_wakeup_cmd; 1675 cmd->nc_sqe.sqe_cdw10 = ac.r; 1676 1677 /* 1678 * Send the ABORT to the hardware. The ABORT command will return _after_ 1679 * the aborted command has completed (aborted or otherwise), but since 1680 * we still hold the aborted command's mutex its callback hasn't been 1681 * processed yet. 1682 */ 1683 nvme_admin_cmd(cmd, sec); 1684 sema_v(&nvme->n_abort_sema); 1685 1686 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 1687 dev_err(nvme->n_dip, CE_WARN, 1688 "!ABORT failed with sct = %x, sc = %x", 1689 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 1690 atomic_inc_32(&nvme->n_abort_failed); 1691 } else { 1692 dev_err(nvme->n_dip, CE_WARN, 1693 "!ABORT of command %d/%d %ssuccessful", 1694 abort_cmd->nc_sqe.sqe_cid, abort_cmd->nc_sqid, 1695 cmd->nc_cqe.cqe_dw0 & 1 ? "un" : ""); 1696 if ((cmd->nc_cqe.cqe_dw0 & 1) == 0) 1697 atomic_inc_32(&nvme->n_cmd_aborted); 1698 } 1699 1700 nvme_free_cmd(cmd); 1701 return (ret); 1702 } 1703 1704 /* 1705 * nvme_wait_cmd -- wait for command completion or timeout 1706 * 1707 * In case of a serious error or a timeout of the abort command the hardware 1708 * will be declared dead and FMA will be notified. 1709 */ 1710 static void 1711 nvme_wait_cmd(nvme_cmd_t *cmd, uint_t sec) 1712 { 1713 clock_t timeout = ddi_get_lbolt() + drv_usectohz(sec * MICROSEC); 1714 nvme_t *nvme = cmd->nc_nvme; 1715 nvme_reg_csts_t csts; 1716 nvme_qpair_t *qp; 1717 1718 ASSERT(mutex_owned(&cmd->nc_mutex)); 1719 1720 while (!cmd->nc_completed) { 1721 if (cv_timedwait(&cmd->nc_cv, &cmd->nc_mutex, timeout) == -1) 1722 break; 1723 } 1724 1725 if (cmd->nc_completed) 1726 return; 1727 1728 /* 1729 * The command timed out. 1730 * 1731 * Check controller for fatal status, any errors associated with the 1732 * register or DMA handle, or for a double timeout (abort command timed 1733 * out). If necessary log a warning and call FMA. 1734 */ 1735 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 1736 dev_err(nvme->n_dip, CE_WARN, "!command %d/%d timeout, " 1737 "OPC = %x, CFS = %d", cmd->nc_sqe.sqe_cid, cmd->nc_sqid, 1738 cmd->nc_sqe.sqe_opc, csts.b.csts_cfs); 1739 atomic_inc_32(&nvme->n_cmd_timeout); 1740 1741 if (csts.b.csts_cfs || 1742 nvme_check_regs_hdl(nvme) || 1743 nvme_check_dma_hdl(cmd->nc_dma) || 1744 cmd->nc_sqe.sqe_opc == NVME_OPC_ABORT) { 1745 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 1746 nvme->n_dead = B_TRUE; 1747 } else if (nvme_abort_cmd(cmd, sec) == 0) { 1748 /* 1749 * If the abort succeeded the command should complete 1750 * immediately with an appropriate status. 1751 */ 1752 while (!cmd->nc_completed) 1753 cv_wait(&cmd->nc_cv, &cmd->nc_mutex); 1754 1755 return; 1756 } 1757 1758 qp = nvme->n_ioq[cmd->nc_sqid]; 1759 1760 mutex_enter(&qp->nq_mutex); 1761 (void) nvme_unqueue_cmd(nvme, qp, cmd->nc_sqe.sqe_cid); 1762 mutex_exit(&qp->nq_mutex); 1763 1764 /* 1765 * As we don't know what the presumed dead hardware might still do with 1766 * the DMA memory, we'll put the command on the lost commands list if it 1767 * has any DMA memory. 1768 */ 1769 if (cmd->nc_dma != NULL) { 1770 mutex_enter(&nvme_lc_mutex); 1771 list_insert_head(&nvme_lost_cmds, cmd); 1772 mutex_exit(&nvme_lc_mutex); 1773 } 1774 } 1775 1776 static void 1777 nvme_wakeup_cmd(void *arg) 1778 { 1779 nvme_cmd_t *cmd = arg; 1780 1781 mutex_enter(&cmd->nc_mutex); 1782 cmd->nc_completed = B_TRUE; 1783 cv_signal(&cmd->nc_cv); 1784 mutex_exit(&cmd->nc_mutex); 1785 } 1786 1787 static void 1788 nvme_async_event_task(void *arg) 1789 { 1790 nvme_cmd_t *cmd = arg; 1791 nvme_t *nvme = cmd->nc_nvme; 1792 nvme_error_log_entry_t *error_log = NULL; 1793 nvme_health_log_t *health_log = NULL; 1794 nvme_nschange_list_t *nslist = NULL; 1795 size_t logsize = 0; 1796 nvme_async_event_t event; 1797 1798 /* 1799 * Check for errors associated with the async request itself. The only 1800 * command-specific error is "async event limit exceeded", which 1801 * indicates a programming error in the driver and causes a panic in 1802 * nvme_check_cmd_status(). 1803 * 1804 * Other possible errors are various scenarios where the async request 1805 * was aborted, or internal errors in the device. Internal errors are 1806 * reported to FMA, the command aborts need no special handling here. 1807 * 1808 * And finally, at least qemu nvme does not support async events, 1809 * and will return NVME_CQE_SC_GEN_INV_OPC | DNR. If so, we 1810 * will avoid posting async events. 1811 */ 1812 1813 if (nvme_check_cmd_status(cmd) != 0) { 1814 dev_err(cmd->nc_nvme->n_dip, CE_WARN, 1815 "!async event request returned failure, sct = %x, " 1816 "sc = %x, dnr = %d, m = %d", cmd->nc_cqe.cqe_sf.sf_sct, 1817 cmd->nc_cqe.cqe_sf.sf_sc, cmd->nc_cqe.cqe_sf.sf_dnr, 1818 cmd->nc_cqe.cqe_sf.sf_m); 1819 1820 if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC && 1821 cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INTERNAL_ERR) { 1822 cmd->nc_nvme->n_dead = B_TRUE; 1823 ddi_fm_service_impact(cmd->nc_nvme->n_dip, 1824 DDI_SERVICE_LOST); 1825 } 1826 1827 if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC && 1828 cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INV_OPC && 1829 cmd->nc_cqe.cqe_sf.sf_dnr == 1) { 1830 nvme->n_async_event_supported = B_FALSE; 1831 } 1832 1833 nvme_free_cmd(cmd); 1834 return; 1835 } 1836 1837 event.r = cmd->nc_cqe.cqe_dw0; 1838 1839 /* Clear CQE and re-submit the async request. */ 1840 bzero(&cmd->nc_cqe, sizeof (nvme_cqe_t)); 1841 nvme_submit_admin_cmd(nvme->n_adminq, cmd); 1842 1843 switch (event.b.ae_type) { 1844 case NVME_ASYNC_TYPE_ERROR: 1845 if (event.b.ae_logpage == NVME_LOGPAGE_ERROR) { 1846 (void) nvme_get_logpage(nvme, B_FALSE, 1847 (void **)&error_log, &logsize, event.b.ae_logpage); 1848 } else { 1849 dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in " 1850 "async event reply: %d", event.b.ae_logpage); 1851 atomic_inc_32(&nvme->n_wrong_logpage); 1852 } 1853 1854 switch (event.b.ae_info) { 1855 case NVME_ASYNC_ERROR_INV_SQ: 1856 dev_err(nvme->n_dip, CE_PANIC, "programming error: " 1857 "invalid submission queue"); 1858 return; 1859 1860 case NVME_ASYNC_ERROR_INV_DBL: 1861 dev_err(nvme->n_dip, CE_PANIC, "programming error: " 1862 "invalid doorbell write value"); 1863 return; 1864 1865 case NVME_ASYNC_ERROR_DIAGFAIL: 1866 dev_err(nvme->n_dip, CE_WARN, "!diagnostic failure"); 1867 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 1868 nvme->n_dead = B_TRUE; 1869 atomic_inc_32(&nvme->n_diagfail_event); 1870 break; 1871 1872 case NVME_ASYNC_ERROR_PERSISTENT: 1873 dev_err(nvme->n_dip, CE_WARN, "!persistent internal " 1874 "device error"); 1875 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 1876 nvme->n_dead = B_TRUE; 1877 atomic_inc_32(&nvme->n_persistent_event); 1878 break; 1879 1880 case NVME_ASYNC_ERROR_TRANSIENT: 1881 dev_err(nvme->n_dip, CE_WARN, "!transient internal " 1882 "device error"); 1883 /* TODO: send ereport */ 1884 atomic_inc_32(&nvme->n_transient_event); 1885 break; 1886 1887 case NVME_ASYNC_ERROR_FW_LOAD: 1888 dev_err(nvme->n_dip, CE_WARN, 1889 "!firmware image load error"); 1890 atomic_inc_32(&nvme->n_fw_load_event); 1891 break; 1892 } 1893 break; 1894 1895 case NVME_ASYNC_TYPE_HEALTH: 1896 if (event.b.ae_logpage == NVME_LOGPAGE_HEALTH) { 1897 (void) nvme_get_logpage(nvme, B_FALSE, 1898 (void **)&health_log, &logsize, event.b.ae_logpage, 1899 -1); 1900 } else { 1901 dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in " 1902 "async event reply: %d", event.b.ae_logpage); 1903 atomic_inc_32(&nvme->n_wrong_logpage); 1904 } 1905 1906 switch (event.b.ae_info) { 1907 case NVME_ASYNC_HEALTH_RELIABILITY: 1908 dev_err(nvme->n_dip, CE_WARN, 1909 "!device reliability compromised"); 1910 /* TODO: send ereport */ 1911 atomic_inc_32(&nvme->n_reliability_event); 1912 break; 1913 1914 case NVME_ASYNC_HEALTH_TEMPERATURE: 1915 dev_err(nvme->n_dip, CE_WARN, 1916 "!temperature above threshold"); 1917 /* TODO: send ereport */ 1918 atomic_inc_32(&nvme->n_temperature_event); 1919 break; 1920 1921 case NVME_ASYNC_HEALTH_SPARE: 1922 dev_err(nvme->n_dip, CE_WARN, 1923 "!spare space below threshold"); 1924 /* TODO: send ereport */ 1925 atomic_inc_32(&nvme->n_spare_event); 1926 break; 1927 } 1928 break; 1929 1930 case NVME_ASYNC_TYPE_NOTICE: 1931 switch (event.b.ae_info) { 1932 case NVME_ASYNC_NOTICE_NS_CHANGE: 1933 dev_err(nvme->n_dip, CE_NOTE, 1934 "namespace attribute change event, " 1935 "logpage = %x", event.b.ae_logpage); 1936 atomic_inc_32(&nvme->n_notice_event); 1937 1938 if (event.b.ae_logpage != NVME_LOGPAGE_NSCHANGE) 1939 break; 1940 1941 if (nvme_get_logpage(nvme, B_FALSE, (void **)&nslist, 1942 &logsize, event.b.ae_logpage, -1) != 0) { 1943 break; 1944 } 1945 1946 if (nslist->nscl_ns[0] == UINT32_MAX) { 1947 dev_err(nvme->n_dip, CE_CONT, 1948 "more than %u namespaces have changed.\n", 1949 NVME_NSCHANGE_LIST_SIZE); 1950 break; 1951 } 1952 1953 for (uint_t i = 0; i < NVME_NSCHANGE_LIST_SIZE; i++) { 1954 uint32_t nsid = nslist->nscl_ns[i]; 1955 1956 if (nsid == 0) /* end of list */ 1957 break; 1958 1959 dev_err(nvme->n_dip, CE_CONT, 1960 "namespace %u (%s) has changed.\n", 1961 nsid, nvme->n_ns[nsid - 1].ns_name); 1962 /* TODO: handle namespace resize. */ 1963 } 1964 1965 break; 1966 1967 case NVME_ASYNC_NOTICE_FW_ACTIVATE: 1968 dev_err(nvme->n_dip, CE_NOTE, 1969 "firmware activation starting, " 1970 "logpage = %x", event.b.ae_logpage); 1971 atomic_inc_32(&nvme->n_notice_event); 1972 break; 1973 1974 case NVME_ASYNC_NOTICE_TELEMETRY: 1975 dev_err(nvme->n_dip, CE_NOTE, 1976 "telemetry log changed, " 1977 "logpage = %x", event.b.ae_logpage); 1978 atomic_inc_32(&nvme->n_notice_event); 1979 break; 1980 1981 case NVME_ASYNC_NOTICE_NS_ASYMM: 1982 dev_err(nvme->n_dip, CE_NOTE, 1983 "asymmetric namespace access change, " 1984 "logpage = %x", event.b.ae_logpage); 1985 atomic_inc_32(&nvme->n_notice_event); 1986 break; 1987 1988 case NVME_ASYNC_NOTICE_LATENCYLOG: 1989 dev_err(nvme->n_dip, CE_NOTE, 1990 "predictable latency event aggregate log change, " 1991 "logpage = %x", event.b.ae_logpage); 1992 atomic_inc_32(&nvme->n_notice_event); 1993 break; 1994 1995 case NVME_ASYNC_NOTICE_LBASTATUS: 1996 dev_err(nvme->n_dip, CE_NOTE, 1997 "LBA status information alert, " 1998 "logpage = %x", event.b.ae_logpage); 1999 atomic_inc_32(&nvme->n_notice_event); 2000 break; 2001 2002 case NVME_ASYNC_NOTICE_ENDURANCELOG: 2003 dev_err(nvme->n_dip, CE_NOTE, 2004 "endurance group event aggregate log page change, " 2005 "logpage = %x", event.b.ae_logpage); 2006 atomic_inc_32(&nvme->n_notice_event); 2007 break; 2008 2009 default: 2010 dev_err(nvme->n_dip, CE_WARN, 2011 "!unknown notice async event received, " 2012 "info = %x, logpage = %x", event.b.ae_info, 2013 event.b.ae_logpage); 2014 atomic_inc_32(&nvme->n_unknown_event); 2015 break; 2016 } 2017 break; 2018 2019 case NVME_ASYNC_TYPE_VENDOR: 2020 dev_err(nvme->n_dip, CE_WARN, "!vendor specific async event " 2021 "received, info = %x, logpage = %x", event.b.ae_info, 2022 event.b.ae_logpage); 2023 atomic_inc_32(&nvme->n_vendor_event); 2024 break; 2025 2026 default: 2027 dev_err(nvme->n_dip, CE_WARN, "!unknown async event received, " 2028 "type = %x, info = %x, logpage = %x", event.b.ae_type, 2029 event.b.ae_info, event.b.ae_logpage); 2030 atomic_inc_32(&nvme->n_unknown_event); 2031 break; 2032 } 2033 2034 if (error_log != NULL) 2035 kmem_free(error_log, logsize); 2036 2037 if (health_log != NULL) 2038 kmem_free(health_log, logsize); 2039 2040 if (nslist != NULL) 2041 kmem_free(nslist, logsize); 2042 } 2043 2044 static void 2045 nvme_admin_cmd(nvme_cmd_t *cmd, int sec) 2046 { 2047 mutex_enter(&cmd->nc_mutex); 2048 nvme_submit_admin_cmd(cmd->nc_nvme->n_adminq, cmd); 2049 nvme_wait_cmd(cmd, sec); 2050 mutex_exit(&cmd->nc_mutex); 2051 } 2052 2053 static void 2054 nvme_async_event(nvme_t *nvme) 2055 { 2056 nvme_cmd_t *cmd; 2057 2058 cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2059 cmd->nc_sqid = 0; 2060 cmd->nc_sqe.sqe_opc = NVME_OPC_ASYNC_EVENT; 2061 cmd->nc_callback = nvme_async_event_task; 2062 cmd->nc_dontpanic = B_TRUE; 2063 2064 nvme_submit_admin_cmd(nvme->n_adminq, cmd); 2065 } 2066 2067 static int 2068 nvme_format_nvm(nvme_t *nvme, boolean_t user, uint32_t nsid, uint8_t lbaf, 2069 boolean_t ms, uint8_t pi, boolean_t pil, uint8_t ses) 2070 { 2071 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2072 nvme_format_nvm_t format_nvm = { 0 }; 2073 int ret; 2074 2075 format_nvm.b.fm_lbaf = lbaf & 0xf; 2076 format_nvm.b.fm_ms = ms ? 1 : 0; 2077 format_nvm.b.fm_pi = pi & 0x7; 2078 format_nvm.b.fm_pil = pil ? 1 : 0; 2079 format_nvm.b.fm_ses = ses & 0x7; 2080 2081 cmd->nc_sqid = 0; 2082 cmd->nc_callback = nvme_wakeup_cmd; 2083 cmd->nc_sqe.sqe_nsid = nsid; 2084 cmd->nc_sqe.sqe_opc = NVME_OPC_NVM_FORMAT; 2085 cmd->nc_sqe.sqe_cdw10 = format_nvm.r; 2086 2087 /* 2088 * Some devices like Samsung SM951 don't allow formatting of all 2089 * namespaces in one command. Handle that gracefully. 2090 */ 2091 if (nsid == (uint32_t)-1) 2092 cmd->nc_dontpanic = B_TRUE; 2093 /* 2094 * If this format request was initiated by the user, then don't allow a 2095 * programmer error to panic the system. 2096 */ 2097 if (user) 2098 cmd->nc_dontpanic = B_TRUE; 2099 2100 nvme_admin_cmd(cmd, nvme_format_cmd_timeout); 2101 2102 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 2103 dev_err(nvme->n_dip, CE_WARN, 2104 "!FORMAT failed with sct = %x, sc = %x", 2105 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 2106 } 2107 2108 nvme_free_cmd(cmd); 2109 return (ret); 2110 } 2111 2112 /* 2113 * The `bufsize` parameter is usually an output parameter, set by this routine 2114 * when filling in the supported types of logpages from the device. However, for 2115 * vendor-specific pages, it is an input parameter, and must be set 2116 * appropriately by callers. 2117 */ 2118 static int 2119 nvme_get_logpage(nvme_t *nvme, boolean_t user, void **buf, size_t *bufsize, 2120 uint8_t logpage, ...) 2121 { 2122 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2123 nvme_getlogpage_t getlogpage = { 0 }; 2124 va_list ap; 2125 int ret; 2126 2127 va_start(ap, logpage); 2128 2129 cmd->nc_sqid = 0; 2130 cmd->nc_callback = nvme_wakeup_cmd; 2131 cmd->nc_sqe.sqe_opc = NVME_OPC_GET_LOG_PAGE; 2132 2133 if (user) 2134 cmd->nc_dontpanic = B_TRUE; 2135 2136 getlogpage.b.lp_lid = logpage; 2137 2138 switch (logpage) { 2139 case NVME_LOGPAGE_ERROR: 2140 cmd->nc_sqe.sqe_nsid = (uint32_t)-1; 2141 *bufsize = MIN(NVME_VENDOR_SPECIFIC_LOGPAGE_MAX_SIZE, 2142 nvme->n_error_log_len * sizeof (nvme_error_log_entry_t)); 2143 break; 2144 2145 case NVME_LOGPAGE_HEALTH: 2146 cmd->nc_sqe.sqe_nsid = va_arg(ap, uint32_t); 2147 *bufsize = sizeof (nvme_health_log_t); 2148 break; 2149 2150 case NVME_LOGPAGE_FWSLOT: 2151 cmd->nc_sqe.sqe_nsid = (uint32_t)-1; 2152 *bufsize = sizeof (nvme_fwslot_log_t); 2153 break; 2154 2155 case NVME_LOGPAGE_NSCHANGE: 2156 cmd->nc_sqe.sqe_nsid = (uint32_t)-1; 2157 *bufsize = sizeof (nvme_nschange_list_t); 2158 break; 2159 2160 default: 2161 /* 2162 * This intentionally only checks against the minimum valid 2163 * log page ID. `logpage` is a uint8_t, and `0xFF` is a valid 2164 * page ID, so this one-sided check avoids a compiler error 2165 * about a check that's always true. 2166 */ 2167 if (logpage < NVME_VENDOR_SPECIFIC_LOGPAGE_MIN) { 2168 dev_err(nvme->n_dip, CE_WARN, 2169 "!unknown log page requested: %d", logpage); 2170 atomic_inc_32(&nvme->n_unknown_logpage); 2171 ret = EINVAL; 2172 goto fail; 2173 } 2174 cmd->nc_sqe.sqe_nsid = va_arg(ap, uint32_t); 2175 } 2176 2177 va_end(ap); 2178 2179 getlogpage.b.lp_numd = *bufsize / sizeof (uint32_t) - 1; 2180 2181 cmd->nc_sqe.sqe_cdw10 = getlogpage.r; 2182 2183 if (nvme_zalloc_dma(nvme, *bufsize, 2184 DDI_DMA_READ, &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) { 2185 dev_err(nvme->n_dip, CE_WARN, 2186 "!nvme_zalloc_dma failed for GET LOG PAGE"); 2187 ret = ENOMEM; 2188 goto fail; 2189 } 2190 2191 if ((ret = nvme_fill_prp(cmd, cmd->nc_dma->nd_dmah)) != 0) 2192 goto fail; 2193 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout); 2194 2195 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 2196 dev_err(nvme->n_dip, CE_WARN, 2197 "!GET LOG PAGE failed with sct = %x, sc = %x", 2198 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 2199 goto fail; 2200 } 2201 2202 *buf = kmem_alloc(*bufsize, KM_SLEEP); 2203 bcopy(cmd->nc_dma->nd_memp, *buf, *bufsize); 2204 2205 fail: 2206 nvme_free_cmd(cmd); 2207 2208 return (ret); 2209 } 2210 2211 static int 2212 nvme_identify(nvme_t *nvme, boolean_t user, uint32_t nsid, void **buf) 2213 { 2214 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2215 int ret; 2216 2217 if (buf == NULL) 2218 return (EINVAL); 2219 2220 cmd->nc_sqid = 0; 2221 cmd->nc_callback = nvme_wakeup_cmd; 2222 cmd->nc_sqe.sqe_opc = NVME_OPC_IDENTIFY; 2223 cmd->nc_sqe.sqe_nsid = nsid; 2224 cmd->nc_sqe.sqe_cdw10 = nsid ? NVME_IDENTIFY_NSID : NVME_IDENTIFY_CTRL; 2225 2226 if (nvme_zalloc_dma(nvme, NVME_IDENTIFY_BUFSIZE, DDI_DMA_READ, 2227 &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) { 2228 dev_err(nvme->n_dip, CE_WARN, 2229 "!nvme_zalloc_dma failed for IDENTIFY"); 2230 ret = ENOMEM; 2231 goto fail; 2232 } 2233 2234 if (cmd->nc_dma->nd_ncookie > 2) { 2235 dev_err(nvme->n_dip, CE_WARN, 2236 "!too many DMA cookies for IDENTIFY"); 2237 atomic_inc_32(&nvme->n_too_many_cookies); 2238 ret = ENOMEM; 2239 goto fail; 2240 } 2241 2242 cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_dma->nd_cookie.dmac_laddress; 2243 if (cmd->nc_dma->nd_ncookie > 1) { 2244 ddi_dma_nextcookie(cmd->nc_dma->nd_dmah, 2245 &cmd->nc_dma->nd_cookie); 2246 cmd->nc_sqe.sqe_dptr.d_prp[1] = 2247 cmd->nc_dma->nd_cookie.dmac_laddress; 2248 } 2249 2250 if (user) 2251 cmd->nc_dontpanic = B_TRUE; 2252 2253 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout); 2254 2255 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 2256 dev_err(nvme->n_dip, CE_WARN, 2257 "!IDENTIFY failed with sct = %x, sc = %x", 2258 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 2259 goto fail; 2260 } 2261 2262 *buf = kmem_alloc(NVME_IDENTIFY_BUFSIZE, KM_SLEEP); 2263 bcopy(cmd->nc_dma->nd_memp, *buf, NVME_IDENTIFY_BUFSIZE); 2264 2265 fail: 2266 nvme_free_cmd(cmd); 2267 2268 return (ret); 2269 } 2270 2271 static int 2272 nvme_set_features(nvme_t *nvme, boolean_t user, uint32_t nsid, uint8_t feature, 2273 uint32_t val, uint32_t *res) 2274 { 2275 _NOTE(ARGUNUSED(nsid)); 2276 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2277 int ret = EINVAL; 2278 2279 ASSERT(res != NULL); 2280 2281 cmd->nc_sqid = 0; 2282 cmd->nc_callback = nvme_wakeup_cmd; 2283 cmd->nc_sqe.sqe_opc = NVME_OPC_SET_FEATURES; 2284 cmd->nc_sqe.sqe_cdw10 = feature; 2285 cmd->nc_sqe.sqe_cdw11 = val; 2286 2287 if (user) 2288 cmd->nc_dontpanic = B_TRUE; 2289 2290 switch (feature) { 2291 case NVME_FEAT_WRITE_CACHE: 2292 if (!nvme->n_write_cache_present) 2293 goto fail; 2294 break; 2295 2296 case NVME_FEAT_NQUEUES: 2297 break; 2298 2299 default: 2300 goto fail; 2301 } 2302 2303 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout); 2304 2305 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 2306 dev_err(nvme->n_dip, CE_WARN, 2307 "!SET FEATURES %d failed with sct = %x, sc = %x", 2308 feature, cmd->nc_cqe.cqe_sf.sf_sct, 2309 cmd->nc_cqe.cqe_sf.sf_sc); 2310 goto fail; 2311 } 2312 2313 *res = cmd->nc_cqe.cqe_dw0; 2314 2315 fail: 2316 nvme_free_cmd(cmd); 2317 return (ret); 2318 } 2319 2320 static int 2321 nvme_get_features(nvme_t *nvme, boolean_t user, uint32_t nsid, uint8_t feature, 2322 uint32_t *res, void **buf, size_t *bufsize) 2323 { 2324 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2325 int ret = EINVAL; 2326 2327 ASSERT(res != NULL); 2328 2329 if (bufsize != NULL) 2330 *bufsize = 0; 2331 2332 cmd->nc_sqid = 0; 2333 cmd->nc_callback = nvme_wakeup_cmd; 2334 cmd->nc_sqe.sqe_opc = NVME_OPC_GET_FEATURES; 2335 cmd->nc_sqe.sqe_cdw10 = feature; 2336 cmd->nc_sqe.sqe_cdw11 = *res; 2337 2338 /* 2339 * For some of the optional features there doesn't seem to be a method 2340 * of detecting whether it is supported other than using it. This will 2341 * cause "Invalid Field in Command" error, which is normally considered 2342 * a programming error. Set the nc_dontpanic flag to override the panic 2343 * in nvme_check_generic_cmd_status(). 2344 */ 2345 switch (feature) { 2346 case NVME_FEAT_ARBITRATION: 2347 case NVME_FEAT_POWER_MGMT: 2348 case NVME_FEAT_TEMPERATURE: 2349 case NVME_FEAT_ERROR: 2350 case NVME_FEAT_NQUEUES: 2351 case NVME_FEAT_INTR_COAL: 2352 case NVME_FEAT_INTR_VECT: 2353 case NVME_FEAT_WRITE_ATOM: 2354 case NVME_FEAT_ASYNC_EVENT: 2355 break; 2356 2357 case NVME_FEAT_WRITE_CACHE: 2358 if (!nvme->n_write_cache_present) 2359 goto fail; 2360 break; 2361 2362 case NVME_FEAT_LBA_RANGE: 2363 if (!nvme->n_lba_range_supported) 2364 goto fail; 2365 2366 cmd->nc_dontpanic = B_TRUE; 2367 cmd->nc_sqe.sqe_nsid = nsid; 2368 ASSERT(bufsize != NULL); 2369 *bufsize = NVME_LBA_RANGE_BUFSIZE; 2370 break; 2371 2372 case NVME_FEAT_AUTO_PST: 2373 if (!nvme->n_auto_pst_supported) 2374 goto fail; 2375 2376 ASSERT(bufsize != NULL); 2377 *bufsize = NVME_AUTO_PST_BUFSIZE; 2378 break; 2379 2380 case NVME_FEAT_PROGRESS: 2381 if (!nvme->n_progress_supported) 2382 goto fail; 2383 2384 cmd->nc_dontpanic = B_TRUE; 2385 break; 2386 2387 default: 2388 goto fail; 2389 } 2390 2391 if (user) 2392 cmd->nc_dontpanic = B_TRUE; 2393 2394 if (bufsize != NULL && *bufsize != 0) { 2395 if (nvme_zalloc_dma(nvme, *bufsize, DDI_DMA_READ, 2396 &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) { 2397 dev_err(nvme->n_dip, CE_WARN, 2398 "!nvme_zalloc_dma failed for GET FEATURES"); 2399 ret = ENOMEM; 2400 goto fail; 2401 } 2402 2403 if (cmd->nc_dma->nd_ncookie > 2) { 2404 dev_err(nvme->n_dip, CE_WARN, 2405 "!too many DMA cookies for GET FEATURES"); 2406 atomic_inc_32(&nvme->n_too_many_cookies); 2407 ret = ENOMEM; 2408 goto fail; 2409 } 2410 2411 cmd->nc_sqe.sqe_dptr.d_prp[0] = 2412 cmd->nc_dma->nd_cookie.dmac_laddress; 2413 if (cmd->nc_dma->nd_ncookie > 1) { 2414 ddi_dma_nextcookie(cmd->nc_dma->nd_dmah, 2415 &cmd->nc_dma->nd_cookie); 2416 cmd->nc_sqe.sqe_dptr.d_prp[1] = 2417 cmd->nc_dma->nd_cookie.dmac_laddress; 2418 } 2419 } 2420 2421 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout); 2422 2423 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 2424 boolean_t known = B_TRUE; 2425 2426 /* Check if this is unsupported optional feature */ 2427 if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC && 2428 cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INV_FLD) { 2429 switch (feature) { 2430 case NVME_FEAT_LBA_RANGE: 2431 nvme->n_lba_range_supported = B_FALSE; 2432 break; 2433 case NVME_FEAT_PROGRESS: 2434 nvme->n_progress_supported = B_FALSE; 2435 break; 2436 default: 2437 known = B_FALSE; 2438 break; 2439 } 2440 } else { 2441 known = B_FALSE; 2442 } 2443 2444 /* Report the error otherwise */ 2445 if (!known) { 2446 dev_err(nvme->n_dip, CE_WARN, 2447 "!GET FEATURES %d failed with sct = %x, sc = %x", 2448 feature, cmd->nc_cqe.cqe_sf.sf_sct, 2449 cmd->nc_cqe.cqe_sf.sf_sc); 2450 } 2451 2452 goto fail; 2453 } 2454 2455 if (bufsize != NULL && *bufsize != 0) { 2456 ASSERT(buf != NULL); 2457 *buf = kmem_alloc(*bufsize, KM_SLEEP); 2458 bcopy(cmd->nc_dma->nd_memp, *buf, *bufsize); 2459 } 2460 2461 *res = cmd->nc_cqe.cqe_dw0; 2462 2463 fail: 2464 nvme_free_cmd(cmd); 2465 return (ret); 2466 } 2467 2468 static int 2469 nvme_write_cache_set(nvme_t *nvme, boolean_t enable) 2470 { 2471 nvme_write_cache_t nwc = { 0 }; 2472 2473 if (enable) 2474 nwc.b.wc_wce = 1; 2475 2476 return (nvme_set_features(nvme, B_FALSE, 0, NVME_FEAT_WRITE_CACHE, 2477 nwc.r, &nwc.r)); 2478 } 2479 2480 static int 2481 nvme_set_nqueues(nvme_t *nvme) 2482 { 2483 nvme_nqueues_t nq = { 0 }; 2484 int ret; 2485 2486 /* 2487 * The default is to allocate one completion queue per vector. 2488 */ 2489 if (nvme->n_completion_queues == -1) 2490 nvme->n_completion_queues = nvme->n_intr_cnt; 2491 2492 /* 2493 * There is no point in having more completion queues than 2494 * interrupt vectors. 2495 */ 2496 nvme->n_completion_queues = MIN(nvme->n_completion_queues, 2497 nvme->n_intr_cnt); 2498 2499 /* 2500 * The default is to use one submission queue per completion queue. 2501 */ 2502 if (nvme->n_submission_queues == -1) 2503 nvme->n_submission_queues = nvme->n_completion_queues; 2504 2505 /* 2506 * There is no point in having more compeletion queues than 2507 * submission queues. 2508 */ 2509 nvme->n_completion_queues = MIN(nvme->n_completion_queues, 2510 nvme->n_submission_queues); 2511 2512 ASSERT(nvme->n_submission_queues > 0); 2513 ASSERT(nvme->n_completion_queues > 0); 2514 2515 nq.b.nq_nsq = nvme->n_submission_queues - 1; 2516 nq.b.nq_ncq = nvme->n_completion_queues - 1; 2517 2518 ret = nvme_set_features(nvme, B_FALSE, 0, NVME_FEAT_NQUEUES, nq.r, 2519 &nq.r); 2520 2521 if (ret == 0) { 2522 /* 2523 * Never use more than the requested number of queues. 2524 */ 2525 nvme->n_submission_queues = MIN(nvme->n_submission_queues, 2526 nq.b.nq_nsq + 1); 2527 nvme->n_completion_queues = MIN(nvme->n_completion_queues, 2528 nq.b.nq_ncq + 1); 2529 } 2530 2531 return (ret); 2532 } 2533 2534 static int 2535 nvme_create_completion_queue(nvme_t *nvme, nvme_cq_t *cq) 2536 { 2537 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2538 nvme_create_queue_dw10_t dw10 = { 0 }; 2539 nvme_create_cq_dw11_t c_dw11 = { 0 }; 2540 int ret; 2541 2542 dw10.b.q_qid = cq->ncq_id; 2543 dw10.b.q_qsize = cq->ncq_nentry - 1; 2544 2545 c_dw11.b.cq_pc = 1; 2546 c_dw11.b.cq_ien = 1; 2547 c_dw11.b.cq_iv = cq->ncq_id % nvme->n_intr_cnt; 2548 2549 cmd->nc_sqid = 0; 2550 cmd->nc_callback = nvme_wakeup_cmd; 2551 cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_CQUEUE; 2552 cmd->nc_sqe.sqe_cdw10 = dw10.r; 2553 cmd->nc_sqe.sqe_cdw11 = c_dw11.r; 2554 cmd->nc_sqe.sqe_dptr.d_prp[0] = cq->ncq_dma->nd_cookie.dmac_laddress; 2555 2556 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout); 2557 2558 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 2559 dev_err(nvme->n_dip, CE_WARN, 2560 "!CREATE CQUEUE failed with sct = %x, sc = %x", 2561 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 2562 } 2563 2564 nvme_free_cmd(cmd); 2565 2566 return (ret); 2567 } 2568 2569 static int 2570 nvme_create_io_qpair(nvme_t *nvme, nvme_qpair_t *qp, uint16_t idx) 2571 { 2572 nvme_cq_t *cq = qp->nq_cq; 2573 nvme_cmd_t *cmd; 2574 nvme_create_queue_dw10_t dw10 = { 0 }; 2575 nvme_create_sq_dw11_t s_dw11 = { 0 }; 2576 int ret; 2577 2578 /* 2579 * It is possible to have more qpairs than completion queues, 2580 * and when the idx > ncq_id, that completion queue is shared 2581 * and has already been created. 2582 */ 2583 if (idx <= cq->ncq_id && 2584 nvme_create_completion_queue(nvme, cq) != DDI_SUCCESS) 2585 return (DDI_FAILURE); 2586 2587 dw10.b.q_qid = idx; 2588 dw10.b.q_qsize = qp->nq_nentry - 1; 2589 2590 s_dw11.b.sq_pc = 1; 2591 s_dw11.b.sq_cqid = cq->ncq_id; 2592 2593 cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2594 cmd->nc_sqid = 0; 2595 cmd->nc_callback = nvme_wakeup_cmd; 2596 cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_SQUEUE; 2597 cmd->nc_sqe.sqe_cdw10 = dw10.r; 2598 cmd->nc_sqe.sqe_cdw11 = s_dw11.r; 2599 cmd->nc_sqe.sqe_dptr.d_prp[0] = qp->nq_sqdma->nd_cookie.dmac_laddress; 2600 2601 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout); 2602 2603 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 2604 dev_err(nvme->n_dip, CE_WARN, 2605 "!CREATE SQUEUE failed with sct = %x, sc = %x", 2606 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 2607 } 2608 2609 nvme_free_cmd(cmd); 2610 2611 return (ret); 2612 } 2613 2614 static boolean_t 2615 nvme_reset(nvme_t *nvme, boolean_t quiesce) 2616 { 2617 nvme_reg_csts_t csts; 2618 int i; 2619 2620 nvme_put32(nvme, NVME_REG_CC, 0); 2621 2622 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 2623 if (csts.b.csts_rdy == 1) { 2624 nvme_put32(nvme, NVME_REG_CC, 0); 2625 for (i = 0; i != nvme->n_timeout * 10; i++) { 2626 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 2627 if (csts.b.csts_rdy == 0) 2628 break; 2629 2630 if (quiesce) 2631 drv_usecwait(50000); 2632 else 2633 delay(drv_usectohz(50000)); 2634 } 2635 } 2636 2637 nvme_put32(nvme, NVME_REG_AQA, 0); 2638 nvme_put32(nvme, NVME_REG_ASQ, 0); 2639 nvme_put32(nvme, NVME_REG_ACQ, 0); 2640 2641 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 2642 return (csts.b.csts_rdy == 0 ? B_TRUE : B_FALSE); 2643 } 2644 2645 static void 2646 nvme_shutdown(nvme_t *nvme, int mode, boolean_t quiesce) 2647 { 2648 nvme_reg_cc_t cc; 2649 nvme_reg_csts_t csts; 2650 int i; 2651 2652 ASSERT(mode == NVME_CC_SHN_NORMAL || mode == NVME_CC_SHN_ABRUPT); 2653 2654 cc.r = nvme_get32(nvme, NVME_REG_CC); 2655 cc.b.cc_shn = mode & 0x3; 2656 nvme_put32(nvme, NVME_REG_CC, cc.r); 2657 2658 for (i = 0; i != 10; i++) { 2659 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 2660 if (csts.b.csts_shst == NVME_CSTS_SHN_COMPLETE) 2661 break; 2662 2663 if (quiesce) 2664 drv_usecwait(100000); 2665 else 2666 delay(drv_usectohz(100000)); 2667 } 2668 } 2669 2670 2671 static void 2672 nvme_prepare_devid(nvme_t *nvme, uint32_t nsid) 2673 { 2674 /* 2675 * Section 7.7 of the spec describes how to get a unique ID for 2676 * the controller: the vendor ID, the model name and the serial 2677 * number shall be unique when combined. 2678 * 2679 * If a namespace has no EUI64 we use the above and add the hex 2680 * namespace ID to get a unique ID for the namespace. 2681 */ 2682 char model[sizeof (nvme->n_idctl->id_model) + 1]; 2683 char serial[sizeof (nvme->n_idctl->id_serial) + 1]; 2684 2685 bcopy(nvme->n_idctl->id_model, model, sizeof (nvme->n_idctl->id_model)); 2686 bcopy(nvme->n_idctl->id_serial, serial, 2687 sizeof (nvme->n_idctl->id_serial)); 2688 2689 model[sizeof (nvme->n_idctl->id_model)] = '\0'; 2690 serial[sizeof (nvme->n_idctl->id_serial)] = '\0'; 2691 2692 nvme->n_ns[nsid - 1].ns_devid = kmem_asprintf("%4X-%s-%s-%X", 2693 nvme->n_idctl->id_vid, model, serial, nsid); 2694 } 2695 2696 static int 2697 nvme_init_ns(nvme_t *nvme, int nsid) 2698 { 2699 nvme_namespace_t *ns = &nvme->n_ns[nsid - 1]; 2700 nvme_identify_nsid_t *idns; 2701 boolean_t was_ignored; 2702 int last_rp; 2703 2704 ns->ns_nvme = nvme; 2705 2706 if (nvme_identify(nvme, B_FALSE, nsid, (void **)&idns) != 0) { 2707 dev_err(nvme->n_dip, CE_WARN, 2708 "!failed to identify namespace %d", nsid); 2709 return (DDI_FAILURE); 2710 } 2711 2712 ns->ns_idns = idns; 2713 ns->ns_id = nsid; 2714 ns->ns_block_count = idns->id_nsize; 2715 ns->ns_block_size = 2716 1 << idns->id_lbaf[idns->id_flbas.lba_format].lbaf_lbads; 2717 ns->ns_best_block_size = ns->ns_block_size; 2718 2719 /* 2720 * Get the EUI64 if present. Use it for devid and device node names. 2721 */ 2722 if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1)) 2723 bcopy(idns->id_eui64, ns->ns_eui64, sizeof (ns->ns_eui64)); 2724 2725 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 2726 if (*(uint64_t *)ns->ns_eui64 != 0) { 2727 uint8_t *eui64 = ns->ns_eui64; 2728 2729 (void) snprintf(ns->ns_name, sizeof (ns->ns_name), 2730 "%02x%02x%02x%02x%02x%02x%02x%02x", 2731 eui64[0], eui64[1], eui64[2], eui64[3], 2732 eui64[4], eui64[5], eui64[6], eui64[7]); 2733 } else { 2734 (void) snprintf(ns->ns_name, sizeof (ns->ns_name), "%d", 2735 ns->ns_id); 2736 2737 nvme_prepare_devid(nvme, ns->ns_id); 2738 } 2739 2740 /* 2741 * Find the LBA format with no metadata and the best relative 2742 * performance. A value of 3 means "degraded", 0 is best. 2743 */ 2744 last_rp = 3; 2745 for (int j = 0; j <= idns->id_nlbaf; j++) { 2746 if (idns->id_lbaf[j].lbaf_lbads == 0) 2747 break; 2748 if (idns->id_lbaf[j].lbaf_ms != 0) 2749 continue; 2750 if (idns->id_lbaf[j].lbaf_rp >= last_rp) 2751 continue; 2752 last_rp = idns->id_lbaf[j].lbaf_rp; 2753 ns->ns_best_block_size = 2754 1 << idns->id_lbaf[j].lbaf_lbads; 2755 } 2756 2757 if (ns->ns_best_block_size < nvme->n_min_block_size) 2758 ns->ns_best_block_size = nvme->n_min_block_size; 2759 2760 was_ignored = ns->ns_ignore; 2761 2762 /* 2763 * We currently don't support namespaces that use either: 2764 * - protection information 2765 * - illegal block size (< 512) 2766 */ 2767 if (idns->id_dps.dp_pinfo) { 2768 dev_err(nvme->n_dip, CE_WARN, 2769 "!ignoring namespace %d, unsupported feature: " 2770 "pinfo = %d", nsid, idns->id_dps.dp_pinfo); 2771 ns->ns_ignore = B_TRUE; 2772 } else if (ns->ns_block_size < 512) { 2773 dev_err(nvme->n_dip, CE_WARN, 2774 "!ignoring namespace %d, unsupported block size %"PRIu64, 2775 nsid, (uint64_t)ns->ns_block_size); 2776 ns->ns_ignore = B_TRUE; 2777 } else { 2778 ns->ns_ignore = B_FALSE; 2779 } 2780 2781 /* 2782 * Keep a count of namespaces which are attachable. 2783 * See comments in nvme_bd_driveinfo() to understand its effect. 2784 */ 2785 if (was_ignored) { 2786 /* 2787 * Previously ignored, but now not. Count it. 2788 */ 2789 if (!ns->ns_ignore) 2790 nvme->n_namespaces_attachable++; 2791 } else { 2792 /* 2793 * Wasn't ignored previously, but now needs to be. 2794 * Discount it. 2795 */ 2796 if (ns->ns_ignore) 2797 nvme->n_namespaces_attachable--; 2798 } 2799 2800 return (DDI_SUCCESS); 2801 } 2802 2803 static int 2804 nvme_init(nvme_t *nvme) 2805 { 2806 nvme_reg_cc_t cc = { 0 }; 2807 nvme_reg_aqa_t aqa = { 0 }; 2808 nvme_reg_asq_t asq = { 0 }; 2809 nvme_reg_acq_t acq = { 0 }; 2810 nvme_reg_cap_t cap; 2811 nvme_reg_vs_t vs; 2812 nvme_reg_csts_t csts; 2813 int i = 0; 2814 uint16_t nqueues; 2815 uint_t tq_threads; 2816 char model[sizeof (nvme->n_idctl->id_model) + 1]; 2817 char *vendor, *product; 2818 2819 /* Check controller version */ 2820 vs.r = nvme_get32(nvme, NVME_REG_VS); 2821 nvme->n_version.v_major = vs.b.vs_mjr; 2822 nvme->n_version.v_minor = vs.b.vs_mnr; 2823 dev_err(nvme->n_dip, CE_CONT, "?NVMe spec version %d.%d", 2824 nvme->n_version.v_major, nvme->n_version.v_minor); 2825 2826 if (nvme->n_version.v_major > nvme_version_major) { 2827 dev_err(nvme->n_dip, CE_WARN, "!no support for version > %d.x", 2828 nvme_version_major); 2829 if (nvme->n_strict_version) 2830 goto fail; 2831 } 2832 2833 /* retrieve controller configuration */ 2834 cap.r = nvme_get64(nvme, NVME_REG_CAP); 2835 2836 if ((cap.b.cap_css & NVME_CAP_CSS_NVM) == 0) { 2837 dev_err(nvme->n_dip, CE_WARN, 2838 "!NVM command set not supported by hardware"); 2839 goto fail; 2840 } 2841 2842 nvme->n_nssr_supported = cap.b.cap_nssrs; 2843 nvme->n_doorbell_stride = 4 << cap.b.cap_dstrd; 2844 nvme->n_timeout = cap.b.cap_to; 2845 nvme->n_arbitration_mechanisms = cap.b.cap_ams; 2846 nvme->n_cont_queues_reqd = cap.b.cap_cqr; 2847 nvme->n_max_queue_entries = cap.b.cap_mqes + 1; 2848 2849 /* 2850 * The MPSMIN and MPSMAX fields in the CAP register use 0 to specify 2851 * the base page size of 4k (1<<12), so add 12 here to get the real 2852 * page size value. 2853 */ 2854 nvme->n_pageshift = MIN(MAX(cap.b.cap_mpsmin + 12, PAGESHIFT), 2855 cap.b.cap_mpsmax + 12); 2856 nvme->n_pagesize = 1UL << (nvme->n_pageshift); 2857 2858 /* 2859 * Set up Queue DMA to transfer at least 1 page-aligned page at a time. 2860 */ 2861 nvme->n_queue_dma_attr.dma_attr_align = nvme->n_pagesize; 2862 nvme->n_queue_dma_attr.dma_attr_minxfer = nvme->n_pagesize; 2863 2864 /* 2865 * Set up PRP DMA to transfer 1 page-aligned page at a time. 2866 * Maxxfer may be increased after we identified the controller limits. 2867 */ 2868 nvme->n_prp_dma_attr.dma_attr_maxxfer = nvme->n_pagesize; 2869 nvme->n_prp_dma_attr.dma_attr_minxfer = nvme->n_pagesize; 2870 nvme->n_prp_dma_attr.dma_attr_align = nvme->n_pagesize; 2871 nvme->n_prp_dma_attr.dma_attr_seg = nvme->n_pagesize - 1; 2872 2873 /* 2874 * Reset controller if it's still in ready state. 2875 */ 2876 if (nvme_reset(nvme, B_FALSE) == B_FALSE) { 2877 dev_err(nvme->n_dip, CE_WARN, "!unable to reset controller"); 2878 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 2879 nvme->n_dead = B_TRUE; 2880 goto fail; 2881 } 2882 2883 /* 2884 * Create the cq array with one completion queue to be assigned 2885 * to the admin queue pair and a limited number of taskqs (4). 2886 */ 2887 if (nvme_create_cq_array(nvme, 1, nvme->n_admin_queue_len, 4) != 2888 DDI_SUCCESS) { 2889 dev_err(nvme->n_dip, CE_WARN, 2890 "!failed to pre-allocate admin completion queue"); 2891 goto fail; 2892 } 2893 /* 2894 * Create the admin queue pair. 2895 */ 2896 if (nvme_alloc_qpair(nvme, nvme->n_admin_queue_len, &nvme->n_adminq, 0) 2897 != DDI_SUCCESS) { 2898 dev_err(nvme->n_dip, CE_WARN, 2899 "!unable to allocate admin qpair"); 2900 goto fail; 2901 } 2902 nvme->n_ioq = kmem_alloc(sizeof (nvme_qpair_t *), KM_SLEEP); 2903 nvme->n_ioq[0] = nvme->n_adminq; 2904 2905 nvme->n_progress |= NVME_ADMIN_QUEUE; 2906 2907 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, 2908 "admin-queue-len", nvme->n_admin_queue_len); 2909 2910 aqa.b.aqa_asqs = aqa.b.aqa_acqs = nvme->n_admin_queue_len - 1; 2911 asq = nvme->n_adminq->nq_sqdma->nd_cookie.dmac_laddress; 2912 acq = nvme->n_adminq->nq_cq->ncq_dma->nd_cookie.dmac_laddress; 2913 2914 ASSERT((asq & (nvme->n_pagesize - 1)) == 0); 2915 ASSERT((acq & (nvme->n_pagesize - 1)) == 0); 2916 2917 nvme_put32(nvme, NVME_REG_AQA, aqa.r); 2918 nvme_put64(nvme, NVME_REG_ASQ, asq); 2919 nvme_put64(nvme, NVME_REG_ACQ, acq); 2920 2921 cc.b.cc_ams = 0; /* use Round-Robin arbitration */ 2922 cc.b.cc_css = 0; /* use NVM command set */ 2923 cc.b.cc_mps = nvme->n_pageshift - 12; 2924 cc.b.cc_shn = 0; /* no shutdown in progress */ 2925 cc.b.cc_en = 1; /* enable controller */ 2926 cc.b.cc_iosqes = 6; /* submission queue entry is 2^6 bytes long */ 2927 cc.b.cc_iocqes = 4; /* completion queue entry is 2^4 bytes long */ 2928 2929 nvme_put32(nvme, NVME_REG_CC, cc.r); 2930 2931 /* 2932 * Wait for the controller to become ready. 2933 */ 2934 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 2935 if (csts.b.csts_rdy == 0) { 2936 for (i = 0; i != nvme->n_timeout * 10; i++) { 2937 delay(drv_usectohz(50000)); 2938 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 2939 2940 if (csts.b.csts_cfs == 1) { 2941 dev_err(nvme->n_dip, CE_WARN, 2942 "!controller fatal status at init"); 2943 ddi_fm_service_impact(nvme->n_dip, 2944 DDI_SERVICE_LOST); 2945 nvme->n_dead = B_TRUE; 2946 goto fail; 2947 } 2948 2949 if (csts.b.csts_rdy == 1) 2950 break; 2951 } 2952 } 2953 2954 if (csts.b.csts_rdy == 0) { 2955 dev_err(nvme->n_dip, CE_WARN, "!controller not ready"); 2956 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 2957 nvme->n_dead = B_TRUE; 2958 goto fail; 2959 } 2960 2961 /* 2962 * Assume an abort command limit of 1. We'll destroy and re-init 2963 * that later when we know the true abort command limit. 2964 */ 2965 sema_init(&nvme->n_abort_sema, 1, NULL, SEMA_DRIVER, NULL); 2966 2967 /* 2968 * Set up initial interrupt for admin queue. 2969 */ 2970 if ((nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSIX, 1) 2971 != DDI_SUCCESS) && 2972 (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSI, 1) 2973 != DDI_SUCCESS) && 2974 (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_FIXED, 1) 2975 != DDI_SUCCESS)) { 2976 dev_err(nvme->n_dip, CE_WARN, 2977 "!failed to setup initial interrupt"); 2978 goto fail; 2979 } 2980 2981 /* 2982 * Post an asynchronous event command to catch errors. 2983 * We assume the asynchronous events are supported as required by 2984 * specification (Figure 40 in section 5 of NVMe 1.2). 2985 * However, since at least qemu does not follow the specification, 2986 * we need a mechanism to protect ourselves. 2987 */ 2988 nvme->n_async_event_supported = B_TRUE; 2989 nvme_async_event(nvme); 2990 2991 /* 2992 * Identify Controller 2993 */ 2994 if (nvme_identify(nvme, B_FALSE, 0, (void **)&nvme->n_idctl) != 0) { 2995 dev_err(nvme->n_dip, CE_WARN, 2996 "!failed to identify controller"); 2997 goto fail; 2998 } 2999 3000 /* 3001 * Get Vendor & Product ID 3002 */ 3003 bcopy(nvme->n_idctl->id_model, model, sizeof (nvme->n_idctl->id_model)); 3004 model[sizeof (nvme->n_idctl->id_model)] = '\0'; 3005 sata_split_model(model, &vendor, &product); 3006 3007 if (vendor == NULL) 3008 nvme->n_vendor = strdup("NVMe"); 3009 else 3010 nvme->n_vendor = strdup(vendor); 3011 3012 nvme->n_product = strdup(product); 3013 3014 /* 3015 * Get controller limits. 3016 */ 3017 nvme->n_async_event_limit = MAX(NVME_MIN_ASYNC_EVENT_LIMIT, 3018 MIN(nvme->n_admin_queue_len / 10, 3019 MIN(nvme->n_idctl->id_aerl + 1, nvme->n_async_event_limit))); 3020 3021 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, 3022 "async-event-limit", nvme->n_async_event_limit); 3023 3024 nvme->n_abort_command_limit = nvme->n_idctl->id_acl + 1; 3025 3026 /* 3027 * Reinitialize the semaphore with the true abort command limit 3028 * supported by the hardware. It's not necessary to disable interrupts 3029 * as only command aborts use the semaphore, and no commands are 3030 * executed or aborted while we're here. 3031 */ 3032 sema_destroy(&nvme->n_abort_sema); 3033 sema_init(&nvme->n_abort_sema, nvme->n_abort_command_limit - 1, NULL, 3034 SEMA_DRIVER, NULL); 3035 3036 nvme->n_progress |= NVME_CTRL_LIMITS; 3037 3038 if (nvme->n_idctl->id_mdts == 0) 3039 nvme->n_max_data_transfer_size = nvme->n_pagesize * 65536; 3040 else 3041 nvme->n_max_data_transfer_size = 3042 1ull << (nvme->n_pageshift + nvme->n_idctl->id_mdts); 3043 3044 nvme->n_error_log_len = nvme->n_idctl->id_elpe + 1; 3045 3046 /* 3047 * Limit n_max_data_transfer_size to what we can handle in one PRP. 3048 * Chained PRPs are currently unsupported. 3049 * 3050 * This is a no-op on hardware which doesn't support a transfer size 3051 * big enough to require chained PRPs. 3052 */ 3053 nvme->n_max_data_transfer_size = MIN(nvme->n_max_data_transfer_size, 3054 (nvme->n_pagesize / sizeof (uint64_t) * nvme->n_pagesize)); 3055 3056 nvme->n_prp_dma_attr.dma_attr_maxxfer = nvme->n_max_data_transfer_size; 3057 3058 /* 3059 * Make sure the minimum/maximum queue entry sizes are not 3060 * larger/smaller than the default. 3061 */ 3062 3063 if (((1 << nvme->n_idctl->id_sqes.qes_min) > sizeof (nvme_sqe_t)) || 3064 ((1 << nvme->n_idctl->id_sqes.qes_max) < sizeof (nvme_sqe_t)) || 3065 ((1 << nvme->n_idctl->id_cqes.qes_min) > sizeof (nvme_cqe_t)) || 3066 ((1 << nvme->n_idctl->id_cqes.qes_max) < sizeof (nvme_cqe_t))) 3067 goto fail; 3068 3069 /* 3070 * Check for the presence of a Volatile Write Cache. If present, 3071 * enable or disable based on the value of the property 3072 * volatile-write-cache-enable (default is enabled). 3073 */ 3074 nvme->n_write_cache_present = 3075 nvme->n_idctl->id_vwc.vwc_present == 0 ? B_FALSE : B_TRUE; 3076 3077 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, 3078 "volatile-write-cache-present", 3079 nvme->n_write_cache_present ? 1 : 0); 3080 3081 if (!nvme->n_write_cache_present) { 3082 nvme->n_write_cache_enabled = B_FALSE; 3083 } else if (nvme_write_cache_set(nvme, nvme->n_write_cache_enabled) 3084 != 0) { 3085 dev_err(nvme->n_dip, CE_WARN, 3086 "!failed to %sable volatile write cache", 3087 nvme->n_write_cache_enabled ? "en" : "dis"); 3088 /* 3089 * Assume the cache is (still) enabled. 3090 */ 3091 nvme->n_write_cache_enabled = B_TRUE; 3092 } 3093 3094 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, 3095 "volatile-write-cache-enable", 3096 nvme->n_write_cache_enabled ? 1 : 0); 3097 3098 /* 3099 * Assume LBA Range Type feature is supported. If it isn't this 3100 * will be set to B_FALSE by nvme_get_features(). 3101 */ 3102 nvme->n_lba_range_supported = B_TRUE; 3103 3104 /* 3105 * Check support for Autonomous Power State Transition. 3106 */ 3107 if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1)) 3108 nvme->n_auto_pst_supported = 3109 nvme->n_idctl->id_apsta.ap_sup == 0 ? B_FALSE : B_TRUE; 3110 3111 /* 3112 * Assume Software Progress Marker feature is supported. If it isn't 3113 * this will be set to B_FALSE by nvme_get_features(). 3114 */ 3115 nvme->n_progress_supported = B_TRUE; 3116 3117 /* 3118 * Identify Namespaces 3119 */ 3120 nvme->n_namespace_count = nvme->n_idctl->id_nn; 3121 3122 if (nvme->n_namespace_count == 0) { 3123 dev_err(nvme->n_dip, CE_WARN, 3124 "!controllers without namespaces are not supported"); 3125 goto fail; 3126 } 3127 3128 if (nvme->n_namespace_count > NVME_MINOR_MAX) { 3129 dev_err(nvme->n_dip, CE_WARN, 3130 "!too many namespaces: %d, limiting to %d\n", 3131 nvme->n_namespace_count, NVME_MINOR_MAX); 3132 nvme->n_namespace_count = NVME_MINOR_MAX; 3133 } 3134 3135 nvme->n_ns = kmem_zalloc(sizeof (nvme_namespace_t) * 3136 nvme->n_namespace_count, KM_SLEEP); 3137 3138 for (i = 0; i != nvme->n_namespace_count; i++) { 3139 mutex_init(&nvme->n_ns[i].ns_minor.nm_mutex, NULL, MUTEX_DRIVER, 3140 NULL); 3141 nvme->n_ns[i].ns_ignore = B_TRUE; 3142 if (nvme_init_ns(nvme, i + 1) != DDI_SUCCESS) 3143 goto fail; 3144 } 3145 3146 /* 3147 * Try to set up MSI/MSI-X interrupts. 3148 */ 3149 if ((nvme->n_intr_types & (DDI_INTR_TYPE_MSI | DDI_INTR_TYPE_MSIX)) 3150 != 0) { 3151 nvme_release_interrupts(nvme); 3152 3153 nqueues = MIN(UINT16_MAX, ncpus); 3154 3155 if ((nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSIX, 3156 nqueues) != DDI_SUCCESS) && 3157 (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSI, 3158 nqueues) != DDI_SUCCESS)) { 3159 dev_err(nvme->n_dip, CE_WARN, 3160 "!failed to setup MSI/MSI-X interrupts"); 3161 goto fail; 3162 } 3163 } 3164 3165 /* 3166 * Create I/O queue pairs. 3167 */ 3168 3169 if (nvme_set_nqueues(nvme) != 0) { 3170 dev_err(nvme->n_dip, CE_WARN, 3171 "!failed to set number of I/O queues to %d", 3172 nvme->n_intr_cnt); 3173 goto fail; 3174 } 3175 3176 /* 3177 * Reallocate I/O queue array 3178 */ 3179 kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *)); 3180 nvme->n_ioq = kmem_zalloc(sizeof (nvme_qpair_t *) * 3181 (nvme->n_submission_queues + 1), KM_SLEEP); 3182 nvme->n_ioq[0] = nvme->n_adminq; 3183 3184 /* 3185 * There should always be at least as many submission queues 3186 * as completion queues. 3187 */ 3188 ASSERT(nvme->n_submission_queues >= nvme->n_completion_queues); 3189 3190 nvme->n_ioq_count = nvme->n_submission_queues; 3191 3192 nvme->n_io_squeue_len = 3193 MIN(nvme->n_io_squeue_len, nvme->n_max_queue_entries); 3194 3195 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-squeue-len", 3196 nvme->n_io_squeue_len); 3197 3198 /* 3199 * Pre-allocate completion queues. 3200 * When there are the same number of submission and completion 3201 * queues there is no value in having a larger completion 3202 * queue length. 3203 */ 3204 if (nvme->n_submission_queues == nvme->n_completion_queues) 3205 nvme->n_io_cqueue_len = MIN(nvme->n_io_cqueue_len, 3206 nvme->n_io_squeue_len); 3207 3208 nvme->n_io_cqueue_len = MIN(nvme->n_io_cqueue_len, 3209 nvme->n_max_queue_entries); 3210 3211 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-cqueue-len", 3212 nvme->n_io_cqueue_len); 3213 3214 /* 3215 * Assign the equal quantity of taskq threads to each completion 3216 * queue, capping the total number of threads to the number 3217 * of CPUs. 3218 */ 3219 tq_threads = MIN(UINT16_MAX, ncpus) / nvme->n_completion_queues; 3220 3221 /* 3222 * In case the calculation above is zero, we need at least one 3223 * thread per completion queue. 3224 */ 3225 tq_threads = MAX(1, tq_threads); 3226 3227 if (nvme_create_cq_array(nvme, nvme->n_completion_queues + 1, 3228 nvme->n_io_cqueue_len, tq_threads) != DDI_SUCCESS) { 3229 dev_err(nvme->n_dip, CE_WARN, 3230 "!failed to pre-allocate completion queues"); 3231 goto fail; 3232 } 3233 3234 /* 3235 * If we use less completion queues than interrupt vectors return 3236 * some of the interrupt vectors back to the system. 3237 */ 3238 if (nvme->n_completion_queues + 1 < nvme->n_intr_cnt) { 3239 nvme_release_interrupts(nvme); 3240 3241 if (nvme_setup_interrupts(nvme, nvme->n_intr_type, 3242 nvme->n_completion_queues + 1) != DDI_SUCCESS) { 3243 dev_err(nvme->n_dip, CE_WARN, 3244 "!failed to reduce number of interrupts"); 3245 goto fail; 3246 } 3247 } 3248 3249 /* 3250 * Alloc & register I/O queue pairs 3251 */ 3252 3253 for (i = 1; i != nvme->n_ioq_count + 1; i++) { 3254 if (nvme_alloc_qpair(nvme, nvme->n_io_squeue_len, 3255 &nvme->n_ioq[i], i) != DDI_SUCCESS) { 3256 dev_err(nvme->n_dip, CE_WARN, 3257 "!unable to allocate I/O qpair %d", i); 3258 goto fail; 3259 } 3260 3261 if (nvme_create_io_qpair(nvme, nvme->n_ioq[i], i) != 0) { 3262 dev_err(nvme->n_dip, CE_WARN, 3263 "!unable to create I/O qpair %d", i); 3264 goto fail; 3265 } 3266 } 3267 3268 /* 3269 * Post more asynchronous events commands to reduce event reporting 3270 * latency as suggested by the spec. 3271 */ 3272 if (nvme->n_async_event_supported) { 3273 for (i = 1; i != nvme->n_async_event_limit; i++) 3274 nvme_async_event(nvme); 3275 } 3276 3277 return (DDI_SUCCESS); 3278 3279 fail: 3280 (void) nvme_reset(nvme, B_FALSE); 3281 return (DDI_FAILURE); 3282 } 3283 3284 static uint_t 3285 nvme_intr(caddr_t arg1, caddr_t arg2) 3286 { 3287 /*LINTED: E_PTR_BAD_CAST_ALIGN*/ 3288 nvme_t *nvme = (nvme_t *)arg1; 3289 int inum = (int)(uintptr_t)arg2; 3290 int ccnt = 0; 3291 int qnum; 3292 3293 if (inum >= nvme->n_intr_cnt) 3294 return (DDI_INTR_UNCLAIMED); 3295 3296 if (nvme->n_dead) 3297 return (nvme->n_intr_type == DDI_INTR_TYPE_FIXED ? 3298 DDI_INTR_UNCLAIMED : DDI_INTR_CLAIMED); 3299 3300 /* 3301 * The interrupt vector a queue uses is calculated as queue_idx % 3302 * intr_cnt in nvme_create_io_qpair(). Iterate through the queue array 3303 * in steps of n_intr_cnt to process all queues using this vector. 3304 */ 3305 for (qnum = inum; 3306 qnum < nvme->n_cq_count && nvme->n_cq[qnum] != NULL; 3307 qnum += nvme->n_intr_cnt) { 3308 ccnt += nvme_process_iocq(nvme, nvme->n_cq[qnum]); 3309 } 3310 3311 return (ccnt > 0 ? DDI_INTR_CLAIMED : DDI_INTR_UNCLAIMED); 3312 } 3313 3314 static void 3315 nvme_release_interrupts(nvme_t *nvme) 3316 { 3317 int i; 3318 3319 for (i = 0; i < nvme->n_intr_cnt; i++) { 3320 if (nvme->n_inth[i] == NULL) 3321 break; 3322 3323 if (nvme->n_intr_cap & DDI_INTR_FLAG_BLOCK) 3324 (void) ddi_intr_block_disable(&nvme->n_inth[i], 1); 3325 else 3326 (void) ddi_intr_disable(nvme->n_inth[i]); 3327 3328 (void) ddi_intr_remove_handler(nvme->n_inth[i]); 3329 (void) ddi_intr_free(nvme->n_inth[i]); 3330 } 3331 3332 kmem_free(nvme->n_inth, nvme->n_inth_sz); 3333 nvme->n_inth = NULL; 3334 nvme->n_inth_sz = 0; 3335 3336 nvme->n_progress &= ~NVME_INTERRUPTS; 3337 } 3338 3339 static int 3340 nvme_setup_interrupts(nvme_t *nvme, int intr_type, int nqpairs) 3341 { 3342 int nintrs, navail, count; 3343 int ret; 3344 int i; 3345 3346 if (nvme->n_intr_types == 0) { 3347 ret = ddi_intr_get_supported_types(nvme->n_dip, 3348 &nvme->n_intr_types); 3349 if (ret != DDI_SUCCESS) { 3350 dev_err(nvme->n_dip, CE_WARN, 3351 "!%s: ddi_intr_get_supported types failed", 3352 __func__); 3353 return (ret); 3354 } 3355 #ifdef __x86 3356 if (get_hwenv() == HW_VMWARE) 3357 nvme->n_intr_types &= ~DDI_INTR_TYPE_MSIX; 3358 #endif 3359 } 3360 3361 if ((nvme->n_intr_types & intr_type) == 0) 3362 return (DDI_FAILURE); 3363 3364 ret = ddi_intr_get_nintrs(nvme->n_dip, intr_type, &nintrs); 3365 if (ret != DDI_SUCCESS) { 3366 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_nintrs failed", 3367 __func__); 3368 return (ret); 3369 } 3370 3371 ret = ddi_intr_get_navail(nvme->n_dip, intr_type, &navail); 3372 if (ret != DDI_SUCCESS) { 3373 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_navail failed", 3374 __func__); 3375 return (ret); 3376 } 3377 3378 /* We want at most one interrupt per queue pair. */ 3379 if (navail > nqpairs) 3380 navail = nqpairs; 3381 3382 nvme->n_inth_sz = sizeof (ddi_intr_handle_t) * navail; 3383 nvme->n_inth = kmem_zalloc(nvme->n_inth_sz, KM_SLEEP); 3384 3385 ret = ddi_intr_alloc(nvme->n_dip, nvme->n_inth, intr_type, 0, navail, 3386 &count, 0); 3387 if (ret != DDI_SUCCESS) { 3388 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_alloc failed", 3389 __func__); 3390 goto fail; 3391 } 3392 3393 nvme->n_intr_cnt = count; 3394 3395 ret = ddi_intr_get_pri(nvme->n_inth[0], &nvme->n_intr_pri); 3396 if (ret != DDI_SUCCESS) { 3397 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_pri failed", 3398 __func__); 3399 goto fail; 3400 } 3401 3402 for (i = 0; i < count; i++) { 3403 ret = ddi_intr_add_handler(nvme->n_inth[i], nvme_intr, 3404 (void *)nvme, (void *)(uintptr_t)i); 3405 if (ret != DDI_SUCCESS) { 3406 dev_err(nvme->n_dip, CE_WARN, 3407 "!%s: ddi_intr_add_handler failed", __func__); 3408 goto fail; 3409 } 3410 } 3411 3412 (void) ddi_intr_get_cap(nvme->n_inth[0], &nvme->n_intr_cap); 3413 3414 for (i = 0; i < count; i++) { 3415 if (nvme->n_intr_cap & DDI_INTR_FLAG_BLOCK) 3416 ret = ddi_intr_block_enable(&nvme->n_inth[i], 1); 3417 else 3418 ret = ddi_intr_enable(nvme->n_inth[i]); 3419 3420 if (ret != DDI_SUCCESS) { 3421 dev_err(nvme->n_dip, CE_WARN, 3422 "!%s: enabling interrupt %d failed", __func__, i); 3423 goto fail; 3424 } 3425 } 3426 3427 nvme->n_intr_type = intr_type; 3428 3429 nvme->n_progress |= NVME_INTERRUPTS; 3430 3431 return (DDI_SUCCESS); 3432 3433 fail: 3434 nvme_release_interrupts(nvme); 3435 3436 return (ret); 3437 } 3438 3439 static int 3440 nvme_fm_errcb(dev_info_t *dip, ddi_fm_error_t *fm_error, const void *arg) 3441 { 3442 _NOTE(ARGUNUSED(arg)); 3443 3444 pci_ereport_post(dip, fm_error, NULL); 3445 return (fm_error->fme_status); 3446 } 3447 3448 static void 3449 nvme_remove_callback(dev_info_t *dip, ddi_eventcookie_t cookie, void *a, 3450 void *b) 3451 { 3452 nvme_t *nvme = a; 3453 3454 nvme->n_dead = B_TRUE; 3455 3456 /* 3457 * Fail all outstanding commands, including those in the admin queue 3458 * (queue 0). 3459 */ 3460 for (uint_t i = 0; i < nvme->n_ioq_count + 1; i++) { 3461 nvme_qpair_t *qp = nvme->n_ioq[i]; 3462 3463 mutex_enter(&qp->nq_mutex); 3464 for (size_t j = 0; j < qp->nq_nentry; j++) { 3465 nvme_cmd_t *cmd = qp->nq_cmd[j]; 3466 nvme_cmd_t *u_cmd; 3467 3468 if (cmd == NULL) { 3469 continue; 3470 } 3471 3472 /* 3473 * Since we have the queue lock held the entire time we 3474 * iterate over it, it's not possible for the queue to 3475 * change underneath us. Thus, we don't need to check 3476 * that the return value of nvme_unqueue_cmd matches the 3477 * requested cmd to unqueue. 3478 */ 3479 u_cmd = nvme_unqueue_cmd(nvme, qp, cmd->nc_sqe.sqe_cid); 3480 taskq_dispatch_ent(qp->nq_cq->ncq_cmd_taskq, 3481 cmd->nc_callback, cmd, TQ_NOSLEEP, &cmd->nc_tqent); 3482 3483 ASSERT3P(u_cmd, ==, cmd); 3484 } 3485 mutex_exit(&qp->nq_mutex); 3486 } 3487 } 3488 3489 static int 3490 nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 3491 { 3492 nvme_t *nvme; 3493 int instance; 3494 int nregs; 3495 off_t regsize; 3496 int i; 3497 char name[32]; 3498 bd_ops_t ops = nvme_bd_ops; 3499 3500 if (cmd != DDI_ATTACH) 3501 return (DDI_FAILURE); 3502 3503 instance = ddi_get_instance(dip); 3504 3505 if (ddi_soft_state_zalloc(nvme_state, instance) != DDI_SUCCESS) 3506 return (DDI_FAILURE); 3507 3508 nvme = ddi_get_soft_state(nvme_state, instance); 3509 ddi_set_driver_private(dip, nvme); 3510 nvme->n_dip = dip; 3511 3512 /* Set up event handlers for hot removal. */ 3513 if (ddi_get_eventcookie(nvme->n_dip, DDI_DEVI_REMOVE_EVENT, 3514 &nvme->n_rm_cookie) != DDI_SUCCESS) { 3515 goto fail; 3516 } 3517 if (ddi_add_event_handler(nvme->n_dip, nvme->n_rm_cookie, 3518 nvme_remove_callback, nvme, &nvme->n_ev_rm_cb_id) != 3519 DDI_SUCCESS) { 3520 goto fail; 3521 } 3522 3523 mutex_init(&nvme->n_minor.nm_mutex, NULL, MUTEX_DRIVER, NULL); 3524 3525 nvme->n_strict_version = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3526 DDI_PROP_DONTPASS, "strict-version", 1) == 1 ? B_TRUE : B_FALSE; 3527 nvme->n_ignore_unknown_vendor_status = ddi_prop_get_int(DDI_DEV_T_ANY, 3528 dip, DDI_PROP_DONTPASS, "ignore-unknown-vendor-status", 0) == 1 ? 3529 B_TRUE : B_FALSE; 3530 nvme->n_admin_queue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3531 DDI_PROP_DONTPASS, "admin-queue-len", NVME_DEFAULT_ADMIN_QUEUE_LEN); 3532 nvme->n_io_squeue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3533 DDI_PROP_DONTPASS, "io-squeue-len", NVME_DEFAULT_IO_QUEUE_LEN); 3534 /* 3535 * Double up the default for completion queues in case of 3536 * queue sharing. 3537 */ 3538 nvme->n_io_cqueue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3539 DDI_PROP_DONTPASS, "io-cqueue-len", 2 * NVME_DEFAULT_IO_QUEUE_LEN); 3540 nvme->n_async_event_limit = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3541 DDI_PROP_DONTPASS, "async-event-limit", 3542 NVME_DEFAULT_ASYNC_EVENT_LIMIT); 3543 nvme->n_write_cache_enabled = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3544 DDI_PROP_DONTPASS, "volatile-write-cache-enable", 1) != 0 ? 3545 B_TRUE : B_FALSE; 3546 nvme->n_min_block_size = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3547 DDI_PROP_DONTPASS, "min-phys-block-size", 3548 NVME_DEFAULT_MIN_BLOCK_SIZE); 3549 nvme->n_submission_queues = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3550 DDI_PROP_DONTPASS, "max-submission-queues", -1); 3551 nvme->n_completion_queues = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3552 DDI_PROP_DONTPASS, "max-completion-queues", -1); 3553 3554 if (!ISP2(nvme->n_min_block_size) || 3555 (nvme->n_min_block_size < NVME_DEFAULT_MIN_BLOCK_SIZE)) { 3556 dev_err(dip, CE_WARN, "!min-phys-block-size %s, " 3557 "using default %d", ISP2(nvme->n_min_block_size) ? 3558 "too low" : "not a power of 2", 3559 NVME_DEFAULT_MIN_BLOCK_SIZE); 3560 nvme->n_min_block_size = NVME_DEFAULT_MIN_BLOCK_SIZE; 3561 } 3562 3563 if (nvme->n_submission_queues != -1 && 3564 (nvme->n_submission_queues < 1 || 3565 nvme->n_submission_queues > UINT16_MAX)) { 3566 dev_err(dip, CE_WARN, "!\"submission-queues\"=%d is not " 3567 "valid. Must be [1..%d]", nvme->n_submission_queues, 3568 UINT16_MAX); 3569 nvme->n_submission_queues = -1; 3570 } 3571 3572 if (nvme->n_completion_queues != -1 && 3573 (nvme->n_completion_queues < 1 || 3574 nvme->n_completion_queues > UINT16_MAX)) { 3575 dev_err(dip, CE_WARN, "!\"completion-queues\"=%d is not " 3576 "valid. Must be [1..%d]", nvme->n_completion_queues, 3577 UINT16_MAX); 3578 nvme->n_completion_queues = -1; 3579 } 3580 3581 if (nvme->n_admin_queue_len < NVME_MIN_ADMIN_QUEUE_LEN) 3582 nvme->n_admin_queue_len = NVME_MIN_ADMIN_QUEUE_LEN; 3583 else if (nvme->n_admin_queue_len > NVME_MAX_ADMIN_QUEUE_LEN) 3584 nvme->n_admin_queue_len = NVME_MAX_ADMIN_QUEUE_LEN; 3585 3586 if (nvme->n_io_squeue_len < NVME_MIN_IO_QUEUE_LEN) 3587 nvme->n_io_squeue_len = NVME_MIN_IO_QUEUE_LEN; 3588 if (nvme->n_io_cqueue_len < NVME_MIN_IO_QUEUE_LEN) 3589 nvme->n_io_cqueue_len = NVME_MIN_IO_QUEUE_LEN; 3590 3591 if (nvme->n_async_event_limit < 1) 3592 nvme->n_async_event_limit = NVME_DEFAULT_ASYNC_EVENT_LIMIT; 3593 3594 nvme->n_reg_acc_attr = nvme_reg_acc_attr; 3595 nvme->n_queue_dma_attr = nvme_queue_dma_attr; 3596 nvme->n_prp_dma_attr = nvme_prp_dma_attr; 3597 nvme->n_sgl_dma_attr = nvme_sgl_dma_attr; 3598 3599 /* 3600 * Set up FMA support. 3601 */ 3602 nvme->n_fm_cap = ddi_getprop(DDI_DEV_T_ANY, dip, 3603 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "fm-capable", 3604 DDI_FM_EREPORT_CAPABLE | DDI_FM_ACCCHK_CAPABLE | 3605 DDI_FM_DMACHK_CAPABLE | DDI_FM_ERRCB_CAPABLE); 3606 3607 ddi_fm_init(dip, &nvme->n_fm_cap, &nvme->n_fm_ibc); 3608 3609 if (nvme->n_fm_cap) { 3610 if (nvme->n_fm_cap & DDI_FM_ACCCHK_CAPABLE) 3611 nvme->n_reg_acc_attr.devacc_attr_access = 3612 DDI_FLAGERR_ACC; 3613 3614 if (nvme->n_fm_cap & DDI_FM_DMACHK_CAPABLE) { 3615 nvme->n_prp_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR; 3616 nvme->n_sgl_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR; 3617 } 3618 3619 if (DDI_FM_EREPORT_CAP(nvme->n_fm_cap) || 3620 DDI_FM_ERRCB_CAP(nvme->n_fm_cap)) 3621 pci_ereport_setup(dip); 3622 3623 if (DDI_FM_ERRCB_CAP(nvme->n_fm_cap)) 3624 ddi_fm_handler_register(dip, nvme_fm_errcb, 3625 (void *)nvme); 3626 } 3627 3628 nvme->n_progress |= NVME_FMA_INIT; 3629 3630 /* 3631 * The spec defines several register sets. Only the controller 3632 * registers (set 1) are currently used. 3633 */ 3634 if (ddi_dev_nregs(dip, &nregs) == DDI_FAILURE || 3635 nregs < 2 || 3636 ddi_dev_regsize(dip, 1, ®size) == DDI_FAILURE) 3637 goto fail; 3638 3639 if (ddi_regs_map_setup(dip, 1, &nvme->n_regs, 0, regsize, 3640 &nvme->n_reg_acc_attr, &nvme->n_regh) != DDI_SUCCESS) { 3641 dev_err(dip, CE_WARN, "!failed to map regset 1"); 3642 goto fail; 3643 } 3644 3645 nvme->n_progress |= NVME_REGS_MAPPED; 3646 3647 /* 3648 * Create PRP DMA cache 3649 */ 3650 (void) snprintf(name, sizeof (name), "%s%d_prp_cache", 3651 ddi_driver_name(dip), ddi_get_instance(dip)); 3652 nvme->n_prp_cache = kmem_cache_create(name, sizeof (nvme_dma_t), 3653 0, nvme_prp_dma_constructor, nvme_prp_dma_destructor, 3654 NULL, (void *)nvme, NULL, 0); 3655 3656 if (nvme_init(nvme) != DDI_SUCCESS) 3657 goto fail; 3658 3659 if (!nvme->n_idctl->id_oncs.on_dset_mgmt) 3660 ops.o_free_space = NULL; 3661 3662 /* 3663 * Initialize the driver with the UFM subsystem 3664 */ 3665 if (ddi_ufm_init(dip, DDI_UFM_CURRENT_VERSION, &nvme_ufm_ops, 3666 &nvme->n_ufmh, nvme) != 0) { 3667 dev_err(dip, CE_WARN, "!failed to initialize UFM subsystem"); 3668 goto fail; 3669 } 3670 mutex_init(&nvme->n_fwslot_mutex, NULL, MUTEX_DRIVER, NULL); 3671 ddi_ufm_update(nvme->n_ufmh); 3672 nvme->n_progress |= NVME_UFM_INIT; 3673 3674 /* 3675 * Attach the blkdev driver for each namespace. 3676 */ 3677 for (i = 0; i != nvme->n_namespace_count; i++) { 3678 if (ddi_create_minor_node(nvme->n_dip, nvme->n_ns[i].ns_name, 3679 S_IFCHR, NVME_MINOR(ddi_get_instance(nvme->n_dip), i + 1), 3680 DDI_NT_NVME_ATTACHMENT_POINT, 0) != DDI_SUCCESS) { 3681 dev_err(dip, CE_WARN, 3682 "!failed to create minor node for namespace %d", i); 3683 goto fail; 3684 } 3685 3686 if (nvme->n_ns[i].ns_ignore) 3687 continue; 3688 3689 nvme->n_ns[i].ns_bd_hdl = bd_alloc_handle(&nvme->n_ns[i], 3690 &ops, &nvme->n_prp_dma_attr, KM_SLEEP); 3691 3692 if (nvme->n_ns[i].ns_bd_hdl == NULL) { 3693 dev_err(dip, CE_WARN, 3694 "!failed to get blkdev handle for namespace %d", i); 3695 goto fail; 3696 } 3697 3698 if (bd_attach_handle(dip, nvme->n_ns[i].ns_bd_hdl) 3699 != DDI_SUCCESS) { 3700 dev_err(dip, CE_WARN, 3701 "!failed to attach blkdev handle for namespace %d", 3702 i); 3703 goto fail; 3704 } 3705 } 3706 3707 if (ddi_create_minor_node(dip, "devctl", S_IFCHR, 3708 NVME_MINOR(ddi_get_instance(dip), 0), DDI_NT_NVME_NEXUS, 0) 3709 != DDI_SUCCESS) { 3710 dev_err(dip, CE_WARN, "nvme_attach: " 3711 "cannot create devctl minor node"); 3712 goto fail; 3713 } 3714 3715 return (DDI_SUCCESS); 3716 3717 fail: 3718 /* attach successful anyway so that FMA can retire the device */ 3719 if (nvme->n_dead) 3720 return (DDI_SUCCESS); 3721 3722 (void) nvme_detach(dip, DDI_DETACH); 3723 3724 return (DDI_FAILURE); 3725 } 3726 3727 static int 3728 nvme_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 3729 { 3730 int instance, i; 3731 nvme_t *nvme; 3732 3733 if (cmd != DDI_DETACH) 3734 return (DDI_FAILURE); 3735 3736 instance = ddi_get_instance(dip); 3737 3738 nvme = ddi_get_soft_state(nvme_state, instance); 3739 3740 if (nvme == NULL) 3741 return (DDI_FAILURE); 3742 3743 ddi_remove_minor_node(dip, "devctl"); 3744 mutex_destroy(&nvme->n_minor.nm_mutex); 3745 3746 if (nvme->n_ns) { 3747 for (i = 0; i != nvme->n_namespace_count; i++) { 3748 ddi_remove_minor_node(dip, nvme->n_ns[i].ns_name); 3749 mutex_destroy(&nvme->n_ns[i].ns_minor.nm_mutex); 3750 3751 if (nvme->n_ns[i].ns_bd_hdl) { 3752 (void) bd_detach_handle( 3753 nvme->n_ns[i].ns_bd_hdl); 3754 bd_free_handle(nvme->n_ns[i].ns_bd_hdl); 3755 } 3756 3757 if (nvme->n_ns[i].ns_idns) 3758 kmem_free(nvme->n_ns[i].ns_idns, 3759 sizeof (nvme_identify_nsid_t)); 3760 if (nvme->n_ns[i].ns_devid) 3761 strfree(nvme->n_ns[i].ns_devid); 3762 } 3763 3764 kmem_free(nvme->n_ns, sizeof (nvme_namespace_t) * 3765 nvme->n_namespace_count); 3766 } 3767 if (nvme->n_progress & NVME_UFM_INIT) { 3768 ddi_ufm_fini(nvme->n_ufmh); 3769 mutex_destroy(&nvme->n_fwslot_mutex); 3770 } 3771 3772 if (nvme->n_progress & NVME_INTERRUPTS) 3773 nvme_release_interrupts(nvme); 3774 3775 for (i = 0; i < nvme->n_cq_count; i++) { 3776 if (nvme->n_cq[i]->ncq_cmd_taskq != NULL) 3777 taskq_wait(nvme->n_cq[i]->ncq_cmd_taskq); 3778 } 3779 3780 if (nvme->n_ioq_count > 0) { 3781 for (i = 1; i != nvme->n_ioq_count + 1; i++) { 3782 if (nvme->n_ioq[i] != NULL) { 3783 /* TODO: send destroy queue commands */ 3784 nvme_free_qpair(nvme->n_ioq[i]); 3785 } 3786 } 3787 3788 kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *) * 3789 (nvme->n_ioq_count + 1)); 3790 } 3791 3792 if (nvme->n_prp_cache != NULL) { 3793 kmem_cache_destroy(nvme->n_prp_cache); 3794 } 3795 3796 if (nvme->n_progress & NVME_REGS_MAPPED) { 3797 nvme_shutdown(nvme, NVME_CC_SHN_NORMAL, B_FALSE); 3798 (void) nvme_reset(nvme, B_FALSE); 3799 } 3800 3801 if (nvme->n_progress & NVME_CTRL_LIMITS) 3802 sema_destroy(&nvme->n_abort_sema); 3803 3804 if (nvme->n_progress & NVME_ADMIN_QUEUE) 3805 nvme_free_qpair(nvme->n_adminq); 3806 3807 if (nvme->n_cq_count > 0) { 3808 nvme_destroy_cq_array(nvme, 0); 3809 nvme->n_cq = NULL; 3810 nvme->n_cq_count = 0; 3811 } 3812 3813 if (nvme->n_idctl) 3814 kmem_free(nvme->n_idctl, NVME_IDENTIFY_BUFSIZE); 3815 3816 if (nvme->n_progress & NVME_REGS_MAPPED) 3817 ddi_regs_map_free(&nvme->n_regh); 3818 3819 if (nvme->n_progress & NVME_FMA_INIT) { 3820 if (DDI_FM_ERRCB_CAP(nvme->n_fm_cap)) 3821 ddi_fm_handler_unregister(nvme->n_dip); 3822 3823 if (DDI_FM_EREPORT_CAP(nvme->n_fm_cap) || 3824 DDI_FM_ERRCB_CAP(nvme->n_fm_cap)) 3825 pci_ereport_teardown(nvme->n_dip); 3826 3827 ddi_fm_fini(nvme->n_dip); 3828 } 3829 3830 if (nvme->n_vendor != NULL) 3831 strfree(nvme->n_vendor); 3832 3833 if (nvme->n_product != NULL) 3834 strfree(nvme->n_product); 3835 3836 /* Clean up hot removal event handler. */ 3837 if (nvme->n_ev_rm_cb_id != NULL) { 3838 (void) ddi_remove_event_handler(nvme->n_ev_rm_cb_id); 3839 } 3840 nvme->n_ev_rm_cb_id = NULL; 3841 3842 ddi_soft_state_free(nvme_state, instance); 3843 3844 return (DDI_SUCCESS); 3845 } 3846 3847 static int 3848 nvme_quiesce(dev_info_t *dip) 3849 { 3850 int instance; 3851 nvme_t *nvme; 3852 3853 instance = ddi_get_instance(dip); 3854 3855 nvme = ddi_get_soft_state(nvme_state, instance); 3856 3857 if (nvme == NULL) 3858 return (DDI_FAILURE); 3859 3860 nvme_shutdown(nvme, NVME_CC_SHN_ABRUPT, B_TRUE); 3861 3862 (void) nvme_reset(nvme, B_TRUE); 3863 3864 return (DDI_FAILURE); 3865 } 3866 3867 static int 3868 nvme_fill_prp(nvme_cmd_t *cmd, ddi_dma_handle_t dma) 3869 { 3870 nvme_t *nvme = cmd->nc_nvme; 3871 uint_t nprp_per_page, nprp; 3872 uint64_t *prp; 3873 const ddi_dma_cookie_t *cookie; 3874 uint_t idx; 3875 uint_t ncookies = ddi_dma_ncookies(dma); 3876 3877 if (ncookies == 0) 3878 return (DDI_FAILURE); 3879 3880 if ((cookie = ddi_dma_cookie_get(dma, 0)) == NULL) 3881 return (DDI_FAILURE); 3882 cmd->nc_sqe.sqe_dptr.d_prp[0] = cookie->dmac_laddress; 3883 3884 if (ncookies == 1) { 3885 cmd->nc_sqe.sqe_dptr.d_prp[1] = 0; 3886 return (DDI_SUCCESS); 3887 } else if (ncookies == 2) { 3888 if ((cookie = ddi_dma_cookie_get(dma, 1)) == NULL) 3889 return (DDI_FAILURE); 3890 cmd->nc_sqe.sqe_dptr.d_prp[1] = cookie->dmac_laddress; 3891 return (DDI_SUCCESS); 3892 } 3893 3894 /* 3895 * At this point, we're always operating on cookies at 3896 * index >= 1 and writing the addresses of those cookies 3897 * into a new page. The address of that page is stored 3898 * as the second PRP entry. 3899 */ 3900 nprp_per_page = nvme->n_pagesize / sizeof (uint64_t); 3901 ASSERT(nprp_per_page > 0); 3902 3903 /* 3904 * We currently don't support chained PRPs and set up our DMA 3905 * attributes to reflect that. If we still get an I/O request 3906 * that needs a chained PRP something is very wrong. Account 3907 * for the first cookie here, which we've placed in d_prp[0]. 3908 */ 3909 nprp = howmany(ncookies - 1, nprp_per_page); 3910 VERIFY(nprp == 1); 3911 3912 /* 3913 * Allocate a page of pointers, in which we'll write the 3914 * addresses of cookies 1 to `ncookies`. 3915 */ 3916 cmd->nc_prp = kmem_cache_alloc(nvme->n_prp_cache, KM_SLEEP); 3917 bzero(cmd->nc_prp->nd_memp, cmd->nc_prp->nd_len); 3918 cmd->nc_sqe.sqe_dptr.d_prp[1] = cmd->nc_prp->nd_cookie.dmac_laddress; 3919 3920 prp = (uint64_t *)cmd->nc_prp->nd_memp; 3921 for (idx = 1; idx < ncookies; idx++) { 3922 if ((cookie = ddi_dma_cookie_get(dma, idx)) == NULL) 3923 return (DDI_FAILURE); 3924 *prp++ = cookie->dmac_laddress; 3925 } 3926 3927 (void) ddi_dma_sync(cmd->nc_prp->nd_dmah, 0, cmd->nc_prp->nd_len, 3928 DDI_DMA_SYNC_FORDEV); 3929 return (DDI_SUCCESS); 3930 } 3931 3932 /* 3933 * The maximum number of requests supported for a deallocate request is 3934 * NVME_DSET_MGMT_MAX_RANGES (256) -- this is from the NVMe 1.1 spec (and 3935 * unchanged through at least 1.4a). The definition of nvme_range_t is also 3936 * from the NVMe 1.1 spec. Together, the result is that all of the ranges for 3937 * a deallocate request will fit into the smallest supported namespace page 3938 * (4k). 3939 */ 3940 CTASSERT(sizeof (nvme_range_t) * NVME_DSET_MGMT_MAX_RANGES == 4096); 3941 3942 static int 3943 nvme_fill_ranges(nvme_cmd_t *cmd, bd_xfer_t *xfer, uint64_t blocksize, 3944 int allocflag) 3945 { 3946 const dkioc_free_list_t *dfl = xfer->x_dfl; 3947 const dkioc_free_list_ext_t *exts = dfl->dfl_exts; 3948 nvme_t *nvme = cmd->nc_nvme; 3949 nvme_range_t *ranges = NULL; 3950 uint_t i; 3951 3952 /* 3953 * The number of ranges in the request is 0s based (that is 3954 * word10 == 0 -> 1 range, word10 == 1 -> 2 ranges, ..., 3955 * word10 == 255 -> 256 ranges). Therefore the allowed values are 3956 * [1..NVME_DSET_MGMT_MAX_RANGES]. If blkdev gives us a bad request, 3957 * we either provided bad info in nvme_bd_driveinfo() or there is a bug 3958 * in blkdev. 3959 */ 3960 VERIFY3U(dfl->dfl_num_exts, >, 0); 3961 VERIFY3U(dfl->dfl_num_exts, <=, NVME_DSET_MGMT_MAX_RANGES); 3962 cmd->nc_sqe.sqe_cdw10 = (dfl->dfl_num_exts - 1) & 0xff; 3963 3964 cmd->nc_sqe.sqe_cdw11 = NVME_DSET_MGMT_ATTR_DEALLOCATE; 3965 3966 cmd->nc_prp = kmem_cache_alloc(nvme->n_prp_cache, allocflag); 3967 if (cmd->nc_prp == NULL) 3968 return (DDI_FAILURE); 3969 3970 bzero(cmd->nc_prp->nd_memp, cmd->nc_prp->nd_len); 3971 ranges = (nvme_range_t *)cmd->nc_prp->nd_memp; 3972 3973 cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_prp->nd_cookie.dmac_laddress; 3974 cmd->nc_sqe.sqe_dptr.d_prp[1] = 0; 3975 3976 for (i = 0; i < dfl->dfl_num_exts; i++) { 3977 uint64_t lba, len; 3978 3979 lba = (dfl->dfl_offset + exts[i].dfle_start) / blocksize; 3980 len = exts[i].dfle_length / blocksize; 3981 3982 VERIFY3U(len, <=, UINT32_MAX); 3983 3984 /* No context attributes for a deallocate request */ 3985 ranges[i].nr_ctxattr = 0; 3986 ranges[i].nr_len = len; 3987 ranges[i].nr_lba = lba; 3988 } 3989 3990 (void) ddi_dma_sync(cmd->nc_prp->nd_dmah, 0, cmd->nc_prp->nd_len, 3991 DDI_DMA_SYNC_FORDEV); 3992 3993 return (DDI_SUCCESS); 3994 } 3995 3996 static nvme_cmd_t * 3997 nvme_create_nvm_cmd(nvme_namespace_t *ns, uint8_t opc, bd_xfer_t *xfer) 3998 { 3999 nvme_t *nvme = ns->ns_nvme; 4000 nvme_cmd_t *cmd; 4001 int allocflag; 4002 4003 /* 4004 * Blkdev only sets BD_XFER_POLL when dumping, so don't sleep. 4005 */ 4006 allocflag = (xfer->x_flags & BD_XFER_POLL) ? KM_NOSLEEP : KM_SLEEP; 4007 cmd = nvme_alloc_cmd(nvme, allocflag); 4008 4009 if (cmd == NULL) 4010 return (NULL); 4011 4012 cmd->nc_sqe.sqe_opc = opc; 4013 cmd->nc_callback = nvme_bd_xfer_done; 4014 cmd->nc_xfer = xfer; 4015 4016 switch (opc) { 4017 case NVME_OPC_NVM_WRITE: 4018 case NVME_OPC_NVM_READ: 4019 VERIFY(xfer->x_nblks <= 0x10000); 4020 4021 cmd->nc_sqe.sqe_nsid = ns->ns_id; 4022 4023 cmd->nc_sqe.sqe_cdw10 = xfer->x_blkno & 0xffffffffu; 4024 cmd->nc_sqe.sqe_cdw11 = (xfer->x_blkno >> 32); 4025 cmd->nc_sqe.sqe_cdw12 = (uint16_t)(xfer->x_nblks - 1); 4026 4027 if (nvme_fill_prp(cmd, xfer->x_dmah) != DDI_SUCCESS) 4028 goto fail; 4029 break; 4030 4031 case NVME_OPC_NVM_FLUSH: 4032 cmd->nc_sqe.sqe_nsid = ns->ns_id; 4033 break; 4034 4035 case NVME_OPC_NVM_DSET_MGMT: 4036 cmd->nc_sqe.sqe_nsid = ns->ns_id; 4037 4038 if (nvme_fill_ranges(cmd, xfer, 4039 (uint64_t)ns->ns_block_size, allocflag) != DDI_SUCCESS) 4040 goto fail; 4041 break; 4042 4043 default: 4044 goto fail; 4045 } 4046 4047 return (cmd); 4048 4049 fail: 4050 nvme_free_cmd(cmd); 4051 return (NULL); 4052 } 4053 4054 static void 4055 nvme_bd_xfer_done(void *arg) 4056 { 4057 nvme_cmd_t *cmd = arg; 4058 bd_xfer_t *xfer = cmd->nc_xfer; 4059 int error = 0; 4060 4061 error = nvme_check_cmd_status(cmd); 4062 nvme_free_cmd(cmd); 4063 4064 bd_xfer_done(xfer, error); 4065 } 4066 4067 static void 4068 nvme_bd_driveinfo(void *arg, bd_drive_t *drive) 4069 { 4070 nvme_namespace_t *ns = arg; 4071 nvme_t *nvme = ns->ns_nvme; 4072 uint_t ns_count = MAX(1, nvme->n_namespaces_attachable); 4073 4074 /* 4075 * Set the blkdev qcount to the number of submission queues. 4076 * It will then create one waitq/runq pair for each submission 4077 * queue and spread I/O requests across the queues. 4078 */ 4079 drive->d_qcount = nvme->n_ioq_count; 4080 4081 /* 4082 * I/O activity to individual namespaces is distributed across 4083 * each of the d_qcount blkdev queues (which has been set to 4084 * the number of nvme submission queues). d_qsize is the number 4085 * of submitted and not completed I/Os within each queue that blkdev 4086 * will allow before it starts holding them in the waitq. 4087 * 4088 * Each namespace will create a child blkdev instance, for each one 4089 * we try and set the d_qsize so that each namespace gets an 4090 * equal portion of the submission queue. 4091 * 4092 * If post instantiation of the nvme drive, n_namespaces_attachable 4093 * changes and a namespace is attached it could calculate a 4094 * different d_qsize. It may even be that the sum of the d_qsizes is 4095 * now beyond the submission queue size. Should that be the case 4096 * and the I/O rate is such that blkdev attempts to submit more 4097 * I/Os than the size of the submission queue, the excess I/Os 4098 * will be held behind the semaphore nq_sema. 4099 */ 4100 drive->d_qsize = nvme->n_io_squeue_len / ns_count; 4101 4102 /* 4103 * Don't let the queue size drop below the minimum, though. 4104 */ 4105 drive->d_qsize = MAX(drive->d_qsize, NVME_MIN_IO_QUEUE_LEN); 4106 4107 /* 4108 * d_maxxfer is not set, which means the value is taken from the DMA 4109 * attributes specified to bd_alloc_handle. 4110 */ 4111 4112 drive->d_removable = B_FALSE; 4113 drive->d_hotpluggable = B_FALSE; 4114 4115 bcopy(ns->ns_eui64, drive->d_eui64, sizeof (drive->d_eui64)); 4116 drive->d_target = ns->ns_id; 4117 drive->d_lun = 0; 4118 4119 drive->d_model = nvme->n_idctl->id_model; 4120 drive->d_model_len = sizeof (nvme->n_idctl->id_model); 4121 drive->d_vendor = nvme->n_vendor; 4122 drive->d_vendor_len = strlen(nvme->n_vendor); 4123 drive->d_product = nvme->n_product; 4124 drive->d_product_len = strlen(nvme->n_product); 4125 drive->d_serial = nvme->n_idctl->id_serial; 4126 drive->d_serial_len = sizeof (nvme->n_idctl->id_serial); 4127 drive->d_revision = nvme->n_idctl->id_fwrev; 4128 drive->d_revision_len = sizeof (nvme->n_idctl->id_fwrev); 4129 4130 /* 4131 * If we support the dataset management command, the only restrictions 4132 * on a discard request are the maximum number of ranges (segments) 4133 * per single request. 4134 */ 4135 if (nvme->n_idctl->id_oncs.on_dset_mgmt) 4136 drive->d_max_free_seg = NVME_DSET_MGMT_MAX_RANGES; 4137 } 4138 4139 static int 4140 nvme_bd_mediainfo(void *arg, bd_media_t *media) 4141 { 4142 nvme_namespace_t *ns = arg; 4143 nvme_t *nvme = ns->ns_nvme; 4144 4145 if (nvme->n_dead) { 4146 return (EIO); 4147 } 4148 4149 media->m_nblks = ns->ns_block_count; 4150 media->m_blksize = ns->ns_block_size; 4151 media->m_readonly = B_FALSE; 4152 media->m_solidstate = B_TRUE; 4153 4154 media->m_pblksize = ns->ns_best_block_size; 4155 4156 return (0); 4157 } 4158 4159 static int 4160 nvme_bd_cmd(nvme_namespace_t *ns, bd_xfer_t *xfer, uint8_t opc) 4161 { 4162 nvme_t *nvme = ns->ns_nvme; 4163 nvme_cmd_t *cmd; 4164 nvme_qpair_t *ioq; 4165 boolean_t poll; 4166 int ret; 4167 4168 if (nvme->n_dead) { 4169 return (EIO); 4170 } 4171 4172 cmd = nvme_create_nvm_cmd(ns, opc, xfer); 4173 if (cmd == NULL) 4174 return (ENOMEM); 4175 4176 cmd->nc_sqid = xfer->x_qnum + 1; 4177 ASSERT(cmd->nc_sqid <= nvme->n_ioq_count); 4178 ioq = nvme->n_ioq[cmd->nc_sqid]; 4179 4180 /* 4181 * Get the polling flag before submitting the command. The command may 4182 * complete immediately after it was submitted, which means we must 4183 * treat both cmd and xfer as if they have been freed already. 4184 */ 4185 poll = (xfer->x_flags & BD_XFER_POLL) != 0; 4186 4187 ret = nvme_submit_io_cmd(ioq, cmd); 4188 4189 if (ret != 0) 4190 return (ret); 4191 4192 if (!poll) 4193 return (0); 4194 4195 do { 4196 cmd = nvme_retrieve_cmd(nvme, ioq); 4197 if (cmd != NULL) 4198 cmd->nc_callback(cmd); 4199 else 4200 drv_usecwait(10); 4201 } while (ioq->nq_active_cmds != 0); 4202 4203 return (0); 4204 } 4205 4206 static int 4207 nvme_bd_read(void *arg, bd_xfer_t *xfer) 4208 { 4209 nvme_namespace_t *ns = arg; 4210 4211 return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_READ)); 4212 } 4213 4214 static int 4215 nvme_bd_write(void *arg, bd_xfer_t *xfer) 4216 { 4217 nvme_namespace_t *ns = arg; 4218 4219 return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_WRITE)); 4220 } 4221 4222 static int 4223 nvme_bd_sync(void *arg, bd_xfer_t *xfer) 4224 { 4225 nvme_namespace_t *ns = arg; 4226 4227 if (ns->ns_nvme->n_dead) 4228 return (EIO); 4229 4230 /* 4231 * If the volatile write cache is not present or not enabled the FLUSH 4232 * command is a no-op, so we can take a shortcut here. 4233 */ 4234 if (!ns->ns_nvme->n_write_cache_present) { 4235 bd_xfer_done(xfer, ENOTSUP); 4236 return (0); 4237 } 4238 4239 if (!ns->ns_nvme->n_write_cache_enabled) { 4240 bd_xfer_done(xfer, 0); 4241 return (0); 4242 } 4243 4244 return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_FLUSH)); 4245 } 4246 4247 static int 4248 nvme_bd_devid(void *arg, dev_info_t *devinfo, ddi_devid_t *devid) 4249 { 4250 nvme_namespace_t *ns = arg; 4251 nvme_t *nvme = ns->ns_nvme; 4252 4253 if (nvme->n_dead) { 4254 return (EIO); 4255 } 4256 4257 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 4258 if (*(uint64_t *)ns->ns_eui64 != 0) { 4259 return (ddi_devid_init(devinfo, DEVID_SCSI3_WWN, 4260 sizeof (ns->ns_eui64), ns->ns_eui64, devid)); 4261 } else { 4262 return (ddi_devid_init(devinfo, DEVID_ENCAP, 4263 strlen(ns->ns_devid), ns->ns_devid, devid)); 4264 } 4265 } 4266 4267 static int 4268 nvme_bd_free_space(void *arg, bd_xfer_t *xfer) 4269 { 4270 nvme_namespace_t *ns = arg; 4271 4272 if (xfer->x_dfl == NULL) 4273 return (EINVAL); 4274 4275 if (!ns->ns_nvme->n_idctl->id_oncs.on_dset_mgmt) 4276 return (ENOTSUP); 4277 4278 return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_DSET_MGMT)); 4279 } 4280 4281 static int 4282 nvme_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) 4283 { 4284 #ifndef __lock_lint 4285 _NOTE(ARGUNUSED(cred_p)); 4286 #endif 4287 minor_t minor = getminor(*devp); 4288 nvme_t *nvme = ddi_get_soft_state(nvme_state, NVME_MINOR_INST(minor)); 4289 int nsid = NVME_MINOR_NSID(minor); 4290 nvme_minor_state_t *nm; 4291 int rv = 0; 4292 4293 if (otyp != OTYP_CHR) 4294 return (EINVAL); 4295 4296 if (nvme == NULL) 4297 return (ENXIO); 4298 4299 if (nsid > nvme->n_namespace_count) 4300 return (ENXIO); 4301 4302 if (nvme->n_dead) 4303 return (EIO); 4304 4305 nm = nsid == 0 ? &nvme->n_minor : &nvme->n_ns[nsid - 1].ns_minor; 4306 4307 mutex_enter(&nm->nm_mutex); 4308 if (nm->nm_oexcl) { 4309 rv = EBUSY; 4310 goto out; 4311 } 4312 4313 if (flag & FEXCL) { 4314 if (nm->nm_ocnt != 0) { 4315 rv = EBUSY; 4316 goto out; 4317 } 4318 nm->nm_oexcl = B_TRUE; 4319 } 4320 4321 nm->nm_ocnt++; 4322 4323 out: 4324 mutex_exit(&nm->nm_mutex); 4325 return (rv); 4326 4327 } 4328 4329 static int 4330 nvme_close(dev_t dev, int flag, int otyp, cred_t *cred_p) 4331 { 4332 #ifndef __lock_lint 4333 _NOTE(ARGUNUSED(cred_p)); 4334 _NOTE(ARGUNUSED(flag)); 4335 #endif 4336 minor_t minor = getminor(dev); 4337 nvme_t *nvme = ddi_get_soft_state(nvme_state, NVME_MINOR_INST(minor)); 4338 int nsid = NVME_MINOR_NSID(minor); 4339 nvme_minor_state_t *nm; 4340 4341 if (otyp != OTYP_CHR) 4342 return (ENXIO); 4343 4344 if (nvme == NULL) 4345 return (ENXIO); 4346 4347 if (nsid > nvme->n_namespace_count) 4348 return (ENXIO); 4349 4350 nm = nsid == 0 ? &nvme->n_minor : &nvme->n_ns[nsid - 1].ns_minor; 4351 4352 mutex_enter(&nm->nm_mutex); 4353 if (nm->nm_oexcl) 4354 nm->nm_oexcl = B_FALSE; 4355 4356 ASSERT(nm->nm_ocnt > 0); 4357 nm->nm_ocnt--; 4358 mutex_exit(&nm->nm_mutex); 4359 4360 return (0); 4361 } 4362 4363 static int 4364 nvme_ioctl_identify(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 4365 cred_t *cred_p) 4366 { 4367 _NOTE(ARGUNUSED(cred_p)); 4368 int rv = 0; 4369 void *idctl; 4370 4371 if ((mode & FREAD) == 0) 4372 return (EPERM); 4373 4374 if (nioc->n_len < NVME_IDENTIFY_BUFSIZE) 4375 return (EINVAL); 4376 4377 if ((rv = nvme_identify(nvme, B_TRUE, nsid, (void **)&idctl)) != 0) 4378 return (rv); 4379 4380 if (ddi_copyout(idctl, (void *)nioc->n_buf, NVME_IDENTIFY_BUFSIZE, mode) 4381 != 0) 4382 rv = EFAULT; 4383 4384 kmem_free(idctl, NVME_IDENTIFY_BUFSIZE); 4385 4386 return (rv); 4387 } 4388 4389 /* 4390 * Execute commands on behalf of the various ioctls. 4391 */ 4392 static int 4393 nvme_ioc_cmd(nvme_t *nvme, nvme_sqe_t *sqe, boolean_t is_admin, void *data_addr, 4394 uint32_t data_len, int rwk, nvme_cqe_t *cqe, uint_t timeout) 4395 { 4396 nvme_cmd_t *cmd; 4397 nvme_qpair_t *ioq; 4398 int rv = 0; 4399 4400 cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 4401 if (is_admin) { 4402 cmd->nc_sqid = 0; 4403 ioq = nvme->n_adminq; 4404 } else { 4405 cmd->nc_sqid = (CPU->cpu_id % nvme->n_ioq_count) + 1; 4406 ASSERT(cmd->nc_sqid <= nvme->n_ioq_count); 4407 ioq = nvme->n_ioq[cmd->nc_sqid]; 4408 } 4409 4410 /* 4411 * This function is used to facilitate requests from 4412 * userspace, so don't panic if the command fails. This 4413 * is especially true for admin passthru commands, where 4414 * the actual command data structure is entirely defined 4415 * by userspace. 4416 */ 4417 cmd->nc_dontpanic = B_TRUE; 4418 4419 cmd->nc_callback = nvme_wakeup_cmd; 4420 cmd->nc_sqe = *sqe; 4421 4422 if ((rwk & (FREAD | FWRITE)) != 0) { 4423 if (data_addr == NULL) { 4424 rv = EINVAL; 4425 goto free_cmd; 4426 } 4427 4428 if (nvme_zalloc_dma(nvme, data_len, DDI_DMA_READ, 4429 &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) { 4430 dev_err(nvme->n_dip, CE_WARN, 4431 "!nvme_zalloc_dma failed for nvme_ioc_cmd()"); 4432 4433 rv = ENOMEM; 4434 goto free_cmd; 4435 } 4436 4437 if ((rv = nvme_fill_prp(cmd, cmd->nc_dma->nd_dmah)) != 0) 4438 goto free_cmd; 4439 4440 if ((rwk & FWRITE) != 0) { 4441 if (ddi_copyin(data_addr, cmd->nc_dma->nd_memp, 4442 data_len, rwk & FKIOCTL) != 0) { 4443 rv = EFAULT; 4444 goto free_cmd; 4445 } 4446 } 4447 } 4448 4449 if (is_admin) { 4450 nvme_admin_cmd(cmd, timeout); 4451 } else { 4452 mutex_enter(&cmd->nc_mutex); 4453 4454 rv = nvme_submit_io_cmd(ioq, cmd); 4455 4456 if (rv == EAGAIN) { 4457 mutex_exit(&cmd->nc_mutex); 4458 dev_err(cmd->nc_nvme->n_dip, CE_WARN, 4459 "!nvme_ioc_cmd() failed, I/O Q full"); 4460 goto free_cmd; 4461 } 4462 4463 nvme_wait_cmd(cmd, timeout); 4464 4465 mutex_exit(&cmd->nc_mutex); 4466 } 4467 4468 if (cqe != NULL) 4469 *cqe = cmd->nc_cqe; 4470 4471 if ((rv = nvme_check_cmd_status(cmd)) != 0) { 4472 dev_err(nvme->n_dip, CE_WARN, 4473 "!nvme_ioc_cmd() failed with sct = %x, sc = %x", 4474 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 4475 4476 goto free_cmd; 4477 } 4478 4479 if ((rwk & FREAD) != 0) { 4480 if (ddi_copyout(cmd->nc_dma->nd_memp, 4481 data_addr, data_len, rwk & FKIOCTL) != 0) 4482 rv = EFAULT; 4483 } 4484 4485 free_cmd: 4486 nvme_free_cmd(cmd); 4487 4488 return (rv); 4489 } 4490 4491 static int 4492 nvme_ioctl_capabilities(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, 4493 int mode, cred_t *cred_p) 4494 { 4495 _NOTE(ARGUNUSED(nsid, cred_p)); 4496 int rv = 0; 4497 nvme_reg_cap_t cap = { 0 }; 4498 nvme_capabilities_t nc; 4499 4500 if ((mode & FREAD) == 0) 4501 return (EPERM); 4502 4503 if (nioc->n_len < sizeof (nc)) 4504 return (EINVAL); 4505 4506 cap.r = nvme_get64(nvme, NVME_REG_CAP); 4507 4508 /* 4509 * The MPSMIN and MPSMAX fields in the CAP register use 0 to 4510 * specify the base page size of 4k (1<<12), so add 12 here to 4511 * get the real page size value. 4512 */ 4513 nc.mpsmax = 1 << (12 + cap.b.cap_mpsmax); 4514 nc.mpsmin = 1 << (12 + cap.b.cap_mpsmin); 4515 4516 if (ddi_copyout(&nc, (void *)nioc->n_buf, sizeof (nc), mode) != 0) 4517 rv = EFAULT; 4518 4519 return (rv); 4520 } 4521 4522 static int 4523 nvme_ioctl_get_logpage(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, 4524 int mode, cred_t *cred_p) 4525 { 4526 _NOTE(ARGUNUSED(cred_p)); 4527 void *log = NULL; 4528 size_t bufsize = 0; 4529 int rv = 0; 4530 4531 if ((mode & FREAD) == 0) 4532 return (EPERM); 4533 4534 switch (nioc->n_arg) { 4535 case NVME_LOGPAGE_ERROR: 4536 if (nsid != 0) 4537 return (EINVAL); 4538 break; 4539 case NVME_LOGPAGE_HEALTH: 4540 if (nsid != 0 && nvme->n_idctl->id_lpa.lp_smart == 0) 4541 return (EINVAL); 4542 4543 if (nsid == 0) 4544 nsid = (uint32_t)-1; 4545 4546 break; 4547 case NVME_LOGPAGE_FWSLOT: 4548 if (nsid != 0) 4549 return (EINVAL); 4550 break; 4551 default: 4552 if (!NVME_IS_VENDOR_SPECIFIC_LOGPAGE(nioc->n_arg)) 4553 return (EINVAL); 4554 if (nioc->n_len > NVME_VENDOR_SPECIFIC_LOGPAGE_MAX_SIZE) { 4555 dev_err(nvme->n_dip, CE_NOTE, "!Vendor-specific log " 4556 "page size exceeds device maximum supported size: " 4557 "%lu", NVME_VENDOR_SPECIFIC_LOGPAGE_MAX_SIZE); 4558 return (EINVAL); 4559 } 4560 if (nioc->n_len == 0) 4561 return (EINVAL); 4562 bufsize = nioc->n_len; 4563 if (nsid == 0) 4564 nsid = (uint32_t)-1; 4565 } 4566 4567 if (nvme_get_logpage(nvme, B_TRUE, &log, &bufsize, nioc->n_arg, nsid) 4568 != DDI_SUCCESS) 4569 return (EIO); 4570 4571 if (nioc->n_len < bufsize) { 4572 kmem_free(log, bufsize); 4573 return (EINVAL); 4574 } 4575 4576 if (ddi_copyout(log, (void *)nioc->n_buf, bufsize, mode) != 0) 4577 rv = EFAULT; 4578 4579 nioc->n_len = bufsize; 4580 kmem_free(log, bufsize); 4581 4582 return (rv); 4583 } 4584 4585 static int 4586 nvme_ioctl_get_features(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, 4587 int mode, cred_t *cred_p) 4588 { 4589 _NOTE(ARGUNUSED(cred_p)); 4590 void *buf = NULL; 4591 size_t bufsize = 0; 4592 uint32_t res = 0; 4593 uint8_t feature; 4594 int rv = 0; 4595 4596 if ((mode & FREAD) == 0) 4597 return (EPERM); 4598 4599 if ((nioc->n_arg >> 32) > 0xff) 4600 return (EINVAL); 4601 4602 feature = (uint8_t)(nioc->n_arg >> 32); 4603 4604 switch (feature) { 4605 case NVME_FEAT_ARBITRATION: 4606 case NVME_FEAT_POWER_MGMT: 4607 case NVME_FEAT_ERROR: 4608 case NVME_FEAT_NQUEUES: 4609 case NVME_FEAT_INTR_COAL: 4610 case NVME_FEAT_WRITE_ATOM: 4611 case NVME_FEAT_ASYNC_EVENT: 4612 case NVME_FEAT_PROGRESS: 4613 if (nsid != 0) 4614 return (EINVAL); 4615 break; 4616 4617 case NVME_FEAT_TEMPERATURE: 4618 if (nsid != 0) 4619 return (EINVAL); 4620 res = nioc->n_arg & 0xffffffffUL; 4621 if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 2)) { 4622 nvme_temp_threshold_t tt; 4623 4624 tt.r = res; 4625 if (tt.b.tt_thsel != NVME_TEMP_THRESH_OVER && 4626 tt.b.tt_thsel != NVME_TEMP_THRESH_UNDER) { 4627 return (EINVAL); 4628 } 4629 4630 if (tt.b.tt_tmpsel > NVME_TEMP_THRESH_MAX_SENSOR) { 4631 return (EINVAL); 4632 } 4633 } else if (res != 0) { 4634 return (EINVAL); 4635 } 4636 break; 4637 4638 case NVME_FEAT_INTR_VECT: 4639 if (nsid != 0) 4640 return (EINVAL); 4641 4642 res = nioc->n_arg & 0xffffffffUL; 4643 if (res >= nvme->n_intr_cnt) 4644 return (EINVAL); 4645 break; 4646 4647 case NVME_FEAT_LBA_RANGE: 4648 if (nvme->n_lba_range_supported == B_FALSE) 4649 return (EINVAL); 4650 4651 if (nsid == 0 || 4652 nsid > nvme->n_namespace_count) 4653 return (EINVAL); 4654 4655 break; 4656 4657 case NVME_FEAT_WRITE_CACHE: 4658 if (nsid != 0) 4659 return (EINVAL); 4660 4661 if (!nvme->n_write_cache_present) 4662 return (EINVAL); 4663 4664 break; 4665 4666 case NVME_FEAT_AUTO_PST: 4667 if (nsid != 0) 4668 return (EINVAL); 4669 4670 if (!nvme->n_auto_pst_supported) 4671 return (EINVAL); 4672 4673 break; 4674 4675 default: 4676 return (EINVAL); 4677 } 4678 4679 rv = nvme_get_features(nvme, B_TRUE, nsid, feature, &res, &buf, 4680 &bufsize); 4681 if (rv != 0) 4682 return (rv); 4683 4684 if (nioc->n_len < bufsize) { 4685 kmem_free(buf, bufsize); 4686 return (EINVAL); 4687 } 4688 4689 if (buf && ddi_copyout(buf, (void*)nioc->n_buf, bufsize, mode) != 0) 4690 rv = EFAULT; 4691 4692 kmem_free(buf, bufsize); 4693 nioc->n_arg = res; 4694 nioc->n_len = bufsize; 4695 4696 return (rv); 4697 } 4698 4699 static int 4700 nvme_ioctl_intr_cnt(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 4701 cred_t *cred_p) 4702 { 4703 _NOTE(ARGUNUSED(nsid, mode, cred_p)); 4704 4705 if ((mode & FREAD) == 0) 4706 return (EPERM); 4707 4708 nioc->n_arg = nvme->n_intr_cnt; 4709 return (0); 4710 } 4711 4712 static int 4713 nvme_ioctl_version(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 4714 cred_t *cred_p) 4715 { 4716 _NOTE(ARGUNUSED(nsid, cred_p)); 4717 int rv = 0; 4718 4719 if ((mode & FREAD) == 0) 4720 return (EPERM); 4721 4722 if (nioc->n_len < sizeof (nvme->n_version)) 4723 return (ENOMEM); 4724 4725 if (ddi_copyout(&nvme->n_version, (void *)nioc->n_buf, 4726 sizeof (nvme->n_version), mode) != 0) 4727 rv = EFAULT; 4728 4729 return (rv); 4730 } 4731 4732 static int 4733 nvme_ioctl_format(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 4734 cred_t *cred_p) 4735 { 4736 _NOTE(ARGUNUSED(mode)); 4737 nvme_format_nvm_t frmt = { 0 }; 4738 int c_nsid = nsid != 0 ? nsid - 1 : 0; 4739 4740 if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) 4741 return (EPERM); 4742 4743 frmt.r = nioc->n_arg & 0xffffffff; 4744 4745 /* 4746 * Check whether the FORMAT NVM command is supported. 4747 */ 4748 if (nvme->n_idctl->id_oacs.oa_format == 0) 4749 return (EINVAL); 4750 4751 /* 4752 * Don't allow format or secure erase of individual namespace if that 4753 * would cause a format or secure erase of all namespaces. 4754 */ 4755 if (nsid != 0 && nvme->n_idctl->id_fna.fn_format != 0) 4756 return (EINVAL); 4757 4758 if (nsid != 0 && frmt.b.fm_ses != NVME_FRMT_SES_NONE && 4759 nvme->n_idctl->id_fna.fn_sec_erase != 0) 4760 return (EINVAL); 4761 4762 /* 4763 * Don't allow formatting with Protection Information. 4764 */ 4765 if (frmt.b.fm_pi != 0 || frmt.b.fm_pil != 0 || frmt.b.fm_ms != 0) 4766 return (EINVAL); 4767 4768 /* 4769 * Don't allow formatting using an illegal LBA format, or any LBA format 4770 * that uses metadata. 4771 */ 4772 if (frmt.b.fm_lbaf > nvme->n_ns[c_nsid].ns_idns->id_nlbaf || 4773 nvme->n_ns[c_nsid].ns_idns->id_lbaf[frmt.b.fm_lbaf].lbaf_ms != 0) 4774 return (EINVAL); 4775 4776 /* 4777 * Don't allow formatting using an illegal Secure Erase setting. 4778 */ 4779 if (frmt.b.fm_ses > NVME_FRMT_MAX_SES || 4780 (frmt.b.fm_ses == NVME_FRMT_SES_CRYPTO && 4781 nvme->n_idctl->id_fna.fn_crypt_erase == 0)) 4782 return (EINVAL); 4783 4784 if (nsid == 0) 4785 nsid = (uint32_t)-1; 4786 4787 return (nvme_format_nvm(nvme, B_TRUE, nsid, frmt.b.fm_lbaf, B_FALSE, 0, 4788 B_FALSE, frmt.b.fm_ses)); 4789 } 4790 4791 static int 4792 nvme_ioctl_detach(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 4793 cred_t *cred_p) 4794 { 4795 _NOTE(ARGUNUSED(nioc, mode)); 4796 int rv = 0; 4797 4798 if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) 4799 return (EPERM); 4800 4801 if (nsid == 0) 4802 return (EINVAL); 4803 4804 if (nvme->n_ns[nsid - 1].ns_ignore) 4805 return (0); 4806 4807 rv = bd_detach_handle(nvme->n_ns[nsid - 1].ns_bd_hdl); 4808 if (rv != DDI_SUCCESS) 4809 rv = EBUSY; 4810 4811 return (rv); 4812 } 4813 4814 static int 4815 nvme_ioctl_attach(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 4816 cred_t *cred_p) 4817 { 4818 _NOTE(ARGUNUSED(nioc, mode)); 4819 nvme_identify_nsid_t *idns; 4820 int rv = 0; 4821 4822 if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) 4823 return (EPERM); 4824 4825 if (nsid == 0) 4826 return (EINVAL); 4827 4828 /* 4829 * Identify namespace again, free old identify data. 4830 */ 4831 idns = nvme->n_ns[nsid - 1].ns_idns; 4832 if (nvme_init_ns(nvme, nsid) != DDI_SUCCESS) 4833 return (EIO); 4834 4835 kmem_free(idns, sizeof (nvme_identify_nsid_t)); 4836 4837 if (nvme->n_ns[nsid - 1].ns_ignore) 4838 return (ENOTSUP); 4839 4840 if (nvme->n_ns[nsid - 1].ns_bd_hdl == NULL) 4841 nvme->n_ns[nsid - 1].ns_bd_hdl = bd_alloc_handle( 4842 &nvme->n_ns[nsid - 1], &nvme_bd_ops, &nvme->n_prp_dma_attr, 4843 KM_SLEEP); 4844 4845 rv = bd_attach_handle(nvme->n_dip, nvme->n_ns[nsid - 1].ns_bd_hdl); 4846 if (rv != DDI_SUCCESS) 4847 rv = EBUSY; 4848 4849 return (rv); 4850 } 4851 4852 static void 4853 nvme_ufm_update(nvme_t *nvme) 4854 { 4855 mutex_enter(&nvme->n_fwslot_mutex); 4856 ddi_ufm_update(nvme->n_ufmh); 4857 if (nvme->n_fwslot != NULL) { 4858 kmem_free(nvme->n_fwslot, sizeof (nvme_fwslot_log_t)); 4859 nvme->n_fwslot = NULL; 4860 } 4861 mutex_exit(&nvme->n_fwslot_mutex); 4862 } 4863 4864 static int 4865 nvme_ioctl_firmware_download(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, 4866 int mode, cred_t *cred_p) 4867 { 4868 int rv = 0; 4869 size_t len, copylen; 4870 offset_t offset; 4871 uintptr_t buf; 4872 nvme_sqe_t sqe = { 4873 .sqe_opc = NVME_OPC_FW_IMAGE_LOAD 4874 }; 4875 4876 if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) 4877 return (EPERM); 4878 4879 if (nsid != 0) 4880 return (EINVAL); 4881 4882 /* 4883 * The offset (in n_len) is restricted to the number of DWORDs in 4884 * 32 bits. 4885 */ 4886 if (nioc->n_len > NVME_FW_OFFSETB_MAX) 4887 return (EINVAL); 4888 4889 /* Confirm that both offset and length are a multiple of DWORD bytes */ 4890 if ((nioc->n_len & NVME_DWORD_MASK) != 0 || 4891 (nioc->n_arg & NVME_DWORD_MASK) != 0) 4892 return (EINVAL); 4893 4894 len = nioc->n_len; 4895 offset = nioc->n_arg; 4896 buf = (uintptr_t)nioc->n_buf; 4897 while (len > 0 && rv == 0) { 4898 /* 4899 * nvme_ioc_cmd() does not use SGLs or PRP lists. 4900 * It is limited to 2 PRPs per NVM command, so limit 4901 * the size of the data to 2 pages. 4902 */ 4903 copylen = MIN(2 * nvme->n_pagesize, len); 4904 4905 sqe.sqe_cdw10 = (uint32_t)(copylen >> NVME_DWORD_SHIFT) - 1; 4906 sqe.sqe_cdw11 = (uint32_t)(offset >> NVME_DWORD_SHIFT); 4907 4908 rv = nvme_ioc_cmd(nvme, &sqe, B_TRUE, (void *)buf, copylen, 4909 FWRITE, NULL, nvme_admin_cmd_timeout); 4910 4911 buf += copylen; 4912 offset += copylen; 4913 len -= copylen; 4914 } 4915 4916 /* 4917 * Let the DDI UFM subsystem know that the firmware information for 4918 * this device has changed. 4919 */ 4920 nvme_ufm_update(nvme); 4921 4922 return (rv); 4923 } 4924 4925 static int 4926 nvme_ioctl_firmware_commit(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, 4927 int mode, cred_t *cred_p) 4928 { 4929 nvme_firmware_commit_dw10_t fc_dw10 = { 0 }; 4930 uint32_t slot = nioc->n_arg & 0xffffffff; 4931 uint32_t action = nioc->n_arg >> 32; 4932 nvme_cqe_t cqe = { 0 }; 4933 nvme_sqe_t sqe = { 4934 .sqe_opc = NVME_OPC_FW_ACTIVATE 4935 }; 4936 int timeout; 4937 int rv; 4938 4939 if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) 4940 return (EPERM); 4941 4942 if (nsid != 0) 4943 return (EINVAL); 4944 4945 /* Validate slot is in range. */ 4946 if (slot < NVME_FW_SLOT_MIN || slot > NVME_FW_SLOT_MAX) 4947 return (EINVAL); 4948 4949 switch (action) { 4950 case NVME_FWC_SAVE: 4951 case NVME_FWC_SAVE_ACTIVATE: 4952 timeout = nvme_commit_save_cmd_timeout; 4953 break; 4954 case NVME_FWC_ACTIVATE: 4955 case NVME_FWC_ACTIVATE_IMMED: 4956 timeout = nvme_admin_cmd_timeout; 4957 break; 4958 default: 4959 return (EINVAL); 4960 } 4961 4962 fc_dw10.b.fc_slot = slot; 4963 fc_dw10.b.fc_action = action; 4964 sqe.sqe_cdw10 = fc_dw10.r; 4965 4966 rv = nvme_ioc_cmd(nvme, &sqe, B_TRUE, NULL, 0, 0, &cqe, timeout); 4967 4968 nioc->n_arg = ((uint64_t)cqe.cqe_sf.sf_sct << 16) | cqe.cqe_sf.sf_sc; 4969 4970 /* 4971 * Let the DDI UFM subsystem know that the firmware information for 4972 * this device has changed. 4973 */ 4974 nvme_ufm_update(nvme); 4975 4976 return (rv); 4977 } 4978 4979 /* 4980 * Helper to copy in a passthru command from userspace, handling 4981 * different data models. 4982 */ 4983 static int 4984 nvme_passthru_copy_cmd_in(const void *buf, nvme_passthru_cmd_t *cmd, int mode) 4985 { 4986 #ifdef _MULTI_DATAMODEL 4987 switch (ddi_model_convert_from(mode & FMODELS)) { 4988 case DDI_MODEL_ILP32: { 4989 nvme_passthru_cmd32_t cmd32; 4990 if (ddi_copyin(buf, (void*)&cmd32, sizeof (cmd32), mode) != 0) 4991 return (-1); 4992 cmd->npc_opcode = cmd32.npc_opcode; 4993 cmd->npc_timeout = cmd32.npc_timeout; 4994 cmd->npc_flags = cmd32.npc_flags; 4995 cmd->npc_cdw12 = cmd32.npc_cdw12; 4996 cmd->npc_cdw13 = cmd32.npc_cdw13; 4997 cmd->npc_cdw14 = cmd32.npc_cdw14; 4998 cmd->npc_cdw15 = cmd32.npc_cdw15; 4999 cmd->npc_buflen = cmd32.npc_buflen; 5000 cmd->npc_buf = cmd32.npc_buf; 5001 break; 5002 } 5003 case DDI_MODEL_NONE: 5004 #endif 5005 if (ddi_copyin(buf, (void*)cmd, sizeof (nvme_passthru_cmd_t), 5006 mode) != 0) 5007 return (-1); 5008 #ifdef _MULTI_DATAMODEL 5009 break; 5010 } 5011 #endif 5012 return (0); 5013 } 5014 5015 /* 5016 * Helper to copy out a passthru command result to userspace, handling 5017 * different data models. 5018 */ 5019 static int 5020 nvme_passthru_copy_cmd_out(const nvme_passthru_cmd_t *cmd, void *buf, int mode) 5021 { 5022 #ifdef _MULTI_DATAMODEL 5023 switch (ddi_model_convert_from(mode & FMODELS)) { 5024 case DDI_MODEL_ILP32: { 5025 nvme_passthru_cmd32_t cmd32; 5026 bzero(&cmd32, sizeof (cmd32)); 5027 cmd32.npc_opcode = cmd->npc_opcode; 5028 cmd32.npc_status = cmd->npc_status; 5029 cmd32.npc_err = cmd->npc_err; 5030 cmd32.npc_timeout = cmd->npc_timeout; 5031 cmd32.npc_flags = cmd->npc_flags; 5032 cmd32.npc_cdw0 = cmd->npc_cdw0; 5033 cmd32.npc_cdw12 = cmd->npc_cdw12; 5034 cmd32.npc_cdw13 = cmd->npc_cdw13; 5035 cmd32.npc_cdw14 = cmd->npc_cdw14; 5036 cmd32.npc_cdw15 = cmd->npc_cdw15; 5037 cmd32.npc_buflen = (size32_t)cmd->npc_buflen; 5038 cmd32.npc_buf = (uintptr32_t)cmd->npc_buf; 5039 if (ddi_copyout(&cmd32, buf, sizeof (cmd32), mode) != 0) 5040 return (-1); 5041 break; 5042 } 5043 case DDI_MODEL_NONE: 5044 #endif 5045 if (ddi_copyout(cmd, buf, sizeof (nvme_passthru_cmd_t), 5046 mode) != 0) 5047 return (-1); 5048 #ifdef _MULTI_DATAMODEL 5049 break; 5050 } 5051 #endif 5052 return (0); 5053 } 5054 5055 /* 5056 * Run an arbitrary vendor-specific admin command on the device. 5057 */ 5058 static int 5059 nvme_ioctl_passthru(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 5060 cred_t *cred_p) 5061 { 5062 int rv = 0; 5063 uint_t timeout = 0; 5064 int rwk = 0; 5065 nvme_passthru_cmd_t cmd; 5066 size_t expected_passthru_size = 0; 5067 nvme_sqe_t sqe; 5068 nvme_cqe_t cqe; 5069 5070 bzero(&cmd, sizeof (cmd)); 5071 bzero(&sqe, sizeof (sqe)); 5072 bzero(&cqe, sizeof (cqe)); 5073 5074 /* 5075 * Basic checks: permissions, data model, argument size. 5076 */ 5077 if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) 5078 return (EPERM); 5079 5080 /* 5081 * Compute the expected size of the argument buffer 5082 */ 5083 #ifdef _MULTI_DATAMODEL 5084 switch (ddi_model_convert_from(mode & FMODELS)) { 5085 case DDI_MODEL_ILP32: 5086 expected_passthru_size = sizeof (nvme_passthru_cmd32_t); 5087 break; 5088 case DDI_MODEL_NONE: 5089 #endif 5090 expected_passthru_size = sizeof (nvme_passthru_cmd_t); 5091 #ifdef _MULTI_DATAMODEL 5092 break; 5093 } 5094 #endif 5095 5096 if (nioc->n_len != expected_passthru_size) { 5097 cmd.npc_err = NVME_PASSTHRU_ERR_CMD_SIZE; 5098 rv = EINVAL; 5099 goto out; 5100 } 5101 5102 /* 5103 * Ensure the device supports the standard vendor specific 5104 * admin command format. 5105 */ 5106 if (!nvme->n_idctl->id_nvscc.nv_spec) { 5107 cmd.npc_err = NVME_PASSTHRU_ERR_NOT_SUPPORTED; 5108 rv = ENOTSUP; 5109 goto out; 5110 } 5111 5112 if (nvme_passthru_copy_cmd_in((const void*)nioc->n_buf, &cmd, mode)) 5113 return (EFAULT); 5114 5115 if (!NVME_IS_VENDOR_SPECIFIC_CMD(cmd.npc_opcode)) { 5116 cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_OPCODE; 5117 rv = EINVAL; 5118 goto out; 5119 } 5120 5121 /* 5122 * This restriction is not mandated by the spec, so future work 5123 * could relax this if it's necessary to support commands that both 5124 * read and write. 5125 */ 5126 if ((cmd.npc_flags & NVME_PASSTHRU_READ) != 0 && 5127 (cmd.npc_flags & NVME_PASSTHRU_WRITE) != 0) { 5128 cmd.npc_err = NVME_PASSTHRU_ERR_READ_AND_WRITE; 5129 rv = EINVAL; 5130 goto out; 5131 } 5132 if (cmd.npc_timeout > nvme_vendor_specific_admin_cmd_max_timeout) { 5133 cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_TIMEOUT; 5134 rv = EINVAL; 5135 goto out; 5136 } 5137 timeout = cmd.npc_timeout; 5138 5139 /* 5140 * Passed-thru command buffer verification: 5141 * - Size is multiple of DWords 5142 * - Non-null iff the length is non-zero 5143 * - Null if neither reading nor writing data. 5144 * - Non-null if reading or writing. 5145 * - Maximum buffer size. 5146 */ 5147 if ((cmd.npc_buflen % sizeof (uint32_t)) != 0) { 5148 cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_BUFFER; 5149 rv = EINVAL; 5150 goto out; 5151 } 5152 if (((void*)cmd.npc_buf != NULL && cmd.npc_buflen == 0) || 5153 ((void*)cmd.npc_buf == NULL && cmd.npc_buflen != 0)) { 5154 cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_BUFFER; 5155 rv = EINVAL; 5156 goto out; 5157 } 5158 if (cmd.npc_flags == 0 && (void*)cmd.npc_buf != NULL) { 5159 cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_BUFFER; 5160 rv = EINVAL; 5161 goto out; 5162 } 5163 if ((cmd.npc_flags != 0) && ((void*)cmd.npc_buf == NULL)) { 5164 cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_BUFFER; 5165 rv = EINVAL; 5166 goto out; 5167 } 5168 if (cmd.npc_buflen > nvme_vendor_specific_admin_cmd_size) { 5169 cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_BUFFER; 5170 rv = EINVAL; 5171 goto out; 5172 } 5173 if ((cmd.npc_buflen >> NVME_DWORD_SHIFT) > UINT32_MAX) { 5174 cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_BUFFER; 5175 rv = EINVAL; 5176 goto out; 5177 } 5178 5179 sqe.sqe_opc = cmd.npc_opcode; 5180 sqe.sqe_nsid = nsid; 5181 sqe.sqe_cdw10 = (uint32_t)(cmd.npc_buflen >> NVME_DWORD_SHIFT); 5182 sqe.sqe_cdw12 = cmd.npc_cdw12; 5183 sqe.sqe_cdw13 = cmd.npc_cdw13; 5184 sqe.sqe_cdw14 = cmd.npc_cdw14; 5185 sqe.sqe_cdw15 = cmd.npc_cdw15; 5186 if ((cmd.npc_flags & NVME_PASSTHRU_READ) != 0) 5187 rwk = FREAD; 5188 else if ((cmd.npc_flags & NVME_PASSTHRU_WRITE) != 0) 5189 rwk = FWRITE; 5190 5191 rv = nvme_ioc_cmd(nvme, &sqe, B_TRUE, (void*)cmd.npc_buf, 5192 cmd.npc_buflen, rwk, &cqe, timeout); 5193 cmd.npc_status = cqe.cqe_sf.sf_sc; 5194 cmd.npc_cdw0 = cqe.cqe_dw0; 5195 5196 out: 5197 if (nvme_passthru_copy_cmd_out(&cmd, (void*)nioc->n_buf, mode)) 5198 rv = EFAULT; 5199 return (rv); 5200 } 5201 5202 static int 5203 nvme_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred_p, 5204 int *rval_p) 5205 { 5206 #ifndef __lock_lint 5207 _NOTE(ARGUNUSED(rval_p)); 5208 #endif 5209 minor_t minor = getminor(dev); 5210 nvme_t *nvme = ddi_get_soft_state(nvme_state, NVME_MINOR_INST(minor)); 5211 int nsid = NVME_MINOR_NSID(minor); 5212 int rv = 0; 5213 nvme_ioctl_t nioc; 5214 5215 int (*nvme_ioctl[])(nvme_t *, int, nvme_ioctl_t *, int, cred_t *) = { 5216 NULL, 5217 nvme_ioctl_identify, 5218 nvme_ioctl_identify, 5219 nvme_ioctl_capabilities, 5220 nvme_ioctl_get_logpage, 5221 nvme_ioctl_get_features, 5222 nvme_ioctl_intr_cnt, 5223 nvme_ioctl_version, 5224 nvme_ioctl_format, 5225 nvme_ioctl_detach, 5226 nvme_ioctl_attach, 5227 nvme_ioctl_firmware_download, 5228 nvme_ioctl_firmware_commit, 5229 nvme_ioctl_passthru 5230 }; 5231 5232 if (nvme == NULL) 5233 return (ENXIO); 5234 5235 if (nsid > nvme->n_namespace_count) 5236 return (ENXIO); 5237 5238 if (IS_DEVCTL(cmd)) 5239 return (ndi_devctl_ioctl(nvme->n_dip, cmd, arg, mode, 0)); 5240 5241 #ifdef _MULTI_DATAMODEL 5242 switch (ddi_model_convert_from(mode & FMODELS)) { 5243 case DDI_MODEL_ILP32: { 5244 nvme_ioctl32_t nioc32; 5245 if (ddi_copyin((void*)arg, &nioc32, sizeof (nvme_ioctl32_t), 5246 mode) != 0) 5247 return (EFAULT); 5248 nioc.n_len = nioc32.n_len; 5249 nioc.n_buf = nioc32.n_buf; 5250 nioc.n_arg = nioc32.n_arg; 5251 break; 5252 } 5253 case DDI_MODEL_NONE: 5254 #endif 5255 if (ddi_copyin((void*)arg, &nioc, sizeof (nvme_ioctl_t), mode) 5256 != 0) 5257 return (EFAULT); 5258 #ifdef _MULTI_DATAMODEL 5259 break; 5260 } 5261 #endif 5262 5263 if (nvme->n_dead && cmd != NVME_IOC_DETACH) 5264 return (EIO); 5265 5266 5267 if (cmd == NVME_IOC_IDENTIFY_CTRL) { 5268 /* 5269 * This makes NVME_IOC_IDENTIFY_CTRL work the same on devctl and 5270 * attachment point nodes. 5271 */ 5272 nsid = 0; 5273 } else if (cmd == NVME_IOC_IDENTIFY_NSID && nsid == 0) { 5274 /* 5275 * This makes NVME_IOC_IDENTIFY_NSID work on a devctl node, it 5276 * will always return identify data for namespace 1. 5277 */ 5278 nsid = 1; 5279 } 5280 5281 if (IS_NVME_IOC(cmd) && nvme_ioctl[NVME_IOC_CMD(cmd)] != NULL) 5282 rv = nvme_ioctl[NVME_IOC_CMD(cmd)](nvme, nsid, &nioc, mode, 5283 cred_p); 5284 else 5285 rv = EINVAL; 5286 5287 #ifdef _MULTI_DATAMODEL 5288 switch (ddi_model_convert_from(mode & FMODELS)) { 5289 case DDI_MODEL_ILP32: { 5290 nvme_ioctl32_t nioc32; 5291 5292 nioc32.n_len = (size32_t)nioc.n_len; 5293 nioc32.n_buf = (uintptr32_t)nioc.n_buf; 5294 nioc32.n_arg = nioc.n_arg; 5295 5296 if (ddi_copyout(&nioc32, (void *)arg, sizeof (nvme_ioctl32_t), 5297 mode) != 0) 5298 return (EFAULT); 5299 break; 5300 } 5301 case DDI_MODEL_NONE: 5302 #endif 5303 if (ddi_copyout(&nioc, (void *)arg, sizeof (nvme_ioctl_t), mode) 5304 != 0) 5305 return (EFAULT); 5306 #ifdef _MULTI_DATAMODEL 5307 break; 5308 } 5309 #endif 5310 5311 return (rv); 5312 } 5313 5314 /* 5315 * DDI UFM Callbacks 5316 */ 5317 static int 5318 nvme_ufm_fill_image(ddi_ufm_handle_t *ufmh, void *arg, uint_t imgno, 5319 ddi_ufm_image_t *img) 5320 { 5321 nvme_t *nvme = arg; 5322 5323 if (imgno != 0) 5324 return (EINVAL); 5325 5326 ddi_ufm_image_set_desc(img, "Firmware"); 5327 ddi_ufm_image_set_nslots(img, nvme->n_idctl->id_frmw.fw_nslot); 5328 5329 return (0); 5330 } 5331 5332 /* 5333 * Fill out firmware slot information for the requested slot. The firmware 5334 * slot information is gathered by requesting the Firmware Slot Information log 5335 * page. The format of the page is described in section 5.10.1.3. 5336 * 5337 * We lazily cache the log page on the first call and then invalidate the cache 5338 * data after a successful firmware download or firmware commit command. 5339 * The cached data is protected by a mutex as the state can change 5340 * asynchronous to this callback. 5341 */ 5342 static int 5343 nvme_ufm_fill_slot(ddi_ufm_handle_t *ufmh, void *arg, uint_t imgno, 5344 uint_t slotno, ddi_ufm_slot_t *slot) 5345 { 5346 nvme_t *nvme = arg; 5347 void *log = NULL; 5348 size_t bufsize; 5349 ddi_ufm_attr_t attr = 0; 5350 char fw_ver[NVME_FWVER_SZ + 1]; 5351 int ret; 5352 5353 if (imgno > 0 || slotno > (nvme->n_idctl->id_frmw.fw_nslot - 1)) 5354 return (EINVAL); 5355 5356 mutex_enter(&nvme->n_fwslot_mutex); 5357 if (nvme->n_fwslot == NULL) { 5358 ret = nvme_get_logpage(nvme, B_TRUE, &log, &bufsize, 5359 NVME_LOGPAGE_FWSLOT, 0); 5360 if (ret != DDI_SUCCESS || 5361 bufsize != sizeof (nvme_fwslot_log_t)) { 5362 if (log != NULL) 5363 kmem_free(log, bufsize); 5364 mutex_exit(&nvme->n_fwslot_mutex); 5365 return (EIO); 5366 } 5367 nvme->n_fwslot = (nvme_fwslot_log_t *)log; 5368 } 5369 5370 /* 5371 * NVMe numbers firmware slots starting at 1 5372 */ 5373 if (slotno == (nvme->n_fwslot->fw_afi - 1)) 5374 attr |= DDI_UFM_ATTR_ACTIVE; 5375 5376 if (slotno != 0 || nvme->n_idctl->id_frmw.fw_readonly == 0) 5377 attr |= DDI_UFM_ATTR_WRITEABLE; 5378 5379 if (nvme->n_fwslot->fw_frs[slotno][0] == '\0') { 5380 attr |= DDI_UFM_ATTR_EMPTY; 5381 } else { 5382 (void) strncpy(fw_ver, nvme->n_fwslot->fw_frs[slotno], 5383 NVME_FWVER_SZ); 5384 fw_ver[NVME_FWVER_SZ] = '\0'; 5385 ddi_ufm_slot_set_version(slot, fw_ver); 5386 } 5387 mutex_exit(&nvme->n_fwslot_mutex); 5388 5389 ddi_ufm_slot_set_attrs(slot, attr); 5390 5391 return (0); 5392 } 5393 5394 static int 5395 nvme_ufm_getcaps(ddi_ufm_handle_t *ufmh, void *arg, ddi_ufm_cap_t *caps) 5396 { 5397 *caps = DDI_UFM_CAP_REPORT; 5398 return (0); 5399 } 5400