1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2018 Nexenta Systems, Inc. 14 * Copyright 2016 Tegile Systems, Inc. All rights reserved. 15 * Copyright (c) 2016 The MathWorks, Inc. All rights reserved. 16 * Copyright 2020 Joyent, Inc. 17 * Copyright 2019 Western Digital Corporation. 18 * Copyright 2020 Racktop Systems. 19 * Copyright 2022 Oxide Computer Company. 20 * Copyright 2022 OmniOS Community Edition (OmniOSce) Association. 21 */ 22 23 /* 24 * blkdev driver for NVMe compliant storage devices 25 * 26 * This driver targets and is designed to support all NVMe 1.x devices. 27 * Features are added to the driver as we encounter devices that require them 28 * and our needs, so some commands or log pages may not take advantage of newer 29 * features that devices support at this time. When you encounter such a case, 30 * it is generally fine to add that support to the driver as long as you take 31 * care to ensure that the requisite device version is met before using it. 32 * 33 * The driver has only been tested on x86 systems and will not work on big- 34 * endian systems without changes to the code accessing registers and data 35 * structures used by the hardware. 36 * 37 * 38 * Interrupt Usage: 39 * 40 * The driver will use a single interrupt while configuring the device as the 41 * specification requires, but contrary to the specification it will try to use 42 * a single-message MSI(-X) or FIXED interrupt. Later in the attach process it 43 * will switch to multiple-message MSI(-X) if supported. The driver wants to 44 * have one interrupt vector per CPU, but it will work correctly if less are 45 * available. Interrupts can be shared by queues, the interrupt handler will 46 * iterate through the I/O queue array by steps of n_intr_cnt. Usually only 47 * the admin queue will share an interrupt with one I/O queue. The interrupt 48 * handler will retrieve completed commands from all queues sharing an interrupt 49 * vector and will post them to a taskq for completion processing. 50 * 51 * 52 * Command Processing: 53 * 54 * NVMe devices can have up to 65535 I/O queue pairs, with each queue holding up 55 * to 65536 I/O commands. The driver will configure one I/O queue pair per 56 * available interrupt vector, with the queue length usually much smaller than 57 * the maximum of 65536. If the hardware doesn't provide enough queues, fewer 58 * interrupt vectors will be used. 59 * 60 * Additionally the hardware provides a single special admin queue pair that can 61 * hold up to 4096 admin commands. 62 * 63 * From the hardware perspective both queues of a queue pair are independent, 64 * but they share some driver state: the command array (holding pointers to 65 * commands currently being processed by the hardware) and the active command 66 * counter. Access to a submission queue and the shared state is protected by 67 * nq_mutex; completion queue is protected by ncq_mutex. 68 * 69 * When a command is submitted to a queue pair the active command counter is 70 * incremented and a pointer to the command is stored in the command array. The 71 * array index is used as command identifier (CID) in the submission queue 72 * entry. Some commands may take a very long time to complete, and if the queue 73 * wraps around in that time a submission may find the next array slot to still 74 * be used by a long-running command. In this case the array is sequentially 75 * searched for the next free slot. The length of the command array is the same 76 * as the configured queue length. Queue overrun is prevented by the semaphore, 77 * so a command submission may block if the queue is full. 78 * 79 * 80 * Polled I/O Support: 81 * 82 * For kernel core dump support the driver can do polled I/O. As interrupts are 83 * turned off while dumping the driver will just submit a command in the regular 84 * way, and then repeatedly attempt a command retrieval until it gets the 85 * command back. 86 * 87 * 88 * Namespace Support: 89 * 90 * NVMe devices can have multiple namespaces, each being a independent data 91 * store. The driver supports multiple namespaces and creates a blkdev interface 92 * for each namespace found. Namespaces can have various attributes to support 93 * protection information. This driver does not support any of this and ignores 94 * namespaces that have these attributes. 95 * 96 * As of NVMe 1.1 namespaces can have an 64bit Extended Unique Identifier 97 * (EUI64). This driver uses the EUI64 if present to generate the devid and 98 * passes it to blkdev to use it in the device node names. As this is currently 99 * untested namespaces with EUI64 are ignored by default. 100 * 101 * We currently support only (2 << NVME_MINOR_INST_SHIFT) - 2 namespaces in a 102 * single controller. This is an artificial limit imposed by the driver to be 103 * able to address a reasonable number of controllers and namespaces using a 104 * 32bit minor node number. 105 * 106 * 107 * Minor nodes: 108 * 109 * For each NVMe device the driver exposes one minor node for the controller and 110 * one minor node for each namespace. The only operations supported by those 111 * minor nodes are open(9E), close(9E), and ioctl(9E). This serves as the 112 * interface for the nvmeadm(1M) utility. 113 * 114 * 115 * Blkdev Interface: 116 * 117 * This driver uses blkdev to do all the heavy lifting involved with presenting 118 * a disk device to the system. As a result, the processing of I/O requests is 119 * relatively simple as blkdev takes care of partitioning, boundary checks, DMA 120 * setup, and splitting of transfers into manageable chunks. 121 * 122 * I/O requests coming in from blkdev are turned into NVM commands and posted to 123 * an I/O queue. The queue is selected by taking the CPU id modulo the number of 124 * queues. There is currently no timeout handling of I/O commands. 125 * 126 * Blkdev also supports querying device/media information and generating a 127 * devid. The driver reports the best block size as determined by the namespace 128 * format back to blkdev as physical block size to support partition and block 129 * alignment. The devid is either based on the namespace EUI64, if present, or 130 * composed using the device vendor ID, model number, serial number, and the 131 * namespace ID. 132 * 133 * 134 * Error Handling: 135 * 136 * Error handling is currently limited to detecting fatal hardware errors, 137 * either by asynchronous events, or synchronously through command status or 138 * admin command timeouts. In case of severe errors the device is fenced off, 139 * all further requests will return EIO. FMA is then called to fault the device. 140 * 141 * The hardware has a limit for outstanding asynchronous event requests. Before 142 * this limit is known the driver assumes it is at least 1 and posts a single 143 * asynchronous request. Later when the limit is known more asynchronous event 144 * requests are posted to allow quicker reception of error information. When an 145 * asynchronous event is posted by the hardware the driver will parse the error 146 * status fields and log information or fault the device, depending on the 147 * severity of the asynchronous event. The asynchronous event request is then 148 * reused and posted to the admin queue again. 149 * 150 * On command completion the command status is checked for errors. In case of 151 * errors indicating a driver bug the driver panics. Almost all other error 152 * status values just cause EIO to be returned. 153 * 154 * Command timeouts are currently detected for all admin commands except 155 * asynchronous event requests. If a command times out and the hardware appears 156 * to be healthy the driver attempts to abort the command. The original command 157 * timeout is also applied to the abort command. If the abort times out too the 158 * driver assumes the device to be dead, fences it off, and calls FMA to retire 159 * it. In all other cases the aborted command should return immediately with a 160 * status indicating it was aborted, and the driver will wait indefinitely for 161 * that to happen. No timeout handling of normal I/O commands is presently done. 162 * 163 * Any command that times out due to the controller dropping dead will be put on 164 * nvme_lost_cmds list if it references DMA memory. This will prevent the DMA 165 * memory being reused by the system and later be written to by a "dead" NVMe 166 * controller. 167 * 168 * 169 * Locking: 170 * 171 * Each queue pair has a nq_mutex and ncq_mutex. The nq_mutex must be held 172 * when accessing shared state and submission queue registers, ncq_mutex 173 * is held when accessing completion queue state and registers. 174 * Callers of nvme_unqueue_cmd() must make sure that nq_mutex is held, while 175 * nvme_submit_{admin,io}_cmd() and nvme_retrieve_cmd() take care of both 176 * mutexes themselves. 177 * 178 * Each command also has its own nc_mutex, which is associated with the 179 * condition variable nc_cv. It is only used on admin commands which are run 180 * synchronously. In that case it must be held across calls to 181 * nvme_submit_{admin,io}_cmd() and nvme_wait_cmd(), which is taken care of by 182 * nvme_admin_cmd(). It must also be held whenever the completion state of the 183 * command is changed or while a admin command timeout is handled. 184 * 185 * If both nc_mutex and nq_mutex must be held, nc_mutex must be acquired first. 186 * More than one nc_mutex may only be held when aborting commands. In this case, 187 * the nc_mutex of the command to be aborted must be held across the call to 188 * nvme_abort_cmd() to prevent the command from completing while the abort is in 189 * progress. 190 * 191 * If both nq_mutex and ncq_mutex need to be held, ncq_mutex must be 192 * acquired first. More than one nq_mutex is never held by a single thread. 193 * The ncq_mutex is only held by nvme_retrieve_cmd() and 194 * nvme_process_iocq(). nvme_process_iocq() is only called from the 195 * interrupt thread and nvme_retrieve_cmd() during polled I/O, so the 196 * mutex is non-contentious but is required for implementation completeness 197 * and safety. 198 * 199 * Each minor node has its own nm_mutex, which protects the open count nm_ocnt 200 * and exclusive-open flag nm_oexcl. 201 * 202 * 203 * Quiesce / Fast Reboot: 204 * 205 * The driver currently does not support fast reboot. A quiesce(9E) entry point 206 * is still provided which is used to send a shutdown notification to the 207 * device. 208 * 209 * 210 * NVMe Hotplug: 211 * 212 * The driver supports hot removal. The driver uses the NDI event framework 213 * to register a callback, nvme_remove_callback, to clean up when a disk is 214 * removed. In particular, the driver will unqueue outstanding I/O commands and 215 * set n_dead on the softstate to true so that other operations, such as ioctls 216 * and command submissions, fail as well. 217 * 218 * While the callback registration relies on the NDI event framework, the 219 * removal event itself is kicked off in the PCIe hotplug framework, when the 220 * PCIe bridge driver ("pcieb") gets a hotplug interrupt indicating that a 221 * device was removed from the slot. 222 * 223 * The NVMe driver instance itself will remain until the final close of the 224 * device. 225 * 226 * 227 * DDI UFM Support 228 * 229 * The driver supports the DDI UFM framework for reporting information about 230 * the device's firmware image and slot configuration. This data can be 231 * queried by userland software via ioctls to the ufm driver. For more 232 * information, see ddi_ufm(9E). 233 * 234 * 235 * Driver Configuration: 236 * 237 * The following driver properties can be changed to control some aspects of the 238 * drivers operation: 239 * - strict-version: can be set to 0 to allow devices conforming to newer 240 * major versions to be used 241 * - ignore-unknown-vendor-status: can be set to 1 to not handle any vendor 242 * specific command status as a fatal error leading device faulting 243 * - admin-queue-len: the maximum length of the admin queue (16-4096) 244 * - io-squeue-len: the maximum length of the I/O submission queues (16-65536) 245 * - io-cqueue-len: the maximum length of the I/O completion queues (16-65536) 246 * - async-event-limit: the maximum number of asynchronous event requests to be 247 * posted by the driver 248 * - volatile-write-cache-enable: can be set to 0 to disable the volatile write 249 * cache 250 * - min-phys-block-size: the minimum physical block size to report to blkdev, 251 * which is among other things the basis for ZFS vdev ashift 252 * - max-submission-queues: the maximum number of I/O submission queues. 253 * - max-completion-queues: the maximum number of I/O completion queues, 254 * can be less than max-submission-queues, in which case the completion 255 * queues are shared. 256 * 257 * 258 * TODO: 259 * - figure out sane default for I/O queue depth reported to blkdev 260 * - FMA handling of media errors 261 * - support for devices supporting very large I/O requests using chained PRPs 262 * - support for configuring hardware parameters like interrupt coalescing 263 * - support for media formatting and hard partitioning into namespaces 264 * - support for big-endian systems 265 * - support for fast reboot 266 * - support for NVMe Subsystem Reset (1.1) 267 * - support for Scatter/Gather lists (1.1) 268 * - support for Reservations (1.1) 269 * - support for power management 270 */ 271 272 #include <sys/byteorder.h> 273 #ifdef _BIG_ENDIAN 274 #error nvme driver needs porting for big-endian platforms 275 #endif 276 277 #include <sys/modctl.h> 278 #include <sys/conf.h> 279 #include <sys/devops.h> 280 #include <sys/ddi.h> 281 #include <sys/ddi_ufm.h> 282 #include <sys/sunddi.h> 283 #include <sys/sunndi.h> 284 #include <sys/bitmap.h> 285 #include <sys/sysmacros.h> 286 #include <sys/param.h> 287 #include <sys/varargs.h> 288 #include <sys/cpuvar.h> 289 #include <sys/disp.h> 290 #include <sys/blkdev.h> 291 #include <sys/atomic.h> 292 #include <sys/archsystm.h> 293 #include <sys/sata/sata_hba.h> 294 #include <sys/stat.h> 295 #include <sys/policy.h> 296 #include <sys/list.h> 297 #include <sys/dkio.h> 298 299 #include <sys/nvme.h> 300 301 #ifdef __x86 302 #include <sys/x86_archext.h> 303 #endif 304 305 #include "nvme_reg.h" 306 #include "nvme_var.h" 307 308 /* 309 * Assertions to make sure that we've properly captured various aspects of the 310 * packed structures and haven't broken them during updates. 311 */ 312 CTASSERT(sizeof (nvme_identify_ctrl_t) == 0x1000); 313 CTASSERT(offsetof(nvme_identify_ctrl_t, id_oacs) == 256); 314 CTASSERT(offsetof(nvme_identify_ctrl_t, id_sqes) == 512); 315 CTASSERT(offsetof(nvme_identify_ctrl_t, id_oncs) == 520); 316 CTASSERT(offsetof(nvme_identify_ctrl_t, id_subnqn) == 768); 317 CTASSERT(offsetof(nvme_identify_ctrl_t, id_nvmof) == 1792); 318 CTASSERT(offsetof(nvme_identify_ctrl_t, id_psd) == 2048); 319 CTASSERT(offsetof(nvme_identify_ctrl_t, id_vs) == 3072); 320 321 CTASSERT(sizeof (nvme_identify_nsid_t) == 0x1000); 322 CTASSERT(offsetof(nvme_identify_nsid_t, id_fpi) == 32); 323 CTASSERT(offsetof(nvme_identify_nsid_t, id_anagrpid) == 92); 324 CTASSERT(offsetof(nvme_identify_nsid_t, id_nguid) == 104); 325 CTASSERT(offsetof(nvme_identify_nsid_t, id_lbaf) == 128); 326 CTASSERT(offsetof(nvme_identify_nsid_t, id_vs) == 384); 327 328 CTASSERT(sizeof (nvme_identify_primary_caps_t) == 0x1000); 329 CTASSERT(offsetof(nvme_identify_primary_caps_t, nipc_vqfrt) == 32); 330 CTASSERT(offsetof(nvme_identify_primary_caps_t, nipc_vifrt) == 64); 331 332 333 /* NVMe spec version supported */ 334 static const int nvme_version_major = 1; 335 336 /* tunable for admin command timeout in seconds, default is 1s */ 337 int nvme_admin_cmd_timeout = 1; 338 339 /* tunable for FORMAT NVM command timeout in seconds, default is 600s */ 340 int nvme_format_cmd_timeout = 600; 341 342 /* tunable for firmware commit with NVME_FWC_SAVE, default is 15s */ 343 int nvme_commit_save_cmd_timeout = 15; 344 345 /* 346 * tunable for the size of arbitrary vendor specific admin commands, 347 * default is 16MiB. 348 */ 349 uint32_t nvme_vendor_specific_admin_cmd_size = 1 << 24; 350 351 /* 352 * tunable for the max timeout of arbitary vendor specific admin commands, 353 * default is 60s. 354 */ 355 uint_t nvme_vendor_specific_admin_cmd_max_timeout = 60; 356 357 static int nvme_attach(dev_info_t *, ddi_attach_cmd_t); 358 static int nvme_detach(dev_info_t *, ddi_detach_cmd_t); 359 static int nvme_quiesce(dev_info_t *); 360 static int nvme_fm_errcb(dev_info_t *, ddi_fm_error_t *, const void *); 361 static int nvme_setup_interrupts(nvme_t *, int, int); 362 static void nvme_release_interrupts(nvme_t *); 363 static uint_t nvme_intr(caddr_t, caddr_t); 364 365 static void nvme_shutdown(nvme_t *, int, boolean_t); 366 static boolean_t nvme_reset(nvme_t *, boolean_t); 367 static int nvme_init(nvme_t *); 368 static nvme_cmd_t *nvme_alloc_cmd(nvme_t *, int); 369 static void nvme_free_cmd(nvme_cmd_t *); 370 static nvme_cmd_t *nvme_create_nvm_cmd(nvme_namespace_t *, uint8_t, 371 bd_xfer_t *); 372 static void nvme_admin_cmd(nvme_cmd_t *, int); 373 static void nvme_submit_admin_cmd(nvme_qpair_t *, nvme_cmd_t *); 374 static int nvme_submit_io_cmd(nvme_qpair_t *, nvme_cmd_t *); 375 static void nvme_submit_cmd_common(nvme_qpair_t *, nvme_cmd_t *); 376 static nvme_cmd_t *nvme_unqueue_cmd(nvme_t *, nvme_qpair_t *, int); 377 static nvme_cmd_t *nvme_retrieve_cmd(nvme_t *, nvme_qpair_t *); 378 static void nvme_wait_cmd(nvme_cmd_t *, uint_t); 379 static void nvme_wakeup_cmd(void *); 380 static void nvme_async_event_task(void *); 381 382 static int nvme_check_unknown_cmd_status(nvme_cmd_t *); 383 static int nvme_check_vendor_cmd_status(nvme_cmd_t *); 384 static int nvme_check_integrity_cmd_status(nvme_cmd_t *); 385 static int nvme_check_specific_cmd_status(nvme_cmd_t *); 386 static int nvme_check_generic_cmd_status(nvme_cmd_t *); 387 static inline int nvme_check_cmd_status(nvme_cmd_t *); 388 389 static int nvme_abort_cmd(nvme_cmd_t *, uint_t); 390 static void nvme_async_event(nvme_t *); 391 static int nvme_format_nvm(nvme_t *, boolean_t, uint32_t, uint8_t, boolean_t, 392 uint8_t, boolean_t, uint8_t); 393 static int nvme_get_logpage(nvme_t *, boolean_t, void **, size_t *, uint8_t, 394 ...); 395 static int nvme_identify(nvme_t *, boolean_t, uint32_t, void **); 396 static int nvme_set_features(nvme_t *, boolean_t, uint32_t, uint8_t, uint32_t, 397 uint32_t *); 398 static int nvme_get_features(nvme_t *, boolean_t, uint32_t, uint8_t, uint32_t *, 399 void **, size_t *); 400 static int nvme_write_cache_set(nvme_t *, boolean_t); 401 static int nvme_set_nqueues(nvme_t *); 402 403 static void nvme_free_dma(nvme_dma_t *); 404 static int nvme_zalloc_dma(nvme_t *, size_t, uint_t, ddi_dma_attr_t *, 405 nvme_dma_t **); 406 static int nvme_zalloc_queue_dma(nvme_t *, uint32_t, uint16_t, uint_t, 407 nvme_dma_t **); 408 static void nvme_free_qpair(nvme_qpair_t *); 409 static int nvme_alloc_qpair(nvme_t *, uint32_t, nvme_qpair_t **, uint_t); 410 static int nvme_create_io_qpair(nvme_t *, nvme_qpair_t *, uint16_t); 411 412 static inline void nvme_put64(nvme_t *, uintptr_t, uint64_t); 413 static inline void nvme_put32(nvme_t *, uintptr_t, uint32_t); 414 static inline uint64_t nvme_get64(nvme_t *, uintptr_t); 415 static inline uint32_t nvme_get32(nvme_t *, uintptr_t); 416 417 static boolean_t nvme_check_regs_hdl(nvme_t *); 418 static boolean_t nvme_check_dma_hdl(nvme_dma_t *); 419 420 static int nvme_fill_prp(nvme_cmd_t *, ddi_dma_handle_t); 421 422 static void nvme_bd_xfer_done(void *); 423 static void nvme_bd_driveinfo(void *, bd_drive_t *); 424 static int nvme_bd_mediainfo(void *, bd_media_t *); 425 static int nvme_bd_cmd(nvme_namespace_t *, bd_xfer_t *, uint8_t); 426 static int nvme_bd_read(void *, bd_xfer_t *); 427 static int nvme_bd_write(void *, bd_xfer_t *); 428 static int nvme_bd_sync(void *, bd_xfer_t *); 429 static int nvme_bd_devid(void *, dev_info_t *, ddi_devid_t *); 430 static int nvme_bd_free_space(void *, bd_xfer_t *); 431 432 static int nvme_prp_dma_constructor(void *, void *, int); 433 static void nvme_prp_dma_destructor(void *, void *); 434 435 static void nvme_prepare_devid(nvme_t *, uint32_t); 436 437 /* DDI UFM callbacks */ 438 static int nvme_ufm_fill_image(ddi_ufm_handle_t *, void *, uint_t, 439 ddi_ufm_image_t *); 440 static int nvme_ufm_fill_slot(ddi_ufm_handle_t *, void *, uint_t, uint_t, 441 ddi_ufm_slot_t *); 442 static int nvme_ufm_getcaps(ddi_ufm_handle_t *, void *, ddi_ufm_cap_t *); 443 444 static int nvme_open(dev_t *, int, int, cred_t *); 445 static int nvme_close(dev_t, int, int, cred_t *); 446 static int nvme_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); 447 448 static void nvme_changed_ns(nvme_t *, int); 449 450 static ddi_ufm_ops_t nvme_ufm_ops = { 451 NULL, 452 nvme_ufm_fill_image, 453 nvme_ufm_fill_slot, 454 nvme_ufm_getcaps 455 }; 456 457 #define NVME_MINOR_INST_SHIFT 9 458 #define NVME_MINOR(inst, nsid) (((inst) << NVME_MINOR_INST_SHIFT) | (nsid)) 459 #define NVME_MINOR_INST(minor) ((minor) >> NVME_MINOR_INST_SHIFT) 460 #define NVME_MINOR_NSID(minor) ((minor) & ((1 << NVME_MINOR_INST_SHIFT) - 1)) 461 #define NVME_MINOR_MAX (NVME_MINOR(1, 0) - 2) 462 #define NVME_IS_VENDOR_SPECIFIC_CMD(x) (((x) >= 0xC0) && ((x) <= 0xFF)) 463 #define NVME_VENDOR_SPECIFIC_LOGPAGE_MIN 0xC0 464 #define NVME_VENDOR_SPECIFIC_LOGPAGE_MAX 0xFF 465 #define NVME_IS_VENDOR_SPECIFIC_LOGPAGE(x) \ 466 (((x) >= NVME_VENDOR_SPECIFIC_LOGPAGE_MIN) && \ 467 ((x) <= NVME_VENDOR_SPECIFIC_LOGPAGE_MAX)) 468 469 /* 470 * NVMe versions 1.3 and later actually support log pages up to UINT32_MAX 471 * DWords in size. However, revision 1.3 also modified the layout of the Get Log 472 * Page command significantly relative to version 1.2, including changing 473 * reserved bits, adding new bitfields, and requiring the use of command DWord 474 * 11 to fully specify the size of the log page (the lower and upper 16 bits of 475 * the number of DWords in the page are split between DWord 10 and DWord 11, 476 * respectively). 477 * 478 * All of these impose significantly different layout requirements on the 479 * `nvme_getlogpage_t` type. This could be solved with two different types, or a 480 * complicated/nested union with the two versions as the overlying members. Both 481 * of these are reasonable, if a bit convoluted. However, these is no current 482 * need for such large pages, or a way to test them, as most log pages actually 483 * fit within the current size limit. So for simplicity, we retain the size cap 484 * from version 1.2. 485 * 486 * Note that the number of DWords is zero-based, so we add 1. It is subtracted 487 * to form a zero-based value in `nvme_get_logpage`. 488 */ 489 #define NVME_VENDOR_SPECIFIC_LOGPAGE_MAX_SIZE \ 490 (((1 << 12) + 1) * sizeof (uint32_t)) 491 492 static void *nvme_state; 493 static kmem_cache_t *nvme_cmd_cache; 494 495 /* 496 * DMA attributes for queue DMA memory 497 * 498 * Queue DMA memory must be page aligned. The maximum length of a queue is 499 * 65536 entries, and an entry can be 64 bytes long. 500 */ 501 static ddi_dma_attr_t nvme_queue_dma_attr = { 502 .dma_attr_version = DMA_ATTR_V0, 503 .dma_attr_addr_lo = 0, 504 .dma_attr_addr_hi = 0xffffffffffffffffULL, 505 .dma_attr_count_max = (UINT16_MAX + 1) * sizeof (nvme_sqe_t) - 1, 506 .dma_attr_align = 0x1000, 507 .dma_attr_burstsizes = 0x7ff, 508 .dma_attr_minxfer = 0x1000, 509 .dma_attr_maxxfer = (UINT16_MAX + 1) * sizeof (nvme_sqe_t), 510 .dma_attr_seg = 0xffffffffffffffffULL, 511 .dma_attr_sgllen = 1, 512 .dma_attr_granular = 1, 513 .dma_attr_flags = 0, 514 }; 515 516 /* 517 * DMA attributes for transfers using Physical Region Page (PRP) entries 518 * 519 * A PRP entry describes one page of DMA memory using the page size specified 520 * in the controller configuration's memory page size register (CC.MPS). It uses 521 * a 64bit base address aligned to this page size. There is no limitation on 522 * chaining PRPs together for arbitrarily large DMA transfers. 523 */ 524 static ddi_dma_attr_t nvme_prp_dma_attr = { 525 .dma_attr_version = DMA_ATTR_V0, 526 .dma_attr_addr_lo = 0, 527 .dma_attr_addr_hi = 0xffffffffffffffffULL, 528 .dma_attr_count_max = 0xfff, 529 .dma_attr_align = 0x1000, 530 .dma_attr_burstsizes = 0x7ff, 531 .dma_attr_minxfer = 0x1000, 532 .dma_attr_maxxfer = 0x1000, 533 .dma_attr_seg = 0xfff, 534 .dma_attr_sgllen = -1, 535 .dma_attr_granular = 1, 536 .dma_attr_flags = 0, 537 }; 538 539 /* 540 * DMA attributes for transfers using scatter/gather lists 541 * 542 * A SGL entry describes a chunk of DMA memory using a 64bit base address and a 543 * 32bit length field. SGL Segment and SGL Last Segment entries require the 544 * length to be a multiple of 16 bytes. 545 */ 546 static ddi_dma_attr_t nvme_sgl_dma_attr = { 547 .dma_attr_version = DMA_ATTR_V0, 548 .dma_attr_addr_lo = 0, 549 .dma_attr_addr_hi = 0xffffffffffffffffULL, 550 .dma_attr_count_max = 0xffffffffUL, 551 .dma_attr_align = 1, 552 .dma_attr_burstsizes = 0x7ff, 553 .dma_attr_minxfer = 0x10, 554 .dma_attr_maxxfer = 0xfffffffffULL, 555 .dma_attr_seg = 0xffffffffffffffffULL, 556 .dma_attr_sgllen = -1, 557 .dma_attr_granular = 0x10, 558 .dma_attr_flags = 0 559 }; 560 561 static ddi_device_acc_attr_t nvme_reg_acc_attr = { 562 .devacc_attr_version = DDI_DEVICE_ATTR_V0, 563 .devacc_attr_endian_flags = DDI_STRUCTURE_LE_ACC, 564 .devacc_attr_dataorder = DDI_STRICTORDER_ACC 565 }; 566 567 static struct cb_ops nvme_cb_ops = { 568 .cb_open = nvme_open, 569 .cb_close = nvme_close, 570 .cb_strategy = nodev, 571 .cb_print = nodev, 572 .cb_dump = nodev, 573 .cb_read = nodev, 574 .cb_write = nodev, 575 .cb_ioctl = nvme_ioctl, 576 .cb_devmap = nodev, 577 .cb_mmap = nodev, 578 .cb_segmap = nodev, 579 .cb_chpoll = nochpoll, 580 .cb_prop_op = ddi_prop_op, 581 .cb_str = 0, 582 .cb_flag = D_NEW | D_MP, 583 .cb_rev = CB_REV, 584 .cb_aread = nodev, 585 .cb_awrite = nodev 586 }; 587 588 static struct dev_ops nvme_dev_ops = { 589 .devo_rev = DEVO_REV, 590 .devo_refcnt = 0, 591 .devo_getinfo = ddi_no_info, 592 .devo_identify = nulldev, 593 .devo_probe = nulldev, 594 .devo_attach = nvme_attach, 595 .devo_detach = nvme_detach, 596 .devo_reset = nodev, 597 .devo_cb_ops = &nvme_cb_ops, 598 .devo_bus_ops = NULL, 599 .devo_power = NULL, 600 .devo_quiesce = nvme_quiesce, 601 }; 602 603 static struct modldrv nvme_modldrv = { 604 .drv_modops = &mod_driverops, 605 .drv_linkinfo = "NVMe v1.1b", 606 .drv_dev_ops = &nvme_dev_ops 607 }; 608 609 static struct modlinkage nvme_modlinkage = { 610 .ml_rev = MODREV_1, 611 .ml_linkage = { &nvme_modldrv, NULL } 612 }; 613 614 static bd_ops_t nvme_bd_ops = { 615 .o_version = BD_OPS_CURRENT_VERSION, 616 .o_drive_info = nvme_bd_driveinfo, 617 .o_media_info = nvme_bd_mediainfo, 618 .o_devid_init = nvme_bd_devid, 619 .o_sync_cache = nvme_bd_sync, 620 .o_read = nvme_bd_read, 621 .o_write = nvme_bd_write, 622 .o_free_space = nvme_bd_free_space, 623 }; 624 625 /* 626 * This list will hold commands that have timed out and couldn't be aborted. 627 * As we don't know what the hardware may still do with the DMA memory we can't 628 * free them, so we'll keep them forever on this list where we can easily look 629 * at them with mdb. 630 */ 631 static struct list nvme_lost_cmds; 632 static kmutex_t nvme_lc_mutex; 633 634 int 635 _init(void) 636 { 637 int error; 638 639 error = ddi_soft_state_init(&nvme_state, sizeof (nvme_t), 1); 640 if (error != DDI_SUCCESS) 641 return (error); 642 643 nvme_cmd_cache = kmem_cache_create("nvme_cmd_cache", 644 sizeof (nvme_cmd_t), 64, NULL, NULL, NULL, NULL, NULL, 0); 645 646 mutex_init(&nvme_lc_mutex, NULL, MUTEX_DRIVER, NULL); 647 list_create(&nvme_lost_cmds, sizeof (nvme_cmd_t), 648 offsetof(nvme_cmd_t, nc_list)); 649 650 bd_mod_init(&nvme_dev_ops); 651 652 error = mod_install(&nvme_modlinkage); 653 if (error != DDI_SUCCESS) { 654 ddi_soft_state_fini(&nvme_state); 655 mutex_destroy(&nvme_lc_mutex); 656 list_destroy(&nvme_lost_cmds); 657 bd_mod_fini(&nvme_dev_ops); 658 } 659 660 return (error); 661 } 662 663 int 664 _fini(void) 665 { 666 int error; 667 668 if (!list_is_empty(&nvme_lost_cmds)) 669 return (DDI_FAILURE); 670 671 error = mod_remove(&nvme_modlinkage); 672 if (error == DDI_SUCCESS) { 673 ddi_soft_state_fini(&nvme_state); 674 kmem_cache_destroy(nvme_cmd_cache); 675 mutex_destroy(&nvme_lc_mutex); 676 list_destroy(&nvme_lost_cmds); 677 bd_mod_fini(&nvme_dev_ops); 678 } 679 680 return (error); 681 } 682 683 int 684 _info(struct modinfo *modinfop) 685 { 686 return (mod_info(&nvme_modlinkage, modinfop)); 687 } 688 689 static inline void 690 nvme_put64(nvme_t *nvme, uintptr_t reg, uint64_t val) 691 { 692 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0); 693 694 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 695 ddi_put64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg), val); 696 } 697 698 static inline void 699 nvme_put32(nvme_t *nvme, uintptr_t reg, uint32_t val) 700 { 701 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x3) == 0); 702 703 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 704 ddi_put32(nvme->n_regh, (uint32_t *)(nvme->n_regs + reg), val); 705 } 706 707 static inline uint64_t 708 nvme_get64(nvme_t *nvme, uintptr_t reg) 709 { 710 uint64_t val; 711 712 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0); 713 714 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 715 val = ddi_get64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg)); 716 717 return (val); 718 } 719 720 static inline uint32_t 721 nvme_get32(nvme_t *nvme, uintptr_t reg) 722 { 723 uint32_t val; 724 725 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x3) == 0); 726 727 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 728 val = ddi_get32(nvme->n_regh, (uint32_t *)(nvme->n_regs + reg)); 729 730 return (val); 731 } 732 733 static boolean_t 734 nvme_check_regs_hdl(nvme_t *nvme) 735 { 736 ddi_fm_error_t error; 737 738 ddi_fm_acc_err_get(nvme->n_regh, &error, DDI_FME_VERSION); 739 740 if (error.fme_status != DDI_FM_OK) 741 return (B_TRUE); 742 743 return (B_FALSE); 744 } 745 746 static boolean_t 747 nvme_check_dma_hdl(nvme_dma_t *dma) 748 { 749 ddi_fm_error_t error; 750 751 if (dma == NULL) 752 return (B_FALSE); 753 754 ddi_fm_dma_err_get(dma->nd_dmah, &error, DDI_FME_VERSION); 755 756 if (error.fme_status != DDI_FM_OK) 757 return (B_TRUE); 758 759 return (B_FALSE); 760 } 761 762 static void 763 nvme_free_dma_common(nvme_dma_t *dma) 764 { 765 if (dma->nd_dmah != NULL) 766 (void) ddi_dma_unbind_handle(dma->nd_dmah); 767 if (dma->nd_acch != NULL) 768 ddi_dma_mem_free(&dma->nd_acch); 769 if (dma->nd_dmah != NULL) 770 ddi_dma_free_handle(&dma->nd_dmah); 771 } 772 773 static void 774 nvme_free_dma(nvme_dma_t *dma) 775 { 776 nvme_free_dma_common(dma); 777 kmem_free(dma, sizeof (*dma)); 778 } 779 780 /* ARGSUSED */ 781 static void 782 nvme_prp_dma_destructor(void *buf, void *private) 783 { 784 nvme_dma_t *dma = (nvme_dma_t *)buf; 785 786 nvme_free_dma_common(dma); 787 } 788 789 static int 790 nvme_alloc_dma_common(nvme_t *nvme, nvme_dma_t *dma, 791 size_t len, uint_t flags, ddi_dma_attr_t *dma_attr) 792 { 793 if (ddi_dma_alloc_handle(nvme->n_dip, dma_attr, DDI_DMA_SLEEP, NULL, 794 &dma->nd_dmah) != DDI_SUCCESS) { 795 /* 796 * Due to DDI_DMA_SLEEP this can't be DDI_DMA_NORESOURCES, and 797 * the only other possible error is DDI_DMA_BADATTR which 798 * indicates a driver bug which should cause a panic. 799 */ 800 dev_err(nvme->n_dip, CE_PANIC, 801 "!failed to get DMA handle, check DMA attributes"); 802 return (DDI_FAILURE); 803 } 804 805 /* 806 * ddi_dma_mem_alloc() can only fail when DDI_DMA_NOSLEEP is specified 807 * or the flags are conflicting, which isn't the case here. 808 */ 809 (void) ddi_dma_mem_alloc(dma->nd_dmah, len, &nvme->n_reg_acc_attr, 810 DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL, &dma->nd_memp, 811 &dma->nd_len, &dma->nd_acch); 812 813 if (ddi_dma_addr_bind_handle(dma->nd_dmah, NULL, dma->nd_memp, 814 dma->nd_len, flags | DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL, 815 &dma->nd_cookie, &dma->nd_ncookie) != DDI_DMA_MAPPED) { 816 dev_err(nvme->n_dip, CE_WARN, 817 "!failed to bind DMA memory"); 818 atomic_inc_32(&nvme->n_dma_bind_err); 819 nvme_free_dma_common(dma); 820 return (DDI_FAILURE); 821 } 822 823 return (DDI_SUCCESS); 824 } 825 826 static int 827 nvme_zalloc_dma(nvme_t *nvme, size_t len, uint_t flags, 828 ddi_dma_attr_t *dma_attr, nvme_dma_t **ret) 829 { 830 nvme_dma_t *dma = kmem_zalloc(sizeof (nvme_dma_t), KM_SLEEP); 831 832 if (nvme_alloc_dma_common(nvme, dma, len, flags, dma_attr) != 833 DDI_SUCCESS) { 834 *ret = NULL; 835 kmem_free(dma, sizeof (nvme_dma_t)); 836 return (DDI_FAILURE); 837 } 838 839 bzero(dma->nd_memp, dma->nd_len); 840 841 *ret = dma; 842 return (DDI_SUCCESS); 843 } 844 845 /* ARGSUSED */ 846 static int 847 nvme_prp_dma_constructor(void *buf, void *private, int flags) 848 { 849 nvme_dma_t *dma = (nvme_dma_t *)buf; 850 nvme_t *nvme = (nvme_t *)private; 851 852 dma->nd_dmah = NULL; 853 dma->nd_acch = NULL; 854 855 if (nvme_alloc_dma_common(nvme, dma, nvme->n_pagesize, 856 DDI_DMA_READ, &nvme->n_prp_dma_attr) != DDI_SUCCESS) { 857 return (-1); 858 } 859 860 ASSERT(dma->nd_ncookie == 1); 861 862 dma->nd_cached = B_TRUE; 863 864 return (0); 865 } 866 867 static int 868 nvme_zalloc_queue_dma(nvme_t *nvme, uint32_t nentry, uint16_t qe_len, 869 uint_t flags, nvme_dma_t **dma) 870 { 871 uint32_t len = nentry * qe_len; 872 ddi_dma_attr_t q_dma_attr = nvme->n_queue_dma_attr; 873 874 len = roundup(len, nvme->n_pagesize); 875 876 if (nvme_zalloc_dma(nvme, len, flags, &q_dma_attr, dma) 877 != DDI_SUCCESS) { 878 dev_err(nvme->n_dip, CE_WARN, 879 "!failed to get DMA memory for queue"); 880 goto fail; 881 } 882 883 if ((*dma)->nd_ncookie != 1) { 884 dev_err(nvme->n_dip, CE_WARN, 885 "!got too many cookies for queue DMA"); 886 goto fail; 887 } 888 889 return (DDI_SUCCESS); 890 891 fail: 892 if (*dma) { 893 nvme_free_dma(*dma); 894 *dma = NULL; 895 } 896 897 return (DDI_FAILURE); 898 } 899 900 static void 901 nvme_free_cq(nvme_cq_t *cq) 902 { 903 mutex_destroy(&cq->ncq_mutex); 904 905 if (cq->ncq_cmd_taskq != NULL) 906 taskq_destroy(cq->ncq_cmd_taskq); 907 908 if (cq->ncq_dma != NULL) 909 nvme_free_dma(cq->ncq_dma); 910 911 kmem_free(cq, sizeof (*cq)); 912 } 913 914 static void 915 nvme_free_qpair(nvme_qpair_t *qp) 916 { 917 int i; 918 919 mutex_destroy(&qp->nq_mutex); 920 sema_destroy(&qp->nq_sema); 921 922 if (qp->nq_sqdma != NULL) 923 nvme_free_dma(qp->nq_sqdma); 924 925 if (qp->nq_active_cmds > 0) 926 for (i = 0; i != qp->nq_nentry; i++) 927 if (qp->nq_cmd[i] != NULL) 928 nvme_free_cmd(qp->nq_cmd[i]); 929 930 if (qp->nq_cmd != NULL) 931 kmem_free(qp->nq_cmd, sizeof (nvme_cmd_t *) * qp->nq_nentry); 932 933 kmem_free(qp, sizeof (nvme_qpair_t)); 934 } 935 936 /* 937 * Destroy the pre-allocated cq array, but only free individual completion 938 * queues from the given starting index. 939 */ 940 static void 941 nvme_destroy_cq_array(nvme_t *nvme, uint_t start) 942 { 943 uint_t i; 944 945 for (i = start; i < nvme->n_cq_count; i++) 946 if (nvme->n_cq[i] != NULL) 947 nvme_free_cq(nvme->n_cq[i]); 948 949 kmem_free(nvme->n_cq, sizeof (*nvme->n_cq) * nvme->n_cq_count); 950 } 951 952 static int 953 nvme_alloc_cq(nvme_t *nvme, uint32_t nentry, nvme_cq_t **cqp, uint16_t idx, 954 uint_t nthr) 955 { 956 nvme_cq_t *cq = kmem_zalloc(sizeof (*cq), KM_SLEEP); 957 char name[64]; /* large enough for the taskq name */ 958 959 mutex_init(&cq->ncq_mutex, NULL, MUTEX_DRIVER, 960 DDI_INTR_PRI(nvme->n_intr_pri)); 961 962 if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_cqe_t), 963 DDI_DMA_READ, &cq->ncq_dma) != DDI_SUCCESS) 964 goto fail; 965 966 cq->ncq_cq = (nvme_cqe_t *)cq->ncq_dma->nd_memp; 967 cq->ncq_nentry = nentry; 968 cq->ncq_id = idx; 969 cq->ncq_hdbl = NVME_REG_CQHDBL(nvme, idx); 970 971 /* 972 * Each completion queue has its own command taskq. 973 */ 974 (void) snprintf(name, sizeof (name), "%s%d_cmd_taskq%u", 975 ddi_driver_name(nvme->n_dip), ddi_get_instance(nvme->n_dip), idx); 976 977 cq->ncq_cmd_taskq = taskq_create(name, nthr, minclsyspri, 64, INT_MAX, 978 TASKQ_PREPOPULATE); 979 980 if (cq->ncq_cmd_taskq == NULL) { 981 dev_err(nvme->n_dip, CE_WARN, "!failed to create cmd " 982 "taskq for cq %u", idx); 983 goto fail; 984 } 985 986 *cqp = cq; 987 return (DDI_SUCCESS); 988 989 fail: 990 nvme_free_cq(cq); 991 *cqp = NULL; 992 993 return (DDI_FAILURE); 994 } 995 996 /* 997 * Create the n_cq array big enough to hold "ncq" completion queues. 998 * If the array already exists it will be re-sized (but only larger). 999 * The admin queue is included in this array, which boosts the 1000 * max number of entries to UINT16_MAX + 1. 1001 */ 1002 static int 1003 nvme_create_cq_array(nvme_t *nvme, uint_t ncq, uint32_t nentry, uint_t nthr) 1004 { 1005 nvme_cq_t **cq; 1006 uint_t i, cq_count; 1007 1008 ASSERT3U(ncq, >, nvme->n_cq_count); 1009 1010 cq = nvme->n_cq; 1011 cq_count = nvme->n_cq_count; 1012 1013 nvme->n_cq = kmem_zalloc(sizeof (*nvme->n_cq) * ncq, KM_SLEEP); 1014 nvme->n_cq_count = ncq; 1015 1016 for (i = 0; i < cq_count; i++) 1017 nvme->n_cq[i] = cq[i]; 1018 1019 for (; i < nvme->n_cq_count; i++) 1020 if (nvme_alloc_cq(nvme, nentry, &nvme->n_cq[i], i, nthr) != 1021 DDI_SUCCESS) 1022 goto fail; 1023 1024 if (cq != NULL) 1025 kmem_free(cq, sizeof (*cq) * cq_count); 1026 1027 return (DDI_SUCCESS); 1028 1029 fail: 1030 nvme_destroy_cq_array(nvme, cq_count); 1031 /* 1032 * Restore the original array 1033 */ 1034 nvme->n_cq_count = cq_count; 1035 nvme->n_cq = cq; 1036 1037 return (DDI_FAILURE); 1038 } 1039 1040 static int 1041 nvme_alloc_qpair(nvme_t *nvme, uint32_t nentry, nvme_qpair_t **nqp, 1042 uint_t idx) 1043 { 1044 nvme_qpair_t *qp = kmem_zalloc(sizeof (*qp), KM_SLEEP); 1045 uint_t cq_idx; 1046 1047 mutex_init(&qp->nq_mutex, NULL, MUTEX_DRIVER, 1048 DDI_INTR_PRI(nvme->n_intr_pri)); 1049 1050 /* 1051 * The NVMe spec defines that a full queue has one empty (unused) slot; 1052 * initialize the semaphore accordingly. 1053 */ 1054 sema_init(&qp->nq_sema, nentry - 1, NULL, SEMA_DRIVER, NULL); 1055 1056 if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_sqe_t), 1057 DDI_DMA_WRITE, &qp->nq_sqdma) != DDI_SUCCESS) 1058 goto fail; 1059 1060 /* 1061 * idx == 0 is adminq, those above 0 are shared io completion queues. 1062 */ 1063 cq_idx = idx == 0 ? 0 : 1 + (idx - 1) % (nvme->n_cq_count - 1); 1064 qp->nq_cq = nvme->n_cq[cq_idx]; 1065 qp->nq_sq = (nvme_sqe_t *)qp->nq_sqdma->nd_memp; 1066 qp->nq_nentry = nentry; 1067 1068 qp->nq_sqtdbl = NVME_REG_SQTDBL(nvme, idx); 1069 1070 qp->nq_cmd = kmem_zalloc(sizeof (nvme_cmd_t *) * nentry, KM_SLEEP); 1071 qp->nq_next_cmd = 0; 1072 1073 *nqp = qp; 1074 return (DDI_SUCCESS); 1075 1076 fail: 1077 nvme_free_qpair(qp); 1078 *nqp = NULL; 1079 1080 return (DDI_FAILURE); 1081 } 1082 1083 static nvme_cmd_t * 1084 nvme_alloc_cmd(nvme_t *nvme, int kmflag) 1085 { 1086 nvme_cmd_t *cmd = kmem_cache_alloc(nvme_cmd_cache, kmflag); 1087 1088 if (cmd == NULL) 1089 return (cmd); 1090 1091 bzero(cmd, sizeof (nvme_cmd_t)); 1092 1093 cmd->nc_nvme = nvme; 1094 1095 mutex_init(&cmd->nc_mutex, NULL, MUTEX_DRIVER, 1096 DDI_INTR_PRI(nvme->n_intr_pri)); 1097 cv_init(&cmd->nc_cv, NULL, CV_DRIVER, NULL); 1098 1099 return (cmd); 1100 } 1101 1102 static void 1103 nvme_free_cmd(nvme_cmd_t *cmd) 1104 { 1105 /* Don't free commands on the lost commands list. */ 1106 if (list_link_active(&cmd->nc_list)) 1107 return; 1108 1109 if (cmd->nc_dma) { 1110 nvme_free_dma(cmd->nc_dma); 1111 cmd->nc_dma = NULL; 1112 } 1113 1114 if (cmd->nc_prp) { 1115 kmem_cache_free(cmd->nc_nvme->n_prp_cache, cmd->nc_prp); 1116 cmd->nc_prp = NULL; 1117 } 1118 1119 cv_destroy(&cmd->nc_cv); 1120 mutex_destroy(&cmd->nc_mutex); 1121 1122 kmem_cache_free(nvme_cmd_cache, cmd); 1123 } 1124 1125 static void 1126 nvme_submit_admin_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd) 1127 { 1128 sema_p(&qp->nq_sema); 1129 nvme_submit_cmd_common(qp, cmd); 1130 } 1131 1132 static int 1133 nvme_submit_io_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd) 1134 { 1135 if (cmd->nc_nvme->n_dead) { 1136 return (EIO); 1137 } 1138 1139 if (sema_tryp(&qp->nq_sema) == 0) 1140 return (EAGAIN); 1141 1142 nvme_submit_cmd_common(qp, cmd); 1143 return (0); 1144 } 1145 1146 static void 1147 nvme_submit_cmd_common(nvme_qpair_t *qp, nvme_cmd_t *cmd) 1148 { 1149 nvme_reg_sqtdbl_t tail = { 0 }; 1150 1151 mutex_enter(&qp->nq_mutex); 1152 cmd->nc_completed = B_FALSE; 1153 1154 /* 1155 * Now that we hold the queue pair lock, we must check whether or not 1156 * the controller has been listed as dead (e.g. was removed due to 1157 * hotplug). This is necessary as otherwise we could race with 1158 * nvme_remove_callback(). Because this has not been enqueued, we don't 1159 * call nvme_unqueue_cmd(), which is why we must manually decrement the 1160 * semaphore. 1161 */ 1162 if (cmd->nc_nvme->n_dead) { 1163 taskq_dispatch_ent(qp->nq_cq->ncq_cmd_taskq, cmd->nc_callback, 1164 cmd, TQ_NOSLEEP, &cmd->nc_tqent); 1165 sema_v(&qp->nq_sema); 1166 mutex_exit(&qp->nq_mutex); 1167 return; 1168 } 1169 1170 /* 1171 * Try to insert the cmd into the active cmd array at the nq_next_cmd 1172 * slot. If the slot is already occupied advance to the next slot and 1173 * try again. This can happen for long running commands like async event 1174 * requests. 1175 */ 1176 while (qp->nq_cmd[qp->nq_next_cmd] != NULL) 1177 qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry; 1178 qp->nq_cmd[qp->nq_next_cmd] = cmd; 1179 1180 qp->nq_active_cmds++; 1181 1182 cmd->nc_sqe.sqe_cid = qp->nq_next_cmd; 1183 bcopy(&cmd->nc_sqe, &qp->nq_sq[qp->nq_sqtail], sizeof (nvme_sqe_t)); 1184 (void) ddi_dma_sync(qp->nq_sqdma->nd_dmah, 1185 sizeof (nvme_sqe_t) * qp->nq_sqtail, 1186 sizeof (nvme_sqe_t), DDI_DMA_SYNC_FORDEV); 1187 qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry; 1188 1189 tail.b.sqtdbl_sqt = qp->nq_sqtail = (qp->nq_sqtail + 1) % qp->nq_nentry; 1190 nvme_put32(cmd->nc_nvme, qp->nq_sqtdbl, tail.r); 1191 1192 mutex_exit(&qp->nq_mutex); 1193 } 1194 1195 static nvme_cmd_t * 1196 nvme_unqueue_cmd(nvme_t *nvme, nvme_qpair_t *qp, int cid) 1197 { 1198 nvme_cmd_t *cmd; 1199 1200 ASSERT(mutex_owned(&qp->nq_mutex)); 1201 ASSERT3S(cid, <, qp->nq_nentry); 1202 1203 cmd = qp->nq_cmd[cid]; 1204 qp->nq_cmd[cid] = NULL; 1205 ASSERT3U(qp->nq_active_cmds, >, 0); 1206 qp->nq_active_cmds--; 1207 sema_v(&qp->nq_sema); 1208 1209 ASSERT3P(cmd, !=, NULL); 1210 ASSERT3P(cmd->nc_nvme, ==, nvme); 1211 ASSERT3S(cmd->nc_sqe.sqe_cid, ==, cid); 1212 1213 return (cmd); 1214 } 1215 1216 /* 1217 * Get the command tied to the next completed cqe and bump along completion 1218 * queue head counter. 1219 */ 1220 static nvme_cmd_t * 1221 nvme_get_completed(nvme_t *nvme, nvme_cq_t *cq) 1222 { 1223 nvme_qpair_t *qp; 1224 nvme_cqe_t *cqe; 1225 nvme_cmd_t *cmd; 1226 1227 ASSERT(mutex_owned(&cq->ncq_mutex)); 1228 1229 cqe = &cq->ncq_cq[cq->ncq_head]; 1230 1231 /* Check phase tag of CQE. Hardware inverts it for new entries. */ 1232 if (cqe->cqe_sf.sf_p == cq->ncq_phase) 1233 return (NULL); 1234 1235 qp = nvme->n_ioq[cqe->cqe_sqid]; 1236 1237 mutex_enter(&qp->nq_mutex); 1238 cmd = nvme_unqueue_cmd(nvme, qp, cqe->cqe_cid); 1239 mutex_exit(&qp->nq_mutex); 1240 1241 ASSERT(cmd->nc_sqid == cqe->cqe_sqid); 1242 bcopy(cqe, &cmd->nc_cqe, sizeof (nvme_cqe_t)); 1243 1244 qp->nq_sqhead = cqe->cqe_sqhd; 1245 1246 cq->ncq_head = (cq->ncq_head + 1) % cq->ncq_nentry; 1247 1248 /* Toggle phase on wrap-around. */ 1249 if (cq->ncq_head == 0) 1250 cq->ncq_phase = cq->ncq_phase ? 0 : 1; 1251 1252 return (cmd); 1253 } 1254 1255 /* 1256 * Process all completed commands on the io completion queue. 1257 */ 1258 static uint_t 1259 nvme_process_iocq(nvme_t *nvme, nvme_cq_t *cq) 1260 { 1261 nvme_reg_cqhdbl_t head = { 0 }; 1262 nvme_cmd_t *cmd; 1263 uint_t completed = 0; 1264 1265 if (ddi_dma_sync(cq->ncq_dma->nd_dmah, 0, 0, DDI_DMA_SYNC_FORKERNEL) != 1266 DDI_SUCCESS) 1267 dev_err(nvme->n_dip, CE_WARN, "!ddi_dma_sync() failed in %s", 1268 __func__); 1269 1270 mutex_enter(&cq->ncq_mutex); 1271 1272 while ((cmd = nvme_get_completed(nvme, cq)) != NULL) { 1273 taskq_dispatch_ent(cq->ncq_cmd_taskq, cmd->nc_callback, cmd, 1274 TQ_NOSLEEP, &cmd->nc_tqent); 1275 1276 completed++; 1277 } 1278 1279 if (completed > 0) { 1280 /* 1281 * Update the completion queue head doorbell. 1282 */ 1283 head.b.cqhdbl_cqh = cq->ncq_head; 1284 nvme_put32(nvme, cq->ncq_hdbl, head.r); 1285 } 1286 1287 mutex_exit(&cq->ncq_mutex); 1288 1289 return (completed); 1290 } 1291 1292 static nvme_cmd_t * 1293 nvme_retrieve_cmd(nvme_t *nvme, nvme_qpair_t *qp) 1294 { 1295 nvme_cq_t *cq = qp->nq_cq; 1296 nvme_reg_cqhdbl_t head = { 0 }; 1297 nvme_cmd_t *cmd; 1298 1299 if (ddi_dma_sync(cq->ncq_dma->nd_dmah, 0, 0, DDI_DMA_SYNC_FORKERNEL) != 1300 DDI_SUCCESS) 1301 dev_err(nvme->n_dip, CE_WARN, "!ddi_dma_sync() failed in %s", 1302 __func__); 1303 1304 mutex_enter(&cq->ncq_mutex); 1305 1306 if ((cmd = nvme_get_completed(nvme, cq)) != NULL) { 1307 head.b.cqhdbl_cqh = cq->ncq_head; 1308 nvme_put32(nvme, cq->ncq_hdbl, head.r); 1309 } 1310 1311 mutex_exit(&cq->ncq_mutex); 1312 1313 return (cmd); 1314 } 1315 1316 static int 1317 nvme_check_unknown_cmd_status(nvme_cmd_t *cmd) 1318 { 1319 nvme_cqe_t *cqe = &cmd->nc_cqe; 1320 1321 dev_err(cmd->nc_nvme->n_dip, CE_WARN, 1322 "!unknown command status received: opc = %x, sqid = %d, cid = %d, " 1323 "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc, 1324 cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct, 1325 cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m); 1326 1327 if (cmd->nc_xfer != NULL) 1328 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 1329 1330 if (cmd->nc_nvme->n_strict_version) { 1331 cmd->nc_nvme->n_dead = B_TRUE; 1332 ddi_fm_service_impact(cmd->nc_nvme->n_dip, DDI_SERVICE_LOST); 1333 } 1334 1335 return (EIO); 1336 } 1337 1338 static int 1339 nvme_check_vendor_cmd_status(nvme_cmd_t *cmd) 1340 { 1341 nvme_cqe_t *cqe = &cmd->nc_cqe; 1342 1343 dev_err(cmd->nc_nvme->n_dip, CE_WARN, 1344 "!unknown command status received: opc = %x, sqid = %d, cid = %d, " 1345 "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc, 1346 cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct, 1347 cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m); 1348 if (!cmd->nc_nvme->n_ignore_unknown_vendor_status) { 1349 cmd->nc_nvme->n_dead = B_TRUE; 1350 ddi_fm_service_impact(cmd->nc_nvme->n_dip, DDI_SERVICE_LOST); 1351 } 1352 1353 return (EIO); 1354 } 1355 1356 static int 1357 nvme_check_integrity_cmd_status(nvme_cmd_t *cmd) 1358 { 1359 nvme_cqe_t *cqe = &cmd->nc_cqe; 1360 1361 switch (cqe->cqe_sf.sf_sc) { 1362 case NVME_CQE_SC_INT_NVM_WRITE: 1363 /* write fail */ 1364 /* TODO: post ereport */ 1365 if (cmd->nc_xfer != NULL) 1366 bd_error(cmd->nc_xfer, BD_ERR_MEDIA); 1367 return (EIO); 1368 1369 case NVME_CQE_SC_INT_NVM_READ: 1370 /* read fail */ 1371 /* TODO: post ereport */ 1372 if (cmd->nc_xfer != NULL) 1373 bd_error(cmd->nc_xfer, BD_ERR_MEDIA); 1374 return (EIO); 1375 1376 default: 1377 return (nvme_check_unknown_cmd_status(cmd)); 1378 } 1379 } 1380 1381 static int 1382 nvme_check_generic_cmd_status(nvme_cmd_t *cmd) 1383 { 1384 nvme_cqe_t *cqe = &cmd->nc_cqe; 1385 1386 switch (cqe->cqe_sf.sf_sc) { 1387 case NVME_CQE_SC_GEN_SUCCESS: 1388 return (0); 1389 1390 /* 1391 * Errors indicating a bug in the driver should cause a panic. 1392 */ 1393 case NVME_CQE_SC_GEN_INV_OPC: 1394 /* Invalid Command Opcode */ 1395 if (!cmd->nc_dontpanic) 1396 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, 1397 "programming error: invalid opcode in cmd %p", 1398 (void *)cmd); 1399 return (EINVAL); 1400 1401 case NVME_CQE_SC_GEN_INV_FLD: 1402 /* Invalid Field in Command */ 1403 if (!cmd->nc_dontpanic) 1404 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, 1405 "programming error: invalid field in cmd %p", 1406 (void *)cmd); 1407 return (EIO); 1408 1409 case NVME_CQE_SC_GEN_ID_CNFL: 1410 /* Command ID Conflict */ 1411 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " 1412 "cmd ID conflict in cmd %p", (void *)cmd); 1413 return (0); 1414 1415 case NVME_CQE_SC_GEN_INV_NS: 1416 /* Invalid Namespace or Format */ 1417 if (!cmd->nc_dontpanic) 1418 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, 1419 "programming error: invalid NS/format in cmd %p", 1420 (void *)cmd); 1421 return (EINVAL); 1422 1423 case NVME_CQE_SC_GEN_NVM_LBA_RANGE: 1424 /* LBA Out Of Range */ 1425 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " 1426 "LBA out of range in cmd %p", (void *)cmd); 1427 return (0); 1428 1429 /* 1430 * Non-fatal errors, handle gracefully. 1431 */ 1432 case NVME_CQE_SC_GEN_DATA_XFR_ERR: 1433 /* Data Transfer Error (DMA) */ 1434 /* TODO: post ereport */ 1435 atomic_inc_32(&cmd->nc_nvme->n_data_xfr_err); 1436 if (cmd->nc_xfer != NULL) 1437 bd_error(cmd->nc_xfer, BD_ERR_NTRDY); 1438 return (EIO); 1439 1440 case NVME_CQE_SC_GEN_INTERNAL_ERR: 1441 /* 1442 * Internal Error. The spec (v1.0, section 4.5.1.2) says 1443 * detailed error information is returned as async event, 1444 * so we pretty much ignore the error here and handle it 1445 * in the async event handler. 1446 */ 1447 atomic_inc_32(&cmd->nc_nvme->n_internal_err); 1448 if (cmd->nc_xfer != NULL) 1449 bd_error(cmd->nc_xfer, BD_ERR_NTRDY); 1450 return (EIO); 1451 1452 case NVME_CQE_SC_GEN_ABORT_REQUEST: 1453 /* 1454 * Command Abort Requested. This normally happens only when a 1455 * command times out. 1456 */ 1457 /* TODO: post ereport or change blkdev to handle this? */ 1458 atomic_inc_32(&cmd->nc_nvme->n_abort_rq_err); 1459 return (ECANCELED); 1460 1461 case NVME_CQE_SC_GEN_ABORT_PWRLOSS: 1462 /* Command Aborted due to Power Loss Notification */ 1463 ddi_fm_service_impact(cmd->nc_nvme->n_dip, DDI_SERVICE_LOST); 1464 cmd->nc_nvme->n_dead = B_TRUE; 1465 return (EIO); 1466 1467 case NVME_CQE_SC_GEN_ABORT_SQ_DEL: 1468 /* Command Aborted due to SQ Deletion */ 1469 atomic_inc_32(&cmd->nc_nvme->n_abort_sq_del); 1470 return (EIO); 1471 1472 case NVME_CQE_SC_GEN_NVM_CAP_EXC: 1473 /* Capacity Exceeded */ 1474 atomic_inc_32(&cmd->nc_nvme->n_nvm_cap_exc); 1475 if (cmd->nc_xfer != NULL) 1476 bd_error(cmd->nc_xfer, BD_ERR_MEDIA); 1477 return (EIO); 1478 1479 case NVME_CQE_SC_GEN_NVM_NS_NOTRDY: 1480 /* Namespace Not Ready */ 1481 atomic_inc_32(&cmd->nc_nvme->n_nvm_ns_notrdy); 1482 if (cmd->nc_xfer != NULL) 1483 bd_error(cmd->nc_xfer, BD_ERR_NTRDY); 1484 return (EIO); 1485 1486 default: 1487 return (nvme_check_unknown_cmd_status(cmd)); 1488 } 1489 } 1490 1491 static int 1492 nvme_check_specific_cmd_status(nvme_cmd_t *cmd) 1493 { 1494 nvme_cqe_t *cqe = &cmd->nc_cqe; 1495 1496 switch (cqe->cqe_sf.sf_sc) { 1497 case NVME_CQE_SC_SPC_INV_CQ: 1498 /* Completion Queue Invalid */ 1499 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE); 1500 atomic_inc_32(&cmd->nc_nvme->n_inv_cq_err); 1501 return (EINVAL); 1502 1503 case NVME_CQE_SC_SPC_INV_QID: 1504 /* Invalid Queue Identifier */ 1505 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE || 1506 cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_SQUEUE || 1507 cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE || 1508 cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_CQUEUE); 1509 atomic_inc_32(&cmd->nc_nvme->n_inv_qid_err); 1510 return (EINVAL); 1511 1512 case NVME_CQE_SC_SPC_MAX_QSZ_EXC: 1513 /* Max Queue Size Exceeded */ 1514 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE || 1515 cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE); 1516 atomic_inc_32(&cmd->nc_nvme->n_max_qsz_exc); 1517 return (EINVAL); 1518 1519 case NVME_CQE_SC_SPC_ABRT_CMD_EXC: 1520 /* Abort Command Limit Exceeded */ 1521 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_ABORT); 1522 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " 1523 "abort command limit exceeded in cmd %p", (void *)cmd); 1524 return (0); 1525 1526 case NVME_CQE_SC_SPC_ASYNC_EVREQ_EXC: 1527 /* Async Event Request Limit Exceeded */ 1528 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_ASYNC_EVENT); 1529 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " 1530 "async event request limit exceeded in cmd %p", 1531 (void *)cmd); 1532 return (0); 1533 1534 case NVME_CQE_SC_SPC_INV_INT_VECT: 1535 /* Invalid Interrupt Vector */ 1536 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE); 1537 atomic_inc_32(&cmd->nc_nvme->n_inv_int_vect); 1538 return (EINVAL); 1539 1540 case NVME_CQE_SC_SPC_INV_LOG_PAGE: 1541 /* Invalid Log Page */ 1542 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_GET_LOG_PAGE); 1543 atomic_inc_32(&cmd->nc_nvme->n_inv_log_page); 1544 return (EINVAL); 1545 1546 case NVME_CQE_SC_SPC_INV_FORMAT: 1547 /* Invalid Format */ 1548 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_FORMAT); 1549 atomic_inc_32(&cmd->nc_nvme->n_inv_format); 1550 if (cmd->nc_xfer != NULL) 1551 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 1552 return (EINVAL); 1553 1554 case NVME_CQE_SC_SPC_INV_Q_DEL: 1555 /* Invalid Queue Deletion */ 1556 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_CQUEUE); 1557 atomic_inc_32(&cmd->nc_nvme->n_inv_q_del); 1558 return (EINVAL); 1559 1560 case NVME_CQE_SC_SPC_NVM_CNFL_ATTR: 1561 /* Conflicting Attributes */ 1562 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_DSET_MGMT || 1563 cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_READ || 1564 cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE); 1565 atomic_inc_32(&cmd->nc_nvme->n_cnfl_attr); 1566 if (cmd->nc_xfer != NULL) 1567 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 1568 return (EINVAL); 1569 1570 case NVME_CQE_SC_SPC_NVM_INV_PROT: 1571 /* Invalid Protection Information */ 1572 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_COMPARE || 1573 cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_READ || 1574 cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE); 1575 atomic_inc_32(&cmd->nc_nvme->n_inv_prot); 1576 if (cmd->nc_xfer != NULL) 1577 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 1578 return (EINVAL); 1579 1580 case NVME_CQE_SC_SPC_NVM_READONLY: 1581 /* Write to Read Only Range */ 1582 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE); 1583 atomic_inc_32(&cmd->nc_nvme->n_readonly); 1584 if (cmd->nc_xfer != NULL) 1585 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 1586 return (EROFS); 1587 1588 case NVME_CQE_SC_SPC_INV_FW_SLOT: 1589 /* Invalid Firmware Slot */ 1590 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 1591 return (EINVAL); 1592 1593 case NVME_CQE_SC_SPC_INV_FW_IMG: 1594 /* Invalid Firmware Image */ 1595 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 1596 return (EINVAL); 1597 1598 case NVME_CQE_SC_SPC_FW_RESET: 1599 /* Conventional Reset Required */ 1600 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 1601 return (0); 1602 1603 case NVME_CQE_SC_SPC_FW_NSSR: 1604 /* NVMe Subsystem Reset Required */ 1605 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 1606 return (0); 1607 1608 case NVME_CQE_SC_SPC_FW_NEXT_RESET: 1609 /* Activation Requires Reset */ 1610 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 1611 return (0); 1612 1613 case NVME_CQE_SC_SPC_FW_MTFA: 1614 /* Activation Requires Maximum Time Violation */ 1615 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 1616 return (EAGAIN); 1617 1618 case NVME_CQE_SC_SPC_FW_PROHIBITED: 1619 /* Activation Prohibited */ 1620 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 1621 return (EINVAL); 1622 1623 case NVME_CQE_SC_SPC_FW_OVERLAP: 1624 /* Overlapping Firmware Ranges */ 1625 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_IMAGE_LOAD); 1626 return (EINVAL); 1627 1628 default: 1629 return (nvme_check_unknown_cmd_status(cmd)); 1630 } 1631 } 1632 1633 static inline int 1634 nvme_check_cmd_status(nvme_cmd_t *cmd) 1635 { 1636 nvme_cqe_t *cqe = &cmd->nc_cqe; 1637 1638 /* 1639 * Take a shortcut if the controller is dead, or if 1640 * command status indicates no error. 1641 */ 1642 if (cmd->nc_nvme->n_dead) 1643 return (EIO); 1644 1645 if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC && 1646 cqe->cqe_sf.sf_sc == NVME_CQE_SC_GEN_SUCCESS) 1647 return (0); 1648 1649 if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC) 1650 return (nvme_check_generic_cmd_status(cmd)); 1651 else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_SPECIFIC) 1652 return (nvme_check_specific_cmd_status(cmd)); 1653 else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_INTEGRITY) 1654 return (nvme_check_integrity_cmd_status(cmd)); 1655 else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_VENDOR) 1656 return (nvme_check_vendor_cmd_status(cmd)); 1657 1658 return (nvme_check_unknown_cmd_status(cmd)); 1659 } 1660 1661 static int 1662 nvme_abort_cmd(nvme_cmd_t *abort_cmd, uint_t sec) 1663 { 1664 nvme_t *nvme = abort_cmd->nc_nvme; 1665 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 1666 nvme_abort_cmd_t ac = { 0 }; 1667 int ret = 0; 1668 1669 sema_p(&nvme->n_abort_sema); 1670 1671 ac.b.ac_cid = abort_cmd->nc_sqe.sqe_cid; 1672 ac.b.ac_sqid = abort_cmd->nc_sqid; 1673 1674 cmd->nc_sqid = 0; 1675 cmd->nc_sqe.sqe_opc = NVME_OPC_ABORT; 1676 cmd->nc_callback = nvme_wakeup_cmd; 1677 cmd->nc_sqe.sqe_cdw10 = ac.r; 1678 1679 /* 1680 * Send the ABORT to the hardware. The ABORT command will return _after_ 1681 * the aborted command has completed (aborted or otherwise), but since 1682 * we still hold the aborted command's mutex its callback hasn't been 1683 * processed yet. 1684 */ 1685 nvme_admin_cmd(cmd, sec); 1686 sema_v(&nvme->n_abort_sema); 1687 1688 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 1689 dev_err(nvme->n_dip, CE_WARN, 1690 "!ABORT failed with sct = %x, sc = %x", 1691 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 1692 atomic_inc_32(&nvme->n_abort_failed); 1693 } else { 1694 dev_err(nvme->n_dip, CE_WARN, 1695 "!ABORT of command %d/%d %ssuccessful", 1696 abort_cmd->nc_sqe.sqe_cid, abort_cmd->nc_sqid, 1697 cmd->nc_cqe.cqe_dw0 & 1 ? "un" : ""); 1698 if ((cmd->nc_cqe.cqe_dw0 & 1) == 0) 1699 atomic_inc_32(&nvme->n_cmd_aborted); 1700 } 1701 1702 nvme_free_cmd(cmd); 1703 return (ret); 1704 } 1705 1706 /* 1707 * nvme_wait_cmd -- wait for command completion or timeout 1708 * 1709 * In case of a serious error or a timeout of the abort command the hardware 1710 * will be declared dead and FMA will be notified. 1711 */ 1712 static void 1713 nvme_wait_cmd(nvme_cmd_t *cmd, uint_t sec) 1714 { 1715 clock_t timeout = ddi_get_lbolt() + drv_usectohz(sec * MICROSEC); 1716 nvme_t *nvme = cmd->nc_nvme; 1717 nvme_reg_csts_t csts; 1718 nvme_qpair_t *qp; 1719 1720 ASSERT(mutex_owned(&cmd->nc_mutex)); 1721 1722 while (!cmd->nc_completed) { 1723 if (cv_timedwait(&cmd->nc_cv, &cmd->nc_mutex, timeout) == -1) 1724 break; 1725 } 1726 1727 if (cmd->nc_completed) 1728 return; 1729 1730 /* 1731 * The command timed out. 1732 * 1733 * Check controller for fatal status, any errors associated with the 1734 * register or DMA handle, or for a double timeout (abort command timed 1735 * out). If necessary log a warning and call FMA. 1736 */ 1737 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 1738 dev_err(nvme->n_dip, CE_WARN, "!command %d/%d timeout, " 1739 "OPC = %x, CFS = %d", cmd->nc_sqe.sqe_cid, cmd->nc_sqid, 1740 cmd->nc_sqe.sqe_opc, csts.b.csts_cfs); 1741 atomic_inc_32(&nvme->n_cmd_timeout); 1742 1743 if (csts.b.csts_cfs || 1744 nvme_check_regs_hdl(nvme) || 1745 nvme_check_dma_hdl(cmd->nc_dma) || 1746 cmd->nc_sqe.sqe_opc == NVME_OPC_ABORT) { 1747 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 1748 nvme->n_dead = B_TRUE; 1749 } else if (nvme_abort_cmd(cmd, sec) == 0) { 1750 /* 1751 * If the abort succeeded the command should complete 1752 * immediately with an appropriate status. 1753 */ 1754 while (!cmd->nc_completed) 1755 cv_wait(&cmd->nc_cv, &cmd->nc_mutex); 1756 1757 return; 1758 } 1759 1760 qp = nvme->n_ioq[cmd->nc_sqid]; 1761 1762 mutex_enter(&qp->nq_mutex); 1763 (void) nvme_unqueue_cmd(nvme, qp, cmd->nc_sqe.sqe_cid); 1764 mutex_exit(&qp->nq_mutex); 1765 1766 /* 1767 * As we don't know what the presumed dead hardware might still do with 1768 * the DMA memory, we'll put the command on the lost commands list if it 1769 * has any DMA memory. 1770 */ 1771 if (cmd->nc_dma != NULL) { 1772 mutex_enter(&nvme_lc_mutex); 1773 list_insert_head(&nvme_lost_cmds, cmd); 1774 mutex_exit(&nvme_lc_mutex); 1775 } 1776 } 1777 1778 static void 1779 nvme_wakeup_cmd(void *arg) 1780 { 1781 nvme_cmd_t *cmd = arg; 1782 1783 mutex_enter(&cmd->nc_mutex); 1784 cmd->nc_completed = B_TRUE; 1785 cv_signal(&cmd->nc_cv); 1786 mutex_exit(&cmd->nc_mutex); 1787 } 1788 1789 static void 1790 nvme_async_event_task(void *arg) 1791 { 1792 nvme_cmd_t *cmd = arg; 1793 nvme_t *nvme = cmd->nc_nvme; 1794 nvme_error_log_entry_t *error_log = NULL; 1795 nvme_health_log_t *health_log = NULL; 1796 nvme_nschange_list_t *nslist = NULL; 1797 size_t logsize = 0; 1798 nvme_async_event_t event; 1799 1800 /* 1801 * Check for errors associated with the async request itself. The only 1802 * command-specific error is "async event limit exceeded", which 1803 * indicates a programming error in the driver and causes a panic in 1804 * nvme_check_cmd_status(). 1805 * 1806 * Other possible errors are various scenarios where the async request 1807 * was aborted, or internal errors in the device. Internal errors are 1808 * reported to FMA, the command aborts need no special handling here. 1809 * 1810 * And finally, at least qemu nvme does not support async events, 1811 * and will return NVME_CQE_SC_GEN_INV_OPC | DNR. If so, we 1812 * will avoid posting async events. 1813 */ 1814 1815 if (nvme_check_cmd_status(cmd) != 0) { 1816 dev_err(cmd->nc_nvme->n_dip, CE_WARN, 1817 "!async event request returned failure, sct = %x, " 1818 "sc = %x, dnr = %d, m = %d", cmd->nc_cqe.cqe_sf.sf_sct, 1819 cmd->nc_cqe.cqe_sf.sf_sc, cmd->nc_cqe.cqe_sf.sf_dnr, 1820 cmd->nc_cqe.cqe_sf.sf_m); 1821 1822 if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC && 1823 cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INTERNAL_ERR) { 1824 cmd->nc_nvme->n_dead = B_TRUE; 1825 ddi_fm_service_impact(cmd->nc_nvme->n_dip, 1826 DDI_SERVICE_LOST); 1827 } 1828 1829 if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC && 1830 cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INV_OPC && 1831 cmd->nc_cqe.cqe_sf.sf_dnr == 1) { 1832 nvme->n_async_event_supported = B_FALSE; 1833 } 1834 1835 nvme_free_cmd(cmd); 1836 return; 1837 } 1838 1839 event.r = cmd->nc_cqe.cqe_dw0; 1840 1841 /* Clear CQE and re-submit the async request. */ 1842 bzero(&cmd->nc_cqe, sizeof (nvme_cqe_t)); 1843 nvme_submit_admin_cmd(nvme->n_adminq, cmd); 1844 1845 switch (event.b.ae_type) { 1846 case NVME_ASYNC_TYPE_ERROR: 1847 if (event.b.ae_logpage == NVME_LOGPAGE_ERROR) { 1848 (void) nvme_get_logpage(nvme, B_FALSE, 1849 (void **)&error_log, &logsize, event.b.ae_logpage); 1850 } else { 1851 dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in " 1852 "async event reply: %d", event.b.ae_logpage); 1853 atomic_inc_32(&nvme->n_wrong_logpage); 1854 } 1855 1856 switch (event.b.ae_info) { 1857 case NVME_ASYNC_ERROR_INV_SQ: 1858 dev_err(nvme->n_dip, CE_PANIC, "programming error: " 1859 "invalid submission queue"); 1860 return; 1861 1862 case NVME_ASYNC_ERROR_INV_DBL: 1863 dev_err(nvme->n_dip, CE_PANIC, "programming error: " 1864 "invalid doorbell write value"); 1865 return; 1866 1867 case NVME_ASYNC_ERROR_DIAGFAIL: 1868 dev_err(nvme->n_dip, CE_WARN, "!diagnostic failure"); 1869 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 1870 nvme->n_dead = B_TRUE; 1871 atomic_inc_32(&nvme->n_diagfail_event); 1872 break; 1873 1874 case NVME_ASYNC_ERROR_PERSISTENT: 1875 dev_err(nvme->n_dip, CE_WARN, "!persistent internal " 1876 "device error"); 1877 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 1878 nvme->n_dead = B_TRUE; 1879 atomic_inc_32(&nvme->n_persistent_event); 1880 break; 1881 1882 case NVME_ASYNC_ERROR_TRANSIENT: 1883 dev_err(nvme->n_dip, CE_WARN, "!transient internal " 1884 "device error"); 1885 /* TODO: send ereport */ 1886 atomic_inc_32(&nvme->n_transient_event); 1887 break; 1888 1889 case NVME_ASYNC_ERROR_FW_LOAD: 1890 dev_err(nvme->n_dip, CE_WARN, 1891 "!firmware image load error"); 1892 atomic_inc_32(&nvme->n_fw_load_event); 1893 break; 1894 } 1895 break; 1896 1897 case NVME_ASYNC_TYPE_HEALTH: 1898 if (event.b.ae_logpage == NVME_LOGPAGE_HEALTH) { 1899 (void) nvme_get_logpage(nvme, B_FALSE, 1900 (void **)&health_log, &logsize, event.b.ae_logpage, 1901 -1); 1902 } else { 1903 dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in " 1904 "async event reply: %d", event.b.ae_logpage); 1905 atomic_inc_32(&nvme->n_wrong_logpage); 1906 } 1907 1908 switch (event.b.ae_info) { 1909 case NVME_ASYNC_HEALTH_RELIABILITY: 1910 dev_err(nvme->n_dip, CE_WARN, 1911 "!device reliability compromised"); 1912 /* TODO: send ereport */ 1913 atomic_inc_32(&nvme->n_reliability_event); 1914 break; 1915 1916 case NVME_ASYNC_HEALTH_TEMPERATURE: 1917 dev_err(nvme->n_dip, CE_WARN, 1918 "!temperature above threshold"); 1919 /* TODO: send ereport */ 1920 atomic_inc_32(&nvme->n_temperature_event); 1921 break; 1922 1923 case NVME_ASYNC_HEALTH_SPARE: 1924 dev_err(nvme->n_dip, CE_WARN, 1925 "!spare space below threshold"); 1926 /* TODO: send ereport */ 1927 atomic_inc_32(&nvme->n_spare_event); 1928 break; 1929 } 1930 break; 1931 1932 case NVME_ASYNC_TYPE_NOTICE: 1933 switch (event.b.ae_info) { 1934 case NVME_ASYNC_NOTICE_NS_CHANGE: 1935 dev_err(nvme->n_dip, CE_NOTE, 1936 "namespace attribute change event, " 1937 "logpage = %x", event.b.ae_logpage); 1938 atomic_inc_32(&nvme->n_notice_event); 1939 1940 if (event.b.ae_logpage != NVME_LOGPAGE_NSCHANGE) 1941 break; 1942 1943 if (nvme_get_logpage(nvme, B_FALSE, (void **)&nslist, 1944 &logsize, event.b.ae_logpage, -1) != 0) { 1945 break; 1946 } 1947 1948 if (nslist->nscl_ns[0] == UINT32_MAX) { 1949 dev_err(nvme->n_dip, CE_CONT, 1950 "more than %u namespaces have changed.\n", 1951 NVME_NSCHANGE_LIST_SIZE); 1952 break; 1953 } 1954 1955 for (uint_t i = 0; i < NVME_NSCHANGE_LIST_SIZE; i++) { 1956 uint32_t nsid = nslist->nscl_ns[i]; 1957 1958 if (nsid == 0) /* end of list */ 1959 break; 1960 nvme_changed_ns(nvme, nsid); 1961 } 1962 1963 break; 1964 1965 case NVME_ASYNC_NOTICE_FW_ACTIVATE: 1966 dev_err(nvme->n_dip, CE_NOTE, 1967 "firmware activation starting, " 1968 "logpage = %x", event.b.ae_logpage); 1969 atomic_inc_32(&nvme->n_notice_event); 1970 break; 1971 1972 case NVME_ASYNC_NOTICE_TELEMETRY: 1973 dev_err(nvme->n_dip, CE_NOTE, 1974 "telemetry log changed, " 1975 "logpage = %x", event.b.ae_logpage); 1976 atomic_inc_32(&nvme->n_notice_event); 1977 break; 1978 1979 case NVME_ASYNC_NOTICE_NS_ASYMM: 1980 dev_err(nvme->n_dip, CE_NOTE, 1981 "asymmetric namespace access change, " 1982 "logpage = %x", event.b.ae_logpage); 1983 atomic_inc_32(&nvme->n_notice_event); 1984 break; 1985 1986 case NVME_ASYNC_NOTICE_LATENCYLOG: 1987 dev_err(nvme->n_dip, CE_NOTE, 1988 "predictable latency event aggregate log change, " 1989 "logpage = %x", event.b.ae_logpage); 1990 atomic_inc_32(&nvme->n_notice_event); 1991 break; 1992 1993 case NVME_ASYNC_NOTICE_LBASTATUS: 1994 dev_err(nvme->n_dip, CE_NOTE, 1995 "LBA status information alert, " 1996 "logpage = %x", event.b.ae_logpage); 1997 atomic_inc_32(&nvme->n_notice_event); 1998 break; 1999 2000 case NVME_ASYNC_NOTICE_ENDURANCELOG: 2001 dev_err(nvme->n_dip, CE_NOTE, 2002 "endurance group event aggregate log page change, " 2003 "logpage = %x", event.b.ae_logpage); 2004 atomic_inc_32(&nvme->n_notice_event); 2005 break; 2006 2007 default: 2008 dev_err(nvme->n_dip, CE_WARN, 2009 "!unknown notice async event received, " 2010 "info = %x, logpage = %x", event.b.ae_info, 2011 event.b.ae_logpage); 2012 atomic_inc_32(&nvme->n_unknown_event); 2013 break; 2014 } 2015 break; 2016 2017 case NVME_ASYNC_TYPE_VENDOR: 2018 dev_err(nvme->n_dip, CE_WARN, "!vendor specific async event " 2019 "received, info = %x, logpage = %x", event.b.ae_info, 2020 event.b.ae_logpage); 2021 atomic_inc_32(&nvme->n_vendor_event); 2022 break; 2023 2024 default: 2025 dev_err(nvme->n_dip, CE_WARN, "!unknown async event received, " 2026 "type = %x, info = %x, logpage = %x", event.b.ae_type, 2027 event.b.ae_info, event.b.ae_logpage); 2028 atomic_inc_32(&nvme->n_unknown_event); 2029 break; 2030 } 2031 2032 if (error_log != NULL) 2033 kmem_free(error_log, logsize); 2034 2035 if (health_log != NULL) 2036 kmem_free(health_log, logsize); 2037 2038 if (nslist != NULL) 2039 kmem_free(nslist, logsize); 2040 } 2041 2042 static void 2043 nvme_admin_cmd(nvme_cmd_t *cmd, int sec) 2044 { 2045 mutex_enter(&cmd->nc_mutex); 2046 nvme_submit_admin_cmd(cmd->nc_nvme->n_adminq, cmd); 2047 nvme_wait_cmd(cmd, sec); 2048 mutex_exit(&cmd->nc_mutex); 2049 } 2050 2051 static void 2052 nvme_async_event(nvme_t *nvme) 2053 { 2054 nvme_cmd_t *cmd; 2055 2056 cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2057 cmd->nc_sqid = 0; 2058 cmd->nc_sqe.sqe_opc = NVME_OPC_ASYNC_EVENT; 2059 cmd->nc_callback = nvme_async_event_task; 2060 cmd->nc_dontpanic = B_TRUE; 2061 2062 nvme_submit_admin_cmd(nvme->n_adminq, cmd); 2063 } 2064 2065 static int 2066 nvme_format_nvm(nvme_t *nvme, boolean_t user, uint32_t nsid, uint8_t lbaf, 2067 boolean_t ms, uint8_t pi, boolean_t pil, uint8_t ses) 2068 { 2069 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2070 nvme_format_nvm_t format_nvm = { 0 }; 2071 int ret; 2072 2073 format_nvm.b.fm_lbaf = lbaf & 0xf; 2074 format_nvm.b.fm_ms = ms ? 1 : 0; 2075 format_nvm.b.fm_pi = pi & 0x7; 2076 format_nvm.b.fm_pil = pil ? 1 : 0; 2077 format_nvm.b.fm_ses = ses & 0x7; 2078 2079 cmd->nc_sqid = 0; 2080 cmd->nc_callback = nvme_wakeup_cmd; 2081 cmd->nc_sqe.sqe_nsid = nsid; 2082 cmd->nc_sqe.sqe_opc = NVME_OPC_NVM_FORMAT; 2083 cmd->nc_sqe.sqe_cdw10 = format_nvm.r; 2084 2085 /* 2086 * Some devices like Samsung SM951 don't allow formatting of all 2087 * namespaces in one command. Handle that gracefully. 2088 */ 2089 if (nsid == (uint32_t)-1) 2090 cmd->nc_dontpanic = B_TRUE; 2091 /* 2092 * If this format request was initiated by the user, then don't allow a 2093 * programmer error to panic the system. 2094 */ 2095 if (user) 2096 cmd->nc_dontpanic = B_TRUE; 2097 2098 nvme_admin_cmd(cmd, nvme_format_cmd_timeout); 2099 2100 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 2101 dev_err(nvme->n_dip, CE_WARN, 2102 "!FORMAT failed with sct = %x, sc = %x", 2103 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 2104 } 2105 2106 nvme_free_cmd(cmd); 2107 return (ret); 2108 } 2109 2110 /* 2111 * The `bufsize` parameter is usually an output parameter, set by this routine 2112 * when filling in the supported types of logpages from the device. However, for 2113 * vendor-specific pages, it is an input parameter, and must be set 2114 * appropriately by callers. 2115 */ 2116 static int 2117 nvme_get_logpage(nvme_t *nvme, boolean_t user, void **buf, size_t *bufsize, 2118 uint8_t logpage, ...) 2119 { 2120 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2121 nvme_getlogpage_t getlogpage = { 0 }; 2122 va_list ap; 2123 int ret; 2124 2125 va_start(ap, logpage); 2126 2127 cmd->nc_sqid = 0; 2128 cmd->nc_callback = nvme_wakeup_cmd; 2129 cmd->nc_sqe.sqe_opc = NVME_OPC_GET_LOG_PAGE; 2130 2131 if (user) 2132 cmd->nc_dontpanic = B_TRUE; 2133 2134 getlogpage.b.lp_lid = logpage; 2135 2136 switch (logpage) { 2137 case NVME_LOGPAGE_ERROR: 2138 cmd->nc_sqe.sqe_nsid = (uint32_t)-1; 2139 *bufsize = MIN(NVME_VENDOR_SPECIFIC_LOGPAGE_MAX_SIZE, 2140 nvme->n_error_log_len * sizeof (nvme_error_log_entry_t)); 2141 break; 2142 2143 case NVME_LOGPAGE_HEALTH: 2144 cmd->nc_sqe.sqe_nsid = va_arg(ap, uint32_t); 2145 *bufsize = sizeof (nvme_health_log_t); 2146 break; 2147 2148 case NVME_LOGPAGE_FWSLOT: 2149 cmd->nc_sqe.sqe_nsid = (uint32_t)-1; 2150 *bufsize = sizeof (nvme_fwslot_log_t); 2151 break; 2152 2153 case NVME_LOGPAGE_NSCHANGE: 2154 cmd->nc_sqe.sqe_nsid = (uint32_t)-1; 2155 *bufsize = sizeof (nvme_nschange_list_t); 2156 break; 2157 2158 default: 2159 /* 2160 * This intentionally only checks against the minimum valid 2161 * log page ID. `logpage` is a uint8_t, and `0xFF` is a valid 2162 * page ID, so this one-sided check avoids a compiler error 2163 * about a check that's always true. 2164 */ 2165 if (logpage < NVME_VENDOR_SPECIFIC_LOGPAGE_MIN) { 2166 dev_err(nvme->n_dip, CE_WARN, 2167 "!unknown log page requested: %d", logpage); 2168 atomic_inc_32(&nvme->n_unknown_logpage); 2169 ret = EINVAL; 2170 goto fail; 2171 } 2172 cmd->nc_sqe.sqe_nsid = va_arg(ap, uint32_t); 2173 } 2174 2175 va_end(ap); 2176 2177 getlogpage.b.lp_numd = *bufsize / sizeof (uint32_t) - 1; 2178 2179 cmd->nc_sqe.sqe_cdw10 = getlogpage.r; 2180 2181 if (nvme_zalloc_dma(nvme, *bufsize, 2182 DDI_DMA_READ, &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) { 2183 dev_err(nvme->n_dip, CE_WARN, 2184 "!nvme_zalloc_dma failed for GET LOG PAGE"); 2185 ret = ENOMEM; 2186 goto fail; 2187 } 2188 2189 if ((ret = nvme_fill_prp(cmd, cmd->nc_dma->nd_dmah)) != 0) 2190 goto fail; 2191 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout); 2192 2193 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 2194 dev_err(nvme->n_dip, CE_WARN, 2195 "!GET LOG PAGE failed with sct = %x, sc = %x", 2196 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 2197 goto fail; 2198 } 2199 2200 *buf = kmem_alloc(*bufsize, KM_SLEEP); 2201 bcopy(cmd->nc_dma->nd_memp, *buf, *bufsize); 2202 2203 fail: 2204 nvme_free_cmd(cmd); 2205 2206 return (ret); 2207 } 2208 2209 static int 2210 nvme_identify(nvme_t *nvme, boolean_t user, uint32_t nsid, void **buf) 2211 { 2212 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2213 int ret; 2214 2215 if (buf == NULL) 2216 return (EINVAL); 2217 2218 cmd->nc_sqid = 0; 2219 cmd->nc_callback = nvme_wakeup_cmd; 2220 cmd->nc_sqe.sqe_opc = NVME_OPC_IDENTIFY; 2221 cmd->nc_sqe.sqe_nsid = nsid; 2222 cmd->nc_sqe.sqe_cdw10 = nsid ? NVME_IDENTIFY_NSID : NVME_IDENTIFY_CTRL; 2223 2224 if (nvme_zalloc_dma(nvme, NVME_IDENTIFY_BUFSIZE, DDI_DMA_READ, 2225 &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) { 2226 dev_err(nvme->n_dip, CE_WARN, 2227 "!nvme_zalloc_dma failed for IDENTIFY"); 2228 ret = ENOMEM; 2229 goto fail; 2230 } 2231 2232 if (cmd->nc_dma->nd_ncookie > 2) { 2233 dev_err(nvme->n_dip, CE_WARN, 2234 "!too many DMA cookies for IDENTIFY"); 2235 atomic_inc_32(&nvme->n_too_many_cookies); 2236 ret = ENOMEM; 2237 goto fail; 2238 } 2239 2240 cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_dma->nd_cookie.dmac_laddress; 2241 if (cmd->nc_dma->nd_ncookie > 1) { 2242 ddi_dma_nextcookie(cmd->nc_dma->nd_dmah, 2243 &cmd->nc_dma->nd_cookie); 2244 cmd->nc_sqe.sqe_dptr.d_prp[1] = 2245 cmd->nc_dma->nd_cookie.dmac_laddress; 2246 } 2247 2248 if (user) 2249 cmd->nc_dontpanic = B_TRUE; 2250 2251 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout); 2252 2253 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 2254 dev_err(nvme->n_dip, CE_WARN, 2255 "!IDENTIFY failed with sct = %x, sc = %x", 2256 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 2257 goto fail; 2258 } 2259 2260 *buf = kmem_alloc(NVME_IDENTIFY_BUFSIZE, KM_SLEEP); 2261 bcopy(cmd->nc_dma->nd_memp, *buf, NVME_IDENTIFY_BUFSIZE); 2262 2263 fail: 2264 nvme_free_cmd(cmd); 2265 2266 return (ret); 2267 } 2268 2269 static int 2270 nvme_set_features(nvme_t *nvme, boolean_t user, uint32_t nsid, uint8_t feature, 2271 uint32_t val, uint32_t *res) 2272 { 2273 _NOTE(ARGUNUSED(nsid)); 2274 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2275 int ret = EINVAL; 2276 2277 ASSERT(res != NULL); 2278 2279 cmd->nc_sqid = 0; 2280 cmd->nc_callback = nvme_wakeup_cmd; 2281 cmd->nc_sqe.sqe_opc = NVME_OPC_SET_FEATURES; 2282 cmd->nc_sqe.sqe_cdw10 = feature; 2283 cmd->nc_sqe.sqe_cdw11 = val; 2284 2285 if (user) 2286 cmd->nc_dontpanic = B_TRUE; 2287 2288 switch (feature) { 2289 case NVME_FEAT_WRITE_CACHE: 2290 if (!nvme->n_write_cache_present) 2291 goto fail; 2292 break; 2293 2294 case NVME_FEAT_NQUEUES: 2295 break; 2296 2297 default: 2298 goto fail; 2299 } 2300 2301 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout); 2302 2303 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 2304 dev_err(nvme->n_dip, CE_WARN, 2305 "!SET FEATURES %d failed with sct = %x, sc = %x", 2306 feature, cmd->nc_cqe.cqe_sf.sf_sct, 2307 cmd->nc_cqe.cqe_sf.sf_sc); 2308 goto fail; 2309 } 2310 2311 *res = cmd->nc_cqe.cqe_dw0; 2312 2313 fail: 2314 nvme_free_cmd(cmd); 2315 return (ret); 2316 } 2317 2318 static int 2319 nvme_get_features(nvme_t *nvme, boolean_t user, uint32_t nsid, uint8_t feature, 2320 uint32_t *res, void **buf, size_t *bufsize) 2321 { 2322 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2323 int ret = EINVAL; 2324 2325 ASSERT(res != NULL); 2326 2327 if (bufsize != NULL) 2328 *bufsize = 0; 2329 2330 cmd->nc_sqid = 0; 2331 cmd->nc_callback = nvme_wakeup_cmd; 2332 cmd->nc_sqe.sqe_opc = NVME_OPC_GET_FEATURES; 2333 cmd->nc_sqe.sqe_cdw10 = feature; 2334 cmd->nc_sqe.sqe_cdw11 = *res; 2335 2336 /* 2337 * For some of the optional features there doesn't seem to be a method 2338 * of detecting whether it is supported other than using it. This will 2339 * cause "Invalid Field in Command" error, which is normally considered 2340 * a programming error. Set the nc_dontpanic flag to override the panic 2341 * in nvme_check_generic_cmd_status(). 2342 */ 2343 switch (feature) { 2344 case NVME_FEAT_ARBITRATION: 2345 case NVME_FEAT_POWER_MGMT: 2346 case NVME_FEAT_TEMPERATURE: 2347 case NVME_FEAT_ERROR: 2348 case NVME_FEAT_NQUEUES: 2349 case NVME_FEAT_INTR_COAL: 2350 case NVME_FEAT_INTR_VECT: 2351 case NVME_FEAT_WRITE_ATOM: 2352 case NVME_FEAT_ASYNC_EVENT: 2353 break; 2354 2355 case NVME_FEAT_WRITE_CACHE: 2356 if (!nvme->n_write_cache_present) 2357 goto fail; 2358 break; 2359 2360 case NVME_FEAT_LBA_RANGE: 2361 if (!nvme->n_lba_range_supported) 2362 goto fail; 2363 2364 cmd->nc_dontpanic = B_TRUE; 2365 cmd->nc_sqe.sqe_nsid = nsid; 2366 ASSERT(bufsize != NULL); 2367 *bufsize = NVME_LBA_RANGE_BUFSIZE; 2368 break; 2369 2370 case NVME_FEAT_AUTO_PST: 2371 if (!nvme->n_auto_pst_supported) 2372 goto fail; 2373 2374 ASSERT(bufsize != NULL); 2375 *bufsize = NVME_AUTO_PST_BUFSIZE; 2376 break; 2377 2378 case NVME_FEAT_PROGRESS: 2379 if (!nvme->n_progress_supported) 2380 goto fail; 2381 2382 cmd->nc_dontpanic = B_TRUE; 2383 break; 2384 2385 default: 2386 goto fail; 2387 } 2388 2389 if (user) 2390 cmd->nc_dontpanic = B_TRUE; 2391 2392 if (bufsize != NULL && *bufsize != 0) { 2393 if (nvme_zalloc_dma(nvme, *bufsize, DDI_DMA_READ, 2394 &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) { 2395 dev_err(nvme->n_dip, CE_WARN, 2396 "!nvme_zalloc_dma failed for GET FEATURES"); 2397 ret = ENOMEM; 2398 goto fail; 2399 } 2400 2401 if (cmd->nc_dma->nd_ncookie > 2) { 2402 dev_err(nvme->n_dip, CE_WARN, 2403 "!too many DMA cookies for GET FEATURES"); 2404 atomic_inc_32(&nvme->n_too_many_cookies); 2405 ret = ENOMEM; 2406 goto fail; 2407 } 2408 2409 cmd->nc_sqe.sqe_dptr.d_prp[0] = 2410 cmd->nc_dma->nd_cookie.dmac_laddress; 2411 if (cmd->nc_dma->nd_ncookie > 1) { 2412 ddi_dma_nextcookie(cmd->nc_dma->nd_dmah, 2413 &cmd->nc_dma->nd_cookie); 2414 cmd->nc_sqe.sqe_dptr.d_prp[1] = 2415 cmd->nc_dma->nd_cookie.dmac_laddress; 2416 } 2417 } 2418 2419 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout); 2420 2421 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 2422 boolean_t known = B_TRUE; 2423 2424 /* Check if this is unsupported optional feature */ 2425 if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC && 2426 cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INV_FLD) { 2427 switch (feature) { 2428 case NVME_FEAT_LBA_RANGE: 2429 nvme->n_lba_range_supported = B_FALSE; 2430 break; 2431 case NVME_FEAT_PROGRESS: 2432 nvme->n_progress_supported = B_FALSE; 2433 break; 2434 default: 2435 known = B_FALSE; 2436 break; 2437 } 2438 } else { 2439 known = B_FALSE; 2440 } 2441 2442 /* Report the error otherwise */ 2443 if (!known) { 2444 dev_err(nvme->n_dip, CE_WARN, 2445 "!GET FEATURES %d failed with sct = %x, sc = %x", 2446 feature, cmd->nc_cqe.cqe_sf.sf_sct, 2447 cmd->nc_cqe.cqe_sf.sf_sc); 2448 } 2449 2450 goto fail; 2451 } 2452 2453 if (bufsize != NULL && *bufsize != 0) { 2454 ASSERT(buf != NULL); 2455 *buf = kmem_alloc(*bufsize, KM_SLEEP); 2456 bcopy(cmd->nc_dma->nd_memp, *buf, *bufsize); 2457 } 2458 2459 *res = cmd->nc_cqe.cqe_dw0; 2460 2461 fail: 2462 nvme_free_cmd(cmd); 2463 return (ret); 2464 } 2465 2466 static int 2467 nvme_write_cache_set(nvme_t *nvme, boolean_t enable) 2468 { 2469 nvme_write_cache_t nwc = { 0 }; 2470 2471 if (enable) 2472 nwc.b.wc_wce = 1; 2473 2474 return (nvme_set_features(nvme, B_FALSE, 0, NVME_FEAT_WRITE_CACHE, 2475 nwc.r, &nwc.r)); 2476 } 2477 2478 static int 2479 nvme_set_nqueues(nvme_t *nvme) 2480 { 2481 nvme_nqueues_t nq = { 0 }; 2482 int ret; 2483 2484 /* 2485 * The default is to allocate one completion queue per vector. 2486 */ 2487 if (nvme->n_completion_queues == -1) 2488 nvme->n_completion_queues = nvme->n_intr_cnt; 2489 2490 /* 2491 * There is no point in having more completion queues than 2492 * interrupt vectors. 2493 */ 2494 nvme->n_completion_queues = MIN(nvme->n_completion_queues, 2495 nvme->n_intr_cnt); 2496 2497 /* 2498 * The default is to use one submission queue per completion queue. 2499 */ 2500 if (nvme->n_submission_queues == -1) 2501 nvme->n_submission_queues = nvme->n_completion_queues; 2502 2503 /* 2504 * There is no point in having more compeletion queues than 2505 * submission queues. 2506 */ 2507 nvme->n_completion_queues = MIN(nvme->n_completion_queues, 2508 nvme->n_submission_queues); 2509 2510 ASSERT(nvme->n_submission_queues > 0); 2511 ASSERT(nvme->n_completion_queues > 0); 2512 2513 nq.b.nq_nsq = nvme->n_submission_queues - 1; 2514 nq.b.nq_ncq = nvme->n_completion_queues - 1; 2515 2516 ret = nvme_set_features(nvme, B_FALSE, 0, NVME_FEAT_NQUEUES, nq.r, 2517 &nq.r); 2518 2519 if (ret == 0) { 2520 /* 2521 * Never use more than the requested number of queues. 2522 */ 2523 nvme->n_submission_queues = MIN(nvme->n_submission_queues, 2524 nq.b.nq_nsq + 1); 2525 nvme->n_completion_queues = MIN(nvme->n_completion_queues, 2526 nq.b.nq_ncq + 1); 2527 } 2528 2529 return (ret); 2530 } 2531 2532 static int 2533 nvme_create_completion_queue(nvme_t *nvme, nvme_cq_t *cq) 2534 { 2535 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2536 nvme_create_queue_dw10_t dw10 = { 0 }; 2537 nvme_create_cq_dw11_t c_dw11 = { 0 }; 2538 int ret; 2539 2540 dw10.b.q_qid = cq->ncq_id; 2541 dw10.b.q_qsize = cq->ncq_nentry - 1; 2542 2543 c_dw11.b.cq_pc = 1; 2544 c_dw11.b.cq_ien = 1; 2545 c_dw11.b.cq_iv = cq->ncq_id % nvme->n_intr_cnt; 2546 2547 cmd->nc_sqid = 0; 2548 cmd->nc_callback = nvme_wakeup_cmd; 2549 cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_CQUEUE; 2550 cmd->nc_sqe.sqe_cdw10 = dw10.r; 2551 cmd->nc_sqe.sqe_cdw11 = c_dw11.r; 2552 cmd->nc_sqe.sqe_dptr.d_prp[0] = cq->ncq_dma->nd_cookie.dmac_laddress; 2553 2554 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout); 2555 2556 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 2557 dev_err(nvme->n_dip, CE_WARN, 2558 "!CREATE CQUEUE failed with sct = %x, sc = %x", 2559 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 2560 } 2561 2562 nvme_free_cmd(cmd); 2563 2564 return (ret); 2565 } 2566 2567 static int 2568 nvme_create_io_qpair(nvme_t *nvme, nvme_qpair_t *qp, uint16_t idx) 2569 { 2570 nvme_cq_t *cq = qp->nq_cq; 2571 nvme_cmd_t *cmd; 2572 nvme_create_queue_dw10_t dw10 = { 0 }; 2573 nvme_create_sq_dw11_t s_dw11 = { 0 }; 2574 int ret; 2575 2576 /* 2577 * It is possible to have more qpairs than completion queues, 2578 * and when the idx > ncq_id, that completion queue is shared 2579 * and has already been created. 2580 */ 2581 if (idx <= cq->ncq_id && 2582 nvme_create_completion_queue(nvme, cq) != DDI_SUCCESS) 2583 return (DDI_FAILURE); 2584 2585 dw10.b.q_qid = idx; 2586 dw10.b.q_qsize = qp->nq_nentry - 1; 2587 2588 s_dw11.b.sq_pc = 1; 2589 s_dw11.b.sq_cqid = cq->ncq_id; 2590 2591 cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2592 cmd->nc_sqid = 0; 2593 cmd->nc_callback = nvme_wakeup_cmd; 2594 cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_SQUEUE; 2595 cmd->nc_sqe.sqe_cdw10 = dw10.r; 2596 cmd->nc_sqe.sqe_cdw11 = s_dw11.r; 2597 cmd->nc_sqe.sqe_dptr.d_prp[0] = qp->nq_sqdma->nd_cookie.dmac_laddress; 2598 2599 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout); 2600 2601 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 2602 dev_err(nvme->n_dip, CE_WARN, 2603 "!CREATE SQUEUE failed with sct = %x, sc = %x", 2604 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 2605 } 2606 2607 nvme_free_cmd(cmd); 2608 2609 return (ret); 2610 } 2611 2612 static boolean_t 2613 nvme_reset(nvme_t *nvme, boolean_t quiesce) 2614 { 2615 nvme_reg_csts_t csts; 2616 int i; 2617 2618 nvme_put32(nvme, NVME_REG_CC, 0); 2619 2620 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 2621 if (csts.b.csts_rdy == 1) { 2622 nvme_put32(nvme, NVME_REG_CC, 0); 2623 for (i = 0; i != nvme->n_timeout * 10; i++) { 2624 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 2625 if (csts.b.csts_rdy == 0) 2626 break; 2627 2628 if (quiesce) 2629 drv_usecwait(50000); 2630 else 2631 delay(drv_usectohz(50000)); 2632 } 2633 } 2634 2635 nvme_put32(nvme, NVME_REG_AQA, 0); 2636 nvme_put32(nvme, NVME_REG_ASQ, 0); 2637 nvme_put32(nvme, NVME_REG_ACQ, 0); 2638 2639 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 2640 return (csts.b.csts_rdy == 0 ? B_TRUE : B_FALSE); 2641 } 2642 2643 static void 2644 nvme_shutdown(nvme_t *nvme, int mode, boolean_t quiesce) 2645 { 2646 nvme_reg_cc_t cc; 2647 nvme_reg_csts_t csts; 2648 int i; 2649 2650 ASSERT(mode == NVME_CC_SHN_NORMAL || mode == NVME_CC_SHN_ABRUPT); 2651 2652 cc.r = nvme_get32(nvme, NVME_REG_CC); 2653 cc.b.cc_shn = mode & 0x3; 2654 nvme_put32(nvme, NVME_REG_CC, cc.r); 2655 2656 for (i = 0; i != 10; i++) { 2657 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 2658 if (csts.b.csts_shst == NVME_CSTS_SHN_COMPLETE) 2659 break; 2660 2661 if (quiesce) 2662 drv_usecwait(100000); 2663 else 2664 delay(drv_usectohz(100000)); 2665 } 2666 } 2667 2668 2669 static void 2670 nvme_prepare_devid(nvme_t *nvme, uint32_t nsid) 2671 { 2672 /* 2673 * Section 7.7 of the spec describes how to get a unique ID for 2674 * the controller: the vendor ID, the model name and the serial 2675 * number shall be unique when combined. 2676 * 2677 * If a namespace has no EUI64 we use the above and add the hex 2678 * namespace ID to get a unique ID for the namespace. 2679 */ 2680 char model[sizeof (nvme->n_idctl->id_model) + 1]; 2681 char serial[sizeof (nvme->n_idctl->id_serial) + 1]; 2682 2683 bcopy(nvme->n_idctl->id_model, model, sizeof (nvme->n_idctl->id_model)); 2684 bcopy(nvme->n_idctl->id_serial, serial, 2685 sizeof (nvme->n_idctl->id_serial)); 2686 2687 model[sizeof (nvme->n_idctl->id_model)] = '\0'; 2688 serial[sizeof (nvme->n_idctl->id_serial)] = '\0'; 2689 2690 nvme->n_ns[nsid - 1].ns_devid = kmem_asprintf("%4X-%s-%s-%X", 2691 nvme->n_idctl->id_vid, model, serial, nsid); 2692 } 2693 2694 static void 2695 nvme_changed_ns(nvme_t *nvme, int nsid) 2696 { 2697 nvme_namespace_t *ns = &nvme->n_ns[nsid - 1]; 2698 nvme_identify_nsid_t *idns, *oidns; 2699 2700 dev_err(nvme->n_dip, CE_NOTE, "!namespace %u (%s) has changed.", 2701 nsid, ns->ns_name); 2702 2703 if (ns->ns_ignore) 2704 return; 2705 2706 /* 2707 * The namespace has changed in some way. At present, we only update 2708 * the device capacity and trigger blkdev to check the device state. 2709 */ 2710 2711 if (nvme_identify(nvme, B_FALSE, nsid, (void **)&idns) != 0) { 2712 dev_err(nvme->n_dip, CE_WARN, 2713 "!failed to identify namespace %d", nsid); 2714 return; 2715 } 2716 2717 oidns = ns->ns_idns; 2718 ns->ns_idns = idns; 2719 kmem_free(oidns, sizeof (nvme_identify_nsid_t)); 2720 2721 ns->ns_block_count = idns->id_nsize; 2722 ns->ns_block_size = 2723 1 << idns->id_lbaf[idns->id_flbas.lba_format].lbaf_lbads; 2724 ns->ns_best_block_size = ns->ns_block_size; 2725 2726 bd_state_change(ns->ns_bd_hdl); 2727 } 2728 2729 static int 2730 nvme_init_ns(nvme_t *nvme, int nsid) 2731 { 2732 nvme_namespace_t *ns = &nvme->n_ns[nsid - 1]; 2733 nvme_identify_nsid_t *idns; 2734 boolean_t was_ignored; 2735 int last_rp; 2736 2737 ns->ns_nvme = nvme; 2738 2739 if (nvme_identify(nvme, B_FALSE, nsid, (void **)&idns) != 0) { 2740 dev_err(nvme->n_dip, CE_WARN, 2741 "!failed to identify namespace %d", nsid); 2742 return (DDI_FAILURE); 2743 } 2744 2745 ns->ns_idns = idns; 2746 ns->ns_id = nsid; 2747 ns->ns_block_count = idns->id_nsize; 2748 ns->ns_block_size = 2749 1 << idns->id_lbaf[idns->id_flbas.lba_format].lbaf_lbads; 2750 ns->ns_best_block_size = ns->ns_block_size; 2751 2752 /* 2753 * Get the EUI64 if present. Use it for devid and device node names. 2754 */ 2755 if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1)) 2756 bcopy(idns->id_eui64, ns->ns_eui64, sizeof (ns->ns_eui64)); 2757 2758 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 2759 if (*(uint64_t *)ns->ns_eui64 != 0) { 2760 uint8_t *eui64 = ns->ns_eui64; 2761 2762 (void) snprintf(ns->ns_name, sizeof (ns->ns_name), 2763 "%02x%02x%02x%02x%02x%02x%02x%02x", 2764 eui64[0], eui64[1], eui64[2], eui64[3], 2765 eui64[4], eui64[5], eui64[6], eui64[7]); 2766 } else { 2767 (void) snprintf(ns->ns_name, sizeof (ns->ns_name), "%d", 2768 ns->ns_id); 2769 2770 nvme_prepare_devid(nvme, ns->ns_id); 2771 } 2772 2773 /* 2774 * Find the LBA format with no metadata and the best relative 2775 * performance. A value of 3 means "degraded", 0 is best. 2776 */ 2777 last_rp = 3; 2778 for (int j = 0; j <= idns->id_nlbaf; j++) { 2779 if (idns->id_lbaf[j].lbaf_lbads == 0) 2780 break; 2781 if (idns->id_lbaf[j].lbaf_ms != 0) 2782 continue; 2783 if (idns->id_lbaf[j].lbaf_rp >= last_rp) 2784 continue; 2785 last_rp = idns->id_lbaf[j].lbaf_rp; 2786 ns->ns_best_block_size = 2787 1 << idns->id_lbaf[j].lbaf_lbads; 2788 } 2789 2790 if (ns->ns_best_block_size < nvme->n_min_block_size) 2791 ns->ns_best_block_size = nvme->n_min_block_size; 2792 2793 was_ignored = ns->ns_ignore; 2794 2795 /* 2796 * We currently don't support namespaces that use either: 2797 * - protection information 2798 * - illegal block size (< 512) 2799 */ 2800 if (idns->id_dps.dp_pinfo) { 2801 dev_err(nvme->n_dip, CE_WARN, 2802 "!ignoring namespace %d, unsupported feature: " 2803 "pinfo = %d", nsid, idns->id_dps.dp_pinfo); 2804 ns->ns_ignore = B_TRUE; 2805 } else if (ns->ns_block_size < 512) { 2806 dev_err(nvme->n_dip, CE_WARN, 2807 "!ignoring namespace %d, unsupported block size %"PRIu64, 2808 nsid, (uint64_t)ns->ns_block_size); 2809 ns->ns_ignore = B_TRUE; 2810 } else { 2811 ns->ns_ignore = B_FALSE; 2812 } 2813 2814 /* 2815 * Keep a count of namespaces which are attachable. 2816 * See comments in nvme_bd_driveinfo() to understand its effect. 2817 */ 2818 if (was_ignored) { 2819 /* 2820 * Previously ignored, but now not. Count it. 2821 */ 2822 if (!ns->ns_ignore) 2823 nvme->n_namespaces_attachable++; 2824 } else { 2825 /* 2826 * Wasn't ignored previously, but now needs to be. 2827 * Discount it. 2828 */ 2829 if (ns->ns_ignore) 2830 nvme->n_namespaces_attachable--; 2831 } 2832 2833 return (DDI_SUCCESS); 2834 } 2835 2836 static int 2837 nvme_init(nvme_t *nvme) 2838 { 2839 nvme_reg_cc_t cc = { 0 }; 2840 nvme_reg_aqa_t aqa = { 0 }; 2841 nvme_reg_asq_t asq = { 0 }; 2842 nvme_reg_acq_t acq = { 0 }; 2843 nvme_reg_cap_t cap; 2844 nvme_reg_vs_t vs; 2845 nvme_reg_csts_t csts; 2846 int i = 0; 2847 uint16_t nqueues; 2848 uint_t tq_threads; 2849 char model[sizeof (nvme->n_idctl->id_model) + 1]; 2850 char *vendor, *product; 2851 2852 /* Check controller version */ 2853 vs.r = nvme_get32(nvme, NVME_REG_VS); 2854 nvme->n_version.v_major = vs.b.vs_mjr; 2855 nvme->n_version.v_minor = vs.b.vs_mnr; 2856 dev_err(nvme->n_dip, CE_CONT, "?NVMe spec version %d.%d", 2857 nvme->n_version.v_major, nvme->n_version.v_minor); 2858 2859 if (nvme->n_version.v_major > nvme_version_major) { 2860 dev_err(nvme->n_dip, CE_WARN, "!no support for version > %d.x", 2861 nvme_version_major); 2862 if (nvme->n_strict_version) 2863 goto fail; 2864 } 2865 2866 /* retrieve controller configuration */ 2867 cap.r = nvme_get64(nvme, NVME_REG_CAP); 2868 2869 if ((cap.b.cap_css & NVME_CAP_CSS_NVM) == 0) { 2870 dev_err(nvme->n_dip, CE_WARN, 2871 "!NVM command set not supported by hardware"); 2872 goto fail; 2873 } 2874 2875 nvme->n_nssr_supported = cap.b.cap_nssrs; 2876 nvme->n_doorbell_stride = 4 << cap.b.cap_dstrd; 2877 nvme->n_timeout = cap.b.cap_to; 2878 nvme->n_arbitration_mechanisms = cap.b.cap_ams; 2879 nvme->n_cont_queues_reqd = cap.b.cap_cqr; 2880 nvme->n_max_queue_entries = cap.b.cap_mqes + 1; 2881 2882 /* 2883 * The MPSMIN and MPSMAX fields in the CAP register use 0 to specify 2884 * the base page size of 4k (1<<12), so add 12 here to get the real 2885 * page size value. 2886 */ 2887 nvme->n_pageshift = MIN(MAX(cap.b.cap_mpsmin + 12, PAGESHIFT), 2888 cap.b.cap_mpsmax + 12); 2889 nvme->n_pagesize = 1UL << (nvme->n_pageshift); 2890 2891 /* 2892 * Set up Queue DMA to transfer at least 1 page-aligned page at a time. 2893 */ 2894 nvme->n_queue_dma_attr.dma_attr_align = nvme->n_pagesize; 2895 nvme->n_queue_dma_attr.dma_attr_minxfer = nvme->n_pagesize; 2896 2897 /* 2898 * Set up PRP DMA to transfer 1 page-aligned page at a time. 2899 * Maxxfer may be increased after we identified the controller limits. 2900 */ 2901 nvme->n_prp_dma_attr.dma_attr_maxxfer = nvme->n_pagesize; 2902 nvme->n_prp_dma_attr.dma_attr_minxfer = nvme->n_pagesize; 2903 nvme->n_prp_dma_attr.dma_attr_align = nvme->n_pagesize; 2904 nvme->n_prp_dma_attr.dma_attr_seg = nvme->n_pagesize - 1; 2905 2906 /* 2907 * Reset controller if it's still in ready state. 2908 */ 2909 if (nvme_reset(nvme, B_FALSE) == B_FALSE) { 2910 dev_err(nvme->n_dip, CE_WARN, "!unable to reset controller"); 2911 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 2912 nvme->n_dead = B_TRUE; 2913 goto fail; 2914 } 2915 2916 /* 2917 * Create the cq array with one completion queue to be assigned 2918 * to the admin queue pair and a limited number of taskqs (4). 2919 */ 2920 if (nvme_create_cq_array(nvme, 1, nvme->n_admin_queue_len, 4) != 2921 DDI_SUCCESS) { 2922 dev_err(nvme->n_dip, CE_WARN, 2923 "!failed to pre-allocate admin completion queue"); 2924 goto fail; 2925 } 2926 /* 2927 * Create the admin queue pair. 2928 */ 2929 if (nvme_alloc_qpair(nvme, nvme->n_admin_queue_len, &nvme->n_adminq, 0) 2930 != DDI_SUCCESS) { 2931 dev_err(nvme->n_dip, CE_WARN, 2932 "!unable to allocate admin qpair"); 2933 goto fail; 2934 } 2935 nvme->n_ioq = kmem_alloc(sizeof (nvme_qpair_t *), KM_SLEEP); 2936 nvme->n_ioq[0] = nvme->n_adminq; 2937 2938 nvme->n_progress |= NVME_ADMIN_QUEUE; 2939 2940 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, 2941 "admin-queue-len", nvme->n_admin_queue_len); 2942 2943 aqa.b.aqa_asqs = aqa.b.aqa_acqs = nvme->n_admin_queue_len - 1; 2944 asq = nvme->n_adminq->nq_sqdma->nd_cookie.dmac_laddress; 2945 acq = nvme->n_adminq->nq_cq->ncq_dma->nd_cookie.dmac_laddress; 2946 2947 ASSERT((asq & (nvme->n_pagesize - 1)) == 0); 2948 ASSERT((acq & (nvme->n_pagesize - 1)) == 0); 2949 2950 nvme_put32(nvme, NVME_REG_AQA, aqa.r); 2951 nvme_put64(nvme, NVME_REG_ASQ, asq); 2952 nvme_put64(nvme, NVME_REG_ACQ, acq); 2953 2954 cc.b.cc_ams = 0; /* use Round-Robin arbitration */ 2955 cc.b.cc_css = 0; /* use NVM command set */ 2956 cc.b.cc_mps = nvme->n_pageshift - 12; 2957 cc.b.cc_shn = 0; /* no shutdown in progress */ 2958 cc.b.cc_en = 1; /* enable controller */ 2959 cc.b.cc_iosqes = 6; /* submission queue entry is 2^6 bytes long */ 2960 cc.b.cc_iocqes = 4; /* completion queue entry is 2^4 bytes long */ 2961 2962 nvme_put32(nvme, NVME_REG_CC, cc.r); 2963 2964 /* 2965 * Wait for the controller to become ready. 2966 */ 2967 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 2968 if (csts.b.csts_rdy == 0) { 2969 for (i = 0; i != nvme->n_timeout * 10; i++) { 2970 delay(drv_usectohz(50000)); 2971 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 2972 2973 if (csts.b.csts_cfs == 1) { 2974 dev_err(nvme->n_dip, CE_WARN, 2975 "!controller fatal status at init"); 2976 ddi_fm_service_impact(nvme->n_dip, 2977 DDI_SERVICE_LOST); 2978 nvme->n_dead = B_TRUE; 2979 goto fail; 2980 } 2981 2982 if (csts.b.csts_rdy == 1) 2983 break; 2984 } 2985 } 2986 2987 if (csts.b.csts_rdy == 0) { 2988 dev_err(nvme->n_dip, CE_WARN, "!controller not ready"); 2989 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 2990 nvme->n_dead = B_TRUE; 2991 goto fail; 2992 } 2993 2994 /* 2995 * Assume an abort command limit of 1. We'll destroy and re-init 2996 * that later when we know the true abort command limit. 2997 */ 2998 sema_init(&nvme->n_abort_sema, 1, NULL, SEMA_DRIVER, NULL); 2999 3000 /* 3001 * Set up initial interrupt for admin queue. 3002 */ 3003 if ((nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSIX, 1) 3004 != DDI_SUCCESS) && 3005 (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSI, 1) 3006 != DDI_SUCCESS) && 3007 (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_FIXED, 1) 3008 != DDI_SUCCESS)) { 3009 dev_err(nvme->n_dip, CE_WARN, 3010 "!failed to setup initial interrupt"); 3011 goto fail; 3012 } 3013 3014 /* 3015 * Post an asynchronous event command to catch errors. 3016 * We assume the asynchronous events are supported as required by 3017 * specification (Figure 40 in section 5 of NVMe 1.2). 3018 * However, since at least qemu does not follow the specification, 3019 * we need a mechanism to protect ourselves. 3020 */ 3021 nvme->n_async_event_supported = B_TRUE; 3022 nvme_async_event(nvme); 3023 3024 /* 3025 * Identify Controller 3026 */ 3027 if (nvme_identify(nvme, B_FALSE, 0, (void **)&nvme->n_idctl) != 0) { 3028 dev_err(nvme->n_dip, CE_WARN, 3029 "!failed to identify controller"); 3030 goto fail; 3031 } 3032 3033 /* 3034 * Get Vendor & Product ID 3035 */ 3036 bcopy(nvme->n_idctl->id_model, model, sizeof (nvme->n_idctl->id_model)); 3037 model[sizeof (nvme->n_idctl->id_model)] = '\0'; 3038 sata_split_model(model, &vendor, &product); 3039 3040 if (vendor == NULL) 3041 nvme->n_vendor = strdup("NVMe"); 3042 else 3043 nvme->n_vendor = strdup(vendor); 3044 3045 nvme->n_product = strdup(product); 3046 3047 /* 3048 * Get controller limits. 3049 */ 3050 nvme->n_async_event_limit = MAX(NVME_MIN_ASYNC_EVENT_LIMIT, 3051 MIN(nvme->n_admin_queue_len / 10, 3052 MIN(nvme->n_idctl->id_aerl + 1, nvme->n_async_event_limit))); 3053 3054 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, 3055 "async-event-limit", nvme->n_async_event_limit); 3056 3057 nvme->n_abort_command_limit = nvme->n_idctl->id_acl + 1; 3058 3059 /* 3060 * Reinitialize the semaphore with the true abort command limit 3061 * supported by the hardware. It's not necessary to disable interrupts 3062 * as only command aborts use the semaphore, and no commands are 3063 * executed or aborted while we're here. 3064 */ 3065 sema_destroy(&nvme->n_abort_sema); 3066 sema_init(&nvme->n_abort_sema, nvme->n_abort_command_limit - 1, NULL, 3067 SEMA_DRIVER, NULL); 3068 3069 nvme->n_progress |= NVME_CTRL_LIMITS; 3070 3071 if (nvme->n_idctl->id_mdts == 0) 3072 nvme->n_max_data_transfer_size = nvme->n_pagesize * 65536; 3073 else 3074 nvme->n_max_data_transfer_size = 3075 1ull << (nvme->n_pageshift + nvme->n_idctl->id_mdts); 3076 3077 nvme->n_error_log_len = nvme->n_idctl->id_elpe + 1; 3078 3079 /* 3080 * Limit n_max_data_transfer_size to what we can handle in one PRP. 3081 * Chained PRPs are currently unsupported. 3082 * 3083 * This is a no-op on hardware which doesn't support a transfer size 3084 * big enough to require chained PRPs. 3085 */ 3086 nvme->n_max_data_transfer_size = MIN(nvme->n_max_data_transfer_size, 3087 (nvme->n_pagesize / sizeof (uint64_t) * nvme->n_pagesize)); 3088 3089 nvme->n_prp_dma_attr.dma_attr_maxxfer = nvme->n_max_data_transfer_size; 3090 3091 /* 3092 * Make sure the minimum/maximum queue entry sizes are not 3093 * larger/smaller than the default. 3094 */ 3095 3096 if (((1 << nvme->n_idctl->id_sqes.qes_min) > sizeof (nvme_sqe_t)) || 3097 ((1 << nvme->n_idctl->id_sqes.qes_max) < sizeof (nvme_sqe_t)) || 3098 ((1 << nvme->n_idctl->id_cqes.qes_min) > sizeof (nvme_cqe_t)) || 3099 ((1 << nvme->n_idctl->id_cqes.qes_max) < sizeof (nvme_cqe_t))) 3100 goto fail; 3101 3102 /* 3103 * Check for the presence of a Volatile Write Cache. If present, 3104 * enable or disable based on the value of the property 3105 * volatile-write-cache-enable (default is enabled). 3106 */ 3107 nvme->n_write_cache_present = 3108 nvme->n_idctl->id_vwc.vwc_present == 0 ? B_FALSE : B_TRUE; 3109 3110 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, 3111 "volatile-write-cache-present", 3112 nvme->n_write_cache_present ? 1 : 0); 3113 3114 if (!nvme->n_write_cache_present) { 3115 nvme->n_write_cache_enabled = B_FALSE; 3116 } else if (nvme_write_cache_set(nvme, nvme->n_write_cache_enabled) 3117 != 0) { 3118 dev_err(nvme->n_dip, CE_WARN, 3119 "!failed to %sable volatile write cache", 3120 nvme->n_write_cache_enabled ? "en" : "dis"); 3121 /* 3122 * Assume the cache is (still) enabled. 3123 */ 3124 nvme->n_write_cache_enabled = B_TRUE; 3125 } 3126 3127 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, 3128 "volatile-write-cache-enable", 3129 nvme->n_write_cache_enabled ? 1 : 0); 3130 3131 /* 3132 * Assume LBA Range Type feature is supported. If it isn't this 3133 * will be set to B_FALSE by nvme_get_features(). 3134 */ 3135 nvme->n_lba_range_supported = B_TRUE; 3136 3137 /* 3138 * Check support for Autonomous Power State Transition. 3139 */ 3140 if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1)) 3141 nvme->n_auto_pst_supported = 3142 nvme->n_idctl->id_apsta.ap_sup == 0 ? B_FALSE : B_TRUE; 3143 3144 /* 3145 * Assume Software Progress Marker feature is supported. If it isn't 3146 * this will be set to B_FALSE by nvme_get_features(). 3147 */ 3148 nvme->n_progress_supported = B_TRUE; 3149 3150 /* 3151 * Identify Namespaces 3152 */ 3153 nvme->n_namespace_count = nvme->n_idctl->id_nn; 3154 3155 if (nvme->n_namespace_count == 0) { 3156 dev_err(nvme->n_dip, CE_WARN, 3157 "!controllers without namespaces are not supported"); 3158 goto fail; 3159 } 3160 3161 if (nvme->n_namespace_count > NVME_MINOR_MAX) { 3162 dev_err(nvme->n_dip, CE_WARN, 3163 "!too many namespaces: %d, limiting to %d\n", 3164 nvme->n_namespace_count, NVME_MINOR_MAX); 3165 nvme->n_namespace_count = NVME_MINOR_MAX; 3166 } 3167 3168 nvme->n_ns = kmem_zalloc(sizeof (nvme_namespace_t) * 3169 nvme->n_namespace_count, KM_SLEEP); 3170 3171 for (i = 0; i != nvme->n_namespace_count; i++) { 3172 mutex_init(&nvme->n_ns[i].ns_minor.nm_mutex, NULL, MUTEX_DRIVER, 3173 NULL); 3174 nvme->n_ns[i].ns_ignore = B_TRUE; 3175 if (nvme_init_ns(nvme, i + 1) != DDI_SUCCESS) 3176 goto fail; 3177 } 3178 3179 /* 3180 * Try to set up MSI/MSI-X interrupts. 3181 */ 3182 if ((nvme->n_intr_types & (DDI_INTR_TYPE_MSI | DDI_INTR_TYPE_MSIX)) 3183 != 0) { 3184 nvme_release_interrupts(nvme); 3185 3186 nqueues = MIN(UINT16_MAX, ncpus); 3187 3188 if ((nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSIX, 3189 nqueues) != DDI_SUCCESS) && 3190 (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSI, 3191 nqueues) != DDI_SUCCESS)) { 3192 dev_err(nvme->n_dip, CE_WARN, 3193 "!failed to setup MSI/MSI-X interrupts"); 3194 goto fail; 3195 } 3196 } 3197 3198 /* 3199 * Create I/O queue pairs. 3200 */ 3201 3202 if (nvme_set_nqueues(nvme) != 0) { 3203 dev_err(nvme->n_dip, CE_WARN, 3204 "!failed to set number of I/O queues to %d", 3205 nvme->n_intr_cnt); 3206 goto fail; 3207 } 3208 3209 /* 3210 * Reallocate I/O queue array 3211 */ 3212 kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *)); 3213 nvme->n_ioq = kmem_zalloc(sizeof (nvme_qpair_t *) * 3214 (nvme->n_submission_queues + 1), KM_SLEEP); 3215 nvme->n_ioq[0] = nvme->n_adminq; 3216 3217 /* 3218 * There should always be at least as many submission queues 3219 * as completion queues. 3220 */ 3221 ASSERT(nvme->n_submission_queues >= nvme->n_completion_queues); 3222 3223 nvme->n_ioq_count = nvme->n_submission_queues; 3224 3225 nvme->n_io_squeue_len = 3226 MIN(nvme->n_io_squeue_len, nvme->n_max_queue_entries); 3227 3228 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-squeue-len", 3229 nvme->n_io_squeue_len); 3230 3231 /* 3232 * Pre-allocate completion queues. 3233 * When there are the same number of submission and completion 3234 * queues there is no value in having a larger completion 3235 * queue length. 3236 */ 3237 if (nvme->n_submission_queues == nvme->n_completion_queues) 3238 nvme->n_io_cqueue_len = MIN(nvme->n_io_cqueue_len, 3239 nvme->n_io_squeue_len); 3240 3241 nvme->n_io_cqueue_len = MIN(nvme->n_io_cqueue_len, 3242 nvme->n_max_queue_entries); 3243 3244 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-cqueue-len", 3245 nvme->n_io_cqueue_len); 3246 3247 /* 3248 * Assign the equal quantity of taskq threads to each completion 3249 * queue, capping the total number of threads to the number 3250 * of CPUs. 3251 */ 3252 tq_threads = MIN(UINT16_MAX, ncpus) / nvme->n_completion_queues; 3253 3254 /* 3255 * In case the calculation above is zero, we need at least one 3256 * thread per completion queue. 3257 */ 3258 tq_threads = MAX(1, tq_threads); 3259 3260 if (nvme_create_cq_array(nvme, nvme->n_completion_queues + 1, 3261 nvme->n_io_cqueue_len, tq_threads) != DDI_SUCCESS) { 3262 dev_err(nvme->n_dip, CE_WARN, 3263 "!failed to pre-allocate completion queues"); 3264 goto fail; 3265 } 3266 3267 /* 3268 * If we use less completion queues than interrupt vectors return 3269 * some of the interrupt vectors back to the system. 3270 */ 3271 if (nvme->n_completion_queues + 1 < nvme->n_intr_cnt) { 3272 nvme_release_interrupts(nvme); 3273 3274 if (nvme_setup_interrupts(nvme, nvme->n_intr_type, 3275 nvme->n_completion_queues + 1) != DDI_SUCCESS) { 3276 dev_err(nvme->n_dip, CE_WARN, 3277 "!failed to reduce number of interrupts"); 3278 goto fail; 3279 } 3280 } 3281 3282 /* 3283 * Alloc & register I/O queue pairs 3284 */ 3285 3286 for (i = 1; i != nvme->n_ioq_count + 1; i++) { 3287 if (nvme_alloc_qpair(nvme, nvme->n_io_squeue_len, 3288 &nvme->n_ioq[i], i) != DDI_SUCCESS) { 3289 dev_err(nvme->n_dip, CE_WARN, 3290 "!unable to allocate I/O qpair %d", i); 3291 goto fail; 3292 } 3293 3294 if (nvme_create_io_qpair(nvme, nvme->n_ioq[i], i) != 0) { 3295 dev_err(nvme->n_dip, CE_WARN, 3296 "!unable to create I/O qpair %d", i); 3297 goto fail; 3298 } 3299 } 3300 3301 /* 3302 * Post more asynchronous events commands to reduce event reporting 3303 * latency as suggested by the spec. 3304 */ 3305 if (nvme->n_async_event_supported) { 3306 for (i = 1; i != nvme->n_async_event_limit; i++) 3307 nvme_async_event(nvme); 3308 } 3309 3310 return (DDI_SUCCESS); 3311 3312 fail: 3313 (void) nvme_reset(nvme, B_FALSE); 3314 return (DDI_FAILURE); 3315 } 3316 3317 static uint_t 3318 nvme_intr(caddr_t arg1, caddr_t arg2) 3319 { 3320 /*LINTED: E_PTR_BAD_CAST_ALIGN*/ 3321 nvme_t *nvme = (nvme_t *)arg1; 3322 int inum = (int)(uintptr_t)arg2; 3323 int ccnt = 0; 3324 int qnum; 3325 3326 if (inum >= nvme->n_intr_cnt) 3327 return (DDI_INTR_UNCLAIMED); 3328 3329 if (nvme->n_dead) 3330 return (nvme->n_intr_type == DDI_INTR_TYPE_FIXED ? 3331 DDI_INTR_UNCLAIMED : DDI_INTR_CLAIMED); 3332 3333 /* 3334 * The interrupt vector a queue uses is calculated as queue_idx % 3335 * intr_cnt in nvme_create_io_qpair(). Iterate through the queue array 3336 * in steps of n_intr_cnt to process all queues using this vector. 3337 */ 3338 for (qnum = inum; 3339 qnum < nvme->n_cq_count && nvme->n_cq[qnum] != NULL; 3340 qnum += nvme->n_intr_cnt) { 3341 ccnt += nvme_process_iocq(nvme, nvme->n_cq[qnum]); 3342 } 3343 3344 return (ccnt > 0 ? DDI_INTR_CLAIMED : DDI_INTR_UNCLAIMED); 3345 } 3346 3347 static void 3348 nvme_release_interrupts(nvme_t *nvme) 3349 { 3350 int i; 3351 3352 for (i = 0; i < nvme->n_intr_cnt; i++) { 3353 if (nvme->n_inth[i] == NULL) 3354 break; 3355 3356 if (nvme->n_intr_cap & DDI_INTR_FLAG_BLOCK) 3357 (void) ddi_intr_block_disable(&nvme->n_inth[i], 1); 3358 else 3359 (void) ddi_intr_disable(nvme->n_inth[i]); 3360 3361 (void) ddi_intr_remove_handler(nvme->n_inth[i]); 3362 (void) ddi_intr_free(nvme->n_inth[i]); 3363 } 3364 3365 kmem_free(nvme->n_inth, nvme->n_inth_sz); 3366 nvme->n_inth = NULL; 3367 nvme->n_inth_sz = 0; 3368 3369 nvme->n_progress &= ~NVME_INTERRUPTS; 3370 } 3371 3372 static int 3373 nvme_setup_interrupts(nvme_t *nvme, int intr_type, int nqpairs) 3374 { 3375 int nintrs, navail, count; 3376 int ret; 3377 int i; 3378 3379 if (nvme->n_intr_types == 0) { 3380 ret = ddi_intr_get_supported_types(nvme->n_dip, 3381 &nvme->n_intr_types); 3382 if (ret != DDI_SUCCESS) { 3383 dev_err(nvme->n_dip, CE_WARN, 3384 "!%s: ddi_intr_get_supported types failed", 3385 __func__); 3386 return (ret); 3387 } 3388 #ifdef __x86 3389 if (get_hwenv() == HW_VMWARE) 3390 nvme->n_intr_types &= ~DDI_INTR_TYPE_MSIX; 3391 #endif 3392 } 3393 3394 if ((nvme->n_intr_types & intr_type) == 0) 3395 return (DDI_FAILURE); 3396 3397 ret = ddi_intr_get_nintrs(nvme->n_dip, intr_type, &nintrs); 3398 if (ret != DDI_SUCCESS) { 3399 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_nintrs failed", 3400 __func__); 3401 return (ret); 3402 } 3403 3404 ret = ddi_intr_get_navail(nvme->n_dip, intr_type, &navail); 3405 if (ret != DDI_SUCCESS) { 3406 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_navail failed", 3407 __func__); 3408 return (ret); 3409 } 3410 3411 /* We want at most one interrupt per queue pair. */ 3412 if (navail > nqpairs) 3413 navail = nqpairs; 3414 3415 nvme->n_inth_sz = sizeof (ddi_intr_handle_t) * navail; 3416 nvme->n_inth = kmem_zalloc(nvme->n_inth_sz, KM_SLEEP); 3417 3418 ret = ddi_intr_alloc(nvme->n_dip, nvme->n_inth, intr_type, 0, navail, 3419 &count, 0); 3420 if (ret != DDI_SUCCESS) { 3421 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_alloc failed", 3422 __func__); 3423 goto fail; 3424 } 3425 3426 nvme->n_intr_cnt = count; 3427 3428 ret = ddi_intr_get_pri(nvme->n_inth[0], &nvme->n_intr_pri); 3429 if (ret != DDI_SUCCESS) { 3430 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_pri failed", 3431 __func__); 3432 goto fail; 3433 } 3434 3435 for (i = 0; i < count; i++) { 3436 ret = ddi_intr_add_handler(nvme->n_inth[i], nvme_intr, 3437 (void *)nvme, (void *)(uintptr_t)i); 3438 if (ret != DDI_SUCCESS) { 3439 dev_err(nvme->n_dip, CE_WARN, 3440 "!%s: ddi_intr_add_handler failed", __func__); 3441 goto fail; 3442 } 3443 } 3444 3445 (void) ddi_intr_get_cap(nvme->n_inth[0], &nvme->n_intr_cap); 3446 3447 for (i = 0; i < count; i++) { 3448 if (nvme->n_intr_cap & DDI_INTR_FLAG_BLOCK) 3449 ret = ddi_intr_block_enable(&nvme->n_inth[i], 1); 3450 else 3451 ret = ddi_intr_enable(nvme->n_inth[i]); 3452 3453 if (ret != DDI_SUCCESS) { 3454 dev_err(nvme->n_dip, CE_WARN, 3455 "!%s: enabling interrupt %d failed", __func__, i); 3456 goto fail; 3457 } 3458 } 3459 3460 nvme->n_intr_type = intr_type; 3461 3462 nvme->n_progress |= NVME_INTERRUPTS; 3463 3464 return (DDI_SUCCESS); 3465 3466 fail: 3467 nvme_release_interrupts(nvme); 3468 3469 return (ret); 3470 } 3471 3472 static int 3473 nvme_fm_errcb(dev_info_t *dip, ddi_fm_error_t *fm_error, const void *arg) 3474 { 3475 _NOTE(ARGUNUSED(arg)); 3476 3477 pci_ereport_post(dip, fm_error, NULL); 3478 return (fm_error->fme_status); 3479 } 3480 3481 static void 3482 nvme_remove_callback(dev_info_t *dip, ddi_eventcookie_t cookie, void *a, 3483 void *b) 3484 { 3485 nvme_t *nvme = a; 3486 3487 nvme->n_dead = B_TRUE; 3488 3489 /* 3490 * Fail all outstanding commands, including those in the admin queue 3491 * (queue 0). 3492 */ 3493 for (uint_t i = 0; i < nvme->n_ioq_count + 1; i++) { 3494 nvme_qpair_t *qp = nvme->n_ioq[i]; 3495 3496 mutex_enter(&qp->nq_mutex); 3497 for (size_t j = 0; j < qp->nq_nentry; j++) { 3498 nvme_cmd_t *cmd = qp->nq_cmd[j]; 3499 nvme_cmd_t *u_cmd; 3500 3501 if (cmd == NULL) { 3502 continue; 3503 } 3504 3505 /* 3506 * Since we have the queue lock held the entire time we 3507 * iterate over it, it's not possible for the queue to 3508 * change underneath us. Thus, we don't need to check 3509 * that the return value of nvme_unqueue_cmd matches the 3510 * requested cmd to unqueue. 3511 */ 3512 u_cmd = nvme_unqueue_cmd(nvme, qp, cmd->nc_sqe.sqe_cid); 3513 taskq_dispatch_ent(qp->nq_cq->ncq_cmd_taskq, 3514 cmd->nc_callback, cmd, TQ_NOSLEEP, &cmd->nc_tqent); 3515 3516 ASSERT3P(u_cmd, ==, cmd); 3517 } 3518 mutex_exit(&qp->nq_mutex); 3519 } 3520 } 3521 3522 static int 3523 nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 3524 { 3525 nvme_t *nvme; 3526 int instance; 3527 int nregs; 3528 off_t regsize; 3529 int i; 3530 char name[32]; 3531 bd_ops_t ops = nvme_bd_ops; 3532 3533 if (cmd != DDI_ATTACH) 3534 return (DDI_FAILURE); 3535 3536 instance = ddi_get_instance(dip); 3537 3538 if (ddi_soft_state_zalloc(nvme_state, instance) != DDI_SUCCESS) 3539 return (DDI_FAILURE); 3540 3541 nvme = ddi_get_soft_state(nvme_state, instance); 3542 ddi_set_driver_private(dip, nvme); 3543 nvme->n_dip = dip; 3544 3545 /* Set up event handlers for hot removal. */ 3546 if (ddi_get_eventcookie(nvme->n_dip, DDI_DEVI_REMOVE_EVENT, 3547 &nvme->n_rm_cookie) != DDI_SUCCESS) { 3548 goto fail; 3549 } 3550 if (ddi_add_event_handler(nvme->n_dip, nvme->n_rm_cookie, 3551 nvme_remove_callback, nvme, &nvme->n_ev_rm_cb_id) != 3552 DDI_SUCCESS) { 3553 goto fail; 3554 } 3555 3556 mutex_init(&nvme->n_minor.nm_mutex, NULL, MUTEX_DRIVER, NULL); 3557 3558 nvme->n_strict_version = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3559 DDI_PROP_DONTPASS, "strict-version", 1) == 1 ? B_TRUE : B_FALSE; 3560 nvme->n_ignore_unknown_vendor_status = ddi_prop_get_int(DDI_DEV_T_ANY, 3561 dip, DDI_PROP_DONTPASS, "ignore-unknown-vendor-status", 0) == 1 ? 3562 B_TRUE : B_FALSE; 3563 nvme->n_admin_queue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3564 DDI_PROP_DONTPASS, "admin-queue-len", NVME_DEFAULT_ADMIN_QUEUE_LEN); 3565 nvme->n_io_squeue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3566 DDI_PROP_DONTPASS, "io-squeue-len", NVME_DEFAULT_IO_QUEUE_LEN); 3567 /* 3568 * Double up the default for completion queues in case of 3569 * queue sharing. 3570 */ 3571 nvme->n_io_cqueue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3572 DDI_PROP_DONTPASS, "io-cqueue-len", 2 * NVME_DEFAULT_IO_QUEUE_LEN); 3573 nvme->n_async_event_limit = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3574 DDI_PROP_DONTPASS, "async-event-limit", 3575 NVME_DEFAULT_ASYNC_EVENT_LIMIT); 3576 nvme->n_write_cache_enabled = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3577 DDI_PROP_DONTPASS, "volatile-write-cache-enable", 1) != 0 ? 3578 B_TRUE : B_FALSE; 3579 nvme->n_min_block_size = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3580 DDI_PROP_DONTPASS, "min-phys-block-size", 3581 NVME_DEFAULT_MIN_BLOCK_SIZE); 3582 nvme->n_submission_queues = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3583 DDI_PROP_DONTPASS, "max-submission-queues", -1); 3584 nvme->n_completion_queues = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3585 DDI_PROP_DONTPASS, "max-completion-queues", -1); 3586 3587 if (!ISP2(nvme->n_min_block_size) || 3588 (nvme->n_min_block_size < NVME_DEFAULT_MIN_BLOCK_SIZE)) { 3589 dev_err(dip, CE_WARN, "!min-phys-block-size %s, " 3590 "using default %d", ISP2(nvme->n_min_block_size) ? 3591 "too low" : "not a power of 2", 3592 NVME_DEFAULT_MIN_BLOCK_SIZE); 3593 nvme->n_min_block_size = NVME_DEFAULT_MIN_BLOCK_SIZE; 3594 } 3595 3596 if (nvme->n_submission_queues != -1 && 3597 (nvme->n_submission_queues < 1 || 3598 nvme->n_submission_queues > UINT16_MAX)) { 3599 dev_err(dip, CE_WARN, "!\"submission-queues\"=%d is not " 3600 "valid. Must be [1..%d]", nvme->n_submission_queues, 3601 UINT16_MAX); 3602 nvme->n_submission_queues = -1; 3603 } 3604 3605 if (nvme->n_completion_queues != -1 && 3606 (nvme->n_completion_queues < 1 || 3607 nvme->n_completion_queues > UINT16_MAX)) { 3608 dev_err(dip, CE_WARN, "!\"completion-queues\"=%d is not " 3609 "valid. Must be [1..%d]", nvme->n_completion_queues, 3610 UINT16_MAX); 3611 nvme->n_completion_queues = -1; 3612 } 3613 3614 if (nvme->n_admin_queue_len < NVME_MIN_ADMIN_QUEUE_LEN) 3615 nvme->n_admin_queue_len = NVME_MIN_ADMIN_QUEUE_LEN; 3616 else if (nvme->n_admin_queue_len > NVME_MAX_ADMIN_QUEUE_LEN) 3617 nvme->n_admin_queue_len = NVME_MAX_ADMIN_QUEUE_LEN; 3618 3619 if (nvme->n_io_squeue_len < NVME_MIN_IO_QUEUE_LEN) 3620 nvme->n_io_squeue_len = NVME_MIN_IO_QUEUE_LEN; 3621 if (nvme->n_io_cqueue_len < NVME_MIN_IO_QUEUE_LEN) 3622 nvme->n_io_cqueue_len = NVME_MIN_IO_QUEUE_LEN; 3623 3624 if (nvme->n_async_event_limit < 1) 3625 nvme->n_async_event_limit = NVME_DEFAULT_ASYNC_EVENT_LIMIT; 3626 3627 nvme->n_reg_acc_attr = nvme_reg_acc_attr; 3628 nvme->n_queue_dma_attr = nvme_queue_dma_attr; 3629 nvme->n_prp_dma_attr = nvme_prp_dma_attr; 3630 nvme->n_sgl_dma_attr = nvme_sgl_dma_attr; 3631 3632 /* 3633 * Set up FMA support. 3634 */ 3635 nvme->n_fm_cap = ddi_getprop(DDI_DEV_T_ANY, dip, 3636 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "fm-capable", 3637 DDI_FM_EREPORT_CAPABLE | DDI_FM_ACCCHK_CAPABLE | 3638 DDI_FM_DMACHK_CAPABLE | DDI_FM_ERRCB_CAPABLE); 3639 3640 ddi_fm_init(dip, &nvme->n_fm_cap, &nvme->n_fm_ibc); 3641 3642 if (nvme->n_fm_cap) { 3643 if (nvme->n_fm_cap & DDI_FM_ACCCHK_CAPABLE) 3644 nvme->n_reg_acc_attr.devacc_attr_access = 3645 DDI_FLAGERR_ACC; 3646 3647 if (nvme->n_fm_cap & DDI_FM_DMACHK_CAPABLE) { 3648 nvme->n_prp_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR; 3649 nvme->n_sgl_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR; 3650 } 3651 3652 if (DDI_FM_EREPORT_CAP(nvme->n_fm_cap) || 3653 DDI_FM_ERRCB_CAP(nvme->n_fm_cap)) 3654 pci_ereport_setup(dip); 3655 3656 if (DDI_FM_ERRCB_CAP(nvme->n_fm_cap)) 3657 ddi_fm_handler_register(dip, nvme_fm_errcb, 3658 (void *)nvme); 3659 } 3660 3661 nvme->n_progress |= NVME_FMA_INIT; 3662 3663 /* 3664 * The spec defines several register sets. Only the controller 3665 * registers (set 1) are currently used. 3666 */ 3667 if (ddi_dev_nregs(dip, &nregs) == DDI_FAILURE || 3668 nregs < 2 || 3669 ddi_dev_regsize(dip, 1, ®size) == DDI_FAILURE) 3670 goto fail; 3671 3672 if (ddi_regs_map_setup(dip, 1, &nvme->n_regs, 0, regsize, 3673 &nvme->n_reg_acc_attr, &nvme->n_regh) != DDI_SUCCESS) { 3674 dev_err(dip, CE_WARN, "!failed to map regset 1"); 3675 goto fail; 3676 } 3677 3678 nvme->n_progress |= NVME_REGS_MAPPED; 3679 3680 /* 3681 * Create PRP DMA cache 3682 */ 3683 (void) snprintf(name, sizeof (name), "%s%d_prp_cache", 3684 ddi_driver_name(dip), ddi_get_instance(dip)); 3685 nvme->n_prp_cache = kmem_cache_create(name, sizeof (nvme_dma_t), 3686 0, nvme_prp_dma_constructor, nvme_prp_dma_destructor, 3687 NULL, (void *)nvme, NULL, 0); 3688 3689 if (nvme_init(nvme) != DDI_SUCCESS) 3690 goto fail; 3691 3692 if (!nvme->n_idctl->id_oncs.on_dset_mgmt) 3693 ops.o_free_space = NULL; 3694 3695 /* 3696 * Initialize the driver with the UFM subsystem 3697 */ 3698 if (ddi_ufm_init(dip, DDI_UFM_CURRENT_VERSION, &nvme_ufm_ops, 3699 &nvme->n_ufmh, nvme) != 0) { 3700 dev_err(dip, CE_WARN, "!failed to initialize UFM subsystem"); 3701 goto fail; 3702 } 3703 mutex_init(&nvme->n_fwslot_mutex, NULL, MUTEX_DRIVER, NULL); 3704 ddi_ufm_update(nvme->n_ufmh); 3705 nvme->n_progress |= NVME_UFM_INIT; 3706 3707 /* 3708 * Attach the blkdev driver for each namespace. 3709 */ 3710 for (i = 0; i != nvme->n_namespace_count; i++) { 3711 if (ddi_create_minor_node(nvme->n_dip, nvme->n_ns[i].ns_name, 3712 S_IFCHR, NVME_MINOR(ddi_get_instance(nvme->n_dip), i + 1), 3713 DDI_NT_NVME_ATTACHMENT_POINT, 0) != DDI_SUCCESS) { 3714 dev_err(dip, CE_WARN, 3715 "!failed to create minor node for namespace %d", i); 3716 goto fail; 3717 } 3718 3719 if (nvme->n_ns[i].ns_ignore) 3720 continue; 3721 3722 nvme->n_ns[i].ns_bd_hdl = bd_alloc_handle(&nvme->n_ns[i], 3723 &ops, &nvme->n_prp_dma_attr, KM_SLEEP); 3724 3725 if (nvme->n_ns[i].ns_bd_hdl == NULL) { 3726 dev_err(dip, CE_WARN, 3727 "!failed to get blkdev handle for namespace %d", i); 3728 goto fail; 3729 } 3730 3731 if (bd_attach_handle(dip, nvme->n_ns[i].ns_bd_hdl) 3732 != DDI_SUCCESS) { 3733 dev_err(dip, CE_WARN, 3734 "!failed to attach blkdev handle for namespace %d", 3735 i); 3736 goto fail; 3737 } 3738 } 3739 3740 if (ddi_create_minor_node(dip, "devctl", S_IFCHR, 3741 NVME_MINOR(ddi_get_instance(dip), 0), DDI_NT_NVME_NEXUS, 0) 3742 != DDI_SUCCESS) { 3743 dev_err(dip, CE_WARN, "nvme_attach: " 3744 "cannot create devctl minor node"); 3745 goto fail; 3746 } 3747 3748 return (DDI_SUCCESS); 3749 3750 fail: 3751 /* attach successful anyway so that FMA can retire the device */ 3752 if (nvme->n_dead) 3753 return (DDI_SUCCESS); 3754 3755 (void) nvme_detach(dip, DDI_DETACH); 3756 3757 return (DDI_FAILURE); 3758 } 3759 3760 static int 3761 nvme_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 3762 { 3763 int instance, i; 3764 nvme_t *nvme; 3765 3766 if (cmd != DDI_DETACH) 3767 return (DDI_FAILURE); 3768 3769 instance = ddi_get_instance(dip); 3770 3771 nvme = ddi_get_soft_state(nvme_state, instance); 3772 3773 if (nvme == NULL) 3774 return (DDI_FAILURE); 3775 3776 ddi_remove_minor_node(dip, "devctl"); 3777 mutex_destroy(&nvme->n_minor.nm_mutex); 3778 3779 if (nvme->n_ns) { 3780 for (i = 0; i != nvme->n_namespace_count; i++) { 3781 ddi_remove_minor_node(dip, nvme->n_ns[i].ns_name); 3782 mutex_destroy(&nvme->n_ns[i].ns_minor.nm_mutex); 3783 3784 if (nvme->n_ns[i].ns_bd_hdl) { 3785 (void) bd_detach_handle( 3786 nvme->n_ns[i].ns_bd_hdl); 3787 bd_free_handle(nvme->n_ns[i].ns_bd_hdl); 3788 } 3789 3790 if (nvme->n_ns[i].ns_idns) 3791 kmem_free(nvme->n_ns[i].ns_idns, 3792 sizeof (nvme_identify_nsid_t)); 3793 if (nvme->n_ns[i].ns_devid) 3794 strfree(nvme->n_ns[i].ns_devid); 3795 } 3796 3797 kmem_free(nvme->n_ns, sizeof (nvme_namespace_t) * 3798 nvme->n_namespace_count); 3799 } 3800 if (nvme->n_progress & NVME_UFM_INIT) { 3801 ddi_ufm_fini(nvme->n_ufmh); 3802 mutex_destroy(&nvme->n_fwslot_mutex); 3803 } 3804 3805 if (nvme->n_progress & NVME_INTERRUPTS) 3806 nvme_release_interrupts(nvme); 3807 3808 for (i = 0; i < nvme->n_cq_count; i++) { 3809 if (nvme->n_cq[i]->ncq_cmd_taskq != NULL) 3810 taskq_wait(nvme->n_cq[i]->ncq_cmd_taskq); 3811 } 3812 3813 if (nvme->n_ioq_count > 0) { 3814 for (i = 1; i != nvme->n_ioq_count + 1; i++) { 3815 if (nvme->n_ioq[i] != NULL) { 3816 /* TODO: send destroy queue commands */ 3817 nvme_free_qpair(nvme->n_ioq[i]); 3818 } 3819 } 3820 3821 kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *) * 3822 (nvme->n_ioq_count + 1)); 3823 } 3824 3825 if (nvme->n_prp_cache != NULL) { 3826 kmem_cache_destroy(nvme->n_prp_cache); 3827 } 3828 3829 if (nvme->n_progress & NVME_REGS_MAPPED) { 3830 nvme_shutdown(nvme, NVME_CC_SHN_NORMAL, B_FALSE); 3831 (void) nvme_reset(nvme, B_FALSE); 3832 } 3833 3834 if (nvme->n_progress & NVME_CTRL_LIMITS) 3835 sema_destroy(&nvme->n_abort_sema); 3836 3837 if (nvme->n_progress & NVME_ADMIN_QUEUE) 3838 nvme_free_qpair(nvme->n_adminq); 3839 3840 if (nvme->n_cq_count > 0) { 3841 nvme_destroy_cq_array(nvme, 0); 3842 nvme->n_cq = NULL; 3843 nvme->n_cq_count = 0; 3844 } 3845 3846 if (nvme->n_idctl) 3847 kmem_free(nvme->n_idctl, NVME_IDENTIFY_BUFSIZE); 3848 3849 if (nvme->n_progress & NVME_REGS_MAPPED) 3850 ddi_regs_map_free(&nvme->n_regh); 3851 3852 if (nvme->n_progress & NVME_FMA_INIT) { 3853 if (DDI_FM_ERRCB_CAP(nvme->n_fm_cap)) 3854 ddi_fm_handler_unregister(nvme->n_dip); 3855 3856 if (DDI_FM_EREPORT_CAP(nvme->n_fm_cap) || 3857 DDI_FM_ERRCB_CAP(nvme->n_fm_cap)) 3858 pci_ereport_teardown(nvme->n_dip); 3859 3860 ddi_fm_fini(nvme->n_dip); 3861 } 3862 3863 if (nvme->n_vendor != NULL) 3864 strfree(nvme->n_vendor); 3865 3866 if (nvme->n_product != NULL) 3867 strfree(nvme->n_product); 3868 3869 /* Clean up hot removal event handler. */ 3870 if (nvme->n_ev_rm_cb_id != NULL) { 3871 (void) ddi_remove_event_handler(nvme->n_ev_rm_cb_id); 3872 } 3873 nvme->n_ev_rm_cb_id = NULL; 3874 3875 ddi_soft_state_free(nvme_state, instance); 3876 3877 return (DDI_SUCCESS); 3878 } 3879 3880 static int 3881 nvme_quiesce(dev_info_t *dip) 3882 { 3883 int instance; 3884 nvme_t *nvme; 3885 3886 instance = ddi_get_instance(dip); 3887 3888 nvme = ddi_get_soft_state(nvme_state, instance); 3889 3890 if (nvme == NULL) 3891 return (DDI_FAILURE); 3892 3893 nvme_shutdown(nvme, NVME_CC_SHN_ABRUPT, B_TRUE); 3894 3895 (void) nvme_reset(nvme, B_TRUE); 3896 3897 return (DDI_FAILURE); 3898 } 3899 3900 static int 3901 nvme_fill_prp(nvme_cmd_t *cmd, ddi_dma_handle_t dma) 3902 { 3903 nvme_t *nvme = cmd->nc_nvme; 3904 uint_t nprp_per_page, nprp; 3905 uint64_t *prp; 3906 const ddi_dma_cookie_t *cookie; 3907 uint_t idx; 3908 uint_t ncookies = ddi_dma_ncookies(dma); 3909 3910 if (ncookies == 0) 3911 return (DDI_FAILURE); 3912 3913 if ((cookie = ddi_dma_cookie_get(dma, 0)) == NULL) 3914 return (DDI_FAILURE); 3915 cmd->nc_sqe.sqe_dptr.d_prp[0] = cookie->dmac_laddress; 3916 3917 if (ncookies == 1) { 3918 cmd->nc_sqe.sqe_dptr.d_prp[1] = 0; 3919 return (DDI_SUCCESS); 3920 } else if (ncookies == 2) { 3921 if ((cookie = ddi_dma_cookie_get(dma, 1)) == NULL) 3922 return (DDI_FAILURE); 3923 cmd->nc_sqe.sqe_dptr.d_prp[1] = cookie->dmac_laddress; 3924 return (DDI_SUCCESS); 3925 } 3926 3927 /* 3928 * At this point, we're always operating on cookies at 3929 * index >= 1 and writing the addresses of those cookies 3930 * into a new page. The address of that page is stored 3931 * as the second PRP entry. 3932 */ 3933 nprp_per_page = nvme->n_pagesize / sizeof (uint64_t); 3934 ASSERT(nprp_per_page > 0); 3935 3936 /* 3937 * We currently don't support chained PRPs and set up our DMA 3938 * attributes to reflect that. If we still get an I/O request 3939 * that needs a chained PRP something is very wrong. Account 3940 * for the first cookie here, which we've placed in d_prp[0]. 3941 */ 3942 nprp = howmany(ncookies - 1, nprp_per_page); 3943 VERIFY(nprp == 1); 3944 3945 /* 3946 * Allocate a page of pointers, in which we'll write the 3947 * addresses of cookies 1 to `ncookies`. 3948 */ 3949 cmd->nc_prp = kmem_cache_alloc(nvme->n_prp_cache, KM_SLEEP); 3950 bzero(cmd->nc_prp->nd_memp, cmd->nc_prp->nd_len); 3951 cmd->nc_sqe.sqe_dptr.d_prp[1] = cmd->nc_prp->nd_cookie.dmac_laddress; 3952 3953 prp = (uint64_t *)cmd->nc_prp->nd_memp; 3954 for (idx = 1; idx < ncookies; idx++) { 3955 if ((cookie = ddi_dma_cookie_get(dma, idx)) == NULL) 3956 return (DDI_FAILURE); 3957 *prp++ = cookie->dmac_laddress; 3958 } 3959 3960 (void) ddi_dma_sync(cmd->nc_prp->nd_dmah, 0, cmd->nc_prp->nd_len, 3961 DDI_DMA_SYNC_FORDEV); 3962 return (DDI_SUCCESS); 3963 } 3964 3965 /* 3966 * The maximum number of requests supported for a deallocate request is 3967 * NVME_DSET_MGMT_MAX_RANGES (256) -- this is from the NVMe 1.1 spec (and 3968 * unchanged through at least 1.4a). The definition of nvme_range_t is also 3969 * from the NVMe 1.1 spec. Together, the result is that all of the ranges for 3970 * a deallocate request will fit into the smallest supported namespace page 3971 * (4k). 3972 */ 3973 CTASSERT(sizeof (nvme_range_t) * NVME_DSET_MGMT_MAX_RANGES == 4096); 3974 3975 static int 3976 nvme_fill_ranges(nvme_cmd_t *cmd, bd_xfer_t *xfer, uint64_t blocksize, 3977 int allocflag) 3978 { 3979 const dkioc_free_list_t *dfl = xfer->x_dfl; 3980 const dkioc_free_list_ext_t *exts = dfl->dfl_exts; 3981 nvme_t *nvme = cmd->nc_nvme; 3982 nvme_range_t *ranges = NULL; 3983 uint_t i; 3984 3985 /* 3986 * The number of ranges in the request is 0s based (that is 3987 * word10 == 0 -> 1 range, word10 == 1 -> 2 ranges, ..., 3988 * word10 == 255 -> 256 ranges). Therefore the allowed values are 3989 * [1..NVME_DSET_MGMT_MAX_RANGES]. If blkdev gives us a bad request, 3990 * we either provided bad info in nvme_bd_driveinfo() or there is a bug 3991 * in blkdev. 3992 */ 3993 VERIFY3U(dfl->dfl_num_exts, >, 0); 3994 VERIFY3U(dfl->dfl_num_exts, <=, NVME_DSET_MGMT_MAX_RANGES); 3995 cmd->nc_sqe.sqe_cdw10 = (dfl->dfl_num_exts - 1) & 0xff; 3996 3997 cmd->nc_sqe.sqe_cdw11 = NVME_DSET_MGMT_ATTR_DEALLOCATE; 3998 3999 cmd->nc_prp = kmem_cache_alloc(nvme->n_prp_cache, allocflag); 4000 if (cmd->nc_prp == NULL) 4001 return (DDI_FAILURE); 4002 4003 bzero(cmd->nc_prp->nd_memp, cmd->nc_prp->nd_len); 4004 ranges = (nvme_range_t *)cmd->nc_prp->nd_memp; 4005 4006 cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_prp->nd_cookie.dmac_laddress; 4007 cmd->nc_sqe.sqe_dptr.d_prp[1] = 0; 4008 4009 for (i = 0; i < dfl->dfl_num_exts; i++) { 4010 uint64_t lba, len; 4011 4012 lba = (dfl->dfl_offset + exts[i].dfle_start) / blocksize; 4013 len = exts[i].dfle_length / blocksize; 4014 4015 VERIFY3U(len, <=, UINT32_MAX); 4016 4017 /* No context attributes for a deallocate request */ 4018 ranges[i].nr_ctxattr = 0; 4019 ranges[i].nr_len = len; 4020 ranges[i].nr_lba = lba; 4021 } 4022 4023 (void) ddi_dma_sync(cmd->nc_prp->nd_dmah, 0, cmd->nc_prp->nd_len, 4024 DDI_DMA_SYNC_FORDEV); 4025 4026 return (DDI_SUCCESS); 4027 } 4028 4029 static nvme_cmd_t * 4030 nvme_create_nvm_cmd(nvme_namespace_t *ns, uint8_t opc, bd_xfer_t *xfer) 4031 { 4032 nvme_t *nvme = ns->ns_nvme; 4033 nvme_cmd_t *cmd; 4034 int allocflag; 4035 4036 /* 4037 * Blkdev only sets BD_XFER_POLL when dumping, so don't sleep. 4038 */ 4039 allocflag = (xfer->x_flags & BD_XFER_POLL) ? KM_NOSLEEP : KM_SLEEP; 4040 cmd = nvme_alloc_cmd(nvme, allocflag); 4041 4042 if (cmd == NULL) 4043 return (NULL); 4044 4045 cmd->nc_sqe.sqe_opc = opc; 4046 cmd->nc_callback = nvme_bd_xfer_done; 4047 cmd->nc_xfer = xfer; 4048 4049 switch (opc) { 4050 case NVME_OPC_NVM_WRITE: 4051 case NVME_OPC_NVM_READ: 4052 VERIFY(xfer->x_nblks <= 0x10000); 4053 4054 cmd->nc_sqe.sqe_nsid = ns->ns_id; 4055 4056 cmd->nc_sqe.sqe_cdw10 = xfer->x_blkno & 0xffffffffu; 4057 cmd->nc_sqe.sqe_cdw11 = (xfer->x_blkno >> 32); 4058 cmd->nc_sqe.sqe_cdw12 = (uint16_t)(xfer->x_nblks - 1); 4059 4060 if (nvme_fill_prp(cmd, xfer->x_dmah) != DDI_SUCCESS) 4061 goto fail; 4062 break; 4063 4064 case NVME_OPC_NVM_FLUSH: 4065 cmd->nc_sqe.sqe_nsid = ns->ns_id; 4066 break; 4067 4068 case NVME_OPC_NVM_DSET_MGMT: 4069 cmd->nc_sqe.sqe_nsid = ns->ns_id; 4070 4071 if (nvme_fill_ranges(cmd, xfer, 4072 (uint64_t)ns->ns_block_size, allocflag) != DDI_SUCCESS) 4073 goto fail; 4074 break; 4075 4076 default: 4077 goto fail; 4078 } 4079 4080 return (cmd); 4081 4082 fail: 4083 nvme_free_cmd(cmd); 4084 return (NULL); 4085 } 4086 4087 static void 4088 nvme_bd_xfer_done(void *arg) 4089 { 4090 nvme_cmd_t *cmd = arg; 4091 bd_xfer_t *xfer = cmd->nc_xfer; 4092 int error = 0; 4093 4094 error = nvme_check_cmd_status(cmd); 4095 nvme_free_cmd(cmd); 4096 4097 bd_xfer_done(xfer, error); 4098 } 4099 4100 static void 4101 nvme_bd_driveinfo(void *arg, bd_drive_t *drive) 4102 { 4103 nvme_namespace_t *ns = arg; 4104 nvme_t *nvme = ns->ns_nvme; 4105 uint_t ns_count = MAX(1, nvme->n_namespaces_attachable); 4106 4107 /* 4108 * Set the blkdev qcount to the number of submission queues. 4109 * It will then create one waitq/runq pair for each submission 4110 * queue and spread I/O requests across the queues. 4111 */ 4112 drive->d_qcount = nvme->n_ioq_count; 4113 4114 /* 4115 * I/O activity to individual namespaces is distributed across 4116 * each of the d_qcount blkdev queues (which has been set to 4117 * the number of nvme submission queues). d_qsize is the number 4118 * of submitted and not completed I/Os within each queue that blkdev 4119 * will allow before it starts holding them in the waitq. 4120 * 4121 * Each namespace will create a child blkdev instance, for each one 4122 * we try and set the d_qsize so that each namespace gets an 4123 * equal portion of the submission queue. 4124 * 4125 * If post instantiation of the nvme drive, n_namespaces_attachable 4126 * changes and a namespace is attached it could calculate a 4127 * different d_qsize. It may even be that the sum of the d_qsizes is 4128 * now beyond the submission queue size. Should that be the case 4129 * and the I/O rate is such that blkdev attempts to submit more 4130 * I/Os than the size of the submission queue, the excess I/Os 4131 * will be held behind the semaphore nq_sema. 4132 */ 4133 drive->d_qsize = nvme->n_io_squeue_len / ns_count; 4134 4135 /* 4136 * Don't let the queue size drop below the minimum, though. 4137 */ 4138 drive->d_qsize = MAX(drive->d_qsize, NVME_MIN_IO_QUEUE_LEN); 4139 4140 /* 4141 * d_maxxfer is not set, which means the value is taken from the DMA 4142 * attributes specified to bd_alloc_handle. 4143 */ 4144 4145 drive->d_removable = B_FALSE; 4146 drive->d_hotpluggable = B_FALSE; 4147 4148 bcopy(ns->ns_eui64, drive->d_eui64, sizeof (drive->d_eui64)); 4149 drive->d_target = ns->ns_id; 4150 drive->d_lun = 0; 4151 4152 drive->d_model = nvme->n_idctl->id_model; 4153 drive->d_model_len = sizeof (nvme->n_idctl->id_model); 4154 drive->d_vendor = nvme->n_vendor; 4155 drive->d_vendor_len = strlen(nvme->n_vendor); 4156 drive->d_product = nvme->n_product; 4157 drive->d_product_len = strlen(nvme->n_product); 4158 drive->d_serial = nvme->n_idctl->id_serial; 4159 drive->d_serial_len = sizeof (nvme->n_idctl->id_serial); 4160 drive->d_revision = nvme->n_idctl->id_fwrev; 4161 drive->d_revision_len = sizeof (nvme->n_idctl->id_fwrev); 4162 4163 /* 4164 * If we support the dataset management command, the only restrictions 4165 * on a discard request are the maximum number of ranges (segments) 4166 * per single request. 4167 */ 4168 if (nvme->n_idctl->id_oncs.on_dset_mgmt) 4169 drive->d_max_free_seg = NVME_DSET_MGMT_MAX_RANGES; 4170 } 4171 4172 static int 4173 nvme_bd_mediainfo(void *arg, bd_media_t *media) 4174 { 4175 nvme_namespace_t *ns = arg; 4176 nvme_t *nvme = ns->ns_nvme; 4177 4178 if (nvme->n_dead) { 4179 return (EIO); 4180 } 4181 4182 media->m_nblks = ns->ns_block_count; 4183 media->m_blksize = ns->ns_block_size; 4184 media->m_readonly = B_FALSE; 4185 media->m_solidstate = B_TRUE; 4186 4187 media->m_pblksize = ns->ns_best_block_size; 4188 4189 return (0); 4190 } 4191 4192 static int 4193 nvme_bd_cmd(nvme_namespace_t *ns, bd_xfer_t *xfer, uint8_t opc) 4194 { 4195 nvme_t *nvme = ns->ns_nvme; 4196 nvme_cmd_t *cmd; 4197 nvme_qpair_t *ioq; 4198 boolean_t poll; 4199 int ret; 4200 4201 if (nvme->n_dead) { 4202 return (EIO); 4203 } 4204 4205 cmd = nvme_create_nvm_cmd(ns, opc, xfer); 4206 if (cmd == NULL) 4207 return (ENOMEM); 4208 4209 cmd->nc_sqid = xfer->x_qnum + 1; 4210 ASSERT(cmd->nc_sqid <= nvme->n_ioq_count); 4211 ioq = nvme->n_ioq[cmd->nc_sqid]; 4212 4213 /* 4214 * Get the polling flag before submitting the command. The command may 4215 * complete immediately after it was submitted, which means we must 4216 * treat both cmd and xfer as if they have been freed already. 4217 */ 4218 poll = (xfer->x_flags & BD_XFER_POLL) != 0; 4219 4220 ret = nvme_submit_io_cmd(ioq, cmd); 4221 4222 if (ret != 0) 4223 return (ret); 4224 4225 if (!poll) 4226 return (0); 4227 4228 do { 4229 cmd = nvme_retrieve_cmd(nvme, ioq); 4230 if (cmd != NULL) 4231 cmd->nc_callback(cmd); 4232 else 4233 drv_usecwait(10); 4234 } while (ioq->nq_active_cmds != 0); 4235 4236 return (0); 4237 } 4238 4239 static int 4240 nvme_bd_read(void *arg, bd_xfer_t *xfer) 4241 { 4242 nvme_namespace_t *ns = arg; 4243 4244 return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_READ)); 4245 } 4246 4247 static int 4248 nvme_bd_write(void *arg, bd_xfer_t *xfer) 4249 { 4250 nvme_namespace_t *ns = arg; 4251 4252 return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_WRITE)); 4253 } 4254 4255 static int 4256 nvme_bd_sync(void *arg, bd_xfer_t *xfer) 4257 { 4258 nvme_namespace_t *ns = arg; 4259 4260 if (ns->ns_nvme->n_dead) 4261 return (EIO); 4262 4263 /* 4264 * If the volatile write cache is not present or not enabled the FLUSH 4265 * command is a no-op, so we can take a shortcut here. 4266 */ 4267 if (!ns->ns_nvme->n_write_cache_present) { 4268 bd_xfer_done(xfer, ENOTSUP); 4269 return (0); 4270 } 4271 4272 if (!ns->ns_nvme->n_write_cache_enabled) { 4273 bd_xfer_done(xfer, 0); 4274 return (0); 4275 } 4276 4277 return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_FLUSH)); 4278 } 4279 4280 static int 4281 nvme_bd_devid(void *arg, dev_info_t *devinfo, ddi_devid_t *devid) 4282 { 4283 nvme_namespace_t *ns = arg; 4284 nvme_t *nvme = ns->ns_nvme; 4285 4286 if (nvme->n_dead) { 4287 return (EIO); 4288 } 4289 4290 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 4291 if (*(uint64_t *)ns->ns_eui64 != 0) { 4292 return (ddi_devid_init(devinfo, DEVID_SCSI3_WWN, 4293 sizeof (ns->ns_eui64), ns->ns_eui64, devid)); 4294 } else { 4295 return (ddi_devid_init(devinfo, DEVID_ENCAP, 4296 strlen(ns->ns_devid), ns->ns_devid, devid)); 4297 } 4298 } 4299 4300 static int 4301 nvme_bd_free_space(void *arg, bd_xfer_t *xfer) 4302 { 4303 nvme_namespace_t *ns = arg; 4304 4305 if (xfer->x_dfl == NULL) 4306 return (EINVAL); 4307 4308 if (!ns->ns_nvme->n_idctl->id_oncs.on_dset_mgmt) 4309 return (ENOTSUP); 4310 4311 return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_DSET_MGMT)); 4312 } 4313 4314 static int 4315 nvme_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) 4316 { 4317 #ifndef __lock_lint 4318 _NOTE(ARGUNUSED(cred_p)); 4319 #endif 4320 minor_t minor = getminor(*devp); 4321 nvme_t *nvme = ddi_get_soft_state(nvme_state, NVME_MINOR_INST(minor)); 4322 int nsid = NVME_MINOR_NSID(minor); 4323 nvme_minor_state_t *nm; 4324 int rv = 0; 4325 4326 if (otyp != OTYP_CHR) 4327 return (EINVAL); 4328 4329 if (nvme == NULL) 4330 return (ENXIO); 4331 4332 if (nsid > nvme->n_namespace_count) 4333 return (ENXIO); 4334 4335 if (nvme->n_dead) 4336 return (EIO); 4337 4338 nm = nsid == 0 ? &nvme->n_minor : &nvme->n_ns[nsid - 1].ns_minor; 4339 4340 mutex_enter(&nm->nm_mutex); 4341 if (nm->nm_oexcl) { 4342 rv = EBUSY; 4343 goto out; 4344 } 4345 4346 if (flag & FEXCL) { 4347 if (nm->nm_ocnt != 0) { 4348 rv = EBUSY; 4349 goto out; 4350 } 4351 nm->nm_oexcl = B_TRUE; 4352 } 4353 4354 nm->nm_ocnt++; 4355 4356 out: 4357 mutex_exit(&nm->nm_mutex); 4358 return (rv); 4359 4360 } 4361 4362 static int 4363 nvme_close(dev_t dev, int flag, int otyp, cred_t *cred_p) 4364 { 4365 #ifndef __lock_lint 4366 _NOTE(ARGUNUSED(cred_p)); 4367 _NOTE(ARGUNUSED(flag)); 4368 #endif 4369 minor_t minor = getminor(dev); 4370 nvme_t *nvme = ddi_get_soft_state(nvme_state, NVME_MINOR_INST(minor)); 4371 int nsid = NVME_MINOR_NSID(minor); 4372 nvme_minor_state_t *nm; 4373 4374 if (otyp != OTYP_CHR) 4375 return (ENXIO); 4376 4377 if (nvme == NULL) 4378 return (ENXIO); 4379 4380 if (nsid > nvme->n_namespace_count) 4381 return (ENXIO); 4382 4383 nm = nsid == 0 ? &nvme->n_minor : &nvme->n_ns[nsid - 1].ns_minor; 4384 4385 mutex_enter(&nm->nm_mutex); 4386 if (nm->nm_oexcl) 4387 nm->nm_oexcl = B_FALSE; 4388 4389 ASSERT(nm->nm_ocnt > 0); 4390 nm->nm_ocnt--; 4391 mutex_exit(&nm->nm_mutex); 4392 4393 return (0); 4394 } 4395 4396 static int 4397 nvme_ioctl_identify(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 4398 cred_t *cred_p) 4399 { 4400 _NOTE(ARGUNUSED(cred_p)); 4401 int rv = 0; 4402 void *idctl; 4403 4404 if ((mode & FREAD) == 0) 4405 return (EPERM); 4406 4407 if (nioc->n_len < NVME_IDENTIFY_BUFSIZE) 4408 return (EINVAL); 4409 4410 if ((rv = nvme_identify(nvme, B_TRUE, nsid, (void **)&idctl)) != 0) 4411 return (rv); 4412 4413 if (ddi_copyout(idctl, (void *)nioc->n_buf, NVME_IDENTIFY_BUFSIZE, mode) 4414 != 0) 4415 rv = EFAULT; 4416 4417 kmem_free(idctl, NVME_IDENTIFY_BUFSIZE); 4418 4419 return (rv); 4420 } 4421 4422 /* 4423 * Execute commands on behalf of the various ioctls. 4424 */ 4425 static int 4426 nvme_ioc_cmd(nvme_t *nvme, nvme_sqe_t *sqe, boolean_t is_admin, void *data_addr, 4427 uint32_t data_len, int rwk, nvme_cqe_t *cqe, uint_t timeout) 4428 { 4429 nvme_cmd_t *cmd; 4430 nvme_qpair_t *ioq; 4431 int rv = 0; 4432 4433 cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 4434 if (is_admin) { 4435 cmd->nc_sqid = 0; 4436 ioq = nvme->n_adminq; 4437 } else { 4438 cmd->nc_sqid = (CPU->cpu_id % nvme->n_ioq_count) + 1; 4439 ASSERT(cmd->nc_sqid <= nvme->n_ioq_count); 4440 ioq = nvme->n_ioq[cmd->nc_sqid]; 4441 } 4442 4443 /* 4444 * This function is used to facilitate requests from 4445 * userspace, so don't panic if the command fails. This 4446 * is especially true for admin passthru commands, where 4447 * the actual command data structure is entirely defined 4448 * by userspace. 4449 */ 4450 cmd->nc_dontpanic = B_TRUE; 4451 4452 cmd->nc_callback = nvme_wakeup_cmd; 4453 cmd->nc_sqe = *sqe; 4454 4455 if ((rwk & (FREAD | FWRITE)) != 0) { 4456 if (data_addr == NULL) { 4457 rv = EINVAL; 4458 goto free_cmd; 4459 } 4460 4461 if (nvme_zalloc_dma(nvme, data_len, DDI_DMA_READ, 4462 &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) { 4463 dev_err(nvme->n_dip, CE_WARN, 4464 "!nvme_zalloc_dma failed for nvme_ioc_cmd()"); 4465 4466 rv = ENOMEM; 4467 goto free_cmd; 4468 } 4469 4470 if ((rv = nvme_fill_prp(cmd, cmd->nc_dma->nd_dmah)) != 0) 4471 goto free_cmd; 4472 4473 if ((rwk & FWRITE) != 0) { 4474 if (ddi_copyin(data_addr, cmd->nc_dma->nd_memp, 4475 data_len, rwk & FKIOCTL) != 0) { 4476 rv = EFAULT; 4477 goto free_cmd; 4478 } 4479 } 4480 } 4481 4482 if (is_admin) { 4483 nvme_admin_cmd(cmd, timeout); 4484 } else { 4485 mutex_enter(&cmd->nc_mutex); 4486 4487 rv = nvme_submit_io_cmd(ioq, cmd); 4488 4489 if (rv == EAGAIN) { 4490 mutex_exit(&cmd->nc_mutex); 4491 dev_err(cmd->nc_nvme->n_dip, CE_WARN, 4492 "!nvme_ioc_cmd() failed, I/O Q full"); 4493 goto free_cmd; 4494 } 4495 4496 nvme_wait_cmd(cmd, timeout); 4497 4498 mutex_exit(&cmd->nc_mutex); 4499 } 4500 4501 if (cqe != NULL) 4502 *cqe = cmd->nc_cqe; 4503 4504 if ((rv = nvme_check_cmd_status(cmd)) != 0) { 4505 dev_err(nvme->n_dip, CE_WARN, 4506 "!nvme_ioc_cmd() failed with sct = %x, sc = %x", 4507 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 4508 4509 goto free_cmd; 4510 } 4511 4512 if ((rwk & FREAD) != 0) { 4513 if (ddi_copyout(cmd->nc_dma->nd_memp, 4514 data_addr, data_len, rwk & FKIOCTL) != 0) 4515 rv = EFAULT; 4516 } 4517 4518 free_cmd: 4519 nvme_free_cmd(cmd); 4520 4521 return (rv); 4522 } 4523 4524 static int 4525 nvme_ioctl_capabilities(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, 4526 int mode, cred_t *cred_p) 4527 { 4528 _NOTE(ARGUNUSED(nsid, cred_p)); 4529 int rv = 0; 4530 nvme_reg_cap_t cap = { 0 }; 4531 nvme_capabilities_t nc; 4532 4533 if ((mode & FREAD) == 0) 4534 return (EPERM); 4535 4536 if (nioc->n_len < sizeof (nc)) 4537 return (EINVAL); 4538 4539 cap.r = nvme_get64(nvme, NVME_REG_CAP); 4540 4541 /* 4542 * The MPSMIN and MPSMAX fields in the CAP register use 0 to 4543 * specify the base page size of 4k (1<<12), so add 12 here to 4544 * get the real page size value. 4545 */ 4546 nc.mpsmax = 1 << (12 + cap.b.cap_mpsmax); 4547 nc.mpsmin = 1 << (12 + cap.b.cap_mpsmin); 4548 4549 if (ddi_copyout(&nc, (void *)nioc->n_buf, sizeof (nc), mode) != 0) 4550 rv = EFAULT; 4551 4552 return (rv); 4553 } 4554 4555 static int 4556 nvme_ioctl_get_logpage(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, 4557 int mode, cred_t *cred_p) 4558 { 4559 _NOTE(ARGUNUSED(cred_p)); 4560 void *log = NULL; 4561 size_t bufsize = 0; 4562 int rv = 0; 4563 4564 if ((mode & FREAD) == 0) 4565 return (EPERM); 4566 4567 switch (nioc->n_arg) { 4568 case NVME_LOGPAGE_ERROR: 4569 if (nsid != 0) 4570 return (EINVAL); 4571 break; 4572 case NVME_LOGPAGE_HEALTH: 4573 if (nsid != 0 && nvme->n_idctl->id_lpa.lp_smart == 0) 4574 return (EINVAL); 4575 4576 if (nsid == 0) 4577 nsid = (uint32_t)-1; 4578 4579 break; 4580 case NVME_LOGPAGE_FWSLOT: 4581 if (nsid != 0) 4582 return (EINVAL); 4583 break; 4584 default: 4585 if (!NVME_IS_VENDOR_SPECIFIC_LOGPAGE(nioc->n_arg)) 4586 return (EINVAL); 4587 if (nioc->n_len > NVME_VENDOR_SPECIFIC_LOGPAGE_MAX_SIZE) { 4588 dev_err(nvme->n_dip, CE_NOTE, "!Vendor-specific log " 4589 "page size exceeds device maximum supported size: " 4590 "%lu", NVME_VENDOR_SPECIFIC_LOGPAGE_MAX_SIZE); 4591 return (EINVAL); 4592 } 4593 if (nioc->n_len == 0) 4594 return (EINVAL); 4595 bufsize = nioc->n_len; 4596 if (nsid == 0) 4597 nsid = (uint32_t)-1; 4598 } 4599 4600 if (nvme_get_logpage(nvme, B_TRUE, &log, &bufsize, nioc->n_arg, nsid) 4601 != DDI_SUCCESS) 4602 return (EIO); 4603 4604 if (nioc->n_len < bufsize) { 4605 kmem_free(log, bufsize); 4606 return (EINVAL); 4607 } 4608 4609 if (ddi_copyout(log, (void *)nioc->n_buf, bufsize, mode) != 0) 4610 rv = EFAULT; 4611 4612 nioc->n_len = bufsize; 4613 kmem_free(log, bufsize); 4614 4615 return (rv); 4616 } 4617 4618 static int 4619 nvme_ioctl_get_features(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, 4620 int mode, cred_t *cred_p) 4621 { 4622 _NOTE(ARGUNUSED(cred_p)); 4623 void *buf = NULL; 4624 size_t bufsize = 0; 4625 uint32_t res = 0; 4626 uint8_t feature; 4627 int rv = 0; 4628 4629 if ((mode & FREAD) == 0) 4630 return (EPERM); 4631 4632 if ((nioc->n_arg >> 32) > 0xff) 4633 return (EINVAL); 4634 4635 feature = (uint8_t)(nioc->n_arg >> 32); 4636 4637 switch (feature) { 4638 case NVME_FEAT_ARBITRATION: 4639 case NVME_FEAT_POWER_MGMT: 4640 case NVME_FEAT_ERROR: 4641 case NVME_FEAT_NQUEUES: 4642 case NVME_FEAT_INTR_COAL: 4643 case NVME_FEAT_WRITE_ATOM: 4644 case NVME_FEAT_ASYNC_EVENT: 4645 case NVME_FEAT_PROGRESS: 4646 if (nsid != 0) 4647 return (EINVAL); 4648 break; 4649 4650 case NVME_FEAT_TEMPERATURE: 4651 if (nsid != 0) 4652 return (EINVAL); 4653 res = nioc->n_arg & 0xffffffffUL; 4654 if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 2)) { 4655 nvme_temp_threshold_t tt; 4656 4657 tt.r = res; 4658 if (tt.b.tt_thsel != NVME_TEMP_THRESH_OVER && 4659 tt.b.tt_thsel != NVME_TEMP_THRESH_UNDER) { 4660 return (EINVAL); 4661 } 4662 4663 if (tt.b.tt_tmpsel > NVME_TEMP_THRESH_MAX_SENSOR) { 4664 return (EINVAL); 4665 } 4666 } else if (res != 0) { 4667 return (EINVAL); 4668 } 4669 break; 4670 4671 case NVME_FEAT_INTR_VECT: 4672 if (nsid != 0) 4673 return (EINVAL); 4674 4675 res = nioc->n_arg & 0xffffffffUL; 4676 if (res >= nvme->n_intr_cnt) 4677 return (EINVAL); 4678 break; 4679 4680 case NVME_FEAT_LBA_RANGE: 4681 if (nvme->n_lba_range_supported == B_FALSE) 4682 return (EINVAL); 4683 4684 if (nsid == 0 || 4685 nsid > nvme->n_namespace_count) 4686 return (EINVAL); 4687 4688 break; 4689 4690 case NVME_FEAT_WRITE_CACHE: 4691 if (nsid != 0) 4692 return (EINVAL); 4693 4694 if (!nvme->n_write_cache_present) 4695 return (EINVAL); 4696 4697 break; 4698 4699 case NVME_FEAT_AUTO_PST: 4700 if (nsid != 0) 4701 return (EINVAL); 4702 4703 if (!nvme->n_auto_pst_supported) 4704 return (EINVAL); 4705 4706 break; 4707 4708 default: 4709 return (EINVAL); 4710 } 4711 4712 rv = nvme_get_features(nvme, B_TRUE, nsid, feature, &res, &buf, 4713 &bufsize); 4714 if (rv != 0) 4715 return (rv); 4716 4717 if (nioc->n_len < bufsize) { 4718 kmem_free(buf, bufsize); 4719 return (EINVAL); 4720 } 4721 4722 if (buf && ddi_copyout(buf, (void*)nioc->n_buf, bufsize, mode) != 0) 4723 rv = EFAULT; 4724 4725 kmem_free(buf, bufsize); 4726 nioc->n_arg = res; 4727 nioc->n_len = bufsize; 4728 4729 return (rv); 4730 } 4731 4732 static int 4733 nvme_ioctl_intr_cnt(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 4734 cred_t *cred_p) 4735 { 4736 _NOTE(ARGUNUSED(nsid, mode, cred_p)); 4737 4738 if ((mode & FREAD) == 0) 4739 return (EPERM); 4740 4741 nioc->n_arg = nvme->n_intr_cnt; 4742 return (0); 4743 } 4744 4745 static int 4746 nvme_ioctl_version(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 4747 cred_t *cred_p) 4748 { 4749 _NOTE(ARGUNUSED(nsid, cred_p)); 4750 int rv = 0; 4751 4752 if ((mode & FREAD) == 0) 4753 return (EPERM); 4754 4755 if (nioc->n_len < sizeof (nvme->n_version)) 4756 return (ENOMEM); 4757 4758 if (ddi_copyout(&nvme->n_version, (void *)nioc->n_buf, 4759 sizeof (nvme->n_version), mode) != 0) 4760 rv = EFAULT; 4761 4762 return (rv); 4763 } 4764 4765 static int 4766 nvme_ioctl_format(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 4767 cred_t *cred_p) 4768 { 4769 _NOTE(ARGUNUSED(mode)); 4770 nvme_format_nvm_t frmt = { 0 }; 4771 int c_nsid = nsid != 0 ? nsid - 1 : 0; 4772 4773 if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) 4774 return (EPERM); 4775 4776 frmt.r = nioc->n_arg & 0xffffffff; 4777 4778 /* 4779 * Check whether the FORMAT NVM command is supported. 4780 */ 4781 if (nvme->n_idctl->id_oacs.oa_format == 0) 4782 return (EINVAL); 4783 4784 /* 4785 * Don't allow format or secure erase of individual namespace if that 4786 * would cause a format or secure erase of all namespaces. 4787 */ 4788 if (nsid != 0 && nvme->n_idctl->id_fna.fn_format != 0) 4789 return (EINVAL); 4790 4791 if (nsid != 0 && frmt.b.fm_ses != NVME_FRMT_SES_NONE && 4792 nvme->n_idctl->id_fna.fn_sec_erase != 0) 4793 return (EINVAL); 4794 4795 /* 4796 * Don't allow formatting with Protection Information. 4797 */ 4798 if (frmt.b.fm_pi != 0 || frmt.b.fm_pil != 0 || frmt.b.fm_ms != 0) 4799 return (EINVAL); 4800 4801 /* 4802 * Don't allow formatting using an illegal LBA format, or any LBA format 4803 * that uses metadata. 4804 */ 4805 if (frmt.b.fm_lbaf > nvme->n_ns[c_nsid].ns_idns->id_nlbaf || 4806 nvme->n_ns[c_nsid].ns_idns->id_lbaf[frmt.b.fm_lbaf].lbaf_ms != 0) 4807 return (EINVAL); 4808 4809 /* 4810 * Don't allow formatting using an illegal Secure Erase setting. 4811 */ 4812 if (frmt.b.fm_ses > NVME_FRMT_MAX_SES || 4813 (frmt.b.fm_ses == NVME_FRMT_SES_CRYPTO && 4814 nvme->n_idctl->id_fna.fn_crypt_erase == 0)) 4815 return (EINVAL); 4816 4817 if (nsid == 0) 4818 nsid = (uint32_t)-1; 4819 4820 return (nvme_format_nvm(nvme, B_TRUE, nsid, frmt.b.fm_lbaf, B_FALSE, 0, 4821 B_FALSE, frmt.b.fm_ses)); 4822 } 4823 4824 static int 4825 nvme_ioctl_detach(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 4826 cred_t *cred_p) 4827 { 4828 _NOTE(ARGUNUSED(nioc, mode)); 4829 int rv = 0; 4830 4831 if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) 4832 return (EPERM); 4833 4834 if (nsid == 0) 4835 return (EINVAL); 4836 4837 if (nvme->n_ns[nsid - 1].ns_ignore) 4838 return (0); 4839 4840 rv = bd_detach_handle(nvme->n_ns[nsid - 1].ns_bd_hdl); 4841 if (rv != DDI_SUCCESS) 4842 rv = EBUSY; 4843 4844 return (rv); 4845 } 4846 4847 static int 4848 nvme_ioctl_attach(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 4849 cred_t *cred_p) 4850 { 4851 _NOTE(ARGUNUSED(nioc, mode)); 4852 nvme_identify_nsid_t *idns; 4853 int rv = 0; 4854 4855 if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) 4856 return (EPERM); 4857 4858 if (nsid == 0) 4859 return (EINVAL); 4860 4861 /* 4862 * Identify namespace again, free old identify data. 4863 */ 4864 idns = nvme->n_ns[nsid - 1].ns_idns; 4865 if (nvme_init_ns(nvme, nsid) != DDI_SUCCESS) 4866 return (EIO); 4867 4868 kmem_free(idns, sizeof (nvme_identify_nsid_t)); 4869 4870 if (nvme->n_ns[nsid - 1].ns_ignore) 4871 return (ENOTSUP); 4872 4873 if (nvme->n_ns[nsid - 1].ns_bd_hdl == NULL) 4874 nvme->n_ns[nsid - 1].ns_bd_hdl = bd_alloc_handle( 4875 &nvme->n_ns[nsid - 1], &nvme_bd_ops, &nvme->n_prp_dma_attr, 4876 KM_SLEEP); 4877 4878 rv = bd_attach_handle(nvme->n_dip, nvme->n_ns[nsid - 1].ns_bd_hdl); 4879 if (rv != DDI_SUCCESS) 4880 rv = EBUSY; 4881 4882 return (rv); 4883 } 4884 4885 static void 4886 nvme_ufm_update(nvme_t *nvme) 4887 { 4888 mutex_enter(&nvme->n_fwslot_mutex); 4889 ddi_ufm_update(nvme->n_ufmh); 4890 if (nvme->n_fwslot != NULL) { 4891 kmem_free(nvme->n_fwslot, sizeof (nvme_fwslot_log_t)); 4892 nvme->n_fwslot = NULL; 4893 } 4894 mutex_exit(&nvme->n_fwslot_mutex); 4895 } 4896 4897 static int 4898 nvme_ioctl_firmware_download(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, 4899 int mode, cred_t *cred_p) 4900 { 4901 int rv = 0; 4902 size_t len, copylen; 4903 offset_t offset; 4904 uintptr_t buf; 4905 nvme_sqe_t sqe = { 4906 .sqe_opc = NVME_OPC_FW_IMAGE_LOAD 4907 }; 4908 4909 if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) 4910 return (EPERM); 4911 4912 if (nsid != 0) 4913 return (EINVAL); 4914 4915 /* 4916 * The offset (in n_len) is restricted to the number of DWORDs in 4917 * 32 bits. 4918 */ 4919 if (nioc->n_len > NVME_FW_OFFSETB_MAX) 4920 return (EINVAL); 4921 4922 /* Confirm that both offset and length are a multiple of DWORD bytes */ 4923 if ((nioc->n_len & NVME_DWORD_MASK) != 0 || 4924 (nioc->n_arg & NVME_DWORD_MASK) != 0) 4925 return (EINVAL); 4926 4927 len = nioc->n_len; 4928 offset = nioc->n_arg; 4929 buf = (uintptr_t)nioc->n_buf; 4930 while (len > 0 && rv == 0) { 4931 /* 4932 * nvme_ioc_cmd() does not use SGLs or PRP lists. 4933 * It is limited to 2 PRPs per NVM command, so limit 4934 * the size of the data to 2 pages. 4935 */ 4936 copylen = MIN(2 * nvme->n_pagesize, len); 4937 4938 sqe.sqe_cdw10 = (uint32_t)(copylen >> NVME_DWORD_SHIFT) - 1; 4939 sqe.sqe_cdw11 = (uint32_t)(offset >> NVME_DWORD_SHIFT); 4940 4941 rv = nvme_ioc_cmd(nvme, &sqe, B_TRUE, (void *)buf, copylen, 4942 FWRITE, NULL, nvme_admin_cmd_timeout); 4943 4944 buf += copylen; 4945 offset += copylen; 4946 len -= copylen; 4947 } 4948 4949 /* 4950 * Let the DDI UFM subsystem know that the firmware information for 4951 * this device has changed. 4952 */ 4953 nvme_ufm_update(nvme); 4954 4955 return (rv); 4956 } 4957 4958 static int 4959 nvme_ioctl_firmware_commit(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, 4960 int mode, cred_t *cred_p) 4961 { 4962 nvme_firmware_commit_dw10_t fc_dw10 = { 0 }; 4963 uint32_t slot = nioc->n_arg & 0xffffffff; 4964 uint32_t action = nioc->n_arg >> 32; 4965 nvme_cqe_t cqe = { 0 }; 4966 nvme_sqe_t sqe = { 4967 .sqe_opc = NVME_OPC_FW_ACTIVATE 4968 }; 4969 int timeout; 4970 int rv; 4971 4972 if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) 4973 return (EPERM); 4974 4975 if (nsid != 0) 4976 return (EINVAL); 4977 4978 /* Validate slot is in range. */ 4979 if (slot < NVME_FW_SLOT_MIN || slot > NVME_FW_SLOT_MAX) 4980 return (EINVAL); 4981 4982 switch (action) { 4983 case NVME_FWC_SAVE: 4984 case NVME_FWC_SAVE_ACTIVATE: 4985 timeout = nvme_commit_save_cmd_timeout; 4986 break; 4987 case NVME_FWC_ACTIVATE: 4988 case NVME_FWC_ACTIVATE_IMMED: 4989 timeout = nvme_admin_cmd_timeout; 4990 break; 4991 default: 4992 return (EINVAL); 4993 } 4994 4995 fc_dw10.b.fc_slot = slot; 4996 fc_dw10.b.fc_action = action; 4997 sqe.sqe_cdw10 = fc_dw10.r; 4998 4999 rv = nvme_ioc_cmd(nvme, &sqe, B_TRUE, NULL, 0, 0, &cqe, timeout); 5000 5001 nioc->n_arg = ((uint64_t)cqe.cqe_sf.sf_sct << 16) | cqe.cqe_sf.sf_sc; 5002 5003 /* 5004 * Let the DDI UFM subsystem know that the firmware information for 5005 * this device has changed. 5006 */ 5007 nvme_ufm_update(nvme); 5008 5009 return (rv); 5010 } 5011 5012 /* 5013 * Helper to copy in a passthru command from userspace, handling 5014 * different data models. 5015 */ 5016 static int 5017 nvme_passthru_copy_cmd_in(const void *buf, nvme_passthru_cmd_t *cmd, int mode) 5018 { 5019 #ifdef _MULTI_DATAMODEL 5020 switch (ddi_model_convert_from(mode & FMODELS)) { 5021 case DDI_MODEL_ILP32: { 5022 nvme_passthru_cmd32_t cmd32; 5023 if (ddi_copyin(buf, (void*)&cmd32, sizeof (cmd32), mode) != 0) 5024 return (-1); 5025 cmd->npc_opcode = cmd32.npc_opcode; 5026 cmd->npc_timeout = cmd32.npc_timeout; 5027 cmd->npc_flags = cmd32.npc_flags; 5028 cmd->npc_cdw12 = cmd32.npc_cdw12; 5029 cmd->npc_cdw13 = cmd32.npc_cdw13; 5030 cmd->npc_cdw14 = cmd32.npc_cdw14; 5031 cmd->npc_cdw15 = cmd32.npc_cdw15; 5032 cmd->npc_buflen = cmd32.npc_buflen; 5033 cmd->npc_buf = cmd32.npc_buf; 5034 break; 5035 } 5036 case DDI_MODEL_NONE: 5037 #endif 5038 if (ddi_copyin(buf, (void*)cmd, sizeof (nvme_passthru_cmd_t), 5039 mode) != 0) 5040 return (-1); 5041 #ifdef _MULTI_DATAMODEL 5042 break; 5043 } 5044 #endif 5045 return (0); 5046 } 5047 5048 /* 5049 * Helper to copy out a passthru command result to userspace, handling 5050 * different data models. 5051 */ 5052 static int 5053 nvme_passthru_copy_cmd_out(const nvme_passthru_cmd_t *cmd, void *buf, int mode) 5054 { 5055 #ifdef _MULTI_DATAMODEL 5056 switch (ddi_model_convert_from(mode & FMODELS)) { 5057 case DDI_MODEL_ILP32: { 5058 nvme_passthru_cmd32_t cmd32; 5059 bzero(&cmd32, sizeof (cmd32)); 5060 cmd32.npc_opcode = cmd->npc_opcode; 5061 cmd32.npc_status = cmd->npc_status; 5062 cmd32.npc_err = cmd->npc_err; 5063 cmd32.npc_timeout = cmd->npc_timeout; 5064 cmd32.npc_flags = cmd->npc_flags; 5065 cmd32.npc_cdw0 = cmd->npc_cdw0; 5066 cmd32.npc_cdw12 = cmd->npc_cdw12; 5067 cmd32.npc_cdw13 = cmd->npc_cdw13; 5068 cmd32.npc_cdw14 = cmd->npc_cdw14; 5069 cmd32.npc_cdw15 = cmd->npc_cdw15; 5070 cmd32.npc_buflen = (size32_t)cmd->npc_buflen; 5071 cmd32.npc_buf = (uintptr32_t)cmd->npc_buf; 5072 if (ddi_copyout(&cmd32, buf, sizeof (cmd32), mode) != 0) 5073 return (-1); 5074 break; 5075 } 5076 case DDI_MODEL_NONE: 5077 #endif 5078 if (ddi_copyout(cmd, buf, sizeof (nvme_passthru_cmd_t), 5079 mode) != 0) 5080 return (-1); 5081 #ifdef _MULTI_DATAMODEL 5082 break; 5083 } 5084 #endif 5085 return (0); 5086 } 5087 5088 /* 5089 * Run an arbitrary vendor-specific admin command on the device. 5090 */ 5091 static int 5092 nvme_ioctl_passthru(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 5093 cred_t *cred_p) 5094 { 5095 int rv = 0; 5096 uint_t timeout = 0; 5097 int rwk = 0; 5098 nvme_passthru_cmd_t cmd; 5099 size_t expected_passthru_size = 0; 5100 nvme_sqe_t sqe; 5101 nvme_cqe_t cqe; 5102 5103 bzero(&cmd, sizeof (cmd)); 5104 bzero(&sqe, sizeof (sqe)); 5105 bzero(&cqe, sizeof (cqe)); 5106 5107 /* 5108 * Basic checks: permissions, data model, argument size. 5109 */ 5110 if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) 5111 return (EPERM); 5112 5113 /* 5114 * Compute the expected size of the argument buffer 5115 */ 5116 #ifdef _MULTI_DATAMODEL 5117 switch (ddi_model_convert_from(mode & FMODELS)) { 5118 case DDI_MODEL_ILP32: 5119 expected_passthru_size = sizeof (nvme_passthru_cmd32_t); 5120 break; 5121 case DDI_MODEL_NONE: 5122 #endif 5123 expected_passthru_size = sizeof (nvme_passthru_cmd_t); 5124 #ifdef _MULTI_DATAMODEL 5125 break; 5126 } 5127 #endif 5128 5129 if (nioc->n_len != expected_passthru_size) { 5130 cmd.npc_err = NVME_PASSTHRU_ERR_CMD_SIZE; 5131 rv = EINVAL; 5132 goto out; 5133 } 5134 5135 /* 5136 * Ensure the device supports the standard vendor specific 5137 * admin command format. 5138 */ 5139 if (!nvme->n_idctl->id_nvscc.nv_spec) { 5140 cmd.npc_err = NVME_PASSTHRU_ERR_NOT_SUPPORTED; 5141 rv = ENOTSUP; 5142 goto out; 5143 } 5144 5145 if (nvme_passthru_copy_cmd_in((const void*)nioc->n_buf, &cmd, mode)) 5146 return (EFAULT); 5147 5148 if (!NVME_IS_VENDOR_SPECIFIC_CMD(cmd.npc_opcode)) { 5149 cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_OPCODE; 5150 rv = EINVAL; 5151 goto out; 5152 } 5153 5154 /* 5155 * This restriction is not mandated by the spec, so future work 5156 * could relax this if it's necessary to support commands that both 5157 * read and write. 5158 */ 5159 if ((cmd.npc_flags & NVME_PASSTHRU_READ) != 0 && 5160 (cmd.npc_flags & NVME_PASSTHRU_WRITE) != 0) { 5161 cmd.npc_err = NVME_PASSTHRU_ERR_READ_AND_WRITE; 5162 rv = EINVAL; 5163 goto out; 5164 } 5165 if (cmd.npc_timeout > nvme_vendor_specific_admin_cmd_max_timeout) { 5166 cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_TIMEOUT; 5167 rv = EINVAL; 5168 goto out; 5169 } 5170 timeout = cmd.npc_timeout; 5171 5172 /* 5173 * Passed-thru command buffer verification: 5174 * - Size is multiple of DWords 5175 * - Non-null iff the length is non-zero 5176 * - Null if neither reading nor writing data. 5177 * - Non-null if reading or writing. 5178 * - Maximum buffer size. 5179 */ 5180 if ((cmd.npc_buflen % sizeof (uint32_t)) != 0) { 5181 cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_BUFFER; 5182 rv = EINVAL; 5183 goto out; 5184 } 5185 if (((void*)cmd.npc_buf != NULL && cmd.npc_buflen == 0) || 5186 ((void*)cmd.npc_buf == NULL && cmd.npc_buflen != 0)) { 5187 cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_BUFFER; 5188 rv = EINVAL; 5189 goto out; 5190 } 5191 if (cmd.npc_flags == 0 && (void*)cmd.npc_buf != NULL) { 5192 cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_BUFFER; 5193 rv = EINVAL; 5194 goto out; 5195 } 5196 if ((cmd.npc_flags != 0) && ((void*)cmd.npc_buf == NULL)) { 5197 cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_BUFFER; 5198 rv = EINVAL; 5199 goto out; 5200 } 5201 if (cmd.npc_buflen > nvme_vendor_specific_admin_cmd_size) { 5202 cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_BUFFER; 5203 rv = EINVAL; 5204 goto out; 5205 } 5206 if ((cmd.npc_buflen >> NVME_DWORD_SHIFT) > UINT32_MAX) { 5207 cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_BUFFER; 5208 rv = EINVAL; 5209 goto out; 5210 } 5211 5212 sqe.sqe_opc = cmd.npc_opcode; 5213 sqe.sqe_nsid = nsid; 5214 sqe.sqe_cdw10 = (uint32_t)(cmd.npc_buflen >> NVME_DWORD_SHIFT); 5215 sqe.sqe_cdw12 = cmd.npc_cdw12; 5216 sqe.sqe_cdw13 = cmd.npc_cdw13; 5217 sqe.sqe_cdw14 = cmd.npc_cdw14; 5218 sqe.sqe_cdw15 = cmd.npc_cdw15; 5219 if ((cmd.npc_flags & NVME_PASSTHRU_READ) != 0) 5220 rwk = FREAD; 5221 else if ((cmd.npc_flags & NVME_PASSTHRU_WRITE) != 0) 5222 rwk = FWRITE; 5223 5224 rv = nvme_ioc_cmd(nvme, &sqe, B_TRUE, (void*)cmd.npc_buf, 5225 cmd.npc_buflen, rwk, &cqe, timeout); 5226 cmd.npc_status = cqe.cqe_sf.sf_sc; 5227 cmd.npc_cdw0 = cqe.cqe_dw0; 5228 5229 out: 5230 if (nvme_passthru_copy_cmd_out(&cmd, (void*)nioc->n_buf, mode)) 5231 rv = EFAULT; 5232 return (rv); 5233 } 5234 5235 static int 5236 nvme_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred_p, 5237 int *rval_p) 5238 { 5239 #ifndef __lock_lint 5240 _NOTE(ARGUNUSED(rval_p)); 5241 #endif 5242 minor_t minor = getminor(dev); 5243 nvme_t *nvme = ddi_get_soft_state(nvme_state, NVME_MINOR_INST(minor)); 5244 int nsid = NVME_MINOR_NSID(minor); 5245 int rv = 0; 5246 nvme_ioctl_t nioc; 5247 5248 int (*nvme_ioctl[])(nvme_t *, int, nvme_ioctl_t *, int, cred_t *) = { 5249 NULL, 5250 nvme_ioctl_identify, 5251 nvme_ioctl_identify, 5252 nvme_ioctl_capabilities, 5253 nvme_ioctl_get_logpage, 5254 nvme_ioctl_get_features, 5255 nvme_ioctl_intr_cnt, 5256 nvme_ioctl_version, 5257 nvme_ioctl_format, 5258 nvme_ioctl_detach, 5259 nvme_ioctl_attach, 5260 nvme_ioctl_firmware_download, 5261 nvme_ioctl_firmware_commit, 5262 nvme_ioctl_passthru 5263 }; 5264 5265 if (nvme == NULL) 5266 return (ENXIO); 5267 5268 if (nsid > nvme->n_namespace_count) 5269 return (ENXIO); 5270 5271 if (IS_DEVCTL(cmd)) 5272 return (ndi_devctl_ioctl(nvme->n_dip, cmd, arg, mode, 0)); 5273 5274 #ifdef _MULTI_DATAMODEL 5275 switch (ddi_model_convert_from(mode & FMODELS)) { 5276 case DDI_MODEL_ILP32: { 5277 nvme_ioctl32_t nioc32; 5278 if (ddi_copyin((void*)arg, &nioc32, sizeof (nvme_ioctl32_t), 5279 mode) != 0) 5280 return (EFAULT); 5281 nioc.n_len = nioc32.n_len; 5282 nioc.n_buf = nioc32.n_buf; 5283 nioc.n_arg = nioc32.n_arg; 5284 break; 5285 } 5286 case DDI_MODEL_NONE: 5287 #endif 5288 if (ddi_copyin((void*)arg, &nioc, sizeof (nvme_ioctl_t), mode) 5289 != 0) 5290 return (EFAULT); 5291 #ifdef _MULTI_DATAMODEL 5292 break; 5293 } 5294 #endif 5295 5296 if (nvme->n_dead && cmd != NVME_IOC_DETACH) 5297 return (EIO); 5298 5299 5300 if (cmd == NVME_IOC_IDENTIFY_CTRL) { 5301 /* 5302 * This makes NVME_IOC_IDENTIFY_CTRL work the same on devctl and 5303 * attachment point nodes. 5304 */ 5305 nsid = 0; 5306 } else if (cmd == NVME_IOC_IDENTIFY_NSID && nsid == 0) { 5307 /* 5308 * This makes NVME_IOC_IDENTIFY_NSID work on a devctl node, it 5309 * will always return identify data for namespace 1. 5310 */ 5311 nsid = 1; 5312 } 5313 5314 if (IS_NVME_IOC(cmd) && nvme_ioctl[NVME_IOC_CMD(cmd)] != NULL) 5315 rv = nvme_ioctl[NVME_IOC_CMD(cmd)](nvme, nsid, &nioc, mode, 5316 cred_p); 5317 else 5318 rv = EINVAL; 5319 5320 #ifdef _MULTI_DATAMODEL 5321 switch (ddi_model_convert_from(mode & FMODELS)) { 5322 case DDI_MODEL_ILP32: { 5323 nvme_ioctl32_t nioc32; 5324 5325 nioc32.n_len = (size32_t)nioc.n_len; 5326 nioc32.n_buf = (uintptr32_t)nioc.n_buf; 5327 nioc32.n_arg = nioc.n_arg; 5328 5329 if (ddi_copyout(&nioc32, (void *)arg, sizeof (nvme_ioctl32_t), 5330 mode) != 0) 5331 return (EFAULT); 5332 break; 5333 } 5334 case DDI_MODEL_NONE: 5335 #endif 5336 if (ddi_copyout(&nioc, (void *)arg, sizeof (nvme_ioctl_t), mode) 5337 != 0) 5338 return (EFAULT); 5339 #ifdef _MULTI_DATAMODEL 5340 break; 5341 } 5342 #endif 5343 5344 return (rv); 5345 } 5346 5347 /* 5348 * DDI UFM Callbacks 5349 */ 5350 static int 5351 nvme_ufm_fill_image(ddi_ufm_handle_t *ufmh, void *arg, uint_t imgno, 5352 ddi_ufm_image_t *img) 5353 { 5354 nvme_t *nvme = arg; 5355 5356 if (imgno != 0) 5357 return (EINVAL); 5358 5359 ddi_ufm_image_set_desc(img, "Firmware"); 5360 ddi_ufm_image_set_nslots(img, nvme->n_idctl->id_frmw.fw_nslot); 5361 5362 return (0); 5363 } 5364 5365 /* 5366 * Fill out firmware slot information for the requested slot. The firmware 5367 * slot information is gathered by requesting the Firmware Slot Information log 5368 * page. The format of the page is described in section 5.10.1.3. 5369 * 5370 * We lazily cache the log page on the first call and then invalidate the cache 5371 * data after a successful firmware download or firmware commit command. 5372 * The cached data is protected by a mutex as the state can change 5373 * asynchronous to this callback. 5374 */ 5375 static int 5376 nvme_ufm_fill_slot(ddi_ufm_handle_t *ufmh, void *arg, uint_t imgno, 5377 uint_t slotno, ddi_ufm_slot_t *slot) 5378 { 5379 nvme_t *nvme = arg; 5380 void *log = NULL; 5381 size_t bufsize; 5382 ddi_ufm_attr_t attr = 0; 5383 char fw_ver[NVME_FWVER_SZ + 1]; 5384 int ret; 5385 5386 if (imgno > 0 || slotno > (nvme->n_idctl->id_frmw.fw_nslot - 1)) 5387 return (EINVAL); 5388 5389 mutex_enter(&nvme->n_fwslot_mutex); 5390 if (nvme->n_fwslot == NULL) { 5391 ret = nvme_get_logpage(nvme, B_TRUE, &log, &bufsize, 5392 NVME_LOGPAGE_FWSLOT, 0); 5393 if (ret != DDI_SUCCESS || 5394 bufsize != sizeof (nvme_fwslot_log_t)) { 5395 if (log != NULL) 5396 kmem_free(log, bufsize); 5397 mutex_exit(&nvme->n_fwslot_mutex); 5398 return (EIO); 5399 } 5400 nvme->n_fwslot = (nvme_fwslot_log_t *)log; 5401 } 5402 5403 /* 5404 * NVMe numbers firmware slots starting at 1 5405 */ 5406 if (slotno == (nvme->n_fwslot->fw_afi - 1)) 5407 attr |= DDI_UFM_ATTR_ACTIVE; 5408 5409 if (slotno != 0 || nvme->n_idctl->id_frmw.fw_readonly == 0) 5410 attr |= DDI_UFM_ATTR_WRITEABLE; 5411 5412 if (nvme->n_fwslot->fw_frs[slotno][0] == '\0') { 5413 attr |= DDI_UFM_ATTR_EMPTY; 5414 } else { 5415 (void) strncpy(fw_ver, nvme->n_fwslot->fw_frs[slotno], 5416 NVME_FWVER_SZ); 5417 fw_ver[NVME_FWVER_SZ] = '\0'; 5418 ddi_ufm_slot_set_version(slot, fw_ver); 5419 } 5420 mutex_exit(&nvme->n_fwslot_mutex); 5421 5422 ddi_ufm_slot_set_attrs(slot, attr); 5423 5424 return (0); 5425 } 5426 5427 static int 5428 nvme_ufm_getcaps(ddi_ufm_handle_t *ufmh, void *arg, ddi_ufm_cap_t *caps) 5429 { 5430 *caps = DDI_UFM_CAP_REPORT; 5431 return (0); 5432 } 5433