1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright (c) 2016 The MathWorks, Inc. All rights reserved. 14 * Copyright 2020 Joyent, Inc. 15 * Copyright 2020 Racktop Systems. 16 * Copyright 2022 Oxide Computer Company. 17 * Copyright 2022 OmniOS Community Edition (OmniOSce) Association. 18 * Copyright 2022 Tintri by DDN, Inc. All rights reserved. 19 */ 20 21 /* 22 * blkdev driver for NVMe compliant storage devices 23 * 24 * This driver targets and is designed to support all NVMe 1.x devices. 25 * Features are added to the driver as we encounter devices that require them 26 * and our needs, so some commands or log pages may not take advantage of newer 27 * features that devices support at this time. When you encounter such a case, 28 * it is generally fine to add that support to the driver as long as you take 29 * care to ensure that the requisite device version is met before using it. 30 * 31 * The driver has only been tested on x86 systems and will not work on big- 32 * endian systems without changes to the code accessing registers and data 33 * structures used by the hardware. 34 * 35 * 36 * Interrupt Usage: 37 * 38 * The driver will use a single interrupt while configuring the device as the 39 * specification requires, but contrary to the specification it will try to use 40 * a single-message MSI(-X) or FIXED interrupt. Later in the attach process it 41 * will switch to multiple-message MSI(-X) if supported. The driver wants to 42 * have one interrupt vector per CPU, but it will work correctly if less are 43 * available. Interrupts can be shared by queues, the interrupt handler will 44 * iterate through the I/O queue array by steps of n_intr_cnt. Usually only 45 * the admin queue will share an interrupt with one I/O queue. The interrupt 46 * handler will retrieve completed commands from all queues sharing an interrupt 47 * vector and will post them to a taskq for completion processing. 48 * 49 * 50 * Command Processing: 51 * 52 * NVMe devices can have up to 65535 I/O queue pairs, with each queue holding up 53 * to 65536 I/O commands. The driver will configure one I/O queue pair per 54 * available interrupt vector, with the queue length usually much smaller than 55 * the maximum of 65536. If the hardware doesn't provide enough queues, fewer 56 * interrupt vectors will be used. 57 * 58 * Additionally the hardware provides a single special admin queue pair that can 59 * hold up to 4096 admin commands. 60 * 61 * From the hardware perspective both queues of a queue pair are independent, 62 * but they share some driver state: the command array (holding pointers to 63 * commands currently being processed by the hardware) and the active command 64 * counter. Access to a submission queue and the shared state is protected by 65 * nq_mutex; completion queue is protected by ncq_mutex. 66 * 67 * When a command is submitted to a queue pair the active command counter is 68 * incremented and a pointer to the command is stored in the command array. The 69 * array index is used as command identifier (CID) in the submission queue 70 * entry. Some commands may take a very long time to complete, and if the queue 71 * wraps around in that time a submission may find the next array slot to still 72 * be used by a long-running command. In this case the array is sequentially 73 * searched for the next free slot. The length of the command array is the same 74 * as the configured queue length. Queue overrun is prevented by the semaphore, 75 * so a command submission may block if the queue is full. 76 * 77 * 78 * Polled I/O Support: 79 * 80 * For kernel core dump support the driver can do polled I/O. As interrupts are 81 * turned off while dumping the driver will just submit a command in the regular 82 * way, and then repeatedly attempt a command retrieval until it gets the 83 * command back. 84 * 85 * 86 * Namespace Support: 87 * 88 * NVMe devices can have multiple namespaces, each being a independent data 89 * store. The driver supports multiple namespaces and creates a blkdev interface 90 * for each namespace found. Namespaces can have various attributes to support 91 * protection information. This driver does not support any of this and ignores 92 * namespaces that have these attributes. 93 * 94 * As of NVMe 1.1 namespaces can have an 64bit Extended Unique Identifier 95 * (EUI64). This driver uses the EUI64 if present to generate the devid and 96 * passes it to blkdev to use it in the device node names. As this is currently 97 * untested namespaces with EUI64 are ignored by default. 98 * 99 * We currently support only (2 << NVME_MINOR_INST_SHIFT) - 2 namespaces in a 100 * single controller. This is an artificial limit imposed by the driver to be 101 * able to address a reasonable number of controllers and namespaces using a 102 * 32bit minor node number. 103 * 104 * 105 * Minor nodes: 106 * 107 * For each NVMe device the driver exposes one minor node for the controller and 108 * one minor node for each namespace. The only operations supported by those 109 * minor nodes are open(9E), close(9E), and ioctl(9E). This serves as the 110 * interface for the nvmeadm(1M) utility. 111 * 112 * 113 * Blkdev Interface: 114 * 115 * This driver uses blkdev to do all the heavy lifting involved with presenting 116 * a disk device to the system. As a result, the processing of I/O requests is 117 * relatively simple as blkdev takes care of partitioning, boundary checks, DMA 118 * setup, and splitting of transfers into manageable chunks. 119 * 120 * I/O requests coming in from blkdev are turned into NVM commands and posted to 121 * an I/O queue. The queue is selected by taking the CPU id modulo the number of 122 * queues. There is currently no timeout handling of I/O commands. 123 * 124 * Blkdev also supports querying device/media information and generating a 125 * devid. The driver reports the best block size as determined by the namespace 126 * format back to blkdev as physical block size to support partition and block 127 * alignment. The devid is either based on the namespace EUI64, if present, or 128 * composed using the device vendor ID, model number, serial number, and the 129 * namespace ID. 130 * 131 * 132 * Error Handling: 133 * 134 * Error handling is currently limited to detecting fatal hardware errors, 135 * either by asynchronous events, or synchronously through command status or 136 * admin command timeouts. In case of severe errors the device is fenced off, 137 * all further requests will return EIO. FMA is then called to fault the device. 138 * 139 * The hardware has a limit for outstanding asynchronous event requests. Before 140 * this limit is known the driver assumes it is at least 1 and posts a single 141 * asynchronous request. Later when the limit is known more asynchronous event 142 * requests are posted to allow quicker reception of error information. When an 143 * asynchronous event is posted by the hardware the driver will parse the error 144 * status fields and log information or fault the device, depending on the 145 * severity of the asynchronous event. The asynchronous event request is then 146 * reused and posted to the admin queue again. 147 * 148 * On command completion the command status is checked for errors. In case of 149 * errors indicating a driver bug the driver panics. Almost all other error 150 * status values just cause EIO to be returned. 151 * 152 * Command timeouts are currently detected for all admin commands except 153 * asynchronous event requests. If a command times out and the hardware appears 154 * to be healthy the driver attempts to abort the command. The original command 155 * timeout is also applied to the abort command. If the abort times out too the 156 * driver assumes the device to be dead, fences it off, and calls FMA to retire 157 * it. In all other cases the aborted command should return immediately with a 158 * status indicating it was aborted, and the driver will wait indefinitely for 159 * that to happen. No timeout handling of normal I/O commands is presently done. 160 * 161 * Any command that times out due to the controller dropping dead will be put on 162 * nvme_lost_cmds list if it references DMA memory. This will prevent the DMA 163 * memory being reused by the system and later be written to by a "dead" NVMe 164 * controller. 165 * 166 * 167 * Locking: 168 * 169 * Each queue pair has a nq_mutex and ncq_mutex. The nq_mutex must be held 170 * when accessing shared state and submission queue registers, ncq_mutex 171 * is held when accessing completion queue state and registers. 172 * Callers of nvme_unqueue_cmd() must make sure that nq_mutex is held, while 173 * nvme_submit_{admin,io}_cmd() and nvme_retrieve_cmd() take care of both 174 * mutexes themselves. 175 * 176 * Each command also has its own nc_mutex, which is associated with the 177 * condition variable nc_cv. It is only used on admin commands which are run 178 * synchronously. In that case it must be held across calls to 179 * nvme_submit_{admin,io}_cmd() and nvme_wait_cmd(), which is taken care of by 180 * nvme_admin_cmd(). It must also be held whenever the completion state of the 181 * command is changed or while a admin command timeout is handled. 182 * 183 * If both nc_mutex and nq_mutex must be held, nc_mutex must be acquired first. 184 * More than one nc_mutex may only be held when aborting commands. In this case, 185 * the nc_mutex of the command to be aborted must be held across the call to 186 * nvme_abort_cmd() to prevent the command from completing while the abort is in 187 * progress. 188 * 189 * If both nq_mutex and ncq_mutex need to be held, ncq_mutex must be 190 * acquired first. More than one nq_mutex is never held by a single thread. 191 * The ncq_mutex is only held by nvme_retrieve_cmd() and 192 * nvme_process_iocq(). nvme_process_iocq() is only called from the 193 * interrupt thread and nvme_retrieve_cmd() during polled I/O, so the 194 * mutex is non-contentious but is required for implementation completeness 195 * and safety. 196 * 197 * Each minor node has its own nm_mutex, which protects the open count nm_ocnt 198 * and exclusive-open flag nm_oexcl. 199 * 200 * 201 * Quiesce / Fast Reboot: 202 * 203 * The driver currently does not support fast reboot. A quiesce(9E) entry point 204 * is still provided which is used to send a shutdown notification to the 205 * device. 206 * 207 * 208 * NVMe Hotplug: 209 * 210 * The driver supports hot removal. The driver uses the NDI event framework 211 * to register a callback, nvme_remove_callback, to clean up when a disk is 212 * removed. In particular, the driver will unqueue outstanding I/O commands and 213 * set n_dead on the softstate to true so that other operations, such as ioctls 214 * and command submissions, fail as well. 215 * 216 * While the callback registration relies on the NDI event framework, the 217 * removal event itself is kicked off in the PCIe hotplug framework, when the 218 * PCIe bridge driver ("pcieb") gets a hotplug interrupt indicating that a 219 * device was removed from the slot. 220 * 221 * The NVMe driver instance itself will remain until the final close of the 222 * device. 223 * 224 * 225 * DDI UFM Support 226 * 227 * The driver supports the DDI UFM framework for reporting information about 228 * the device's firmware image and slot configuration. This data can be 229 * queried by userland software via ioctls to the ufm driver. For more 230 * information, see ddi_ufm(9E). 231 * 232 * 233 * Driver Configuration: 234 * 235 * The following driver properties can be changed to control some aspects of the 236 * drivers operation: 237 * - strict-version: can be set to 0 to allow devices conforming to newer 238 * major versions to be used 239 * - ignore-unknown-vendor-status: can be set to 1 to not handle any vendor 240 * specific command status as a fatal error leading device faulting 241 * - admin-queue-len: the maximum length of the admin queue (16-4096) 242 * - io-squeue-len: the maximum length of the I/O submission queues (16-65536) 243 * - io-cqueue-len: the maximum length of the I/O completion queues (16-65536) 244 * - async-event-limit: the maximum number of asynchronous event requests to be 245 * posted by the driver 246 * - volatile-write-cache-enable: can be set to 0 to disable the volatile write 247 * cache 248 * - min-phys-block-size: the minimum physical block size to report to blkdev, 249 * which is among other things the basis for ZFS vdev ashift 250 * - max-submission-queues: the maximum number of I/O submission queues. 251 * - max-completion-queues: the maximum number of I/O completion queues, 252 * can be less than max-submission-queues, in which case the completion 253 * queues are shared. 254 * 255 * 256 * TODO: 257 * - figure out sane default for I/O queue depth reported to blkdev 258 * - FMA handling of media errors 259 * - support for devices supporting very large I/O requests using chained PRPs 260 * - support for configuring hardware parameters like interrupt coalescing 261 * - support for media formatting and hard partitioning into namespaces 262 * - support for big-endian systems 263 * - support for fast reboot 264 * - support for NVMe Subsystem Reset (1.1) 265 * - support for Scatter/Gather lists (1.1) 266 * - support for Reservations (1.1) 267 * - support for power management 268 */ 269 270 #include <sys/byteorder.h> 271 #ifdef _BIG_ENDIAN 272 #error nvme driver needs porting for big-endian platforms 273 #endif 274 275 #include <sys/modctl.h> 276 #include <sys/conf.h> 277 #include <sys/devops.h> 278 #include <sys/ddi.h> 279 #include <sys/ddi_ufm.h> 280 #include <sys/sunddi.h> 281 #include <sys/sunndi.h> 282 #include <sys/bitmap.h> 283 #include <sys/sysmacros.h> 284 #include <sys/param.h> 285 #include <sys/varargs.h> 286 #include <sys/cpuvar.h> 287 #include <sys/disp.h> 288 #include <sys/blkdev.h> 289 #include <sys/atomic.h> 290 #include <sys/archsystm.h> 291 #include <sys/sata/sata_hba.h> 292 #include <sys/stat.h> 293 #include <sys/policy.h> 294 #include <sys/list.h> 295 #include <sys/dkio.h> 296 297 #include <sys/nvme.h> 298 299 #ifdef __x86 300 #include <sys/x86_archext.h> 301 #endif 302 303 #include "nvme_reg.h" 304 #include "nvme_var.h" 305 306 /* 307 * Assertions to make sure that we've properly captured various aspects of the 308 * packed structures and haven't broken them during updates. 309 */ 310 CTASSERT(sizeof (nvme_identify_ctrl_t) == 0x1000); 311 CTASSERT(offsetof(nvme_identify_ctrl_t, id_oacs) == 256); 312 CTASSERT(offsetof(nvme_identify_ctrl_t, id_sqes) == 512); 313 CTASSERT(offsetof(nvme_identify_ctrl_t, id_oncs) == 520); 314 CTASSERT(offsetof(nvme_identify_ctrl_t, id_subnqn) == 768); 315 CTASSERT(offsetof(nvme_identify_ctrl_t, id_nvmof) == 1792); 316 CTASSERT(offsetof(nvme_identify_ctrl_t, id_psd) == 2048); 317 CTASSERT(offsetof(nvme_identify_ctrl_t, id_vs) == 3072); 318 319 CTASSERT(sizeof (nvme_identify_nsid_t) == 0x1000); 320 CTASSERT(offsetof(nvme_identify_nsid_t, id_fpi) == 32); 321 CTASSERT(offsetof(nvme_identify_nsid_t, id_anagrpid) == 92); 322 CTASSERT(offsetof(nvme_identify_nsid_t, id_nguid) == 104); 323 CTASSERT(offsetof(nvme_identify_nsid_t, id_lbaf) == 128); 324 CTASSERT(offsetof(nvme_identify_nsid_t, id_vs) == 384); 325 326 CTASSERT(sizeof (nvme_identify_primary_caps_t) == 0x1000); 327 CTASSERT(offsetof(nvme_identify_primary_caps_t, nipc_vqfrt) == 32); 328 CTASSERT(offsetof(nvme_identify_primary_caps_t, nipc_vifrt) == 64); 329 330 331 /* NVMe spec version supported */ 332 static const int nvme_version_major = 1; 333 334 /* tunable for admin command timeout in seconds, default is 1s */ 335 int nvme_admin_cmd_timeout = 1; 336 337 /* tunable for FORMAT NVM command timeout in seconds, default is 600s */ 338 int nvme_format_cmd_timeout = 600; 339 340 /* tunable for firmware commit with NVME_FWC_SAVE, default is 15s */ 341 int nvme_commit_save_cmd_timeout = 15; 342 343 /* 344 * tunable for the size of arbitrary vendor specific admin commands, 345 * default is 16MiB. 346 */ 347 uint32_t nvme_vendor_specific_admin_cmd_size = 1 << 24; 348 349 /* 350 * tunable for the max timeout of arbitary vendor specific admin commands, 351 * default is 60s. 352 */ 353 uint_t nvme_vendor_specific_admin_cmd_max_timeout = 60; 354 355 static int nvme_attach(dev_info_t *, ddi_attach_cmd_t); 356 static int nvme_detach(dev_info_t *, ddi_detach_cmd_t); 357 static int nvme_quiesce(dev_info_t *); 358 static int nvme_fm_errcb(dev_info_t *, ddi_fm_error_t *, const void *); 359 static int nvme_setup_interrupts(nvme_t *, int, int); 360 static void nvme_release_interrupts(nvme_t *); 361 static uint_t nvme_intr(caddr_t, caddr_t); 362 363 static void nvme_shutdown(nvme_t *, int, boolean_t); 364 static boolean_t nvme_reset(nvme_t *, boolean_t); 365 static int nvme_init(nvme_t *); 366 static nvme_cmd_t *nvme_alloc_cmd(nvme_t *, int); 367 static void nvme_free_cmd(nvme_cmd_t *); 368 static nvme_cmd_t *nvme_create_nvm_cmd(nvme_namespace_t *, uint8_t, 369 bd_xfer_t *); 370 static void nvme_admin_cmd(nvme_cmd_t *, int); 371 static void nvme_submit_admin_cmd(nvme_qpair_t *, nvme_cmd_t *); 372 static int nvme_submit_io_cmd(nvme_qpair_t *, nvme_cmd_t *); 373 static void nvme_submit_cmd_common(nvme_qpair_t *, nvme_cmd_t *); 374 static nvme_cmd_t *nvme_unqueue_cmd(nvme_t *, nvme_qpair_t *, int); 375 static nvme_cmd_t *nvme_retrieve_cmd(nvme_t *, nvme_qpair_t *); 376 static void nvme_wait_cmd(nvme_cmd_t *, uint_t); 377 static void nvme_wakeup_cmd(void *); 378 static void nvme_async_event_task(void *); 379 380 static int nvme_check_unknown_cmd_status(nvme_cmd_t *); 381 static int nvme_check_vendor_cmd_status(nvme_cmd_t *); 382 static int nvme_check_integrity_cmd_status(nvme_cmd_t *); 383 static int nvme_check_specific_cmd_status(nvme_cmd_t *); 384 static int nvme_check_generic_cmd_status(nvme_cmd_t *); 385 static inline int nvme_check_cmd_status(nvme_cmd_t *); 386 387 static int nvme_abort_cmd(nvme_cmd_t *, uint_t); 388 static void nvme_async_event(nvme_t *); 389 static int nvme_format_nvm(nvme_t *, boolean_t, uint32_t, uint8_t, boolean_t, 390 uint8_t, boolean_t, uint8_t); 391 static int nvme_get_logpage(nvme_t *, boolean_t, void **, size_t *, uint8_t, 392 ...); 393 static int nvme_identify(nvme_t *, boolean_t, uint32_t, void **); 394 static int nvme_set_features(nvme_t *, boolean_t, uint32_t, uint8_t, uint32_t, 395 uint32_t *); 396 static int nvme_get_features(nvme_t *, boolean_t, uint32_t, uint8_t, uint32_t *, 397 void **, size_t *); 398 static int nvme_write_cache_set(nvme_t *, boolean_t); 399 static int nvme_set_nqueues(nvme_t *); 400 401 static void nvme_free_dma(nvme_dma_t *); 402 static int nvme_zalloc_dma(nvme_t *, size_t, uint_t, ddi_dma_attr_t *, 403 nvme_dma_t **); 404 static int nvme_zalloc_queue_dma(nvme_t *, uint32_t, uint16_t, uint_t, 405 nvme_dma_t **); 406 static void nvme_free_qpair(nvme_qpair_t *); 407 static int nvme_alloc_qpair(nvme_t *, uint32_t, nvme_qpair_t **, uint_t); 408 static int nvme_create_io_qpair(nvme_t *, nvme_qpair_t *, uint16_t); 409 410 static inline void nvme_put64(nvme_t *, uintptr_t, uint64_t); 411 static inline void nvme_put32(nvme_t *, uintptr_t, uint32_t); 412 static inline uint64_t nvme_get64(nvme_t *, uintptr_t); 413 static inline uint32_t nvme_get32(nvme_t *, uintptr_t); 414 415 static boolean_t nvme_check_regs_hdl(nvme_t *); 416 static boolean_t nvme_check_dma_hdl(nvme_dma_t *); 417 418 static int nvme_fill_prp(nvme_cmd_t *, ddi_dma_handle_t); 419 420 static void nvme_bd_xfer_done(void *); 421 static void nvme_bd_driveinfo(void *, bd_drive_t *); 422 static int nvme_bd_mediainfo(void *, bd_media_t *); 423 static int nvme_bd_cmd(nvme_namespace_t *, bd_xfer_t *, uint8_t); 424 static int nvme_bd_read(void *, bd_xfer_t *); 425 static int nvme_bd_write(void *, bd_xfer_t *); 426 static int nvme_bd_sync(void *, bd_xfer_t *); 427 static int nvme_bd_devid(void *, dev_info_t *, ddi_devid_t *); 428 static int nvme_bd_free_space(void *, bd_xfer_t *); 429 430 static int nvme_prp_dma_constructor(void *, void *, int); 431 static void nvme_prp_dma_destructor(void *, void *); 432 433 static void nvme_prepare_devid(nvme_t *, uint32_t); 434 435 /* DDI UFM callbacks */ 436 static int nvme_ufm_fill_image(ddi_ufm_handle_t *, void *, uint_t, 437 ddi_ufm_image_t *); 438 static int nvme_ufm_fill_slot(ddi_ufm_handle_t *, void *, uint_t, uint_t, 439 ddi_ufm_slot_t *); 440 static int nvme_ufm_getcaps(ddi_ufm_handle_t *, void *, ddi_ufm_cap_t *); 441 442 static int nvme_open(dev_t *, int, int, cred_t *); 443 static int nvme_close(dev_t, int, int, cred_t *); 444 static int nvme_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); 445 446 static void nvme_changed_ns(nvme_t *, int); 447 448 static ddi_ufm_ops_t nvme_ufm_ops = { 449 NULL, 450 nvme_ufm_fill_image, 451 nvme_ufm_fill_slot, 452 nvme_ufm_getcaps 453 }; 454 455 #define NVME_MINOR_INST_SHIFT 9 456 #define NVME_MINOR(inst, nsid) (((inst) << NVME_MINOR_INST_SHIFT) | (nsid)) 457 #define NVME_MINOR_INST(minor) ((minor) >> NVME_MINOR_INST_SHIFT) 458 #define NVME_MINOR_NSID(minor) ((minor) & ((1 << NVME_MINOR_INST_SHIFT) - 1)) 459 #define NVME_MINOR_MAX (NVME_MINOR(1, 0) - 2) 460 #define NVME_IS_VENDOR_SPECIFIC_CMD(x) (((x) >= 0xC0) && ((x) <= 0xFF)) 461 #define NVME_VENDOR_SPECIFIC_LOGPAGE_MIN 0xC0 462 #define NVME_VENDOR_SPECIFIC_LOGPAGE_MAX 0xFF 463 #define NVME_IS_VENDOR_SPECIFIC_LOGPAGE(x) \ 464 (((x) >= NVME_VENDOR_SPECIFIC_LOGPAGE_MIN) && \ 465 ((x) <= NVME_VENDOR_SPECIFIC_LOGPAGE_MAX)) 466 467 /* 468 * NVMe versions 1.3 and later actually support log pages up to UINT32_MAX 469 * DWords in size. However, revision 1.3 also modified the layout of the Get Log 470 * Page command significantly relative to version 1.2, including changing 471 * reserved bits, adding new bitfields, and requiring the use of command DWord 472 * 11 to fully specify the size of the log page (the lower and upper 16 bits of 473 * the number of DWords in the page are split between DWord 10 and DWord 11, 474 * respectively). 475 * 476 * All of these impose significantly different layout requirements on the 477 * `nvme_getlogpage_t` type. This could be solved with two different types, or a 478 * complicated/nested union with the two versions as the overlying members. Both 479 * of these are reasonable, if a bit convoluted. However, these is no current 480 * need for such large pages, or a way to test them, as most log pages actually 481 * fit within the current size limit. So for simplicity, we retain the size cap 482 * from version 1.2. 483 * 484 * Note that the number of DWords is zero-based, so we add 1. It is subtracted 485 * to form a zero-based value in `nvme_get_logpage`. 486 */ 487 #define NVME_VENDOR_SPECIFIC_LOGPAGE_MAX_SIZE \ 488 (((1 << 12) + 1) * sizeof (uint32_t)) 489 490 static void *nvme_state; 491 static kmem_cache_t *nvme_cmd_cache; 492 493 /* 494 * DMA attributes for queue DMA memory 495 * 496 * Queue DMA memory must be page aligned. The maximum length of a queue is 497 * 65536 entries, and an entry can be 64 bytes long. 498 */ 499 static ddi_dma_attr_t nvme_queue_dma_attr = { 500 .dma_attr_version = DMA_ATTR_V0, 501 .dma_attr_addr_lo = 0, 502 .dma_attr_addr_hi = 0xffffffffffffffffULL, 503 .dma_attr_count_max = (UINT16_MAX + 1) * sizeof (nvme_sqe_t) - 1, 504 .dma_attr_align = 0x1000, 505 .dma_attr_burstsizes = 0x7ff, 506 .dma_attr_minxfer = 0x1000, 507 .dma_attr_maxxfer = (UINT16_MAX + 1) * sizeof (nvme_sqe_t), 508 .dma_attr_seg = 0xffffffffffffffffULL, 509 .dma_attr_sgllen = 1, 510 .dma_attr_granular = 1, 511 .dma_attr_flags = 0, 512 }; 513 514 /* 515 * DMA attributes for transfers using Physical Region Page (PRP) entries 516 * 517 * A PRP entry describes one page of DMA memory using the page size specified 518 * in the controller configuration's memory page size register (CC.MPS). It uses 519 * a 64bit base address aligned to this page size. There is no limitation on 520 * chaining PRPs together for arbitrarily large DMA transfers. 521 */ 522 static ddi_dma_attr_t nvme_prp_dma_attr = { 523 .dma_attr_version = DMA_ATTR_V0, 524 .dma_attr_addr_lo = 0, 525 .dma_attr_addr_hi = 0xffffffffffffffffULL, 526 .dma_attr_count_max = 0xfff, 527 .dma_attr_align = 0x1000, 528 .dma_attr_burstsizes = 0x7ff, 529 .dma_attr_minxfer = 0x1000, 530 .dma_attr_maxxfer = 0x1000, 531 .dma_attr_seg = 0xfff, 532 .dma_attr_sgllen = -1, 533 .dma_attr_granular = 1, 534 .dma_attr_flags = 0, 535 }; 536 537 /* 538 * DMA attributes for transfers using scatter/gather lists 539 * 540 * A SGL entry describes a chunk of DMA memory using a 64bit base address and a 541 * 32bit length field. SGL Segment and SGL Last Segment entries require the 542 * length to be a multiple of 16 bytes. 543 */ 544 static ddi_dma_attr_t nvme_sgl_dma_attr = { 545 .dma_attr_version = DMA_ATTR_V0, 546 .dma_attr_addr_lo = 0, 547 .dma_attr_addr_hi = 0xffffffffffffffffULL, 548 .dma_attr_count_max = 0xffffffffUL, 549 .dma_attr_align = 1, 550 .dma_attr_burstsizes = 0x7ff, 551 .dma_attr_minxfer = 0x10, 552 .dma_attr_maxxfer = 0xfffffffffULL, 553 .dma_attr_seg = 0xffffffffffffffffULL, 554 .dma_attr_sgllen = -1, 555 .dma_attr_granular = 0x10, 556 .dma_attr_flags = 0 557 }; 558 559 static ddi_device_acc_attr_t nvme_reg_acc_attr = { 560 .devacc_attr_version = DDI_DEVICE_ATTR_V0, 561 .devacc_attr_endian_flags = DDI_STRUCTURE_LE_ACC, 562 .devacc_attr_dataorder = DDI_STRICTORDER_ACC 563 }; 564 565 static struct cb_ops nvme_cb_ops = { 566 .cb_open = nvme_open, 567 .cb_close = nvme_close, 568 .cb_strategy = nodev, 569 .cb_print = nodev, 570 .cb_dump = nodev, 571 .cb_read = nodev, 572 .cb_write = nodev, 573 .cb_ioctl = nvme_ioctl, 574 .cb_devmap = nodev, 575 .cb_mmap = nodev, 576 .cb_segmap = nodev, 577 .cb_chpoll = nochpoll, 578 .cb_prop_op = ddi_prop_op, 579 .cb_str = 0, 580 .cb_flag = D_NEW | D_MP, 581 .cb_rev = CB_REV, 582 .cb_aread = nodev, 583 .cb_awrite = nodev 584 }; 585 586 static struct dev_ops nvme_dev_ops = { 587 .devo_rev = DEVO_REV, 588 .devo_refcnt = 0, 589 .devo_getinfo = ddi_no_info, 590 .devo_identify = nulldev, 591 .devo_probe = nulldev, 592 .devo_attach = nvme_attach, 593 .devo_detach = nvme_detach, 594 .devo_reset = nodev, 595 .devo_cb_ops = &nvme_cb_ops, 596 .devo_bus_ops = NULL, 597 .devo_power = NULL, 598 .devo_quiesce = nvme_quiesce, 599 }; 600 601 static struct modldrv nvme_modldrv = { 602 .drv_modops = &mod_driverops, 603 .drv_linkinfo = "NVMe v1.1b", 604 .drv_dev_ops = &nvme_dev_ops 605 }; 606 607 static struct modlinkage nvme_modlinkage = { 608 .ml_rev = MODREV_1, 609 .ml_linkage = { &nvme_modldrv, NULL } 610 }; 611 612 static bd_ops_t nvme_bd_ops = { 613 .o_version = BD_OPS_CURRENT_VERSION, 614 .o_drive_info = nvme_bd_driveinfo, 615 .o_media_info = nvme_bd_mediainfo, 616 .o_devid_init = nvme_bd_devid, 617 .o_sync_cache = nvme_bd_sync, 618 .o_read = nvme_bd_read, 619 .o_write = nvme_bd_write, 620 .o_free_space = nvme_bd_free_space, 621 }; 622 623 /* 624 * This list will hold commands that have timed out and couldn't be aborted. 625 * As we don't know what the hardware may still do with the DMA memory we can't 626 * free them, so we'll keep them forever on this list where we can easily look 627 * at them with mdb. 628 */ 629 static struct list nvme_lost_cmds; 630 static kmutex_t nvme_lc_mutex; 631 632 int 633 _init(void) 634 { 635 int error; 636 637 error = ddi_soft_state_init(&nvme_state, sizeof (nvme_t), 1); 638 if (error != DDI_SUCCESS) 639 return (error); 640 641 nvme_cmd_cache = kmem_cache_create("nvme_cmd_cache", 642 sizeof (nvme_cmd_t), 64, NULL, NULL, NULL, NULL, NULL, 0); 643 644 mutex_init(&nvme_lc_mutex, NULL, MUTEX_DRIVER, NULL); 645 list_create(&nvme_lost_cmds, sizeof (nvme_cmd_t), 646 offsetof(nvme_cmd_t, nc_list)); 647 648 bd_mod_init(&nvme_dev_ops); 649 650 error = mod_install(&nvme_modlinkage); 651 if (error != DDI_SUCCESS) { 652 ddi_soft_state_fini(&nvme_state); 653 mutex_destroy(&nvme_lc_mutex); 654 list_destroy(&nvme_lost_cmds); 655 bd_mod_fini(&nvme_dev_ops); 656 } 657 658 return (error); 659 } 660 661 int 662 _fini(void) 663 { 664 int error; 665 666 if (!list_is_empty(&nvme_lost_cmds)) 667 return (DDI_FAILURE); 668 669 error = mod_remove(&nvme_modlinkage); 670 if (error == DDI_SUCCESS) { 671 ddi_soft_state_fini(&nvme_state); 672 kmem_cache_destroy(nvme_cmd_cache); 673 mutex_destroy(&nvme_lc_mutex); 674 list_destroy(&nvme_lost_cmds); 675 bd_mod_fini(&nvme_dev_ops); 676 } 677 678 return (error); 679 } 680 681 int 682 _info(struct modinfo *modinfop) 683 { 684 return (mod_info(&nvme_modlinkage, modinfop)); 685 } 686 687 static inline void 688 nvme_put64(nvme_t *nvme, uintptr_t reg, uint64_t val) 689 { 690 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0); 691 692 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 693 ddi_put64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg), val); 694 } 695 696 static inline void 697 nvme_put32(nvme_t *nvme, uintptr_t reg, uint32_t val) 698 { 699 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x3) == 0); 700 701 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 702 ddi_put32(nvme->n_regh, (uint32_t *)(nvme->n_regs + reg), val); 703 } 704 705 static inline uint64_t 706 nvme_get64(nvme_t *nvme, uintptr_t reg) 707 { 708 uint64_t val; 709 710 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0); 711 712 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 713 val = ddi_get64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg)); 714 715 return (val); 716 } 717 718 static inline uint32_t 719 nvme_get32(nvme_t *nvme, uintptr_t reg) 720 { 721 uint32_t val; 722 723 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x3) == 0); 724 725 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 726 val = ddi_get32(nvme->n_regh, (uint32_t *)(nvme->n_regs + reg)); 727 728 return (val); 729 } 730 731 static boolean_t 732 nvme_check_regs_hdl(nvme_t *nvme) 733 { 734 ddi_fm_error_t error; 735 736 ddi_fm_acc_err_get(nvme->n_regh, &error, DDI_FME_VERSION); 737 738 if (error.fme_status != DDI_FM_OK) 739 return (B_TRUE); 740 741 return (B_FALSE); 742 } 743 744 static boolean_t 745 nvme_check_dma_hdl(nvme_dma_t *dma) 746 { 747 ddi_fm_error_t error; 748 749 if (dma == NULL) 750 return (B_FALSE); 751 752 ddi_fm_dma_err_get(dma->nd_dmah, &error, DDI_FME_VERSION); 753 754 if (error.fme_status != DDI_FM_OK) 755 return (B_TRUE); 756 757 return (B_FALSE); 758 } 759 760 static void 761 nvme_free_dma_common(nvme_dma_t *dma) 762 { 763 if (dma->nd_dmah != NULL) 764 (void) ddi_dma_unbind_handle(dma->nd_dmah); 765 if (dma->nd_acch != NULL) 766 ddi_dma_mem_free(&dma->nd_acch); 767 if (dma->nd_dmah != NULL) 768 ddi_dma_free_handle(&dma->nd_dmah); 769 } 770 771 static void 772 nvme_free_dma(nvme_dma_t *dma) 773 { 774 nvme_free_dma_common(dma); 775 kmem_free(dma, sizeof (*dma)); 776 } 777 778 /* ARGSUSED */ 779 static void 780 nvme_prp_dma_destructor(void *buf, void *private) 781 { 782 nvme_dma_t *dma = (nvme_dma_t *)buf; 783 784 nvme_free_dma_common(dma); 785 } 786 787 static int 788 nvme_alloc_dma_common(nvme_t *nvme, nvme_dma_t *dma, 789 size_t len, uint_t flags, ddi_dma_attr_t *dma_attr) 790 { 791 if (ddi_dma_alloc_handle(nvme->n_dip, dma_attr, DDI_DMA_SLEEP, NULL, 792 &dma->nd_dmah) != DDI_SUCCESS) { 793 /* 794 * Due to DDI_DMA_SLEEP this can't be DDI_DMA_NORESOURCES, and 795 * the only other possible error is DDI_DMA_BADATTR which 796 * indicates a driver bug which should cause a panic. 797 */ 798 dev_err(nvme->n_dip, CE_PANIC, 799 "!failed to get DMA handle, check DMA attributes"); 800 return (DDI_FAILURE); 801 } 802 803 /* 804 * ddi_dma_mem_alloc() can only fail when DDI_DMA_NOSLEEP is specified 805 * or the flags are conflicting, which isn't the case here. 806 */ 807 (void) ddi_dma_mem_alloc(dma->nd_dmah, len, &nvme->n_reg_acc_attr, 808 DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL, &dma->nd_memp, 809 &dma->nd_len, &dma->nd_acch); 810 811 if (ddi_dma_addr_bind_handle(dma->nd_dmah, NULL, dma->nd_memp, 812 dma->nd_len, flags | DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL, 813 &dma->nd_cookie, &dma->nd_ncookie) != DDI_DMA_MAPPED) { 814 dev_err(nvme->n_dip, CE_WARN, 815 "!failed to bind DMA memory"); 816 atomic_inc_32(&nvme->n_dma_bind_err); 817 nvme_free_dma_common(dma); 818 return (DDI_FAILURE); 819 } 820 821 return (DDI_SUCCESS); 822 } 823 824 static int 825 nvme_zalloc_dma(nvme_t *nvme, size_t len, uint_t flags, 826 ddi_dma_attr_t *dma_attr, nvme_dma_t **ret) 827 { 828 nvme_dma_t *dma = kmem_zalloc(sizeof (nvme_dma_t), KM_SLEEP); 829 830 if (nvme_alloc_dma_common(nvme, dma, len, flags, dma_attr) != 831 DDI_SUCCESS) { 832 *ret = NULL; 833 kmem_free(dma, sizeof (nvme_dma_t)); 834 return (DDI_FAILURE); 835 } 836 837 bzero(dma->nd_memp, dma->nd_len); 838 839 *ret = dma; 840 return (DDI_SUCCESS); 841 } 842 843 /* ARGSUSED */ 844 static int 845 nvme_prp_dma_constructor(void *buf, void *private, int flags) 846 { 847 nvme_dma_t *dma = (nvme_dma_t *)buf; 848 nvme_t *nvme = (nvme_t *)private; 849 850 dma->nd_dmah = NULL; 851 dma->nd_acch = NULL; 852 853 if (nvme_alloc_dma_common(nvme, dma, nvme->n_pagesize, 854 DDI_DMA_READ, &nvme->n_prp_dma_attr) != DDI_SUCCESS) { 855 return (-1); 856 } 857 858 ASSERT(dma->nd_ncookie == 1); 859 860 dma->nd_cached = B_TRUE; 861 862 return (0); 863 } 864 865 static int 866 nvme_zalloc_queue_dma(nvme_t *nvme, uint32_t nentry, uint16_t qe_len, 867 uint_t flags, nvme_dma_t **dma) 868 { 869 uint32_t len = nentry * qe_len; 870 ddi_dma_attr_t q_dma_attr = nvme->n_queue_dma_attr; 871 872 len = roundup(len, nvme->n_pagesize); 873 874 if (nvme_zalloc_dma(nvme, len, flags, &q_dma_attr, dma) 875 != DDI_SUCCESS) { 876 dev_err(nvme->n_dip, CE_WARN, 877 "!failed to get DMA memory for queue"); 878 goto fail; 879 } 880 881 if ((*dma)->nd_ncookie != 1) { 882 dev_err(nvme->n_dip, CE_WARN, 883 "!got too many cookies for queue DMA"); 884 goto fail; 885 } 886 887 return (DDI_SUCCESS); 888 889 fail: 890 if (*dma) { 891 nvme_free_dma(*dma); 892 *dma = NULL; 893 } 894 895 return (DDI_FAILURE); 896 } 897 898 static void 899 nvme_free_cq(nvme_cq_t *cq) 900 { 901 mutex_destroy(&cq->ncq_mutex); 902 903 if (cq->ncq_cmd_taskq != NULL) 904 taskq_destroy(cq->ncq_cmd_taskq); 905 906 if (cq->ncq_dma != NULL) 907 nvme_free_dma(cq->ncq_dma); 908 909 kmem_free(cq, sizeof (*cq)); 910 } 911 912 static void 913 nvme_free_qpair(nvme_qpair_t *qp) 914 { 915 int i; 916 917 mutex_destroy(&qp->nq_mutex); 918 sema_destroy(&qp->nq_sema); 919 920 if (qp->nq_sqdma != NULL) 921 nvme_free_dma(qp->nq_sqdma); 922 923 if (qp->nq_active_cmds > 0) 924 for (i = 0; i != qp->nq_nentry; i++) 925 if (qp->nq_cmd[i] != NULL) 926 nvme_free_cmd(qp->nq_cmd[i]); 927 928 if (qp->nq_cmd != NULL) 929 kmem_free(qp->nq_cmd, sizeof (nvme_cmd_t *) * qp->nq_nentry); 930 931 kmem_free(qp, sizeof (nvme_qpair_t)); 932 } 933 934 /* 935 * Destroy the pre-allocated cq array, but only free individual completion 936 * queues from the given starting index. 937 */ 938 static void 939 nvme_destroy_cq_array(nvme_t *nvme, uint_t start) 940 { 941 uint_t i; 942 943 for (i = start; i < nvme->n_cq_count; i++) 944 if (nvme->n_cq[i] != NULL) 945 nvme_free_cq(nvme->n_cq[i]); 946 947 kmem_free(nvme->n_cq, sizeof (*nvme->n_cq) * nvme->n_cq_count); 948 } 949 950 static int 951 nvme_alloc_cq(nvme_t *nvme, uint32_t nentry, nvme_cq_t **cqp, uint16_t idx, 952 uint_t nthr) 953 { 954 nvme_cq_t *cq = kmem_zalloc(sizeof (*cq), KM_SLEEP); 955 char name[64]; /* large enough for the taskq name */ 956 957 mutex_init(&cq->ncq_mutex, NULL, MUTEX_DRIVER, 958 DDI_INTR_PRI(nvme->n_intr_pri)); 959 960 if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_cqe_t), 961 DDI_DMA_READ, &cq->ncq_dma) != DDI_SUCCESS) 962 goto fail; 963 964 cq->ncq_cq = (nvme_cqe_t *)cq->ncq_dma->nd_memp; 965 cq->ncq_nentry = nentry; 966 cq->ncq_id = idx; 967 cq->ncq_hdbl = NVME_REG_CQHDBL(nvme, idx); 968 969 /* 970 * Each completion queue has its own command taskq. 971 */ 972 (void) snprintf(name, sizeof (name), "%s%d_cmd_taskq%u", 973 ddi_driver_name(nvme->n_dip), ddi_get_instance(nvme->n_dip), idx); 974 975 cq->ncq_cmd_taskq = taskq_create(name, nthr, minclsyspri, 64, INT_MAX, 976 TASKQ_PREPOPULATE); 977 978 if (cq->ncq_cmd_taskq == NULL) { 979 dev_err(nvme->n_dip, CE_WARN, "!failed to create cmd " 980 "taskq for cq %u", idx); 981 goto fail; 982 } 983 984 *cqp = cq; 985 return (DDI_SUCCESS); 986 987 fail: 988 nvme_free_cq(cq); 989 *cqp = NULL; 990 991 return (DDI_FAILURE); 992 } 993 994 /* 995 * Create the n_cq array big enough to hold "ncq" completion queues. 996 * If the array already exists it will be re-sized (but only larger). 997 * The admin queue is included in this array, which boosts the 998 * max number of entries to UINT16_MAX + 1. 999 */ 1000 static int 1001 nvme_create_cq_array(nvme_t *nvme, uint_t ncq, uint32_t nentry, uint_t nthr) 1002 { 1003 nvme_cq_t **cq; 1004 uint_t i, cq_count; 1005 1006 ASSERT3U(ncq, >, nvme->n_cq_count); 1007 1008 cq = nvme->n_cq; 1009 cq_count = nvme->n_cq_count; 1010 1011 nvme->n_cq = kmem_zalloc(sizeof (*nvme->n_cq) * ncq, KM_SLEEP); 1012 nvme->n_cq_count = ncq; 1013 1014 for (i = 0; i < cq_count; i++) 1015 nvme->n_cq[i] = cq[i]; 1016 1017 for (; i < nvme->n_cq_count; i++) 1018 if (nvme_alloc_cq(nvme, nentry, &nvme->n_cq[i], i, nthr) != 1019 DDI_SUCCESS) 1020 goto fail; 1021 1022 if (cq != NULL) 1023 kmem_free(cq, sizeof (*cq) * cq_count); 1024 1025 return (DDI_SUCCESS); 1026 1027 fail: 1028 nvme_destroy_cq_array(nvme, cq_count); 1029 /* 1030 * Restore the original array 1031 */ 1032 nvme->n_cq_count = cq_count; 1033 nvme->n_cq = cq; 1034 1035 return (DDI_FAILURE); 1036 } 1037 1038 static int 1039 nvme_alloc_qpair(nvme_t *nvme, uint32_t nentry, nvme_qpair_t **nqp, 1040 uint_t idx) 1041 { 1042 nvme_qpair_t *qp = kmem_zalloc(sizeof (*qp), KM_SLEEP); 1043 uint_t cq_idx; 1044 1045 mutex_init(&qp->nq_mutex, NULL, MUTEX_DRIVER, 1046 DDI_INTR_PRI(nvme->n_intr_pri)); 1047 1048 /* 1049 * The NVMe spec defines that a full queue has one empty (unused) slot; 1050 * initialize the semaphore accordingly. 1051 */ 1052 sema_init(&qp->nq_sema, nentry - 1, NULL, SEMA_DRIVER, NULL); 1053 1054 if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_sqe_t), 1055 DDI_DMA_WRITE, &qp->nq_sqdma) != DDI_SUCCESS) 1056 goto fail; 1057 1058 /* 1059 * idx == 0 is adminq, those above 0 are shared io completion queues. 1060 */ 1061 cq_idx = idx == 0 ? 0 : 1 + (idx - 1) % (nvme->n_cq_count - 1); 1062 qp->nq_cq = nvme->n_cq[cq_idx]; 1063 qp->nq_sq = (nvme_sqe_t *)qp->nq_sqdma->nd_memp; 1064 qp->nq_nentry = nentry; 1065 1066 qp->nq_sqtdbl = NVME_REG_SQTDBL(nvme, idx); 1067 1068 qp->nq_cmd = kmem_zalloc(sizeof (nvme_cmd_t *) * nentry, KM_SLEEP); 1069 qp->nq_next_cmd = 0; 1070 1071 *nqp = qp; 1072 return (DDI_SUCCESS); 1073 1074 fail: 1075 nvme_free_qpair(qp); 1076 *nqp = NULL; 1077 1078 return (DDI_FAILURE); 1079 } 1080 1081 static nvme_cmd_t * 1082 nvme_alloc_cmd(nvme_t *nvme, int kmflag) 1083 { 1084 nvme_cmd_t *cmd = kmem_cache_alloc(nvme_cmd_cache, kmflag); 1085 1086 if (cmd == NULL) 1087 return (cmd); 1088 1089 bzero(cmd, sizeof (nvme_cmd_t)); 1090 1091 cmd->nc_nvme = nvme; 1092 1093 mutex_init(&cmd->nc_mutex, NULL, MUTEX_DRIVER, 1094 DDI_INTR_PRI(nvme->n_intr_pri)); 1095 cv_init(&cmd->nc_cv, NULL, CV_DRIVER, NULL); 1096 1097 return (cmd); 1098 } 1099 1100 static void 1101 nvme_free_cmd(nvme_cmd_t *cmd) 1102 { 1103 /* Don't free commands on the lost commands list. */ 1104 if (list_link_active(&cmd->nc_list)) 1105 return; 1106 1107 if (cmd->nc_dma) { 1108 nvme_free_dma(cmd->nc_dma); 1109 cmd->nc_dma = NULL; 1110 } 1111 1112 if (cmd->nc_prp) { 1113 kmem_cache_free(cmd->nc_nvme->n_prp_cache, cmd->nc_prp); 1114 cmd->nc_prp = NULL; 1115 } 1116 1117 cv_destroy(&cmd->nc_cv); 1118 mutex_destroy(&cmd->nc_mutex); 1119 1120 kmem_cache_free(nvme_cmd_cache, cmd); 1121 } 1122 1123 static void 1124 nvme_submit_admin_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd) 1125 { 1126 sema_p(&qp->nq_sema); 1127 nvme_submit_cmd_common(qp, cmd); 1128 } 1129 1130 static int 1131 nvme_submit_io_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd) 1132 { 1133 if (cmd->nc_nvme->n_dead) { 1134 return (EIO); 1135 } 1136 1137 if (sema_tryp(&qp->nq_sema) == 0) 1138 return (EAGAIN); 1139 1140 nvme_submit_cmd_common(qp, cmd); 1141 return (0); 1142 } 1143 1144 static void 1145 nvme_submit_cmd_common(nvme_qpair_t *qp, nvme_cmd_t *cmd) 1146 { 1147 nvme_reg_sqtdbl_t tail = { 0 }; 1148 1149 mutex_enter(&qp->nq_mutex); 1150 cmd->nc_completed = B_FALSE; 1151 1152 /* 1153 * Now that we hold the queue pair lock, we must check whether or not 1154 * the controller has been listed as dead (e.g. was removed due to 1155 * hotplug). This is necessary as otherwise we could race with 1156 * nvme_remove_callback(). Because this has not been enqueued, we don't 1157 * call nvme_unqueue_cmd(), which is why we must manually decrement the 1158 * semaphore. 1159 */ 1160 if (cmd->nc_nvme->n_dead) { 1161 taskq_dispatch_ent(qp->nq_cq->ncq_cmd_taskq, cmd->nc_callback, 1162 cmd, TQ_NOSLEEP, &cmd->nc_tqent); 1163 sema_v(&qp->nq_sema); 1164 mutex_exit(&qp->nq_mutex); 1165 return; 1166 } 1167 1168 /* 1169 * Try to insert the cmd into the active cmd array at the nq_next_cmd 1170 * slot. If the slot is already occupied advance to the next slot and 1171 * try again. This can happen for long running commands like async event 1172 * requests. 1173 */ 1174 while (qp->nq_cmd[qp->nq_next_cmd] != NULL) 1175 qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry; 1176 qp->nq_cmd[qp->nq_next_cmd] = cmd; 1177 1178 qp->nq_active_cmds++; 1179 1180 cmd->nc_sqe.sqe_cid = qp->nq_next_cmd; 1181 bcopy(&cmd->nc_sqe, &qp->nq_sq[qp->nq_sqtail], sizeof (nvme_sqe_t)); 1182 (void) ddi_dma_sync(qp->nq_sqdma->nd_dmah, 1183 sizeof (nvme_sqe_t) * qp->nq_sqtail, 1184 sizeof (nvme_sqe_t), DDI_DMA_SYNC_FORDEV); 1185 qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry; 1186 1187 tail.b.sqtdbl_sqt = qp->nq_sqtail = (qp->nq_sqtail + 1) % qp->nq_nentry; 1188 nvme_put32(cmd->nc_nvme, qp->nq_sqtdbl, tail.r); 1189 1190 mutex_exit(&qp->nq_mutex); 1191 } 1192 1193 static nvme_cmd_t * 1194 nvme_unqueue_cmd(nvme_t *nvme, nvme_qpair_t *qp, int cid) 1195 { 1196 nvme_cmd_t *cmd; 1197 1198 ASSERT(mutex_owned(&qp->nq_mutex)); 1199 ASSERT3S(cid, <, qp->nq_nentry); 1200 1201 cmd = qp->nq_cmd[cid]; 1202 qp->nq_cmd[cid] = NULL; 1203 ASSERT3U(qp->nq_active_cmds, >, 0); 1204 qp->nq_active_cmds--; 1205 sema_v(&qp->nq_sema); 1206 1207 ASSERT3P(cmd, !=, NULL); 1208 ASSERT3P(cmd->nc_nvme, ==, nvme); 1209 ASSERT3S(cmd->nc_sqe.sqe_cid, ==, cid); 1210 1211 return (cmd); 1212 } 1213 1214 /* 1215 * Get the command tied to the next completed cqe and bump along completion 1216 * queue head counter. 1217 */ 1218 static nvme_cmd_t * 1219 nvme_get_completed(nvme_t *nvme, nvme_cq_t *cq) 1220 { 1221 nvme_qpair_t *qp; 1222 nvme_cqe_t *cqe; 1223 nvme_cmd_t *cmd; 1224 1225 ASSERT(mutex_owned(&cq->ncq_mutex)); 1226 1227 cqe = &cq->ncq_cq[cq->ncq_head]; 1228 1229 /* Check phase tag of CQE. Hardware inverts it for new entries. */ 1230 if (cqe->cqe_sf.sf_p == cq->ncq_phase) 1231 return (NULL); 1232 1233 qp = nvme->n_ioq[cqe->cqe_sqid]; 1234 1235 mutex_enter(&qp->nq_mutex); 1236 cmd = nvme_unqueue_cmd(nvme, qp, cqe->cqe_cid); 1237 mutex_exit(&qp->nq_mutex); 1238 1239 ASSERT(cmd->nc_sqid == cqe->cqe_sqid); 1240 bcopy(cqe, &cmd->nc_cqe, sizeof (nvme_cqe_t)); 1241 1242 qp->nq_sqhead = cqe->cqe_sqhd; 1243 1244 cq->ncq_head = (cq->ncq_head + 1) % cq->ncq_nentry; 1245 1246 /* Toggle phase on wrap-around. */ 1247 if (cq->ncq_head == 0) 1248 cq->ncq_phase = cq->ncq_phase ? 0 : 1; 1249 1250 return (cmd); 1251 } 1252 1253 /* 1254 * Process all completed commands on the io completion queue. 1255 */ 1256 static uint_t 1257 nvme_process_iocq(nvme_t *nvme, nvme_cq_t *cq) 1258 { 1259 nvme_reg_cqhdbl_t head = { 0 }; 1260 nvme_cmd_t *cmd; 1261 uint_t completed = 0; 1262 1263 if (ddi_dma_sync(cq->ncq_dma->nd_dmah, 0, 0, DDI_DMA_SYNC_FORKERNEL) != 1264 DDI_SUCCESS) 1265 dev_err(nvme->n_dip, CE_WARN, "!ddi_dma_sync() failed in %s", 1266 __func__); 1267 1268 mutex_enter(&cq->ncq_mutex); 1269 1270 while ((cmd = nvme_get_completed(nvme, cq)) != NULL) { 1271 taskq_dispatch_ent(cq->ncq_cmd_taskq, cmd->nc_callback, cmd, 1272 TQ_NOSLEEP, &cmd->nc_tqent); 1273 1274 completed++; 1275 } 1276 1277 if (completed > 0) { 1278 /* 1279 * Update the completion queue head doorbell. 1280 */ 1281 head.b.cqhdbl_cqh = cq->ncq_head; 1282 nvme_put32(nvme, cq->ncq_hdbl, head.r); 1283 } 1284 1285 mutex_exit(&cq->ncq_mutex); 1286 1287 return (completed); 1288 } 1289 1290 static nvme_cmd_t * 1291 nvme_retrieve_cmd(nvme_t *nvme, nvme_qpair_t *qp) 1292 { 1293 nvme_cq_t *cq = qp->nq_cq; 1294 nvme_reg_cqhdbl_t head = { 0 }; 1295 nvme_cmd_t *cmd; 1296 1297 if (ddi_dma_sync(cq->ncq_dma->nd_dmah, 0, 0, DDI_DMA_SYNC_FORKERNEL) != 1298 DDI_SUCCESS) 1299 dev_err(nvme->n_dip, CE_WARN, "!ddi_dma_sync() failed in %s", 1300 __func__); 1301 1302 mutex_enter(&cq->ncq_mutex); 1303 1304 if ((cmd = nvme_get_completed(nvme, cq)) != NULL) { 1305 head.b.cqhdbl_cqh = cq->ncq_head; 1306 nvme_put32(nvme, cq->ncq_hdbl, head.r); 1307 } 1308 1309 mutex_exit(&cq->ncq_mutex); 1310 1311 return (cmd); 1312 } 1313 1314 static int 1315 nvme_check_unknown_cmd_status(nvme_cmd_t *cmd) 1316 { 1317 nvme_cqe_t *cqe = &cmd->nc_cqe; 1318 1319 dev_err(cmd->nc_nvme->n_dip, CE_WARN, 1320 "!unknown command status received: opc = %x, sqid = %d, cid = %d, " 1321 "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc, 1322 cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct, 1323 cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m); 1324 1325 if (cmd->nc_xfer != NULL) 1326 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 1327 1328 if (cmd->nc_nvme->n_strict_version) { 1329 cmd->nc_nvme->n_dead = B_TRUE; 1330 ddi_fm_service_impact(cmd->nc_nvme->n_dip, DDI_SERVICE_LOST); 1331 } 1332 1333 return (EIO); 1334 } 1335 1336 static int 1337 nvme_check_vendor_cmd_status(nvme_cmd_t *cmd) 1338 { 1339 nvme_cqe_t *cqe = &cmd->nc_cqe; 1340 1341 dev_err(cmd->nc_nvme->n_dip, CE_WARN, 1342 "!unknown command status received: opc = %x, sqid = %d, cid = %d, " 1343 "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc, 1344 cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct, 1345 cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m); 1346 if (!cmd->nc_nvme->n_ignore_unknown_vendor_status) { 1347 cmd->nc_nvme->n_dead = B_TRUE; 1348 ddi_fm_service_impact(cmd->nc_nvme->n_dip, DDI_SERVICE_LOST); 1349 } 1350 1351 return (EIO); 1352 } 1353 1354 static int 1355 nvme_check_integrity_cmd_status(nvme_cmd_t *cmd) 1356 { 1357 nvme_cqe_t *cqe = &cmd->nc_cqe; 1358 1359 switch (cqe->cqe_sf.sf_sc) { 1360 case NVME_CQE_SC_INT_NVM_WRITE: 1361 /* write fail */ 1362 /* TODO: post ereport */ 1363 if (cmd->nc_xfer != NULL) 1364 bd_error(cmd->nc_xfer, BD_ERR_MEDIA); 1365 return (EIO); 1366 1367 case NVME_CQE_SC_INT_NVM_READ: 1368 /* read fail */ 1369 /* TODO: post ereport */ 1370 if (cmd->nc_xfer != NULL) 1371 bd_error(cmd->nc_xfer, BD_ERR_MEDIA); 1372 return (EIO); 1373 1374 default: 1375 return (nvme_check_unknown_cmd_status(cmd)); 1376 } 1377 } 1378 1379 static int 1380 nvme_check_generic_cmd_status(nvme_cmd_t *cmd) 1381 { 1382 nvme_cqe_t *cqe = &cmd->nc_cqe; 1383 1384 switch (cqe->cqe_sf.sf_sc) { 1385 case NVME_CQE_SC_GEN_SUCCESS: 1386 return (0); 1387 1388 /* 1389 * Errors indicating a bug in the driver should cause a panic. 1390 */ 1391 case NVME_CQE_SC_GEN_INV_OPC: 1392 /* Invalid Command Opcode */ 1393 if (!cmd->nc_dontpanic) 1394 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, 1395 "programming error: invalid opcode in cmd %p", 1396 (void *)cmd); 1397 return (EINVAL); 1398 1399 case NVME_CQE_SC_GEN_INV_FLD: 1400 /* Invalid Field in Command */ 1401 if (!cmd->nc_dontpanic) 1402 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, 1403 "programming error: invalid field in cmd %p", 1404 (void *)cmd); 1405 return (EIO); 1406 1407 case NVME_CQE_SC_GEN_ID_CNFL: 1408 /* Command ID Conflict */ 1409 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " 1410 "cmd ID conflict in cmd %p", (void *)cmd); 1411 return (0); 1412 1413 case NVME_CQE_SC_GEN_INV_NS: 1414 /* Invalid Namespace or Format */ 1415 if (!cmd->nc_dontpanic) 1416 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, 1417 "programming error: invalid NS/format in cmd %p", 1418 (void *)cmd); 1419 return (EINVAL); 1420 1421 case NVME_CQE_SC_GEN_NVM_LBA_RANGE: 1422 /* LBA Out Of Range */ 1423 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " 1424 "LBA out of range in cmd %p", (void *)cmd); 1425 return (0); 1426 1427 /* 1428 * Non-fatal errors, handle gracefully. 1429 */ 1430 case NVME_CQE_SC_GEN_DATA_XFR_ERR: 1431 /* Data Transfer Error (DMA) */ 1432 /* TODO: post ereport */ 1433 atomic_inc_32(&cmd->nc_nvme->n_data_xfr_err); 1434 if (cmd->nc_xfer != NULL) 1435 bd_error(cmd->nc_xfer, BD_ERR_NTRDY); 1436 return (EIO); 1437 1438 case NVME_CQE_SC_GEN_INTERNAL_ERR: 1439 /* 1440 * Internal Error. The spec (v1.0, section 4.5.1.2) says 1441 * detailed error information is returned as async event, 1442 * so we pretty much ignore the error here and handle it 1443 * in the async event handler. 1444 */ 1445 atomic_inc_32(&cmd->nc_nvme->n_internal_err); 1446 if (cmd->nc_xfer != NULL) 1447 bd_error(cmd->nc_xfer, BD_ERR_NTRDY); 1448 return (EIO); 1449 1450 case NVME_CQE_SC_GEN_ABORT_REQUEST: 1451 /* 1452 * Command Abort Requested. This normally happens only when a 1453 * command times out. 1454 */ 1455 /* TODO: post ereport or change blkdev to handle this? */ 1456 atomic_inc_32(&cmd->nc_nvme->n_abort_rq_err); 1457 return (ECANCELED); 1458 1459 case NVME_CQE_SC_GEN_ABORT_PWRLOSS: 1460 /* Command Aborted due to Power Loss Notification */ 1461 ddi_fm_service_impact(cmd->nc_nvme->n_dip, DDI_SERVICE_LOST); 1462 cmd->nc_nvme->n_dead = B_TRUE; 1463 return (EIO); 1464 1465 case NVME_CQE_SC_GEN_ABORT_SQ_DEL: 1466 /* Command Aborted due to SQ Deletion */ 1467 atomic_inc_32(&cmd->nc_nvme->n_abort_sq_del); 1468 return (EIO); 1469 1470 case NVME_CQE_SC_GEN_NVM_CAP_EXC: 1471 /* Capacity Exceeded */ 1472 atomic_inc_32(&cmd->nc_nvme->n_nvm_cap_exc); 1473 if (cmd->nc_xfer != NULL) 1474 bd_error(cmd->nc_xfer, BD_ERR_MEDIA); 1475 return (EIO); 1476 1477 case NVME_CQE_SC_GEN_NVM_NS_NOTRDY: 1478 /* Namespace Not Ready */ 1479 atomic_inc_32(&cmd->nc_nvme->n_nvm_ns_notrdy); 1480 if (cmd->nc_xfer != NULL) 1481 bd_error(cmd->nc_xfer, BD_ERR_NTRDY); 1482 return (EIO); 1483 1484 default: 1485 return (nvme_check_unknown_cmd_status(cmd)); 1486 } 1487 } 1488 1489 static int 1490 nvme_check_specific_cmd_status(nvme_cmd_t *cmd) 1491 { 1492 nvme_cqe_t *cqe = &cmd->nc_cqe; 1493 1494 switch (cqe->cqe_sf.sf_sc) { 1495 case NVME_CQE_SC_SPC_INV_CQ: 1496 /* Completion Queue Invalid */ 1497 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE); 1498 atomic_inc_32(&cmd->nc_nvme->n_inv_cq_err); 1499 return (EINVAL); 1500 1501 case NVME_CQE_SC_SPC_INV_QID: 1502 /* Invalid Queue Identifier */ 1503 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE || 1504 cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_SQUEUE || 1505 cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE || 1506 cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_CQUEUE); 1507 atomic_inc_32(&cmd->nc_nvme->n_inv_qid_err); 1508 return (EINVAL); 1509 1510 case NVME_CQE_SC_SPC_MAX_QSZ_EXC: 1511 /* Max Queue Size Exceeded */ 1512 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE || 1513 cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE); 1514 atomic_inc_32(&cmd->nc_nvme->n_max_qsz_exc); 1515 return (EINVAL); 1516 1517 case NVME_CQE_SC_SPC_ABRT_CMD_EXC: 1518 /* Abort Command Limit Exceeded */ 1519 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_ABORT); 1520 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " 1521 "abort command limit exceeded in cmd %p", (void *)cmd); 1522 return (0); 1523 1524 case NVME_CQE_SC_SPC_ASYNC_EVREQ_EXC: 1525 /* Async Event Request Limit Exceeded */ 1526 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_ASYNC_EVENT); 1527 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " 1528 "async event request limit exceeded in cmd %p", 1529 (void *)cmd); 1530 return (0); 1531 1532 case NVME_CQE_SC_SPC_INV_INT_VECT: 1533 /* Invalid Interrupt Vector */ 1534 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE); 1535 atomic_inc_32(&cmd->nc_nvme->n_inv_int_vect); 1536 return (EINVAL); 1537 1538 case NVME_CQE_SC_SPC_INV_LOG_PAGE: 1539 /* Invalid Log Page */ 1540 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_GET_LOG_PAGE); 1541 atomic_inc_32(&cmd->nc_nvme->n_inv_log_page); 1542 return (EINVAL); 1543 1544 case NVME_CQE_SC_SPC_INV_FORMAT: 1545 /* Invalid Format */ 1546 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_FORMAT); 1547 atomic_inc_32(&cmd->nc_nvme->n_inv_format); 1548 if (cmd->nc_xfer != NULL) 1549 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 1550 return (EINVAL); 1551 1552 case NVME_CQE_SC_SPC_INV_Q_DEL: 1553 /* Invalid Queue Deletion */ 1554 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_CQUEUE); 1555 atomic_inc_32(&cmd->nc_nvme->n_inv_q_del); 1556 return (EINVAL); 1557 1558 case NVME_CQE_SC_SPC_NVM_CNFL_ATTR: 1559 /* Conflicting Attributes */ 1560 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_DSET_MGMT || 1561 cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_READ || 1562 cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE); 1563 atomic_inc_32(&cmd->nc_nvme->n_cnfl_attr); 1564 if (cmd->nc_xfer != NULL) 1565 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 1566 return (EINVAL); 1567 1568 case NVME_CQE_SC_SPC_NVM_INV_PROT: 1569 /* Invalid Protection Information */ 1570 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_COMPARE || 1571 cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_READ || 1572 cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE); 1573 atomic_inc_32(&cmd->nc_nvme->n_inv_prot); 1574 if (cmd->nc_xfer != NULL) 1575 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 1576 return (EINVAL); 1577 1578 case NVME_CQE_SC_SPC_NVM_READONLY: 1579 /* Write to Read Only Range */ 1580 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE); 1581 atomic_inc_32(&cmd->nc_nvme->n_readonly); 1582 if (cmd->nc_xfer != NULL) 1583 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 1584 return (EROFS); 1585 1586 case NVME_CQE_SC_SPC_INV_FW_SLOT: 1587 /* Invalid Firmware Slot */ 1588 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 1589 return (EINVAL); 1590 1591 case NVME_CQE_SC_SPC_INV_FW_IMG: 1592 /* Invalid Firmware Image */ 1593 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 1594 return (EINVAL); 1595 1596 case NVME_CQE_SC_SPC_FW_RESET: 1597 /* Conventional Reset Required */ 1598 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 1599 return (0); 1600 1601 case NVME_CQE_SC_SPC_FW_NSSR: 1602 /* NVMe Subsystem Reset Required */ 1603 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 1604 return (0); 1605 1606 case NVME_CQE_SC_SPC_FW_NEXT_RESET: 1607 /* Activation Requires Reset */ 1608 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 1609 return (0); 1610 1611 case NVME_CQE_SC_SPC_FW_MTFA: 1612 /* Activation Requires Maximum Time Violation */ 1613 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 1614 return (EAGAIN); 1615 1616 case NVME_CQE_SC_SPC_FW_PROHIBITED: 1617 /* Activation Prohibited */ 1618 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 1619 return (EINVAL); 1620 1621 case NVME_CQE_SC_SPC_FW_OVERLAP: 1622 /* Overlapping Firmware Ranges */ 1623 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_IMAGE_LOAD); 1624 return (EINVAL); 1625 1626 default: 1627 return (nvme_check_unknown_cmd_status(cmd)); 1628 } 1629 } 1630 1631 static inline int 1632 nvme_check_cmd_status(nvme_cmd_t *cmd) 1633 { 1634 nvme_cqe_t *cqe = &cmd->nc_cqe; 1635 1636 /* 1637 * Take a shortcut if the controller is dead, or if 1638 * command status indicates no error. 1639 */ 1640 if (cmd->nc_nvme->n_dead) 1641 return (EIO); 1642 1643 if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC && 1644 cqe->cqe_sf.sf_sc == NVME_CQE_SC_GEN_SUCCESS) 1645 return (0); 1646 1647 if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC) 1648 return (nvme_check_generic_cmd_status(cmd)); 1649 else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_SPECIFIC) 1650 return (nvme_check_specific_cmd_status(cmd)); 1651 else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_INTEGRITY) 1652 return (nvme_check_integrity_cmd_status(cmd)); 1653 else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_VENDOR) 1654 return (nvme_check_vendor_cmd_status(cmd)); 1655 1656 return (nvme_check_unknown_cmd_status(cmd)); 1657 } 1658 1659 static int 1660 nvme_abort_cmd(nvme_cmd_t *abort_cmd, uint_t sec) 1661 { 1662 nvme_t *nvme = abort_cmd->nc_nvme; 1663 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 1664 nvme_abort_cmd_t ac = { 0 }; 1665 int ret = 0; 1666 1667 sema_p(&nvme->n_abort_sema); 1668 1669 ac.b.ac_cid = abort_cmd->nc_sqe.sqe_cid; 1670 ac.b.ac_sqid = abort_cmd->nc_sqid; 1671 1672 cmd->nc_sqid = 0; 1673 cmd->nc_sqe.sqe_opc = NVME_OPC_ABORT; 1674 cmd->nc_callback = nvme_wakeup_cmd; 1675 cmd->nc_sqe.sqe_cdw10 = ac.r; 1676 1677 /* 1678 * Send the ABORT to the hardware. The ABORT command will return _after_ 1679 * the aborted command has completed (aborted or otherwise), but since 1680 * we still hold the aborted command's mutex its callback hasn't been 1681 * processed yet. 1682 */ 1683 nvme_admin_cmd(cmd, sec); 1684 sema_v(&nvme->n_abort_sema); 1685 1686 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 1687 dev_err(nvme->n_dip, CE_WARN, 1688 "!ABORT failed with sct = %x, sc = %x", 1689 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 1690 atomic_inc_32(&nvme->n_abort_failed); 1691 } else { 1692 dev_err(nvme->n_dip, CE_WARN, 1693 "!ABORT of command %d/%d %ssuccessful", 1694 abort_cmd->nc_sqe.sqe_cid, abort_cmd->nc_sqid, 1695 cmd->nc_cqe.cqe_dw0 & 1 ? "un" : ""); 1696 if ((cmd->nc_cqe.cqe_dw0 & 1) == 0) 1697 atomic_inc_32(&nvme->n_cmd_aborted); 1698 } 1699 1700 nvme_free_cmd(cmd); 1701 return (ret); 1702 } 1703 1704 /* 1705 * nvme_wait_cmd -- wait for command completion or timeout 1706 * 1707 * In case of a serious error or a timeout of the abort command the hardware 1708 * will be declared dead and FMA will be notified. 1709 */ 1710 static void 1711 nvme_wait_cmd(nvme_cmd_t *cmd, uint_t sec) 1712 { 1713 clock_t timeout = ddi_get_lbolt() + drv_usectohz(sec * MICROSEC); 1714 nvme_t *nvme = cmd->nc_nvme; 1715 nvme_reg_csts_t csts; 1716 nvme_qpair_t *qp; 1717 1718 ASSERT(mutex_owned(&cmd->nc_mutex)); 1719 1720 while (!cmd->nc_completed) { 1721 if (cv_timedwait(&cmd->nc_cv, &cmd->nc_mutex, timeout) == -1) 1722 break; 1723 } 1724 1725 if (cmd->nc_completed) 1726 return; 1727 1728 /* 1729 * The command timed out. 1730 * 1731 * Check controller for fatal status, any errors associated with the 1732 * register or DMA handle, or for a double timeout (abort command timed 1733 * out). If necessary log a warning and call FMA. 1734 */ 1735 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 1736 dev_err(nvme->n_dip, CE_WARN, "!command %d/%d timeout, " 1737 "OPC = %x, CFS = %d", cmd->nc_sqe.sqe_cid, cmd->nc_sqid, 1738 cmd->nc_sqe.sqe_opc, csts.b.csts_cfs); 1739 atomic_inc_32(&nvme->n_cmd_timeout); 1740 1741 if (csts.b.csts_cfs || 1742 nvme_check_regs_hdl(nvme) || 1743 nvme_check_dma_hdl(cmd->nc_dma) || 1744 cmd->nc_sqe.sqe_opc == NVME_OPC_ABORT) { 1745 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 1746 nvme->n_dead = B_TRUE; 1747 } else if (nvme_abort_cmd(cmd, sec) == 0) { 1748 /* 1749 * If the abort succeeded the command should complete 1750 * immediately with an appropriate status. 1751 */ 1752 while (!cmd->nc_completed) 1753 cv_wait(&cmd->nc_cv, &cmd->nc_mutex); 1754 1755 return; 1756 } 1757 1758 qp = nvme->n_ioq[cmd->nc_sqid]; 1759 1760 mutex_enter(&qp->nq_mutex); 1761 (void) nvme_unqueue_cmd(nvme, qp, cmd->nc_sqe.sqe_cid); 1762 mutex_exit(&qp->nq_mutex); 1763 1764 /* 1765 * As we don't know what the presumed dead hardware might still do with 1766 * the DMA memory, we'll put the command on the lost commands list if it 1767 * has any DMA memory. 1768 */ 1769 if (cmd->nc_dma != NULL) { 1770 mutex_enter(&nvme_lc_mutex); 1771 list_insert_head(&nvme_lost_cmds, cmd); 1772 mutex_exit(&nvme_lc_mutex); 1773 } 1774 } 1775 1776 static void 1777 nvme_wakeup_cmd(void *arg) 1778 { 1779 nvme_cmd_t *cmd = arg; 1780 1781 mutex_enter(&cmd->nc_mutex); 1782 cmd->nc_completed = B_TRUE; 1783 cv_signal(&cmd->nc_cv); 1784 mutex_exit(&cmd->nc_mutex); 1785 } 1786 1787 static void 1788 nvme_async_event_task(void *arg) 1789 { 1790 nvme_cmd_t *cmd = arg; 1791 nvme_t *nvme = cmd->nc_nvme; 1792 nvme_error_log_entry_t *error_log = NULL; 1793 nvme_health_log_t *health_log = NULL; 1794 nvme_nschange_list_t *nslist = NULL; 1795 size_t logsize = 0; 1796 nvme_async_event_t event; 1797 1798 /* 1799 * Check for errors associated with the async request itself. The only 1800 * command-specific error is "async event limit exceeded", which 1801 * indicates a programming error in the driver and causes a panic in 1802 * nvme_check_cmd_status(). 1803 * 1804 * Other possible errors are various scenarios where the async request 1805 * was aborted, or internal errors in the device. Internal errors are 1806 * reported to FMA, the command aborts need no special handling here. 1807 * 1808 * And finally, at least qemu nvme does not support async events, 1809 * and will return NVME_CQE_SC_GEN_INV_OPC | DNR. If so, we 1810 * will avoid posting async events. 1811 */ 1812 1813 if (nvme_check_cmd_status(cmd) != 0) { 1814 dev_err(cmd->nc_nvme->n_dip, CE_WARN, 1815 "!async event request returned failure, sct = %x, " 1816 "sc = %x, dnr = %d, m = %d", cmd->nc_cqe.cqe_sf.sf_sct, 1817 cmd->nc_cqe.cqe_sf.sf_sc, cmd->nc_cqe.cqe_sf.sf_dnr, 1818 cmd->nc_cqe.cqe_sf.sf_m); 1819 1820 if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC && 1821 cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INTERNAL_ERR) { 1822 cmd->nc_nvme->n_dead = B_TRUE; 1823 ddi_fm_service_impact(cmd->nc_nvme->n_dip, 1824 DDI_SERVICE_LOST); 1825 } 1826 1827 if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC && 1828 cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INV_OPC && 1829 cmd->nc_cqe.cqe_sf.sf_dnr == 1) { 1830 nvme->n_async_event_supported = B_FALSE; 1831 } 1832 1833 nvme_free_cmd(cmd); 1834 return; 1835 } 1836 1837 event.r = cmd->nc_cqe.cqe_dw0; 1838 1839 /* Clear CQE and re-submit the async request. */ 1840 bzero(&cmd->nc_cqe, sizeof (nvme_cqe_t)); 1841 nvme_submit_admin_cmd(nvme->n_adminq, cmd); 1842 1843 switch (event.b.ae_type) { 1844 case NVME_ASYNC_TYPE_ERROR: 1845 if (event.b.ae_logpage == NVME_LOGPAGE_ERROR) { 1846 (void) nvme_get_logpage(nvme, B_FALSE, 1847 (void **)&error_log, &logsize, event.b.ae_logpage); 1848 } else { 1849 dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in " 1850 "async event reply: %d", event.b.ae_logpage); 1851 atomic_inc_32(&nvme->n_wrong_logpage); 1852 } 1853 1854 switch (event.b.ae_info) { 1855 case NVME_ASYNC_ERROR_INV_SQ: 1856 dev_err(nvme->n_dip, CE_PANIC, "programming error: " 1857 "invalid submission queue"); 1858 return; 1859 1860 case NVME_ASYNC_ERROR_INV_DBL: 1861 dev_err(nvme->n_dip, CE_PANIC, "programming error: " 1862 "invalid doorbell write value"); 1863 return; 1864 1865 case NVME_ASYNC_ERROR_DIAGFAIL: 1866 dev_err(nvme->n_dip, CE_WARN, "!diagnostic failure"); 1867 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 1868 nvme->n_dead = B_TRUE; 1869 atomic_inc_32(&nvme->n_diagfail_event); 1870 break; 1871 1872 case NVME_ASYNC_ERROR_PERSISTENT: 1873 dev_err(nvme->n_dip, CE_WARN, "!persistent internal " 1874 "device error"); 1875 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 1876 nvme->n_dead = B_TRUE; 1877 atomic_inc_32(&nvme->n_persistent_event); 1878 break; 1879 1880 case NVME_ASYNC_ERROR_TRANSIENT: 1881 dev_err(nvme->n_dip, CE_WARN, "!transient internal " 1882 "device error"); 1883 /* TODO: send ereport */ 1884 atomic_inc_32(&nvme->n_transient_event); 1885 break; 1886 1887 case NVME_ASYNC_ERROR_FW_LOAD: 1888 dev_err(nvme->n_dip, CE_WARN, 1889 "!firmware image load error"); 1890 atomic_inc_32(&nvme->n_fw_load_event); 1891 break; 1892 } 1893 break; 1894 1895 case NVME_ASYNC_TYPE_HEALTH: 1896 if (event.b.ae_logpage == NVME_LOGPAGE_HEALTH) { 1897 (void) nvme_get_logpage(nvme, B_FALSE, 1898 (void **)&health_log, &logsize, event.b.ae_logpage, 1899 -1); 1900 } else { 1901 dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in " 1902 "async event reply: %d", event.b.ae_logpage); 1903 atomic_inc_32(&nvme->n_wrong_logpage); 1904 } 1905 1906 switch (event.b.ae_info) { 1907 case NVME_ASYNC_HEALTH_RELIABILITY: 1908 dev_err(nvme->n_dip, CE_WARN, 1909 "!device reliability compromised"); 1910 /* TODO: send ereport */ 1911 atomic_inc_32(&nvme->n_reliability_event); 1912 break; 1913 1914 case NVME_ASYNC_HEALTH_TEMPERATURE: 1915 dev_err(nvme->n_dip, CE_WARN, 1916 "!temperature above threshold"); 1917 /* TODO: send ereport */ 1918 atomic_inc_32(&nvme->n_temperature_event); 1919 break; 1920 1921 case NVME_ASYNC_HEALTH_SPARE: 1922 dev_err(nvme->n_dip, CE_WARN, 1923 "!spare space below threshold"); 1924 /* TODO: send ereport */ 1925 atomic_inc_32(&nvme->n_spare_event); 1926 break; 1927 } 1928 break; 1929 1930 case NVME_ASYNC_TYPE_NOTICE: 1931 switch (event.b.ae_info) { 1932 case NVME_ASYNC_NOTICE_NS_CHANGE: 1933 dev_err(nvme->n_dip, CE_NOTE, 1934 "namespace attribute change event, " 1935 "logpage = %x", event.b.ae_logpage); 1936 atomic_inc_32(&nvme->n_notice_event); 1937 1938 if (event.b.ae_logpage != NVME_LOGPAGE_NSCHANGE) 1939 break; 1940 1941 if (nvme_get_logpage(nvme, B_FALSE, (void **)&nslist, 1942 &logsize, event.b.ae_logpage, -1) != 0) { 1943 break; 1944 } 1945 1946 if (nslist->nscl_ns[0] == UINT32_MAX) { 1947 dev_err(nvme->n_dip, CE_CONT, 1948 "more than %u namespaces have changed.\n", 1949 NVME_NSCHANGE_LIST_SIZE); 1950 break; 1951 } 1952 1953 for (uint_t i = 0; i < NVME_NSCHANGE_LIST_SIZE; i++) { 1954 uint32_t nsid = nslist->nscl_ns[i]; 1955 1956 if (nsid == 0) /* end of list */ 1957 break; 1958 nvme_changed_ns(nvme, nsid); 1959 } 1960 1961 break; 1962 1963 case NVME_ASYNC_NOTICE_FW_ACTIVATE: 1964 dev_err(nvme->n_dip, CE_NOTE, 1965 "firmware activation starting, " 1966 "logpage = %x", event.b.ae_logpage); 1967 atomic_inc_32(&nvme->n_notice_event); 1968 break; 1969 1970 case NVME_ASYNC_NOTICE_TELEMETRY: 1971 dev_err(nvme->n_dip, CE_NOTE, 1972 "telemetry log changed, " 1973 "logpage = %x", event.b.ae_logpage); 1974 atomic_inc_32(&nvme->n_notice_event); 1975 break; 1976 1977 case NVME_ASYNC_NOTICE_NS_ASYMM: 1978 dev_err(nvme->n_dip, CE_NOTE, 1979 "asymmetric namespace access change, " 1980 "logpage = %x", event.b.ae_logpage); 1981 atomic_inc_32(&nvme->n_notice_event); 1982 break; 1983 1984 case NVME_ASYNC_NOTICE_LATENCYLOG: 1985 dev_err(nvme->n_dip, CE_NOTE, 1986 "predictable latency event aggregate log change, " 1987 "logpage = %x", event.b.ae_logpage); 1988 atomic_inc_32(&nvme->n_notice_event); 1989 break; 1990 1991 case NVME_ASYNC_NOTICE_LBASTATUS: 1992 dev_err(nvme->n_dip, CE_NOTE, 1993 "LBA status information alert, " 1994 "logpage = %x", event.b.ae_logpage); 1995 atomic_inc_32(&nvme->n_notice_event); 1996 break; 1997 1998 case NVME_ASYNC_NOTICE_ENDURANCELOG: 1999 dev_err(nvme->n_dip, CE_NOTE, 2000 "endurance group event aggregate log page change, " 2001 "logpage = %x", event.b.ae_logpage); 2002 atomic_inc_32(&nvme->n_notice_event); 2003 break; 2004 2005 default: 2006 dev_err(nvme->n_dip, CE_WARN, 2007 "!unknown notice async event received, " 2008 "info = %x, logpage = %x", event.b.ae_info, 2009 event.b.ae_logpage); 2010 atomic_inc_32(&nvme->n_unknown_event); 2011 break; 2012 } 2013 break; 2014 2015 case NVME_ASYNC_TYPE_VENDOR: 2016 dev_err(nvme->n_dip, CE_WARN, "!vendor specific async event " 2017 "received, info = %x, logpage = %x", event.b.ae_info, 2018 event.b.ae_logpage); 2019 atomic_inc_32(&nvme->n_vendor_event); 2020 break; 2021 2022 default: 2023 dev_err(nvme->n_dip, CE_WARN, "!unknown async event received, " 2024 "type = %x, info = %x, logpage = %x", event.b.ae_type, 2025 event.b.ae_info, event.b.ae_logpage); 2026 atomic_inc_32(&nvme->n_unknown_event); 2027 break; 2028 } 2029 2030 if (error_log != NULL) 2031 kmem_free(error_log, logsize); 2032 2033 if (health_log != NULL) 2034 kmem_free(health_log, logsize); 2035 2036 if (nslist != NULL) 2037 kmem_free(nslist, logsize); 2038 } 2039 2040 static void 2041 nvme_admin_cmd(nvme_cmd_t *cmd, int sec) 2042 { 2043 mutex_enter(&cmd->nc_mutex); 2044 nvme_submit_admin_cmd(cmd->nc_nvme->n_adminq, cmd); 2045 nvme_wait_cmd(cmd, sec); 2046 mutex_exit(&cmd->nc_mutex); 2047 } 2048 2049 static void 2050 nvme_async_event(nvme_t *nvme) 2051 { 2052 nvme_cmd_t *cmd; 2053 2054 cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2055 cmd->nc_sqid = 0; 2056 cmd->nc_sqe.sqe_opc = NVME_OPC_ASYNC_EVENT; 2057 cmd->nc_callback = nvme_async_event_task; 2058 cmd->nc_dontpanic = B_TRUE; 2059 2060 nvme_submit_admin_cmd(nvme->n_adminq, cmd); 2061 } 2062 2063 static int 2064 nvme_format_nvm(nvme_t *nvme, boolean_t user, uint32_t nsid, uint8_t lbaf, 2065 boolean_t ms, uint8_t pi, boolean_t pil, uint8_t ses) 2066 { 2067 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2068 nvme_format_nvm_t format_nvm = { 0 }; 2069 int ret; 2070 2071 format_nvm.b.fm_lbaf = lbaf & 0xf; 2072 format_nvm.b.fm_ms = ms ? 1 : 0; 2073 format_nvm.b.fm_pi = pi & 0x7; 2074 format_nvm.b.fm_pil = pil ? 1 : 0; 2075 format_nvm.b.fm_ses = ses & 0x7; 2076 2077 cmd->nc_sqid = 0; 2078 cmd->nc_callback = nvme_wakeup_cmd; 2079 cmd->nc_sqe.sqe_nsid = nsid; 2080 cmd->nc_sqe.sqe_opc = NVME_OPC_NVM_FORMAT; 2081 cmd->nc_sqe.sqe_cdw10 = format_nvm.r; 2082 2083 /* 2084 * Some devices like Samsung SM951 don't allow formatting of all 2085 * namespaces in one command. Handle that gracefully. 2086 */ 2087 if (nsid == (uint32_t)-1) 2088 cmd->nc_dontpanic = B_TRUE; 2089 /* 2090 * If this format request was initiated by the user, then don't allow a 2091 * programmer error to panic the system. 2092 */ 2093 if (user) 2094 cmd->nc_dontpanic = B_TRUE; 2095 2096 nvme_admin_cmd(cmd, nvme_format_cmd_timeout); 2097 2098 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 2099 dev_err(nvme->n_dip, CE_WARN, 2100 "!FORMAT failed with sct = %x, sc = %x", 2101 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 2102 } 2103 2104 nvme_free_cmd(cmd); 2105 return (ret); 2106 } 2107 2108 /* 2109 * The `bufsize` parameter is usually an output parameter, set by this routine 2110 * when filling in the supported types of logpages from the device. However, for 2111 * vendor-specific pages, it is an input parameter, and must be set 2112 * appropriately by callers. 2113 */ 2114 static int 2115 nvme_get_logpage(nvme_t *nvme, boolean_t user, void **buf, size_t *bufsize, 2116 uint8_t logpage, ...) 2117 { 2118 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2119 nvme_getlogpage_t getlogpage = { 0 }; 2120 va_list ap; 2121 int ret; 2122 2123 va_start(ap, logpage); 2124 2125 cmd->nc_sqid = 0; 2126 cmd->nc_callback = nvme_wakeup_cmd; 2127 cmd->nc_sqe.sqe_opc = NVME_OPC_GET_LOG_PAGE; 2128 2129 if (user) 2130 cmd->nc_dontpanic = B_TRUE; 2131 2132 getlogpage.b.lp_lid = logpage; 2133 2134 switch (logpage) { 2135 case NVME_LOGPAGE_ERROR: 2136 cmd->nc_sqe.sqe_nsid = (uint32_t)-1; 2137 *bufsize = MIN(NVME_VENDOR_SPECIFIC_LOGPAGE_MAX_SIZE, 2138 nvme->n_error_log_len * sizeof (nvme_error_log_entry_t)); 2139 break; 2140 2141 case NVME_LOGPAGE_HEALTH: 2142 cmd->nc_sqe.sqe_nsid = va_arg(ap, uint32_t); 2143 *bufsize = sizeof (nvme_health_log_t); 2144 break; 2145 2146 case NVME_LOGPAGE_FWSLOT: 2147 cmd->nc_sqe.sqe_nsid = (uint32_t)-1; 2148 *bufsize = sizeof (nvme_fwslot_log_t); 2149 break; 2150 2151 case NVME_LOGPAGE_NSCHANGE: 2152 cmd->nc_sqe.sqe_nsid = (uint32_t)-1; 2153 *bufsize = sizeof (nvme_nschange_list_t); 2154 break; 2155 2156 default: 2157 /* 2158 * This intentionally only checks against the minimum valid 2159 * log page ID. `logpage` is a uint8_t, and `0xFF` is a valid 2160 * page ID, so this one-sided check avoids a compiler error 2161 * about a check that's always true. 2162 */ 2163 if (logpage < NVME_VENDOR_SPECIFIC_LOGPAGE_MIN) { 2164 dev_err(nvme->n_dip, CE_WARN, 2165 "!unknown log page requested: %d", logpage); 2166 atomic_inc_32(&nvme->n_unknown_logpage); 2167 ret = EINVAL; 2168 goto fail; 2169 } 2170 cmd->nc_sqe.sqe_nsid = va_arg(ap, uint32_t); 2171 } 2172 2173 va_end(ap); 2174 2175 getlogpage.b.lp_numd = *bufsize / sizeof (uint32_t) - 1; 2176 2177 cmd->nc_sqe.sqe_cdw10 = getlogpage.r; 2178 2179 if (nvme_zalloc_dma(nvme, *bufsize, 2180 DDI_DMA_READ, &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) { 2181 dev_err(nvme->n_dip, CE_WARN, 2182 "!nvme_zalloc_dma failed for GET LOG PAGE"); 2183 ret = ENOMEM; 2184 goto fail; 2185 } 2186 2187 if ((ret = nvme_fill_prp(cmd, cmd->nc_dma->nd_dmah)) != 0) 2188 goto fail; 2189 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout); 2190 2191 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 2192 dev_err(nvme->n_dip, CE_WARN, 2193 "!GET LOG PAGE failed with sct = %x, sc = %x", 2194 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 2195 goto fail; 2196 } 2197 2198 *buf = kmem_alloc(*bufsize, KM_SLEEP); 2199 bcopy(cmd->nc_dma->nd_memp, *buf, *bufsize); 2200 2201 fail: 2202 nvme_free_cmd(cmd); 2203 2204 return (ret); 2205 } 2206 2207 static int 2208 nvme_identify(nvme_t *nvme, boolean_t user, uint32_t nsid, void **buf) 2209 { 2210 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2211 int ret; 2212 2213 if (buf == NULL) 2214 return (EINVAL); 2215 2216 cmd->nc_sqid = 0; 2217 cmd->nc_callback = nvme_wakeup_cmd; 2218 cmd->nc_sqe.sqe_opc = NVME_OPC_IDENTIFY; 2219 cmd->nc_sqe.sqe_nsid = nsid; 2220 cmd->nc_sqe.sqe_cdw10 = nsid ? NVME_IDENTIFY_NSID : NVME_IDENTIFY_CTRL; 2221 2222 if (nvme_zalloc_dma(nvme, NVME_IDENTIFY_BUFSIZE, DDI_DMA_READ, 2223 &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) { 2224 dev_err(nvme->n_dip, CE_WARN, 2225 "!nvme_zalloc_dma failed for IDENTIFY"); 2226 ret = ENOMEM; 2227 goto fail; 2228 } 2229 2230 if (cmd->nc_dma->nd_ncookie > 2) { 2231 dev_err(nvme->n_dip, CE_WARN, 2232 "!too many DMA cookies for IDENTIFY"); 2233 atomic_inc_32(&nvme->n_too_many_cookies); 2234 ret = ENOMEM; 2235 goto fail; 2236 } 2237 2238 cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_dma->nd_cookie.dmac_laddress; 2239 if (cmd->nc_dma->nd_ncookie > 1) { 2240 ddi_dma_nextcookie(cmd->nc_dma->nd_dmah, 2241 &cmd->nc_dma->nd_cookie); 2242 cmd->nc_sqe.sqe_dptr.d_prp[1] = 2243 cmd->nc_dma->nd_cookie.dmac_laddress; 2244 } 2245 2246 if (user) 2247 cmd->nc_dontpanic = B_TRUE; 2248 2249 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout); 2250 2251 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 2252 dev_err(nvme->n_dip, CE_WARN, 2253 "!IDENTIFY failed with sct = %x, sc = %x", 2254 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 2255 goto fail; 2256 } 2257 2258 *buf = kmem_alloc(NVME_IDENTIFY_BUFSIZE, KM_SLEEP); 2259 bcopy(cmd->nc_dma->nd_memp, *buf, NVME_IDENTIFY_BUFSIZE); 2260 2261 fail: 2262 nvme_free_cmd(cmd); 2263 2264 return (ret); 2265 } 2266 2267 static int 2268 nvme_set_features(nvme_t *nvme, boolean_t user, uint32_t nsid, uint8_t feature, 2269 uint32_t val, uint32_t *res) 2270 { 2271 _NOTE(ARGUNUSED(nsid)); 2272 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2273 int ret = EINVAL; 2274 2275 ASSERT(res != NULL); 2276 2277 cmd->nc_sqid = 0; 2278 cmd->nc_callback = nvme_wakeup_cmd; 2279 cmd->nc_sqe.sqe_opc = NVME_OPC_SET_FEATURES; 2280 cmd->nc_sqe.sqe_cdw10 = feature; 2281 cmd->nc_sqe.sqe_cdw11 = val; 2282 2283 if (user) 2284 cmd->nc_dontpanic = B_TRUE; 2285 2286 switch (feature) { 2287 case NVME_FEAT_WRITE_CACHE: 2288 if (!nvme->n_write_cache_present) 2289 goto fail; 2290 break; 2291 2292 case NVME_FEAT_NQUEUES: 2293 break; 2294 2295 default: 2296 goto fail; 2297 } 2298 2299 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout); 2300 2301 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 2302 dev_err(nvme->n_dip, CE_WARN, 2303 "!SET FEATURES %d failed with sct = %x, sc = %x", 2304 feature, cmd->nc_cqe.cqe_sf.sf_sct, 2305 cmd->nc_cqe.cqe_sf.sf_sc); 2306 goto fail; 2307 } 2308 2309 *res = cmd->nc_cqe.cqe_dw0; 2310 2311 fail: 2312 nvme_free_cmd(cmd); 2313 return (ret); 2314 } 2315 2316 static int 2317 nvme_get_features(nvme_t *nvme, boolean_t user, uint32_t nsid, uint8_t feature, 2318 uint32_t *res, void **buf, size_t *bufsize) 2319 { 2320 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2321 int ret = EINVAL; 2322 2323 ASSERT(res != NULL); 2324 2325 if (bufsize != NULL) 2326 *bufsize = 0; 2327 2328 cmd->nc_sqid = 0; 2329 cmd->nc_callback = nvme_wakeup_cmd; 2330 cmd->nc_sqe.sqe_opc = NVME_OPC_GET_FEATURES; 2331 cmd->nc_sqe.sqe_cdw10 = feature; 2332 cmd->nc_sqe.sqe_cdw11 = *res; 2333 2334 /* 2335 * For some of the optional features there doesn't seem to be a method 2336 * of detecting whether it is supported other than using it. This will 2337 * cause "Invalid Field in Command" error, which is normally considered 2338 * a programming error. Set the nc_dontpanic flag to override the panic 2339 * in nvme_check_generic_cmd_status(). 2340 */ 2341 switch (feature) { 2342 case NVME_FEAT_ARBITRATION: 2343 case NVME_FEAT_POWER_MGMT: 2344 case NVME_FEAT_TEMPERATURE: 2345 case NVME_FEAT_ERROR: 2346 case NVME_FEAT_NQUEUES: 2347 case NVME_FEAT_INTR_COAL: 2348 case NVME_FEAT_INTR_VECT: 2349 case NVME_FEAT_WRITE_ATOM: 2350 case NVME_FEAT_ASYNC_EVENT: 2351 break; 2352 2353 case NVME_FEAT_WRITE_CACHE: 2354 if (!nvme->n_write_cache_present) 2355 goto fail; 2356 break; 2357 2358 case NVME_FEAT_LBA_RANGE: 2359 if (!nvme->n_lba_range_supported) 2360 goto fail; 2361 2362 cmd->nc_dontpanic = B_TRUE; 2363 cmd->nc_sqe.sqe_nsid = nsid; 2364 ASSERT(bufsize != NULL); 2365 *bufsize = NVME_LBA_RANGE_BUFSIZE; 2366 break; 2367 2368 case NVME_FEAT_AUTO_PST: 2369 if (!nvme->n_auto_pst_supported) 2370 goto fail; 2371 2372 ASSERT(bufsize != NULL); 2373 *bufsize = NVME_AUTO_PST_BUFSIZE; 2374 break; 2375 2376 case NVME_FEAT_PROGRESS: 2377 if (!nvme->n_progress_supported) 2378 goto fail; 2379 2380 cmd->nc_dontpanic = B_TRUE; 2381 break; 2382 2383 default: 2384 goto fail; 2385 } 2386 2387 if (user) 2388 cmd->nc_dontpanic = B_TRUE; 2389 2390 if (bufsize != NULL && *bufsize != 0) { 2391 if (nvme_zalloc_dma(nvme, *bufsize, DDI_DMA_READ, 2392 &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) { 2393 dev_err(nvme->n_dip, CE_WARN, 2394 "!nvme_zalloc_dma failed for GET FEATURES"); 2395 ret = ENOMEM; 2396 goto fail; 2397 } 2398 2399 if (cmd->nc_dma->nd_ncookie > 2) { 2400 dev_err(nvme->n_dip, CE_WARN, 2401 "!too many DMA cookies for GET FEATURES"); 2402 atomic_inc_32(&nvme->n_too_many_cookies); 2403 ret = ENOMEM; 2404 goto fail; 2405 } 2406 2407 cmd->nc_sqe.sqe_dptr.d_prp[0] = 2408 cmd->nc_dma->nd_cookie.dmac_laddress; 2409 if (cmd->nc_dma->nd_ncookie > 1) { 2410 ddi_dma_nextcookie(cmd->nc_dma->nd_dmah, 2411 &cmd->nc_dma->nd_cookie); 2412 cmd->nc_sqe.sqe_dptr.d_prp[1] = 2413 cmd->nc_dma->nd_cookie.dmac_laddress; 2414 } 2415 } 2416 2417 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout); 2418 2419 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 2420 boolean_t known = B_TRUE; 2421 2422 /* Check if this is unsupported optional feature */ 2423 if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC && 2424 cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INV_FLD) { 2425 switch (feature) { 2426 case NVME_FEAT_LBA_RANGE: 2427 nvme->n_lba_range_supported = B_FALSE; 2428 break; 2429 case NVME_FEAT_PROGRESS: 2430 nvme->n_progress_supported = B_FALSE; 2431 break; 2432 default: 2433 known = B_FALSE; 2434 break; 2435 } 2436 } else { 2437 known = B_FALSE; 2438 } 2439 2440 /* Report the error otherwise */ 2441 if (!known) { 2442 dev_err(nvme->n_dip, CE_WARN, 2443 "!GET FEATURES %d failed with sct = %x, sc = %x", 2444 feature, cmd->nc_cqe.cqe_sf.sf_sct, 2445 cmd->nc_cqe.cqe_sf.sf_sc); 2446 } 2447 2448 goto fail; 2449 } 2450 2451 if (bufsize != NULL && *bufsize != 0) { 2452 ASSERT(buf != NULL); 2453 *buf = kmem_alloc(*bufsize, KM_SLEEP); 2454 bcopy(cmd->nc_dma->nd_memp, *buf, *bufsize); 2455 } 2456 2457 *res = cmd->nc_cqe.cqe_dw0; 2458 2459 fail: 2460 nvme_free_cmd(cmd); 2461 return (ret); 2462 } 2463 2464 static int 2465 nvme_write_cache_set(nvme_t *nvme, boolean_t enable) 2466 { 2467 nvme_write_cache_t nwc = { 0 }; 2468 2469 if (enable) 2470 nwc.b.wc_wce = 1; 2471 2472 return (nvme_set_features(nvme, B_FALSE, 0, NVME_FEAT_WRITE_CACHE, 2473 nwc.r, &nwc.r)); 2474 } 2475 2476 static int 2477 nvme_set_nqueues(nvme_t *nvme) 2478 { 2479 nvme_nqueues_t nq = { 0 }; 2480 int ret; 2481 2482 /* 2483 * The default is to allocate one completion queue per vector. 2484 */ 2485 if (nvme->n_completion_queues == -1) 2486 nvme->n_completion_queues = nvme->n_intr_cnt; 2487 2488 /* 2489 * There is no point in having more completion queues than 2490 * interrupt vectors. 2491 */ 2492 nvme->n_completion_queues = MIN(nvme->n_completion_queues, 2493 nvme->n_intr_cnt); 2494 2495 /* 2496 * The default is to use one submission queue per completion queue. 2497 */ 2498 if (nvme->n_submission_queues == -1) 2499 nvme->n_submission_queues = nvme->n_completion_queues; 2500 2501 /* 2502 * There is no point in having more compeletion queues than 2503 * submission queues. 2504 */ 2505 nvme->n_completion_queues = MIN(nvme->n_completion_queues, 2506 nvme->n_submission_queues); 2507 2508 ASSERT(nvme->n_submission_queues > 0); 2509 ASSERT(nvme->n_completion_queues > 0); 2510 2511 nq.b.nq_nsq = nvme->n_submission_queues - 1; 2512 nq.b.nq_ncq = nvme->n_completion_queues - 1; 2513 2514 ret = nvme_set_features(nvme, B_FALSE, 0, NVME_FEAT_NQUEUES, nq.r, 2515 &nq.r); 2516 2517 if (ret == 0) { 2518 /* 2519 * Never use more than the requested number of queues. 2520 */ 2521 nvme->n_submission_queues = MIN(nvme->n_submission_queues, 2522 nq.b.nq_nsq + 1); 2523 nvme->n_completion_queues = MIN(nvme->n_completion_queues, 2524 nq.b.nq_ncq + 1); 2525 } 2526 2527 return (ret); 2528 } 2529 2530 static int 2531 nvme_create_completion_queue(nvme_t *nvme, nvme_cq_t *cq) 2532 { 2533 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2534 nvme_create_queue_dw10_t dw10 = { 0 }; 2535 nvme_create_cq_dw11_t c_dw11 = { 0 }; 2536 int ret; 2537 2538 dw10.b.q_qid = cq->ncq_id; 2539 dw10.b.q_qsize = cq->ncq_nentry - 1; 2540 2541 c_dw11.b.cq_pc = 1; 2542 c_dw11.b.cq_ien = 1; 2543 c_dw11.b.cq_iv = cq->ncq_id % nvme->n_intr_cnt; 2544 2545 cmd->nc_sqid = 0; 2546 cmd->nc_callback = nvme_wakeup_cmd; 2547 cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_CQUEUE; 2548 cmd->nc_sqe.sqe_cdw10 = dw10.r; 2549 cmd->nc_sqe.sqe_cdw11 = c_dw11.r; 2550 cmd->nc_sqe.sqe_dptr.d_prp[0] = cq->ncq_dma->nd_cookie.dmac_laddress; 2551 2552 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout); 2553 2554 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 2555 dev_err(nvme->n_dip, CE_WARN, 2556 "!CREATE CQUEUE failed with sct = %x, sc = %x", 2557 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 2558 } 2559 2560 nvme_free_cmd(cmd); 2561 2562 return (ret); 2563 } 2564 2565 static int 2566 nvme_create_io_qpair(nvme_t *nvme, nvme_qpair_t *qp, uint16_t idx) 2567 { 2568 nvme_cq_t *cq = qp->nq_cq; 2569 nvme_cmd_t *cmd; 2570 nvme_create_queue_dw10_t dw10 = { 0 }; 2571 nvme_create_sq_dw11_t s_dw11 = { 0 }; 2572 int ret; 2573 2574 /* 2575 * It is possible to have more qpairs than completion queues, 2576 * and when the idx > ncq_id, that completion queue is shared 2577 * and has already been created. 2578 */ 2579 if (idx <= cq->ncq_id && 2580 nvme_create_completion_queue(nvme, cq) != DDI_SUCCESS) 2581 return (DDI_FAILURE); 2582 2583 dw10.b.q_qid = idx; 2584 dw10.b.q_qsize = qp->nq_nentry - 1; 2585 2586 s_dw11.b.sq_pc = 1; 2587 s_dw11.b.sq_cqid = cq->ncq_id; 2588 2589 cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2590 cmd->nc_sqid = 0; 2591 cmd->nc_callback = nvme_wakeup_cmd; 2592 cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_SQUEUE; 2593 cmd->nc_sqe.sqe_cdw10 = dw10.r; 2594 cmd->nc_sqe.sqe_cdw11 = s_dw11.r; 2595 cmd->nc_sqe.sqe_dptr.d_prp[0] = qp->nq_sqdma->nd_cookie.dmac_laddress; 2596 2597 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout); 2598 2599 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 2600 dev_err(nvme->n_dip, CE_WARN, 2601 "!CREATE SQUEUE failed with sct = %x, sc = %x", 2602 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 2603 } 2604 2605 nvme_free_cmd(cmd); 2606 2607 return (ret); 2608 } 2609 2610 static boolean_t 2611 nvme_reset(nvme_t *nvme, boolean_t quiesce) 2612 { 2613 nvme_reg_csts_t csts; 2614 int i; 2615 2616 nvme_put32(nvme, NVME_REG_CC, 0); 2617 2618 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 2619 if (csts.b.csts_rdy == 1) { 2620 nvme_put32(nvme, NVME_REG_CC, 0); 2621 for (i = 0; i != nvme->n_timeout * 10; i++) { 2622 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 2623 if (csts.b.csts_rdy == 0) 2624 break; 2625 2626 if (quiesce) 2627 drv_usecwait(50000); 2628 else 2629 delay(drv_usectohz(50000)); 2630 } 2631 } 2632 2633 nvme_put32(nvme, NVME_REG_AQA, 0); 2634 nvme_put32(nvme, NVME_REG_ASQ, 0); 2635 nvme_put32(nvme, NVME_REG_ACQ, 0); 2636 2637 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 2638 return (csts.b.csts_rdy == 0 ? B_TRUE : B_FALSE); 2639 } 2640 2641 static void 2642 nvme_shutdown(nvme_t *nvme, int mode, boolean_t quiesce) 2643 { 2644 nvme_reg_cc_t cc; 2645 nvme_reg_csts_t csts; 2646 int i; 2647 2648 ASSERT(mode == NVME_CC_SHN_NORMAL || mode == NVME_CC_SHN_ABRUPT); 2649 2650 cc.r = nvme_get32(nvme, NVME_REG_CC); 2651 cc.b.cc_shn = mode & 0x3; 2652 nvme_put32(nvme, NVME_REG_CC, cc.r); 2653 2654 for (i = 0; i != 10; i++) { 2655 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 2656 if (csts.b.csts_shst == NVME_CSTS_SHN_COMPLETE) 2657 break; 2658 2659 if (quiesce) 2660 drv_usecwait(100000); 2661 else 2662 delay(drv_usectohz(100000)); 2663 } 2664 } 2665 2666 2667 static void 2668 nvme_prepare_devid(nvme_t *nvme, uint32_t nsid) 2669 { 2670 /* 2671 * Section 7.7 of the spec describes how to get a unique ID for 2672 * the controller: the vendor ID, the model name and the serial 2673 * number shall be unique when combined. 2674 * 2675 * If a namespace has no EUI64 we use the above and add the hex 2676 * namespace ID to get a unique ID for the namespace. 2677 */ 2678 char model[sizeof (nvme->n_idctl->id_model) + 1]; 2679 char serial[sizeof (nvme->n_idctl->id_serial) + 1]; 2680 2681 bcopy(nvme->n_idctl->id_model, model, sizeof (nvme->n_idctl->id_model)); 2682 bcopy(nvme->n_idctl->id_serial, serial, 2683 sizeof (nvme->n_idctl->id_serial)); 2684 2685 model[sizeof (nvme->n_idctl->id_model)] = '\0'; 2686 serial[sizeof (nvme->n_idctl->id_serial)] = '\0'; 2687 2688 nvme->n_ns[nsid - 1].ns_devid = kmem_asprintf("%4X-%s-%s-%X", 2689 nvme->n_idctl->id_vid, model, serial, nsid); 2690 } 2691 2692 static void 2693 nvme_changed_ns(nvme_t *nvme, int nsid) 2694 { 2695 nvme_namespace_t *ns = &nvme->n_ns[nsid - 1]; 2696 nvme_identify_nsid_t *idns, *oidns; 2697 2698 dev_err(nvme->n_dip, CE_NOTE, "!namespace %u (%s) has changed.", 2699 nsid, ns->ns_name); 2700 2701 if (ns->ns_ignore) 2702 return; 2703 2704 /* 2705 * The namespace has changed in some way. At present, we only update 2706 * the device capacity and trigger blkdev to check the device state. 2707 */ 2708 2709 if (nvme_identify(nvme, B_FALSE, nsid, (void **)&idns) != 0) { 2710 dev_err(nvme->n_dip, CE_WARN, 2711 "!failed to identify namespace %d", nsid); 2712 return; 2713 } 2714 2715 oidns = ns->ns_idns; 2716 ns->ns_idns = idns; 2717 kmem_free(oidns, sizeof (nvme_identify_nsid_t)); 2718 2719 ns->ns_block_count = idns->id_nsize; 2720 ns->ns_block_size = 2721 1 << idns->id_lbaf[idns->id_flbas.lba_format].lbaf_lbads; 2722 ns->ns_best_block_size = ns->ns_block_size; 2723 2724 bd_state_change(ns->ns_bd_hdl); 2725 } 2726 2727 static int 2728 nvme_init_ns(nvme_t *nvme, int nsid) 2729 { 2730 nvme_namespace_t *ns = &nvme->n_ns[nsid - 1]; 2731 nvme_identify_nsid_t *idns; 2732 boolean_t was_ignored; 2733 int last_rp; 2734 2735 ns->ns_nvme = nvme; 2736 2737 if (nvme_identify(nvme, B_FALSE, nsid, (void **)&idns) != 0) { 2738 dev_err(nvme->n_dip, CE_WARN, 2739 "!failed to identify namespace %d", nsid); 2740 return (DDI_FAILURE); 2741 } 2742 2743 ns->ns_idns = idns; 2744 ns->ns_id = nsid; 2745 ns->ns_block_count = idns->id_nsize; 2746 ns->ns_block_size = 2747 1 << idns->id_lbaf[idns->id_flbas.lba_format].lbaf_lbads; 2748 ns->ns_best_block_size = ns->ns_block_size; 2749 2750 /* 2751 * Get the EUI64 if present. Use it for devid and device node names. 2752 */ 2753 if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1)) 2754 bcopy(idns->id_eui64, ns->ns_eui64, sizeof (ns->ns_eui64)); 2755 2756 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 2757 if (*(uint64_t *)ns->ns_eui64 != 0) { 2758 uint8_t *eui64 = ns->ns_eui64; 2759 2760 (void) snprintf(ns->ns_name, sizeof (ns->ns_name), 2761 "%02x%02x%02x%02x%02x%02x%02x%02x", 2762 eui64[0], eui64[1], eui64[2], eui64[3], 2763 eui64[4], eui64[5], eui64[6], eui64[7]); 2764 } else { 2765 (void) snprintf(ns->ns_name, sizeof (ns->ns_name), "%d", 2766 ns->ns_id); 2767 2768 nvme_prepare_devid(nvme, ns->ns_id); 2769 } 2770 2771 /* 2772 * Find the LBA format with no metadata and the best relative 2773 * performance. A value of 3 means "degraded", 0 is best. 2774 */ 2775 last_rp = 3; 2776 for (int j = 0; j <= idns->id_nlbaf; j++) { 2777 if (idns->id_lbaf[j].lbaf_lbads == 0) 2778 break; 2779 if (idns->id_lbaf[j].lbaf_ms != 0) 2780 continue; 2781 if (idns->id_lbaf[j].lbaf_rp >= last_rp) 2782 continue; 2783 last_rp = idns->id_lbaf[j].lbaf_rp; 2784 ns->ns_best_block_size = 2785 1 << idns->id_lbaf[j].lbaf_lbads; 2786 } 2787 2788 if (ns->ns_best_block_size < nvme->n_min_block_size) 2789 ns->ns_best_block_size = nvme->n_min_block_size; 2790 2791 was_ignored = ns->ns_ignore; 2792 2793 /* 2794 * We currently don't support namespaces that use either: 2795 * - protection information 2796 * - illegal block size (< 512) 2797 */ 2798 if (idns->id_dps.dp_pinfo) { 2799 dev_err(nvme->n_dip, CE_WARN, 2800 "!ignoring namespace %d, unsupported feature: " 2801 "pinfo = %d", nsid, idns->id_dps.dp_pinfo); 2802 ns->ns_ignore = B_TRUE; 2803 } else if (ns->ns_block_size < 512) { 2804 dev_err(nvme->n_dip, CE_WARN, 2805 "!ignoring namespace %d, unsupported block size %"PRIu64, 2806 nsid, (uint64_t)ns->ns_block_size); 2807 ns->ns_ignore = B_TRUE; 2808 } else { 2809 ns->ns_ignore = B_FALSE; 2810 } 2811 2812 /* 2813 * Keep a count of namespaces which are attachable. 2814 * See comments in nvme_bd_driveinfo() to understand its effect. 2815 */ 2816 if (was_ignored) { 2817 /* 2818 * Previously ignored, but now not. Count it. 2819 */ 2820 if (!ns->ns_ignore) 2821 nvme->n_namespaces_attachable++; 2822 } else { 2823 /* 2824 * Wasn't ignored previously, but now needs to be. 2825 * Discount it. 2826 */ 2827 if (ns->ns_ignore) 2828 nvme->n_namespaces_attachable--; 2829 } 2830 2831 return (DDI_SUCCESS); 2832 } 2833 2834 static int 2835 nvme_init(nvme_t *nvme) 2836 { 2837 nvme_reg_cc_t cc = { 0 }; 2838 nvme_reg_aqa_t aqa = { 0 }; 2839 nvme_reg_asq_t asq = { 0 }; 2840 nvme_reg_acq_t acq = { 0 }; 2841 nvme_reg_cap_t cap; 2842 nvme_reg_vs_t vs; 2843 nvme_reg_csts_t csts; 2844 int i = 0; 2845 uint16_t nqueues; 2846 uint_t tq_threads; 2847 char model[sizeof (nvme->n_idctl->id_model) + 1]; 2848 char *vendor, *product; 2849 2850 /* Check controller version */ 2851 vs.r = nvme_get32(nvme, NVME_REG_VS); 2852 nvme->n_version.v_major = vs.b.vs_mjr; 2853 nvme->n_version.v_minor = vs.b.vs_mnr; 2854 dev_err(nvme->n_dip, CE_CONT, "?NVMe spec version %d.%d", 2855 nvme->n_version.v_major, nvme->n_version.v_minor); 2856 2857 if (nvme->n_version.v_major > nvme_version_major) { 2858 dev_err(nvme->n_dip, CE_WARN, "!no support for version > %d.x", 2859 nvme_version_major); 2860 if (nvme->n_strict_version) 2861 goto fail; 2862 } 2863 2864 /* retrieve controller configuration */ 2865 cap.r = nvme_get64(nvme, NVME_REG_CAP); 2866 2867 if ((cap.b.cap_css & NVME_CAP_CSS_NVM) == 0) { 2868 dev_err(nvme->n_dip, CE_WARN, 2869 "!NVM command set not supported by hardware"); 2870 goto fail; 2871 } 2872 2873 nvme->n_nssr_supported = cap.b.cap_nssrs; 2874 nvme->n_doorbell_stride = 4 << cap.b.cap_dstrd; 2875 nvme->n_timeout = cap.b.cap_to; 2876 nvme->n_arbitration_mechanisms = cap.b.cap_ams; 2877 nvme->n_cont_queues_reqd = cap.b.cap_cqr; 2878 nvme->n_max_queue_entries = cap.b.cap_mqes + 1; 2879 2880 /* 2881 * The MPSMIN and MPSMAX fields in the CAP register use 0 to specify 2882 * the base page size of 4k (1<<12), so add 12 here to get the real 2883 * page size value. 2884 */ 2885 nvme->n_pageshift = MIN(MAX(cap.b.cap_mpsmin + 12, PAGESHIFT), 2886 cap.b.cap_mpsmax + 12); 2887 nvme->n_pagesize = 1UL << (nvme->n_pageshift); 2888 2889 /* 2890 * Set up Queue DMA to transfer at least 1 page-aligned page at a time. 2891 */ 2892 nvme->n_queue_dma_attr.dma_attr_align = nvme->n_pagesize; 2893 nvme->n_queue_dma_attr.dma_attr_minxfer = nvme->n_pagesize; 2894 2895 /* 2896 * Set up PRP DMA to transfer 1 page-aligned page at a time. 2897 * Maxxfer may be increased after we identified the controller limits. 2898 */ 2899 nvme->n_prp_dma_attr.dma_attr_maxxfer = nvme->n_pagesize; 2900 nvme->n_prp_dma_attr.dma_attr_minxfer = nvme->n_pagesize; 2901 nvme->n_prp_dma_attr.dma_attr_align = nvme->n_pagesize; 2902 nvme->n_prp_dma_attr.dma_attr_seg = nvme->n_pagesize - 1; 2903 2904 /* 2905 * Reset controller if it's still in ready state. 2906 */ 2907 if (nvme_reset(nvme, B_FALSE) == B_FALSE) { 2908 dev_err(nvme->n_dip, CE_WARN, "!unable to reset controller"); 2909 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 2910 nvme->n_dead = B_TRUE; 2911 goto fail; 2912 } 2913 2914 /* 2915 * Create the cq array with one completion queue to be assigned 2916 * to the admin queue pair and a limited number of taskqs (4). 2917 */ 2918 if (nvme_create_cq_array(nvme, 1, nvme->n_admin_queue_len, 4) != 2919 DDI_SUCCESS) { 2920 dev_err(nvme->n_dip, CE_WARN, 2921 "!failed to pre-allocate admin completion queue"); 2922 goto fail; 2923 } 2924 /* 2925 * Create the admin queue pair. 2926 */ 2927 if (nvme_alloc_qpair(nvme, nvme->n_admin_queue_len, &nvme->n_adminq, 0) 2928 != DDI_SUCCESS) { 2929 dev_err(nvme->n_dip, CE_WARN, 2930 "!unable to allocate admin qpair"); 2931 goto fail; 2932 } 2933 nvme->n_ioq = kmem_alloc(sizeof (nvme_qpair_t *), KM_SLEEP); 2934 nvme->n_ioq[0] = nvme->n_adminq; 2935 2936 nvme->n_progress |= NVME_ADMIN_QUEUE; 2937 2938 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, 2939 "admin-queue-len", nvme->n_admin_queue_len); 2940 2941 aqa.b.aqa_asqs = aqa.b.aqa_acqs = nvme->n_admin_queue_len - 1; 2942 asq = nvme->n_adminq->nq_sqdma->nd_cookie.dmac_laddress; 2943 acq = nvme->n_adminq->nq_cq->ncq_dma->nd_cookie.dmac_laddress; 2944 2945 ASSERT((asq & (nvme->n_pagesize - 1)) == 0); 2946 ASSERT((acq & (nvme->n_pagesize - 1)) == 0); 2947 2948 nvme_put32(nvme, NVME_REG_AQA, aqa.r); 2949 nvme_put64(nvme, NVME_REG_ASQ, asq); 2950 nvme_put64(nvme, NVME_REG_ACQ, acq); 2951 2952 cc.b.cc_ams = 0; /* use Round-Robin arbitration */ 2953 cc.b.cc_css = 0; /* use NVM command set */ 2954 cc.b.cc_mps = nvme->n_pageshift - 12; 2955 cc.b.cc_shn = 0; /* no shutdown in progress */ 2956 cc.b.cc_en = 1; /* enable controller */ 2957 cc.b.cc_iosqes = 6; /* submission queue entry is 2^6 bytes long */ 2958 cc.b.cc_iocqes = 4; /* completion queue entry is 2^4 bytes long */ 2959 2960 nvme_put32(nvme, NVME_REG_CC, cc.r); 2961 2962 /* 2963 * Wait for the controller to become ready. 2964 */ 2965 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 2966 if (csts.b.csts_rdy == 0) { 2967 for (i = 0; i != nvme->n_timeout * 10; i++) { 2968 delay(drv_usectohz(50000)); 2969 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 2970 2971 if (csts.b.csts_cfs == 1) { 2972 dev_err(nvme->n_dip, CE_WARN, 2973 "!controller fatal status at init"); 2974 ddi_fm_service_impact(nvme->n_dip, 2975 DDI_SERVICE_LOST); 2976 nvme->n_dead = B_TRUE; 2977 goto fail; 2978 } 2979 2980 if (csts.b.csts_rdy == 1) 2981 break; 2982 } 2983 } 2984 2985 if (csts.b.csts_rdy == 0) { 2986 dev_err(nvme->n_dip, CE_WARN, "!controller not ready"); 2987 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 2988 nvme->n_dead = B_TRUE; 2989 goto fail; 2990 } 2991 2992 /* 2993 * Assume an abort command limit of 1. We'll destroy and re-init 2994 * that later when we know the true abort command limit. 2995 */ 2996 sema_init(&nvme->n_abort_sema, 1, NULL, SEMA_DRIVER, NULL); 2997 2998 /* 2999 * Set up initial interrupt for admin queue. 3000 */ 3001 if ((nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSIX, 1) 3002 != DDI_SUCCESS) && 3003 (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSI, 1) 3004 != DDI_SUCCESS) && 3005 (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_FIXED, 1) 3006 != DDI_SUCCESS)) { 3007 dev_err(nvme->n_dip, CE_WARN, 3008 "!failed to setup initial interrupt"); 3009 goto fail; 3010 } 3011 3012 /* 3013 * Post an asynchronous event command to catch errors. 3014 * We assume the asynchronous events are supported as required by 3015 * specification (Figure 40 in section 5 of NVMe 1.2). 3016 * However, since at least qemu does not follow the specification, 3017 * we need a mechanism to protect ourselves. 3018 */ 3019 nvme->n_async_event_supported = B_TRUE; 3020 nvme_async_event(nvme); 3021 3022 /* 3023 * Identify Controller 3024 */ 3025 if (nvme_identify(nvme, B_FALSE, 0, (void **)&nvme->n_idctl) != 0) { 3026 dev_err(nvme->n_dip, CE_WARN, 3027 "!failed to identify controller"); 3028 goto fail; 3029 } 3030 3031 /* 3032 * Get Vendor & Product ID 3033 */ 3034 bcopy(nvme->n_idctl->id_model, model, sizeof (nvme->n_idctl->id_model)); 3035 model[sizeof (nvme->n_idctl->id_model)] = '\0'; 3036 sata_split_model(model, &vendor, &product); 3037 3038 if (vendor == NULL) 3039 nvme->n_vendor = strdup("NVMe"); 3040 else 3041 nvme->n_vendor = strdup(vendor); 3042 3043 nvme->n_product = strdup(product); 3044 3045 /* 3046 * Get controller limits. 3047 */ 3048 nvme->n_async_event_limit = MAX(NVME_MIN_ASYNC_EVENT_LIMIT, 3049 MIN(nvme->n_admin_queue_len / 10, 3050 MIN(nvme->n_idctl->id_aerl + 1, nvme->n_async_event_limit))); 3051 3052 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, 3053 "async-event-limit", nvme->n_async_event_limit); 3054 3055 nvme->n_abort_command_limit = nvme->n_idctl->id_acl + 1; 3056 3057 /* 3058 * Reinitialize the semaphore with the true abort command limit 3059 * supported by the hardware. It's not necessary to disable interrupts 3060 * as only command aborts use the semaphore, and no commands are 3061 * executed or aborted while we're here. 3062 */ 3063 sema_destroy(&nvme->n_abort_sema); 3064 sema_init(&nvme->n_abort_sema, nvme->n_abort_command_limit - 1, NULL, 3065 SEMA_DRIVER, NULL); 3066 3067 nvme->n_progress |= NVME_CTRL_LIMITS; 3068 3069 if (nvme->n_idctl->id_mdts == 0) 3070 nvme->n_max_data_transfer_size = nvme->n_pagesize * 65536; 3071 else 3072 nvme->n_max_data_transfer_size = 3073 1ull << (nvme->n_pageshift + nvme->n_idctl->id_mdts); 3074 3075 nvme->n_error_log_len = nvme->n_idctl->id_elpe + 1; 3076 3077 /* 3078 * Limit n_max_data_transfer_size to what we can handle in one PRP. 3079 * Chained PRPs are currently unsupported. 3080 * 3081 * This is a no-op on hardware which doesn't support a transfer size 3082 * big enough to require chained PRPs. 3083 */ 3084 nvme->n_max_data_transfer_size = MIN(nvme->n_max_data_transfer_size, 3085 (nvme->n_pagesize / sizeof (uint64_t) * nvme->n_pagesize)); 3086 3087 nvme->n_prp_dma_attr.dma_attr_maxxfer = nvme->n_max_data_transfer_size; 3088 3089 /* 3090 * Make sure the minimum/maximum queue entry sizes are not 3091 * larger/smaller than the default. 3092 */ 3093 3094 if (((1 << nvme->n_idctl->id_sqes.qes_min) > sizeof (nvme_sqe_t)) || 3095 ((1 << nvme->n_idctl->id_sqes.qes_max) < sizeof (nvme_sqe_t)) || 3096 ((1 << nvme->n_idctl->id_cqes.qes_min) > sizeof (nvme_cqe_t)) || 3097 ((1 << nvme->n_idctl->id_cqes.qes_max) < sizeof (nvme_cqe_t))) 3098 goto fail; 3099 3100 /* 3101 * Check for the presence of a Volatile Write Cache. If present, 3102 * enable or disable based on the value of the property 3103 * volatile-write-cache-enable (default is enabled). 3104 */ 3105 nvme->n_write_cache_present = 3106 nvme->n_idctl->id_vwc.vwc_present == 0 ? B_FALSE : B_TRUE; 3107 3108 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, 3109 "volatile-write-cache-present", 3110 nvme->n_write_cache_present ? 1 : 0); 3111 3112 if (!nvme->n_write_cache_present) { 3113 nvme->n_write_cache_enabled = B_FALSE; 3114 } else if (nvme_write_cache_set(nvme, nvme->n_write_cache_enabled) 3115 != 0) { 3116 dev_err(nvme->n_dip, CE_WARN, 3117 "!failed to %sable volatile write cache", 3118 nvme->n_write_cache_enabled ? "en" : "dis"); 3119 /* 3120 * Assume the cache is (still) enabled. 3121 */ 3122 nvme->n_write_cache_enabled = B_TRUE; 3123 } 3124 3125 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, 3126 "volatile-write-cache-enable", 3127 nvme->n_write_cache_enabled ? 1 : 0); 3128 3129 /* 3130 * Assume LBA Range Type feature is supported. If it isn't this 3131 * will be set to B_FALSE by nvme_get_features(). 3132 */ 3133 nvme->n_lba_range_supported = B_TRUE; 3134 3135 /* 3136 * Check support for Autonomous Power State Transition. 3137 */ 3138 if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1)) 3139 nvme->n_auto_pst_supported = 3140 nvme->n_idctl->id_apsta.ap_sup == 0 ? B_FALSE : B_TRUE; 3141 3142 /* 3143 * Assume Software Progress Marker feature is supported. If it isn't 3144 * this will be set to B_FALSE by nvme_get_features(). 3145 */ 3146 nvme->n_progress_supported = B_TRUE; 3147 3148 /* 3149 * Identify Namespaces 3150 */ 3151 nvme->n_namespace_count = nvme->n_idctl->id_nn; 3152 3153 if (nvme->n_namespace_count == 0) { 3154 dev_err(nvme->n_dip, CE_WARN, 3155 "!controllers without namespaces are not supported"); 3156 goto fail; 3157 } 3158 3159 if (nvme->n_namespace_count > NVME_MINOR_MAX) { 3160 dev_err(nvme->n_dip, CE_WARN, 3161 "!too many namespaces: %d, limiting to %d\n", 3162 nvme->n_namespace_count, NVME_MINOR_MAX); 3163 nvme->n_namespace_count = NVME_MINOR_MAX; 3164 } 3165 3166 nvme->n_ns = kmem_zalloc(sizeof (nvme_namespace_t) * 3167 nvme->n_namespace_count, KM_SLEEP); 3168 3169 for (i = 0; i != nvme->n_namespace_count; i++) { 3170 mutex_init(&nvme->n_ns[i].ns_minor.nm_mutex, NULL, MUTEX_DRIVER, 3171 NULL); 3172 nvme->n_ns[i].ns_ignore = B_TRUE; 3173 if (nvme_init_ns(nvme, i + 1) != DDI_SUCCESS) 3174 goto fail; 3175 } 3176 3177 /* 3178 * Try to set up MSI/MSI-X interrupts. 3179 */ 3180 if ((nvme->n_intr_types & (DDI_INTR_TYPE_MSI | DDI_INTR_TYPE_MSIX)) 3181 != 0) { 3182 nvme_release_interrupts(nvme); 3183 3184 nqueues = MIN(UINT16_MAX, ncpus); 3185 3186 if ((nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSIX, 3187 nqueues) != DDI_SUCCESS) && 3188 (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSI, 3189 nqueues) != DDI_SUCCESS)) { 3190 dev_err(nvme->n_dip, CE_WARN, 3191 "!failed to setup MSI/MSI-X interrupts"); 3192 goto fail; 3193 } 3194 } 3195 3196 /* 3197 * Create I/O queue pairs. 3198 */ 3199 3200 if (nvme_set_nqueues(nvme) != 0) { 3201 dev_err(nvme->n_dip, CE_WARN, 3202 "!failed to set number of I/O queues to %d", 3203 nvme->n_intr_cnt); 3204 goto fail; 3205 } 3206 3207 /* 3208 * Reallocate I/O queue array 3209 */ 3210 kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *)); 3211 nvme->n_ioq = kmem_zalloc(sizeof (nvme_qpair_t *) * 3212 (nvme->n_submission_queues + 1), KM_SLEEP); 3213 nvme->n_ioq[0] = nvme->n_adminq; 3214 3215 /* 3216 * There should always be at least as many submission queues 3217 * as completion queues. 3218 */ 3219 ASSERT(nvme->n_submission_queues >= nvme->n_completion_queues); 3220 3221 nvme->n_ioq_count = nvme->n_submission_queues; 3222 3223 nvme->n_io_squeue_len = 3224 MIN(nvme->n_io_squeue_len, nvme->n_max_queue_entries); 3225 3226 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-squeue-len", 3227 nvme->n_io_squeue_len); 3228 3229 /* 3230 * Pre-allocate completion queues. 3231 * When there are the same number of submission and completion 3232 * queues there is no value in having a larger completion 3233 * queue length. 3234 */ 3235 if (nvme->n_submission_queues == nvme->n_completion_queues) 3236 nvme->n_io_cqueue_len = MIN(nvme->n_io_cqueue_len, 3237 nvme->n_io_squeue_len); 3238 3239 nvme->n_io_cqueue_len = MIN(nvme->n_io_cqueue_len, 3240 nvme->n_max_queue_entries); 3241 3242 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-cqueue-len", 3243 nvme->n_io_cqueue_len); 3244 3245 /* 3246 * Assign the equal quantity of taskq threads to each completion 3247 * queue, capping the total number of threads to the number 3248 * of CPUs. 3249 */ 3250 tq_threads = MIN(UINT16_MAX, ncpus) / nvme->n_completion_queues; 3251 3252 /* 3253 * In case the calculation above is zero, we need at least one 3254 * thread per completion queue. 3255 */ 3256 tq_threads = MAX(1, tq_threads); 3257 3258 if (nvme_create_cq_array(nvme, nvme->n_completion_queues + 1, 3259 nvme->n_io_cqueue_len, tq_threads) != DDI_SUCCESS) { 3260 dev_err(nvme->n_dip, CE_WARN, 3261 "!failed to pre-allocate completion queues"); 3262 goto fail; 3263 } 3264 3265 /* 3266 * If we use less completion queues than interrupt vectors return 3267 * some of the interrupt vectors back to the system. 3268 */ 3269 if (nvme->n_completion_queues + 1 < nvme->n_intr_cnt) { 3270 nvme_release_interrupts(nvme); 3271 3272 if (nvme_setup_interrupts(nvme, nvme->n_intr_type, 3273 nvme->n_completion_queues + 1) != DDI_SUCCESS) { 3274 dev_err(nvme->n_dip, CE_WARN, 3275 "!failed to reduce number of interrupts"); 3276 goto fail; 3277 } 3278 } 3279 3280 /* 3281 * Alloc & register I/O queue pairs 3282 */ 3283 3284 for (i = 1; i != nvme->n_ioq_count + 1; i++) { 3285 if (nvme_alloc_qpair(nvme, nvme->n_io_squeue_len, 3286 &nvme->n_ioq[i], i) != DDI_SUCCESS) { 3287 dev_err(nvme->n_dip, CE_WARN, 3288 "!unable to allocate I/O qpair %d", i); 3289 goto fail; 3290 } 3291 3292 if (nvme_create_io_qpair(nvme, nvme->n_ioq[i], i) != 0) { 3293 dev_err(nvme->n_dip, CE_WARN, 3294 "!unable to create I/O qpair %d", i); 3295 goto fail; 3296 } 3297 } 3298 3299 /* 3300 * Post more asynchronous events commands to reduce event reporting 3301 * latency as suggested by the spec. 3302 */ 3303 if (nvme->n_async_event_supported) { 3304 for (i = 1; i != nvme->n_async_event_limit; i++) 3305 nvme_async_event(nvme); 3306 } 3307 3308 return (DDI_SUCCESS); 3309 3310 fail: 3311 (void) nvme_reset(nvme, B_FALSE); 3312 return (DDI_FAILURE); 3313 } 3314 3315 static uint_t 3316 nvme_intr(caddr_t arg1, caddr_t arg2) 3317 { 3318 /*LINTED: E_PTR_BAD_CAST_ALIGN*/ 3319 nvme_t *nvme = (nvme_t *)arg1; 3320 int inum = (int)(uintptr_t)arg2; 3321 int ccnt = 0; 3322 int qnum; 3323 3324 if (inum >= nvme->n_intr_cnt) 3325 return (DDI_INTR_UNCLAIMED); 3326 3327 if (nvme->n_dead) 3328 return (nvme->n_intr_type == DDI_INTR_TYPE_FIXED ? 3329 DDI_INTR_UNCLAIMED : DDI_INTR_CLAIMED); 3330 3331 /* 3332 * The interrupt vector a queue uses is calculated as queue_idx % 3333 * intr_cnt in nvme_create_io_qpair(). Iterate through the queue array 3334 * in steps of n_intr_cnt to process all queues using this vector. 3335 */ 3336 for (qnum = inum; 3337 qnum < nvme->n_cq_count && nvme->n_cq[qnum] != NULL; 3338 qnum += nvme->n_intr_cnt) { 3339 ccnt += nvme_process_iocq(nvme, nvme->n_cq[qnum]); 3340 } 3341 3342 return (ccnt > 0 ? DDI_INTR_CLAIMED : DDI_INTR_UNCLAIMED); 3343 } 3344 3345 static void 3346 nvme_release_interrupts(nvme_t *nvme) 3347 { 3348 int i; 3349 3350 for (i = 0; i < nvme->n_intr_cnt; i++) { 3351 if (nvme->n_inth[i] == NULL) 3352 break; 3353 3354 if (nvme->n_intr_cap & DDI_INTR_FLAG_BLOCK) 3355 (void) ddi_intr_block_disable(&nvme->n_inth[i], 1); 3356 else 3357 (void) ddi_intr_disable(nvme->n_inth[i]); 3358 3359 (void) ddi_intr_remove_handler(nvme->n_inth[i]); 3360 (void) ddi_intr_free(nvme->n_inth[i]); 3361 } 3362 3363 kmem_free(nvme->n_inth, nvme->n_inth_sz); 3364 nvme->n_inth = NULL; 3365 nvme->n_inth_sz = 0; 3366 3367 nvme->n_progress &= ~NVME_INTERRUPTS; 3368 } 3369 3370 static int 3371 nvme_setup_interrupts(nvme_t *nvme, int intr_type, int nqpairs) 3372 { 3373 int nintrs, navail, count; 3374 int ret; 3375 int i; 3376 3377 if (nvme->n_intr_types == 0) { 3378 ret = ddi_intr_get_supported_types(nvme->n_dip, 3379 &nvme->n_intr_types); 3380 if (ret != DDI_SUCCESS) { 3381 dev_err(nvme->n_dip, CE_WARN, 3382 "!%s: ddi_intr_get_supported types failed", 3383 __func__); 3384 return (ret); 3385 } 3386 #ifdef __x86 3387 if (get_hwenv() == HW_VMWARE) 3388 nvme->n_intr_types &= ~DDI_INTR_TYPE_MSIX; 3389 #endif 3390 } 3391 3392 if ((nvme->n_intr_types & intr_type) == 0) 3393 return (DDI_FAILURE); 3394 3395 ret = ddi_intr_get_nintrs(nvme->n_dip, intr_type, &nintrs); 3396 if (ret != DDI_SUCCESS) { 3397 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_nintrs failed", 3398 __func__); 3399 return (ret); 3400 } 3401 3402 ret = ddi_intr_get_navail(nvme->n_dip, intr_type, &navail); 3403 if (ret != DDI_SUCCESS) { 3404 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_navail failed", 3405 __func__); 3406 return (ret); 3407 } 3408 3409 /* We want at most one interrupt per queue pair. */ 3410 if (navail > nqpairs) 3411 navail = nqpairs; 3412 3413 nvme->n_inth_sz = sizeof (ddi_intr_handle_t) * navail; 3414 nvme->n_inth = kmem_zalloc(nvme->n_inth_sz, KM_SLEEP); 3415 3416 ret = ddi_intr_alloc(nvme->n_dip, nvme->n_inth, intr_type, 0, navail, 3417 &count, 0); 3418 if (ret != DDI_SUCCESS) { 3419 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_alloc failed", 3420 __func__); 3421 goto fail; 3422 } 3423 3424 nvme->n_intr_cnt = count; 3425 3426 ret = ddi_intr_get_pri(nvme->n_inth[0], &nvme->n_intr_pri); 3427 if (ret != DDI_SUCCESS) { 3428 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_pri failed", 3429 __func__); 3430 goto fail; 3431 } 3432 3433 for (i = 0; i < count; i++) { 3434 ret = ddi_intr_add_handler(nvme->n_inth[i], nvme_intr, 3435 (void *)nvme, (void *)(uintptr_t)i); 3436 if (ret != DDI_SUCCESS) { 3437 dev_err(nvme->n_dip, CE_WARN, 3438 "!%s: ddi_intr_add_handler failed", __func__); 3439 goto fail; 3440 } 3441 } 3442 3443 (void) ddi_intr_get_cap(nvme->n_inth[0], &nvme->n_intr_cap); 3444 3445 for (i = 0; i < count; i++) { 3446 if (nvme->n_intr_cap & DDI_INTR_FLAG_BLOCK) 3447 ret = ddi_intr_block_enable(&nvme->n_inth[i], 1); 3448 else 3449 ret = ddi_intr_enable(nvme->n_inth[i]); 3450 3451 if (ret != DDI_SUCCESS) { 3452 dev_err(nvme->n_dip, CE_WARN, 3453 "!%s: enabling interrupt %d failed", __func__, i); 3454 goto fail; 3455 } 3456 } 3457 3458 nvme->n_intr_type = intr_type; 3459 3460 nvme->n_progress |= NVME_INTERRUPTS; 3461 3462 return (DDI_SUCCESS); 3463 3464 fail: 3465 nvme_release_interrupts(nvme); 3466 3467 return (ret); 3468 } 3469 3470 static int 3471 nvme_fm_errcb(dev_info_t *dip, ddi_fm_error_t *fm_error, const void *arg) 3472 { 3473 _NOTE(ARGUNUSED(arg)); 3474 3475 pci_ereport_post(dip, fm_error, NULL); 3476 return (fm_error->fme_status); 3477 } 3478 3479 static void 3480 nvme_remove_callback(dev_info_t *dip, ddi_eventcookie_t cookie, void *a, 3481 void *b) 3482 { 3483 nvme_t *nvme = a; 3484 3485 nvme->n_dead = B_TRUE; 3486 3487 /* 3488 * Fail all outstanding commands, including those in the admin queue 3489 * (queue 0). 3490 */ 3491 for (uint_t i = 0; i < nvme->n_ioq_count + 1; i++) { 3492 nvme_qpair_t *qp = nvme->n_ioq[i]; 3493 3494 mutex_enter(&qp->nq_mutex); 3495 for (size_t j = 0; j < qp->nq_nentry; j++) { 3496 nvme_cmd_t *cmd = qp->nq_cmd[j]; 3497 nvme_cmd_t *u_cmd; 3498 3499 if (cmd == NULL) { 3500 continue; 3501 } 3502 3503 /* 3504 * Since we have the queue lock held the entire time we 3505 * iterate over it, it's not possible for the queue to 3506 * change underneath us. Thus, we don't need to check 3507 * that the return value of nvme_unqueue_cmd matches the 3508 * requested cmd to unqueue. 3509 */ 3510 u_cmd = nvme_unqueue_cmd(nvme, qp, cmd->nc_sqe.sqe_cid); 3511 taskq_dispatch_ent(qp->nq_cq->ncq_cmd_taskq, 3512 cmd->nc_callback, cmd, TQ_NOSLEEP, &cmd->nc_tqent); 3513 3514 ASSERT3P(u_cmd, ==, cmd); 3515 } 3516 mutex_exit(&qp->nq_mutex); 3517 } 3518 } 3519 3520 static int 3521 nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 3522 { 3523 nvme_t *nvme; 3524 int instance; 3525 int nregs; 3526 off_t regsize; 3527 int i; 3528 char name[32]; 3529 bd_ops_t ops = nvme_bd_ops; 3530 3531 if (cmd != DDI_ATTACH) 3532 return (DDI_FAILURE); 3533 3534 instance = ddi_get_instance(dip); 3535 3536 if (ddi_soft_state_zalloc(nvme_state, instance) != DDI_SUCCESS) 3537 return (DDI_FAILURE); 3538 3539 nvme = ddi_get_soft_state(nvme_state, instance); 3540 ddi_set_driver_private(dip, nvme); 3541 nvme->n_dip = dip; 3542 3543 /* Set up event handlers for hot removal. */ 3544 if (ddi_get_eventcookie(nvme->n_dip, DDI_DEVI_REMOVE_EVENT, 3545 &nvme->n_rm_cookie) != DDI_SUCCESS) { 3546 goto fail; 3547 } 3548 if (ddi_add_event_handler(nvme->n_dip, nvme->n_rm_cookie, 3549 nvme_remove_callback, nvme, &nvme->n_ev_rm_cb_id) != 3550 DDI_SUCCESS) { 3551 goto fail; 3552 } 3553 3554 mutex_init(&nvme->n_minor.nm_mutex, NULL, MUTEX_DRIVER, NULL); 3555 3556 nvme->n_strict_version = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3557 DDI_PROP_DONTPASS, "strict-version", 1) == 1 ? B_TRUE : B_FALSE; 3558 nvme->n_ignore_unknown_vendor_status = ddi_prop_get_int(DDI_DEV_T_ANY, 3559 dip, DDI_PROP_DONTPASS, "ignore-unknown-vendor-status", 0) == 1 ? 3560 B_TRUE : B_FALSE; 3561 nvme->n_admin_queue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3562 DDI_PROP_DONTPASS, "admin-queue-len", NVME_DEFAULT_ADMIN_QUEUE_LEN); 3563 nvme->n_io_squeue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3564 DDI_PROP_DONTPASS, "io-squeue-len", NVME_DEFAULT_IO_QUEUE_LEN); 3565 /* 3566 * Double up the default for completion queues in case of 3567 * queue sharing. 3568 */ 3569 nvme->n_io_cqueue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3570 DDI_PROP_DONTPASS, "io-cqueue-len", 2 * NVME_DEFAULT_IO_QUEUE_LEN); 3571 nvme->n_async_event_limit = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3572 DDI_PROP_DONTPASS, "async-event-limit", 3573 NVME_DEFAULT_ASYNC_EVENT_LIMIT); 3574 nvme->n_write_cache_enabled = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3575 DDI_PROP_DONTPASS, "volatile-write-cache-enable", 1) != 0 ? 3576 B_TRUE : B_FALSE; 3577 nvme->n_min_block_size = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3578 DDI_PROP_DONTPASS, "min-phys-block-size", 3579 NVME_DEFAULT_MIN_BLOCK_SIZE); 3580 nvme->n_submission_queues = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3581 DDI_PROP_DONTPASS, "max-submission-queues", -1); 3582 nvme->n_completion_queues = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 3583 DDI_PROP_DONTPASS, "max-completion-queues", -1); 3584 3585 if (!ISP2(nvme->n_min_block_size) || 3586 (nvme->n_min_block_size < NVME_DEFAULT_MIN_BLOCK_SIZE)) { 3587 dev_err(dip, CE_WARN, "!min-phys-block-size %s, " 3588 "using default %d", ISP2(nvme->n_min_block_size) ? 3589 "too low" : "not a power of 2", 3590 NVME_DEFAULT_MIN_BLOCK_SIZE); 3591 nvme->n_min_block_size = NVME_DEFAULT_MIN_BLOCK_SIZE; 3592 } 3593 3594 if (nvme->n_submission_queues != -1 && 3595 (nvme->n_submission_queues < 1 || 3596 nvme->n_submission_queues > UINT16_MAX)) { 3597 dev_err(dip, CE_WARN, "!\"submission-queues\"=%d is not " 3598 "valid. Must be [1..%d]", nvme->n_submission_queues, 3599 UINT16_MAX); 3600 nvme->n_submission_queues = -1; 3601 } 3602 3603 if (nvme->n_completion_queues != -1 && 3604 (nvme->n_completion_queues < 1 || 3605 nvme->n_completion_queues > UINT16_MAX)) { 3606 dev_err(dip, CE_WARN, "!\"completion-queues\"=%d is not " 3607 "valid. Must be [1..%d]", nvme->n_completion_queues, 3608 UINT16_MAX); 3609 nvme->n_completion_queues = -1; 3610 } 3611 3612 if (nvme->n_admin_queue_len < NVME_MIN_ADMIN_QUEUE_LEN) 3613 nvme->n_admin_queue_len = NVME_MIN_ADMIN_QUEUE_LEN; 3614 else if (nvme->n_admin_queue_len > NVME_MAX_ADMIN_QUEUE_LEN) 3615 nvme->n_admin_queue_len = NVME_MAX_ADMIN_QUEUE_LEN; 3616 3617 if (nvme->n_io_squeue_len < NVME_MIN_IO_QUEUE_LEN) 3618 nvme->n_io_squeue_len = NVME_MIN_IO_QUEUE_LEN; 3619 if (nvme->n_io_cqueue_len < NVME_MIN_IO_QUEUE_LEN) 3620 nvme->n_io_cqueue_len = NVME_MIN_IO_QUEUE_LEN; 3621 3622 if (nvme->n_async_event_limit < 1) 3623 nvme->n_async_event_limit = NVME_DEFAULT_ASYNC_EVENT_LIMIT; 3624 3625 nvme->n_reg_acc_attr = nvme_reg_acc_attr; 3626 nvme->n_queue_dma_attr = nvme_queue_dma_attr; 3627 nvme->n_prp_dma_attr = nvme_prp_dma_attr; 3628 nvme->n_sgl_dma_attr = nvme_sgl_dma_attr; 3629 3630 /* 3631 * Set up FMA support. 3632 */ 3633 nvme->n_fm_cap = ddi_getprop(DDI_DEV_T_ANY, dip, 3634 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "fm-capable", 3635 DDI_FM_EREPORT_CAPABLE | DDI_FM_ACCCHK_CAPABLE | 3636 DDI_FM_DMACHK_CAPABLE | DDI_FM_ERRCB_CAPABLE); 3637 3638 ddi_fm_init(dip, &nvme->n_fm_cap, &nvme->n_fm_ibc); 3639 3640 if (nvme->n_fm_cap) { 3641 if (nvme->n_fm_cap & DDI_FM_ACCCHK_CAPABLE) 3642 nvme->n_reg_acc_attr.devacc_attr_access = 3643 DDI_FLAGERR_ACC; 3644 3645 if (nvme->n_fm_cap & DDI_FM_DMACHK_CAPABLE) { 3646 nvme->n_prp_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR; 3647 nvme->n_sgl_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR; 3648 } 3649 3650 if (DDI_FM_EREPORT_CAP(nvme->n_fm_cap) || 3651 DDI_FM_ERRCB_CAP(nvme->n_fm_cap)) 3652 pci_ereport_setup(dip); 3653 3654 if (DDI_FM_ERRCB_CAP(nvme->n_fm_cap)) 3655 ddi_fm_handler_register(dip, nvme_fm_errcb, 3656 (void *)nvme); 3657 } 3658 3659 nvme->n_progress |= NVME_FMA_INIT; 3660 3661 /* 3662 * The spec defines several register sets. Only the controller 3663 * registers (set 1) are currently used. 3664 */ 3665 if (ddi_dev_nregs(dip, &nregs) == DDI_FAILURE || 3666 nregs < 2 || 3667 ddi_dev_regsize(dip, 1, ®size) == DDI_FAILURE) 3668 goto fail; 3669 3670 if (ddi_regs_map_setup(dip, 1, &nvme->n_regs, 0, regsize, 3671 &nvme->n_reg_acc_attr, &nvme->n_regh) != DDI_SUCCESS) { 3672 dev_err(dip, CE_WARN, "!failed to map regset 1"); 3673 goto fail; 3674 } 3675 3676 nvme->n_progress |= NVME_REGS_MAPPED; 3677 3678 /* 3679 * Create PRP DMA cache 3680 */ 3681 (void) snprintf(name, sizeof (name), "%s%d_prp_cache", 3682 ddi_driver_name(dip), ddi_get_instance(dip)); 3683 nvme->n_prp_cache = kmem_cache_create(name, sizeof (nvme_dma_t), 3684 0, nvme_prp_dma_constructor, nvme_prp_dma_destructor, 3685 NULL, (void *)nvme, NULL, 0); 3686 3687 if (nvme_init(nvme) != DDI_SUCCESS) 3688 goto fail; 3689 3690 if (!nvme->n_idctl->id_oncs.on_dset_mgmt) 3691 ops.o_free_space = NULL; 3692 3693 /* 3694 * Initialize the driver with the UFM subsystem 3695 */ 3696 if (ddi_ufm_init(dip, DDI_UFM_CURRENT_VERSION, &nvme_ufm_ops, 3697 &nvme->n_ufmh, nvme) != 0) { 3698 dev_err(dip, CE_WARN, "!failed to initialize UFM subsystem"); 3699 goto fail; 3700 } 3701 mutex_init(&nvme->n_fwslot_mutex, NULL, MUTEX_DRIVER, NULL); 3702 ddi_ufm_update(nvme->n_ufmh); 3703 nvme->n_progress |= NVME_UFM_INIT; 3704 3705 /* 3706 * Attach the blkdev driver for each namespace. 3707 */ 3708 for (i = 0; i != nvme->n_namespace_count; i++) { 3709 if (ddi_create_minor_node(nvme->n_dip, nvme->n_ns[i].ns_name, 3710 S_IFCHR, NVME_MINOR(ddi_get_instance(nvme->n_dip), i + 1), 3711 DDI_NT_NVME_ATTACHMENT_POINT, 0) != DDI_SUCCESS) { 3712 dev_err(dip, CE_WARN, 3713 "!failed to create minor node for namespace %d", i); 3714 goto fail; 3715 } 3716 3717 if (nvme->n_ns[i].ns_ignore) 3718 continue; 3719 3720 nvme->n_ns[i].ns_bd_hdl = bd_alloc_handle(&nvme->n_ns[i], 3721 &ops, &nvme->n_prp_dma_attr, KM_SLEEP); 3722 3723 if (nvme->n_ns[i].ns_bd_hdl == NULL) { 3724 dev_err(dip, CE_WARN, 3725 "!failed to get blkdev handle for namespace %d", i); 3726 goto fail; 3727 } 3728 3729 if (bd_attach_handle(dip, nvme->n_ns[i].ns_bd_hdl) 3730 != DDI_SUCCESS) { 3731 dev_err(dip, CE_WARN, 3732 "!failed to attach blkdev handle for namespace %d", 3733 i); 3734 goto fail; 3735 } 3736 } 3737 3738 if (ddi_create_minor_node(dip, "devctl", S_IFCHR, 3739 NVME_MINOR(ddi_get_instance(dip), 0), DDI_NT_NVME_NEXUS, 0) 3740 != DDI_SUCCESS) { 3741 dev_err(dip, CE_WARN, "nvme_attach: " 3742 "cannot create devctl minor node"); 3743 goto fail; 3744 } 3745 3746 return (DDI_SUCCESS); 3747 3748 fail: 3749 /* attach successful anyway so that FMA can retire the device */ 3750 if (nvme->n_dead) 3751 return (DDI_SUCCESS); 3752 3753 (void) nvme_detach(dip, DDI_DETACH); 3754 3755 return (DDI_FAILURE); 3756 } 3757 3758 static int 3759 nvme_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 3760 { 3761 int instance, i; 3762 nvme_t *nvme; 3763 3764 if (cmd != DDI_DETACH) 3765 return (DDI_FAILURE); 3766 3767 instance = ddi_get_instance(dip); 3768 3769 nvme = ddi_get_soft_state(nvme_state, instance); 3770 3771 if (nvme == NULL) 3772 return (DDI_FAILURE); 3773 3774 ddi_remove_minor_node(dip, "devctl"); 3775 mutex_destroy(&nvme->n_minor.nm_mutex); 3776 3777 if (nvme->n_ns) { 3778 for (i = 0; i != nvme->n_namespace_count; i++) { 3779 ddi_remove_minor_node(dip, nvme->n_ns[i].ns_name); 3780 mutex_destroy(&nvme->n_ns[i].ns_minor.nm_mutex); 3781 3782 if (nvme->n_ns[i].ns_bd_hdl) { 3783 (void) bd_detach_handle( 3784 nvme->n_ns[i].ns_bd_hdl); 3785 bd_free_handle(nvme->n_ns[i].ns_bd_hdl); 3786 } 3787 3788 if (nvme->n_ns[i].ns_idns) 3789 kmem_free(nvme->n_ns[i].ns_idns, 3790 sizeof (nvme_identify_nsid_t)); 3791 if (nvme->n_ns[i].ns_devid) 3792 strfree(nvme->n_ns[i].ns_devid); 3793 } 3794 3795 kmem_free(nvme->n_ns, sizeof (nvme_namespace_t) * 3796 nvme->n_namespace_count); 3797 } 3798 if (nvme->n_progress & NVME_UFM_INIT) { 3799 ddi_ufm_fini(nvme->n_ufmh); 3800 mutex_destroy(&nvme->n_fwslot_mutex); 3801 } 3802 3803 if (nvme->n_progress & NVME_INTERRUPTS) 3804 nvme_release_interrupts(nvme); 3805 3806 for (i = 0; i < nvme->n_cq_count; i++) { 3807 if (nvme->n_cq[i]->ncq_cmd_taskq != NULL) 3808 taskq_wait(nvme->n_cq[i]->ncq_cmd_taskq); 3809 } 3810 3811 if (nvme->n_ioq_count > 0) { 3812 for (i = 1; i != nvme->n_ioq_count + 1; i++) { 3813 if (nvme->n_ioq[i] != NULL) { 3814 /* TODO: send destroy queue commands */ 3815 nvme_free_qpair(nvme->n_ioq[i]); 3816 } 3817 } 3818 3819 kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *) * 3820 (nvme->n_ioq_count + 1)); 3821 } 3822 3823 if (nvme->n_prp_cache != NULL) { 3824 kmem_cache_destroy(nvme->n_prp_cache); 3825 } 3826 3827 if (nvme->n_progress & NVME_REGS_MAPPED) { 3828 nvme_shutdown(nvme, NVME_CC_SHN_NORMAL, B_FALSE); 3829 (void) nvme_reset(nvme, B_FALSE); 3830 } 3831 3832 if (nvme->n_progress & NVME_CTRL_LIMITS) 3833 sema_destroy(&nvme->n_abort_sema); 3834 3835 if (nvme->n_progress & NVME_ADMIN_QUEUE) 3836 nvme_free_qpair(nvme->n_adminq); 3837 3838 if (nvme->n_cq_count > 0) { 3839 nvme_destroy_cq_array(nvme, 0); 3840 nvme->n_cq = NULL; 3841 nvme->n_cq_count = 0; 3842 } 3843 3844 if (nvme->n_idctl) 3845 kmem_free(nvme->n_idctl, NVME_IDENTIFY_BUFSIZE); 3846 3847 if (nvme->n_progress & NVME_REGS_MAPPED) 3848 ddi_regs_map_free(&nvme->n_regh); 3849 3850 if (nvme->n_progress & NVME_FMA_INIT) { 3851 if (DDI_FM_ERRCB_CAP(nvme->n_fm_cap)) 3852 ddi_fm_handler_unregister(nvme->n_dip); 3853 3854 if (DDI_FM_EREPORT_CAP(nvme->n_fm_cap) || 3855 DDI_FM_ERRCB_CAP(nvme->n_fm_cap)) 3856 pci_ereport_teardown(nvme->n_dip); 3857 3858 ddi_fm_fini(nvme->n_dip); 3859 } 3860 3861 if (nvme->n_vendor != NULL) 3862 strfree(nvme->n_vendor); 3863 3864 if (nvme->n_product != NULL) 3865 strfree(nvme->n_product); 3866 3867 /* Clean up hot removal event handler. */ 3868 if (nvme->n_ev_rm_cb_id != NULL) { 3869 (void) ddi_remove_event_handler(nvme->n_ev_rm_cb_id); 3870 } 3871 nvme->n_ev_rm_cb_id = NULL; 3872 3873 ddi_soft_state_free(nvme_state, instance); 3874 3875 return (DDI_SUCCESS); 3876 } 3877 3878 static int 3879 nvme_quiesce(dev_info_t *dip) 3880 { 3881 int instance; 3882 nvme_t *nvme; 3883 3884 instance = ddi_get_instance(dip); 3885 3886 nvme = ddi_get_soft_state(nvme_state, instance); 3887 3888 if (nvme == NULL) 3889 return (DDI_FAILURE); 3890 3891 nvme_shutdown(nvme, NVME_CC_SHN_ABRUPT, B_TRUE); 3892 3893 (void) nvme_reset(nvme, B_TRUE); 3894 3895 return (DDI_FAILURE); 3896 } 3897 3898 static int 3899 nvme_fill_prp(nvme_cmd_t *cmd, ddi_dma_handle_t dma) 3900 { 3901 nvme_t *nvme = cmd->nc_nvme; 3902 uint_t nprp_per_page, nprp; 3903 uint64_t *prp; 3904 const ddi_dma_cookie_t *cookie; 3905 uint_t idx; 3906 uint_t ncookies = ddi_dma_ncookies(dma); 3907 3908 if (ncookies == 0) 3909 return (DDI_FAILURE); 3910 3911 if ((cookie = ddi_dma_cookie_get(dma, 0)) == NULL) 3912 return (DDI_FAILURE); 3913 cmd->nc_sqe.sqe_dptr.d_prp[0] = cookie->dmac_laddress; 3914 3915 if (ncookies == 1) { 3916 cmd->nc_sqe.sqe_dptr.d_prp[1] = 0; 3917 return (DDI_SUCCESS); 3918 } else if (ncookies == 2) { 3919 if ((cookie = ddi_dma_cookie_get(dma, 1)) == NULL) 3920 return (DDI_FAILURE); 3921 cmd->nc_sqe.sqe_dptr.d_prp[1] = cookie->dmac_laddress; 3922 return (DDI_SUCCESS); 3923 } 3924 3925 /* 3926 * At this point, we're always operating on cookies at 3927 * index >= 1 and writing the addresses of those cookies 3928 * into a new page. The address of that page is stored 3929 * as the second PRP entry. 3930 */ 3931 nprp_per_page = nvme->n_pagesize / sizeof (uint64_t); 3932 ASSERT(nprp_per_page > 0); 3933 3934 /* 3935 * We currently don't support chained PRPs and set up our DMA 3936 * attributes to reflect that. If we still get an I/O request 3937 * that needs a chained PRP something is very wrong. Account 3938 * for the first cookie here, which we've placed in d_prp[0]. 3939 */ 3940 nprp = howmany(ncookies - 1, nprp_per_page); 3941 VERIFY(nprp == 1); 3942 3943 /* 3944 * Allocate a page of pointers, in which we'll write the 3945 * addresses of cookies 1 to `ncookies`. 3946 */ 3947 cmd->nc_prp = kmem_cache_alloc(nvme->n_prp_cache, KM_SLEEP); 3948 bzero(cmd->nc_prp->nd_memp, cmd->nc_prp->nd_len); 3949 cmd->nc_sqe.sqe_dptr.d_prp[1] = cmd->nc_prp->nd_cookie.dmac_laddress; 3950 3951 prp = (uint64_t *)cmd->nc_prp->nd_memp; 3952 for (idx = 1; idx < ncookies; idx++) { 3953 if ((cookie = ddi_dma_cookie_get(dma, idx)) == NULL) 3954 return (DDI_FAILURE); 3955 *prp++ = cookie->dmac_laddress; 3956 } 3957 3958 (void) ddi_dma_sync(cmd->nc_prp->nd_dmah, 0, cmd->nc_prp->nd_len, 3959 DDI_DMA_SYNC_FORDEV); 3960 return (DDI_SUCCESS); 3961 } 3962 3963 /* 3964 * The maximum number of requests supported for a deallocate request is 3965 * NVME_DSET_MGMT_MAX_RANGES (256) -- this is from the NVMe 1.1 spec (and 3966 * unchanged through at least 1.4a). The definition of nvme_range_t is also 3967 * from the NVMe 1.1 spec. Together, the result is that all of the ranges for 3968 * a deallocate request will fit into the smallest supported namespace page 3969 * (4k). 3970 */ 3971 CTASSERT(sizeof (nvme_range_t) * NVME_DSET_MGMT_MAX_RANGES == 4096); 3972 3973 static int 3974 nvme_fill_ranges(nvme_cmd_t *cmd, bd_xfer_t *xfer, uint64_t blocksize, 3975 int allocflag) 3976 { 3977 const dkioc_free_list_t *dfl = xfer->x_dfl; 3978 const dkioc_free_list_ext_t *exts = dfl->dfl_exts; 3979 nvme_t *nvme = cmd->nc_nvme; 3980 nvme_range_t *ranges = NULL; 3981 uint_t i; 3982 3983 /* 3984 * The number of ranges in the request is 0s based (that is 3985 * word10 == 0 -> 1 range, word10 == 1 -> 2 ranges, ..., 3986 * word10 == 255 -> 256 ranges). Therefore the allowed values are 3987 * [1..NVME_DSET_MGMT_MAX_RANGES]. If blkdev gives us a bad request, 3988 * we either provided bad info in nvme_bd_driveinfo() or there is a bug 3989 * in blkdev. 3990 */ 3991 VERIFY3U(dfl->dfl_num_exts, >, 0); 3992 VERIFY3U(dfl->dfl_num_exts, <=, NVME_DSET_MGMT_MAX_RANGES); 3993 cmd->nc_sqe.sqe_cdw10 = (dfl->dfl_num_exts - 1) & 0xff; 3994 3995 cmd->nc_sqe.sqe_cdw11 = NVME_DSET_MGMT_ATTR_DEALLOCATE; 3996 3997 cmd->nc_prp = kmem_cache_alloc(nvme->n_prp_cache, allocflag); 3998 if (cmd->nc_prp == NULL) 3999 return (DDI_FAILURE); 4000 4001 bzero(cmd->nc_prp->nd_memp, cmd->nc_prp->nd_len); 4002 ranges = (nvme_range_t *)cmd->nc_prp->nd_memp; 4003 4004 cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_prp->nd_cookie.dmac_laddress; 4005 cmd->nc_sqe.sqe_dptr.d_prp[1] = 0; 4006 4007 for (i = 0; i < dfl->dfl_num_exts; i++) { 4008 uint64_t lba, len; 4009 4010 lba = (dfl->dfl_offset + exts[i].dfle_start) / blocksize; 4011 len = exts[i].dfle_length / blocksize; 4012 4013 VERIFY3U(len, <=, UINT32_MAX); 4014 4015 /* No context attributes for a deallocate request */ 4016 ranges[i].nr_ctxattr = 0; 4017 ranges[i].nr_len = len; 4018 ranges[i].nr_lba = lba; 4019 } 4020 4021 (void) ddi_dma_sync(cmd->nc_prp->nd_dmah, 0, cmd->nc_prp->nd_len, 4022 DDI_DMA_SYNC_FORDEV); 4023 4024 return (DDI_SUCCESS); 4025 } 4026 4027 static nvme_cmd_t * 4028 nvme_create_nvm_cmd(nvme_namespace_t *ns, uint8_t opc, bd_xfer_t *xfer) 4029 { 4030 nvme_t *nvme = ns->ns_nvme; 4031 nvme_cmd_t *cmd; 4032 int allocflag; 4033 4034 /* 4035 * Blkdev only sets BD_XFER_POLL when dumping, so don't sleep. 4036 */ 4037 allocflag = (xfer->x_flags & BD_XFER_POLL) ? KM_NOSLEEP : KM_SLEEP; 4038 cmd = nvme_alloc_cmd(nvme, allocflag); 4039 4040 if (cmd == NULL) 4041 return (NULL); 4042 4043 cmd->nc_sqe.sqe_opc = opc; 4044 cmd->nc_callback = nvme_bd_xfer_done; 4045 cmd->nc_xfer = xfer; 4046 4047 switch (opc) { 4048 case NVME_OPC_NVM_WRITE: 4049 case NVME_OPC_NVM_READ: 4050 VERIFY(xfer->x_nblks <= 0x10000); 4051 4052 cmd->nc_sqe.sqe_nsid = ns->ns_id; 4053 4054 cmd->nc_sqe.sqe_cdw10 = xfer->x_blkno & 0xffffffffu; 4055 cmd->nc_sqe.sqe_cdw11 = (xfer->x_blkno >> 32); 4056 cmd->nc_sqe.sqe_cdw12 = (uint16_t)(xfer->x_nblks - 1); 4057 4058 if (nvme_fill_prp(cmd, xfer->x_dmah) != DDI_SUCCESS) 4059 goto fail; 4060 break; 4061 4062 case NVME_OPC_NVM_FLUSH: 4063 cmd->nc_sqe.sqe_nsid = ns->ns_id; 4064 break; 4065 4066 case NVME_OPC_NVM_DSET_MGMT: 4067 cmd->nc_sqe.sqe_nsid = ns->ns_id; 4068 4069 if (nvme_fill_ranges(cmd, xfer, 4070 (uint64_t)ns->ns_block_size, allocflag) != DDI_SUCCESS) 4071 goto fail; 4072 break; 4073 4074 default: 4075 goto fail; 4076 } 4077 4078 return (cmd); 4079 4080 fail: 4081 nvme_free_cmd(cmd); 4082 return (NULL); 4083 } 4084 4085 static void 4086 nvme_bd_xfer_done(void *arg) 4087 { 4088 nvme_cmd_t *cmd = arg; 4089 bd_xfer_t *xfer = cmd->nc_xfer; 4090 int error = 0; 4091 4092 error = nvme_check_cmd_status(cmd); 4093 nvme_free_cmd(cmd); 4094 4095 bd_xfer_done(xfer, error); 4096 } 4097 4098 static void 4099 nvme_bd_driveinfo(void *arg, bd_drive_t *drive) 4100 { 4101 nvme_namespace_t *ns = arg; 4102 nvme_t *nvme = ns->ns_nvme; 4103 uint_t ns_count = MAX(1, nvme->n_namespaces_attachable); 4104 4105 /* 4106 * Set the blkdev qcount to the number of submission queues. 4107 * It will then create one waitq/runq pair for each submission 4108 * queue and spread I/O requests across the queues. 4109 */ 4110 drive->d_qcount = nvme->n_ioq_count; 4111 4112 /* 4113 * I/O activity to individual namespaces is distributed across 4114 * each of the d_qcount blkdev queues (which has been set to 4115 * the number of nvme submission queues). d_qsize is the number 4116 * of submitted and not completed I/Os within each queue that blkdev 4117 * will allow before it starts holding them in the waitq. 4118 * 4119 * Each namespace will create a child blkdev instance, for each one 4120 * we try and set the d_qsize so that each namespace gets an 4121 * equal portion of the submission queue. 4122 * 4123 * If post instantiation of the nvme drive, n_namespaces_attachable 4124 * changes and a namespace is attached it could calculate a 4125 * different d_qsize. It may even be that the sum of the d_qsizes is 4126 * now beyond the submission queue size. Should that be the case 4127 * and the I/O rate is such that blkdev attempts to submit more 4128 * I/Os than the size of the submission queue, the excess I/Os 4129 * will be held behind the semaphore nq_sema. 4130 */ 4131 drive->d_qsize = nvme->n_io_squeue_len / ns_count; 4132 4133 /* 4134 * Don't let the queue size drop below the minimum, though. 4135 */ 4136 drive->d_qsize = MAX(drive->d_qsize, NVME_MIN_IO_QUEUE_LEN); 4137 4138 /* 4139 * d_maxxfer is not set, which means the value is taken from the DMA 4140 * attributes specified to bd_alloc_handle. 4141 */ 4142 4143 drive->d_removable = B_FALSE; 4144 drive->d_hotpluggable = B_FALSE; 4145 4146 bcopy(ns->ns_eui64, drive->d_eui64, sizeof (drive->d_eui64)); 4147 drive->d_target = ns->ns_id; 4148 drive->d_lun = 0; 4149 4150 drive->d_model = nvme->n_idctl->id_model; 4151 drive->d_model_len = sizeof (nvme->n_idctl->id_model); 4152 drive->d_vendor = nvme->n_vendor; 4153 drive->d_vendor_len = strlen(nvme->n_vendor); 4154 drive->d_product = nvme->n_product; 4155 drive->d_product_len = strlen(nvme->n_product); 4156 drive->d_serial = nvme->n_idctl->id_serial; 4157 drive->d_serial_len = sizeof (nvme->n_idctl->id_serial); 4158 drive->d_revision = nvme->n_idctl->id_fwrev; 4159 drive->d_revision_len = sizeof (nvme->n_idctl->id_fwrev); 4160 4161 /* 4162 * If we support the dataset management command, the only restrictions 4163 * on a discard request are the maximum number of ranges (segments) 4164 * per single request. 4165 */ 4166 if (nvme->n_idctl->id_oncs.on_dset_mgmt) 4167 drive->d_max_free_seg = NVME_DSET_MGMT_MAX_RANGES; 4168 } 4169 4170 static int 4171 nvme_bd_mediainfo(void *arg, bd_media_t *media) 4172 { 4173 nvme_namespace_t *ns = arg; 4174 nvme_t *nvme = ns->ns_nvme; 4175 4176 if (nvme->n_dead) { 4177 return (EIO); 4178 } 4179 4180 media->m_nblks = ns->ns_block_count; 4181 media->m_blksize = ns->ns_block_size; 4182 media->m_readonly = B_FALSE; 4183 media->m_solidstate = B_TRUE; 4184 4185 media->m_pblksize = ns->ns_best_block_size; 4186 4187 return (0); 4188 } 4189 4190 static int 4191 nvme_bd_cmd(nvme_namespace_t *ns, bd_xfer_t *xfer, uint8_t opc) 4192 { 4193 nvme_t *nvme = ns->ns_nvme; 4194 nvme_cmd_t *cmd; 4195 nvme_qpair_t *ioq; 4196 boolean_t poll; 4197 int ret; 4198 4199 if (nvme->n_dead) { 4200 return (EIO); 4201 } 4202 4203 cmd = nvme_create_nvm_cmd(ns, opc, xfer); 4204 if (cmd == NULL) 4205 return (ENOMEM); 4206 4207 cmd->nc_sqid = xfer->x_qnum + 1; 4208 ASSERT(cmd->nc_sqid <= nvme->n_ioq_count); 4209 ioq = nvme->n_ioq[cmd->nc_sqid]; 4210 4211 /* 4212 * Get the polling flag before submitting the command. The command may 4213 * complete immediately after it was submitted, which means we must 4214 * treat both cmd and xfer as if they have been freed already. 4215 */ 4216 poll = (xfer->x_flags & BD_XFER_POLL) != 0; 4217 4218 ret = nvme_submit_io_cmd(ioq, cmd); 4219 4220 if (ret != 0) 4221 return (ret); 4222 4223 if (!poll) 4224 return (0); 4225 4226 do { 4227 cmd = nvme_retrieve_cmd(nvme, ioq); 4228 if (cmd != NULL) 4229 cmd->nc_callback(cmd); 4230 else 4231 drv_usecwait(10); 4232 } while (ioq->nq_active_cmds != 0); 4233 4234 return (0); 4235 } 4236 4237 static int 4238 nvme_bd_read(void *arg, bd_xfer_t *xfer) 4239 { 4240 nvme_namespace_t *ns = arg; 4241 4242 return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_READ)); 4243 } 4244 4245 static int 4246 nvme_bd_write(void *arg, bd_xfer_t *xfer) 4247 { 4248 nvme_namespace_t *ns = arg; 4249 4250 return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_WRITE)); 4251 } 4252 4253 static int 4254 nvme_bd_sync(void *arg, bd_xfer_t *xfer) 4255 { 4256 nvme_namespace_t *ns = arg; 4257 4258 if (ns->ns_nvme->n_dead) 4259 return (EIO); 4260 4261 /* 4262 * If the volatile write cache is not present or not enabled the FLUSH 4263 * command is a no-op, so we can take a shortcut here. 4264 */ 4265 if (!ns->ns_nvme->n_write_cache_present) { 4266 bd_xfer_done(xfer, ENOTSUP); 4267 return (0); 4268 } 4269 4270 if (!ns->ns_nvme->n_write_cache_enabled) { 4271 bd_xfer_done(xfer, 0); 4272 return (0); 4273 } 4274 4275 return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_FLUSH)); 4276 } 4277 4278 static int 4279 nvme_bd_devid(void *arg, dev_info_t *devinfo, ddi_devid_t *devid) 4280 { 4281 nvme_namespace_t *ns = arg; 4282 nvme_t *nvme = ns->ns_nvme; 4283 4284 if (nvme->n_dead) { 4285 return (EIO); 4286 } 4287 4288 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 4289 if (*(uint64_t *)ns->ns_eui64 != 0) { 4290 return (ddi_devid_init(devinfo, DEVID_SCSI3_WWN, 4291 sizeof (ns->ns_eui64), ns->ns_eui64, devid)); 4292 } else { 4293 return (ddi_devid_init(devinfo, DEVID_ENCAP, 4294 strlen(ns->ns_devid), ns->ns_devid, devid)); 4295 } 4296 } 4297 4298 static int 4299 nvme_bd_free_space(void *arg, bd_xfer_t *xfer) 4300 { 4301 nvme_namespace_t *ns = arg; 4302 4303 if (xfer->x_dfl == NULL) 4304 return (EINVAL); 4305 4306 if (!ns->ns_nvme->n_idctl->id_oncs.on_dset_mgmt) 4307 return (ENOTSUP); 4308 4309 return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_DSET_MGMT)); 4310 } 4311 4312 static int 4313 nvme_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) 4314 { 4315 #ifndef __lock_lint 4316 _NOTE(ARGUNUSED(cred_p)); 4317 #endif 4318 minor_t minor = getminor(*devp); 4319 nvme_t *nvme = ddi_get_soft_state(nvme_state, NVME_MINOR_INST(minor)); 4320 int nsid = NVME_MINOR_NSID(minor); 4321 nvme_minor_state_t *nm; 4322 int rv = 0; 4323 4324 if (otyp != OTYP_CHR) 4325 return (EINVAL); 4326 4327 if (nvme == NULL) 4328 return (ENXIO); 4329 4330 if (nsid > nvme->n_namespace_count) 4331 return (ENXIO); 4332 4333 if (nvme->n_dead) 4334 return (EIO); 4335 4336 nm = nsid == 0 ? &nvme->n_minor : &nvme->n_ns[nsid - 1].ns_minor; 4337 4338 mutex_enter(&nm->nm_mutex); 4339 if (nm->nm_oexcl) { 4340 rv = EBUSY; 4341 goto out; 4342 } 4343 4344 if (flag & FEXCL) { 4345 if (nm->nm_ocnt != 0) { 4346 rv = EBUSY; 4347 goto out; 4348 } 4349 nm->nm_oexcl = B_TRUE; 4350 } 4351 4352 nm->nm_ocnt++; 4353 4354 out: 4355 mutex_exit(&nm->nm_mutex); 4356 return (rv); 4357 4358 } 4359 4360 static int 4361 nvme_close(dev_t dev, int flag, int otyp, cred_t *cred_p) 4362 { 4363 #ifndef __lock_lint 4364 _NOTE(ARGUNUSED(cred_p)); 4365 _NOTE(ARGUNUSED(flag)); 4366 #endif 4367 minor_t minor = getminor(dev); 4368 nvme_t *nvme = ddi_get_soft_state(nvme_state, NVME_MINOR_INST(minor)); 4369 int nsid = NVME_MINOR_NSID(minor); 4370 nvme_minor_state_t *nm; 4371 4372 if (otyp != OTYP_CHR) 4373 return (ENXIO); 4374 4375 if (nvme == NULL) 4376 return (ENXIO); 4377 4378 if (nsid > nvme->n_namespace_count) 4379 return (ENXIO); 4380 4381 nm = nsid == 0 ? &nvme->n_minor : &nvme->n_ns[nsid - 1].ns_minor; 4382 4383 mutex_enter(&nm->nm_mutex); 4384 if (nm->nm_oexcl) 4385 nm->nm_oexcl = B_FALSE; 4386 4387 ASSERT(nm->nm_ocnt > 0); 4388 nm->nm_ocnt--; 4389 mutex_exit(&nm->nm_mutex); 4390 4391 return (0); 4392 } 4393 4394 static int 4395 nvme_ioctl_identify(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 4396 cred_t *cred_p) 4397 { 4398 _NOTE(ARGUNUSED(cred_p)); 4399 int rv = 0; 4400 void *idctl; 4401 4402 if ((mode & FREAD) == 0) 4403 return (EPERM); 4404 4405 if (nioc->n_len < NVME_IDENTIFY_BUFSIZE) 4406 return (EINVAL); 4407 4408 if ((rv = nvme_identify(nvme, B_TRUE, nsid, (void **)&idctl)) != 0) 4409 return (rv); 4410 4411 if (ddi_copyout(idctl, (void *)nioc->n_buf, NVME_IDENTIFY_BUFSIZE, mode) 4412 != 0) 4413 rv = EFAULT; 4414 4415 kmem_free(idctl, NVME_IDENTIFY_BUFSIZE); 4416 4417 return (rv); 4418 } 4419 4420 /* 4421 * Execute commands on behalf of the various ioctls. 4422 */ 4423 static int 4424 nvme_ioc_cmd(nvme_t *nvme, nvme_sqe_t *sqe, boolean_t is_admin, void *data_addr, 4425 uint32_t data_len, int rwk, nvme_cqe_t *cqe, uint_t timeout) 4426 { 4427 nvme_cmd_t *cmd; 4428 nvme_qpair_t *ioq; 4429 int rv = 0; 4430 4431 cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 4432 if (is_admin) { 4433 cmd->nc_sqid = 0; 4434 ioq = nvme->n_adminq; 4435 } else { 4436 cmd->nc_sqid = (CPU->cpu_id % nvme->n_ioq_count) + 1; 4437 ASSERT(cmd->nc_sqid <= nvme->n_ioq_count); 4438 ioq = nvme->n_ioq[cmd->nc_sqid]; 4439 } 4440 4441 /* 4442 * This function is used to facilitate requests from 4443 * userspace, so don't panic if the command fails. This 4444 * is especially true for admin passthru commands, where 4445 * the actual command data structure is entirely defined 4446 * by userspace. 4447 */ 4448 cmd->nc_dontpanic = B_TRUE; 4449 4450 cmd->nc_callback = nvme_wakeup_cmd; 4451 cmd->nc_sqe = *sqe; 4452 4453 if ((rwk & (FREAD | FWRITE)) != 0) { 4454 if (data_addr == NULL) { 4455 rv = EINVAL; 4456 goto free_cmd; 4457 } 4458 4459 if (nvme_zalloc_dma(nvme, data_len, DDI_DMA_READ, 4460 &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) { 4461 dev_err(nvme->n_dip, CE_WARN, 4462 "!nvme_zalloc_dma failed for nvme_ioc_cmd()"); 4463 4464 rv = ENOMEM; 4465 goto free_cmd; 4466 } 4467 4468 if ((rv = nvme_fill_prp(cmd, cmd->nc_dma->nd_dmah)) != 0) 4469 goto free_cmd; 4470 4471 if ((rwk & FWRITE) != 0) { 4472 if (ddi_copyin(data_addr, cmd->nc_dma->nd_memp, 4473 data_len, rwk & FKIOCTL) != 0) { 4474 rv = EFAULT; 4475 goto free_cmd; 4476 } 4477 } 4478 } 4479 4480 if (is_admin) { 4481 nvme_admin_cmd(cmd, timeout); 4482 } else { 4483 mutex_enter(&cmd->nc_mutex); 4484 4485 rv = nvme_submit_io_cmd(ioq, cmd); 4486 4487 if (rv == EAGAIN) { 4488 mutex_exit(&cmd->nc_mutex); 4489 dev_err(cmd->nc_nvme->n_dip, CE_WARN, 4490 "!nvme_ioc_cmd() failed, I/O Q full"); 4491 goto free_cmd; 4492 } 4493 4494 nvme_wait_cmd(cmd, timeout); 4495 4496 mutex_exit(&cmd->nc_mutex); 4497 } 4498 4499 if (cqe != NULL) 4500 *cqe = cmd->nc_cqe; 4501 4502 if ((rv = nvme_check_cmd_status(cmd)) != 0) { 4503 dev_err(nvme->n_dip, CE_WARN, 4504 "!nvme_ioc_cmd() failed with sct = %x, sc = %x", 4505 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 4506 4507 goto free_cmd; 4508 } 4509 4510 if ((rwk & FREAD) != 0) { 4511 if (ddi_copyout(cmd->nc_dma->nd_memp, 4512 data_addr, data_len, rwk & FKIOCTL) != 0) 4513 rv = EFAULT; 4514 } 4515 4516 free_cmd: 4517 nvme_free_cmd(cmd); 4518 4519 return (rv); 4520 } 4521 4522 static int 4523 nvme_ioctl_capabilities(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, 4524 int mode, cred_t *cred_p) 4525 { 4526 _NOTE(ARGUNUSED(nsid, cred_p)); 4527 int rv = 0; 4528 nvme_reg_cap_t cap = { 0 }; 4529 nvme_capabilities_t nc; 4530 4531 if ((mode & FREAD) == 0) 4532 return (EPERM); 4533 4534 if (nioc->n_len < sizeof (nc)) 4535 return (EINVAL); 4536 4537 cap.r = nvme_get64(nvme, NVME_REG_CAP); 4538 4539 /* 4540 * The MPSMIN and MPSMAX fields in the CAP register use 0 to 4541 * specify the base page size of 4k (1<<12), so add 12 here to 4542 * get the real page size value. 4543 */ 4544 nc.mpsmax = 1 << (12 + cap.b.cap_mpsmax); 4545 nc.mpsmin = 1 << (12 + cap.b.cap_mpsmin); 4546 4547 if (ddi_copyout(&nc, (void *)nioc->n_buf, sizeof (nc), mode) != 0) 4548 rv = EFAULT; 4549 4550 return (rv); 4551 } 4552 4553 static int 4554 nvme_ioctl_get_logpage(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, 4555 int mode, cred_t *cred_p) 4556 { 4557 _NOTE(ARGUNUSED(cred_p)); 4558 void *log = NULL; 4559 size_t bufsize = 0; 4560 int rv = 0; 4561 4562 if ((mode & FREAD) == 0) 4563 return (EPERM); 4564 4565 switch (nioc->n_arg) { 4566 case NVME_LOGPAGE_ERROR: 4567 if (nsid != 0) 4568 return (EINVAL); 4569 break; 4570 case NVME_LOGPAGE_HEALTH: 4571 if (nsid != 0 && nvme->n_idctl->id_lpa.lp_smart == 0) 4572 return (EINVAL); 4573 4574 if (nsid == 0) 4575 nsid = (uint32_t)-1; 4576 4577 break; 4578 case NVME_LOGPAGE_FWSLOT: 4579 if (nsid != 0) 4580 return (EINVAL); 4581 break; 4582 default: 4583 if (!NVME_IS_VENDOR_SPECIFIC_LOGPAGE(nioc->n_arg)) 4584 return (EINVAL); 4585 if (nioc->n_len > NVME_VENDOR_SPECIFIC_LOGPAGE_MAX_SIZE) { 4586 dev_err(nvme->n_dip, CE_NOTE, "!Vendor-specific log " 4587 "page size exceeds device maximum supported size: " 4588 "%lu", NVME_VENDOR_SPECIFIC_LOGPAGE_MAX_SIZE); 4589 return (EINVAL); 4590 } 4591 if (nioc->n_len == 0) 4592 return (EINVAL); 4593 bufsize = nioc->n_len; 4594 if (nsid == 0) 4595 nsid = (uint32_t)-1; 4596 } 4597 4598 if (nvme_get_logpage(nvme, B_TRUE, &log, &bufsize, nioc->n_arg, nsid) 4599 != DDI_SUCCESS) 4600 return (EIO); 4601 4602 if (nioc->n_len < bufsize) { 4603 kmem_free(log, bufsize); 4604 return (EINVAL); 4605 } 4606 4607 if (ddi_copyout(log, (void *)nioc->n_buf, bufsize, mode) != 0) 4608 rv = EFAULT; 4609 4610 nioc->n_len = bufsize; 4611 kmem_free(log, bufsize); 4612 4613 return (rv); 4614 } 4615 4616 static int 4617 nvme_ioctl_get_features(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, 4618 int mode, cred_t *cred_p) 4619 { 4620 _NOTE(ARGUNUSED(cred_p)); 4621 void *buf = NULL; 4622 size_t bufsize = 0; 4623 uint32_t res = 0; 4624 uint8_t feature; 4625 int rv = 0; 4626 4627 if ((mode & FREAD) == 0) 4628 return (EPERM); 4629 4630 if ((nioc->n_arg >> 32) > 0xff) 4631 return (EINVAL); 4632 4633 feature = (uint8_t)(nioc->n_arg >> 32); 4634 4635 switch (feature) { 4636 case NVME_FEAT_ARBITRATION: 4637 case NVME_FEAT_POWER_MGMT: 4638 case NVME_FEAT_ERROR: 4639 case NVME_FEAT_NQUEUES: 4640 case NVME_FEAT_INTR_COAL: 4641 case NVME_FEAT_WRITE_ATOM: 4642 case NVME_FEAT_ASYNC_EVENT: 4643 case NVME_FEAT_PROGRESS: 4644 if (nsid != 0) 4645 return (EINVAL); 4646 break; 4647 4648 case NVME_FEAT_TEMPERATURE: 4649 if (nsid != 0) 4650 return (EINVAL); 4651 res = nioc->n_arg & 0xffffffffUL; 4652 if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 2)) { 4653 nvme_temp_threshold_t tt; 4654 4655 tt.r = res; 4656 if (tt.b.tt_thsel != NVME_TEMP_THRESH_OVER && 4657 tt.b.tt_thsel != NVME_TEMP_THRESH_UNDER) { 4658 return (EINVAL); 4659 } 4660 4661 if (tt.b.tt_tmpsel > NVME_TEMP_THRESH_MAX_SENSOR) { 4662 return (EINVAL); 4663 } 4664 } else if (res != 0) { 4665 return (ENOTSUP); 4666 } 4667 break; 4668 4669 case NVME_FEAT_INTR_VECT: 4670 if (nsid != 0) 4671 return (EINVAL); 4672 4673 res = nioc->n_arg & 0xffffffffUL; 4674 if (res >= nvme->n_intr_cnt) 4675 return (EINVAL); 4676 break; 4677 4678 case NVME_FEAT_LBA_RANGE: 4679 if (nvme->n_lba_range_supported == B_FALSE) 4680 return (EINVAL); 4681 4682 if (nsid == 0 || 4683 nsid > nvme->n_namespace_count) 4684 return (EINVAL); 4685 4686 break; 4687 4688 case NVME_FEAT_WRITE_CACHE: 4689 if (nsid != 0) 4690 return (EINVAL); 4691 4692 if (!nvme->n_write_cache_present) 4693 return (EINVAL); 4694 4695 break; 4696 4697 case NVME_FEAT_AUTO_PST: 4698 if (nsid != 0) 4699 return (EINVAL); 4700 4701 if (!nvme->n_auto_pst_supported) 4702 return (EINVAL); 4703 4704 break; 4705 4706 default: 4707 return (EINVAL); 4708 } 4709 4710 rv = nvme_get_features(nvme, B_TRUE, nsid, feature, &res, &buf, 4711 &bufsize); 4712 if (rv != 0) 4713 return (rv); 4714 4715 if (nioc->n_len < bufsize) { 4716 kmem_free(buf, bufsize); 4717 return (EINVAL); 4718 } 4719 4720 if (buf && ddi_copyout(buf, (void*)nioc->n_buf, bufsize, mode) != 0) 4721 rv = EFAULT; 4722 4723 kmem_free(buf, bufsize); 4724 nioc->n_arg = res; 4725 nioc->n_len = bufsize; 4726 4727 return (rv); 4728 } 4729 4730 static int 4731 nvme_ioctl_intr_cnt(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 4732 cred_t *cred_p) 4733 { 4734 _NOTE(ARGUNUSED(nsid, mode, cred_p)); 4735 4736 if ((mode & FREAD) == 0) 4737 return (EPERM); 4738 4739 nioc->n_arg = nvme->n_intr_cnt; 4740 return (0); 4741 } 4742 4743 static int 4744 nvme_ioctl_version(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 4745 cred_t *cred_p) 4746 { 4747 _NOTE(ARGUNUSED(nsid, cred_p)); 4748 int rv = 0; 4749 4750 if ((mode & FREAD) == 0) 4751 return (EPERM); 4752 4753 if (nioc->n_len < sizeof (nvme->n_version)) 4754 return (ENOMEM); 4755 4756 if (ddi_copyout(&nvme->n_version, (void *)nioc->n_buf, 4757 sizeof (nvme->n_version), mode) != 0) 4758 rv = EFAULT; 4759 4760 return (rv); 4761 } 4762 4763 static int 4764 nvme_ioctl_format(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 4765 cred_t *cred_p) 4766 { 4767 _NOTE(ARGUNUSED(mode)); 4768 nvme_format_nvm_t frmt = { 0 }; 4769 int c_nsid = nsid != 0 ? nsid - 1 : 0; 4770 4771 if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) 4772 return (EPERM); 4773 4774 frmt.r = nioc->n_arg & 0xffffffff; 4775 4776 /* 4777 * Check whether the FORMAT NVM command is supported. 4778 */ 4779 if (nvme->n_idctl->id_oacs.oa_format == 0) 4780 return (ENOTSUP); 4781 4782 /* 4783 * Don't allow format or secure erase of individual namespace if that 4784 * would cause a format or secure erase of all namespaces. 4785 */ 4786 if (nsid != 0 && nvme->n_idctl->id_fna.fn_format != 0) 4787 return (EINVAL); 4788 4789 if (nsid != 0 && frmt.b.fm_ses != NVME_FRMT_SES_NONE && 4790 nvme->n_idctl->id_fna.fn_sec_erase != 0) 4791 return (EINVAL); 4792 4793 /* 4794 * Don't allow formatting with Protection Information. 4795 */ 4796 if (frmt.b.fm_pi != 0 || frmt.b.fm_pil != 0 || frmt.b.fm_ms != 0) 4797 return (EINVAL); 4798 4799 /* 4800 * Don't allow formatting using an illegal LBA format, or any LBA format 4801 * that uses metadata. 4802 */ 4803 if (frmt.b.fm_lbaf > nvme->n_ns[c_nsid].ns_idns->id_nlbaf || 4804 nvme->n_ns[c_nsid].ns_idns->id_lbaf[frmt.b.fm_lbaf].lbaf_ms != 0) 4805 return (EINVAL); 4806 4807 /* 4808 * Don't allow formatting using an illegal Secure Erase setting. 4809 */ 4810 if (frmt.b.fm_ses > NVME_FRMT_MAX_SES || 4811 (frmt.b.fm_ses == NVME_FRMT_SES_CRYPTO && 4812 nvme->n_idctl->id_fna.fn_crypt_erase == 0)) 4813 return (EINVAL); 4814 4815 if (nsid == 0) 4816 nsid = (uint32_t)-1; 4817 4818 return (nvme_format_nvm(nvme, B_TRUE, nsid, frmt.b.fm_lbaf, B_FALSE, 0, 4819 B_FALSE, frmt.b.fm_ses)); 4820 } 4821 4822 static int 4823 nvme_ioctl_detach(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 4824 cred_t *cred_p) 4825 { 4826 _NOTE(ARGUNUSED(nioc, mode)); 4827 int rv = 0; 4828 4829 if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) 4830 return (EPERM); 4831 4832 if (nsid == 0) 4833 return (EINVAL); 4834 4835 if (nvme->n_ns[nsid - 1].ns_ignore) 4836 return (0); 4837 4838 rv = bd_detach_handle(nvme->n_ns[nsid - 1].ns_bd_hdl); 4839 if (rv != DDI_SUCCESS) 4840 rv = EBUSY; 4841 4842 return (rv); 4843 } 4844 4845 static int 4846 nvme_ioctl_attach(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 4847 cred_t *cred_p) 4848 { 4849 _NOTE(ARGUNUSED(nioc, mode)); 4850 nvme_identify_nsid_t *idns; 4851 int rv = 0; 4852 4853 if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) 4854 return (EPERM); 4855 4856 if (nsid == 0) 4857 return (EINVAL); 4858 4859 /* 4860 * Identify namespace again, free old identify data. 4861 */ 4862 idns = nvme->n_ns[nsid - 1].ns_idns; 4863 if (nvme_init_ns(nvme, nsid) != DDI_SUCCESS) 4864 return (EIO); 4865 4866 kmem_free(idns, sizeof (nvme_identify_nsid_t)); 4867 4868 if (nvme->n_ns[nsid - 1].ns_ignore) 4869 return (ENOTSUP); 4870 4871 if (nvme->n_ns[nsid - 1].ns_bd_hdl == NULL) 4872 nvme->n_ns[nsid - 1].ns_bd_hdl = bd_alloc_handle( 4873 &nvme->n_ns[nsid - 1], &nvme_bd_ops, &nvme->n_prp_dma_attr, 4874 KM_SLEEP); 4875 4876 rv = bd_attach_handle(nvme->n_dip, nvme->n_ns[nsid - 1].ns_bd_hdl); 4877 if (rv != DDI_SUCCESS) 4878 rv = EBUSY; 4879 4880 return (rv); 4881 } 4882 4883 static void 4884 nvme_ufm_update(nvme_t *nvme) 4885 { 4886 mutex_enter(&nvme->n_fwslot_mutex); 4887 ddi_ufm_update(nvme->n_ufmh); 4888 if (nvme->n_fwslot != NULL) { 4889 kmem_free(nvme->n_fwslot, sizeof (nvme_fwslot_log_t)); 4890 nvme->n_fwslot = NULL; 4891 } 4892 mutex_exit(&nvme->n_fwslot_mutex); 4893 } 4894 4895 static int 4896 nvme_ioctl_firmware_download(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, 4897 int mode, cred_t *cred_p) 4898 { 4899 int rv = 0; 4900 size_t len, copylen; 4901 offset_t offset; 4902 uintptr_t buf; 4903 nvme_cqe_t cqe = { 0 }; 4904 nvme_sqe_t sqe = { 4905 .sqe_opc = NVME_OPC_FW_IMAGE_LOAD 4906 }; 4907 4908 if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) 4909 return (EPERM); 4910 4911 if (nvme->n_idctl->id_oacs.oa_firmware == 0) 4912 return (ENOTSUP); 4913 4914 if (nsid != 0) 4915 return (EINVAL); 4916 4917 /* 4918 * The offset (in n_len) is restricted to the number of DWORDs in 4919 * 32 bits. 4920 */ 4921 if (nioc->n_len > NVME_FW_OFFSETB_MAX) 4922 return (EINVAL); 4923 4924 /* Confirm that both offset and length are a multiple of DWORD bytes */ 4925 if ((nioc->n_len & NVME_DWORD_MASK) != 0 || 4926 (nioc->n_arg & NVME_DWORD_MASK) != 0) 4927 return (EINVAL); 4928 4929 len = nioc->n_len; 4930 offset = nioc->n_arg; 4931 buf = (uintptr_t)nioc->n_buf; 4932 4933 nioc->n_arg = 0; 4934 4935 while (len > 0 && rv == 0) { 4936 /* 4937 * nvme_ioc_cmd() does not use SGLs or PRP lists. 4938 * It is limited to 2 PRPs per NVM command, so limit 4939 * the size of the data to 2 pages. 4940 */ 4941 copylen = MIN(2 * nvme->n_pagesize, len); 4942 4943 sqe.sqe_cdw10 = (uint32_t)(copylen >> NVME_DWORD_SHIFT) - 1; 4944 sqe.sqe_cdw11 = (uint32_t)(offset >> NVME_DWORD_SHIFT); 4945 4946 rv = nvme_ioc_cmd(nvme, &sqe, B_TRUE, (void *)buf, copylen, 4947 FWRITE, &cqe, nvme_admin_cmd_timeout); 4948 4949 /* 4950 * Regardless of whether the command succeeded or not, whether 4951 * there's an errno in rv to be returned, we'll return any 4952 * command-specific status code in n_arg. 4953 * 4954 * As n_arg isn't cleared in all other possible code paths 4955 * returning an error, we return the status code as a negative 4956 * value so it can be distinguished easily from whatever value 4957 * was passed in n_arg originally. This of course only works as 4958 * long as arguments passed in n_arg are less than INT64_MAX, 4959 * which they currently are. 4960 */ 4961 if (cqe.cqe_sf.sf_sct == NVME_CQE_SCT_SPECIFIC) 4962 nioc->n_arg = (uint64_t)-cqe.cqe_sf.sf_sc; 4963 4964 buf += copylen; 4965 offset += copylen; 4966 len -= copylen; 4967 } 4968 4969 /* 4970 * Let the DDI UFM subsystem know that the firmware information for 4971 * this device has changed. 4972 */ 4973 nvme_ufm_update(nvme); 4974 4975 return (rv); 4976 } 4977 4978 static int 4979 nvme_ioctl_firmware_commit(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, 4980 int mode, cred_t *cred_p) 4981 { 4982 nvme_firmware_commit_dw10_t fc_dw10 = { 0 }; 4983 uint32_t slot = nioc->n_arg & 0xffffffff; 4984 uint32_t action = nioc->n_arg >> 32; 4985 nvme_cqe_t cqe = { 0 }; 4986 nvme_sqe_t sqe = { 4987 .sqe_opc = NVME_OPC_FW_ACTIVATE 4988 }; 4989 int timeout; 4990 int rv; 4991 4992 if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) 4993 return (EPERM); 4994 4995 if (nvme->n_idctl->id_oacs.oa_firmware == 0) 4996 return (ENOTSUP); 4997 4998 if (nsid != 0) 4999 return (EINVAL); 5000 5001 /* Validate slot is in range. */ 5002 if (slot < NVME_FW_SLOT_MIN || slot > NVME_FW_SLOT_MAX) 5003 return (EINVAL); 5004 5005 switch (action) { 5006 case NVME_FWC_SAVE: 5007 case NVME_FWC_SAVE_ACTIVATE: 5008 timeout = nvme_commit_save_cmd_timeout; 5009 if (slot == 1 && nvme->n_idctl->id_frmw.fw_readonly) 5010 return (EROFS); 5011 break; 5012 case NVME_FWC_ACTIVATE: 5013 case NVME_FWC_ACTIVATE_IMMED: 5014 timeout = nvme_admin_cmd_timeout; 5015 break; 5016 default: 5017 return (EINVAL); 5018 } 5019 5020 fc_dw10.b.fc_slot = slot; 5021 fc_dw10.b.fc_action = action; 5022 sqe.sqe_cdw10 = fc_dw10.r; 5023 5024 nioc->n_arg = 0; 5025 rv = nvme_ioc_cmd(nvme, &sqe, B_TRUE, NULL, 0, 0, &cqe, timeout); 5026 5027 /* 5028 * Regardless of whether the command succeeded or not, whether 5029 * there's an errno in rv to be returned, we'll return any 5030 * command-specific status code in n_arg. 5031 * 5032 * As n_arg isn't cleared in all other possible code paths 5033 * returning an error, we return the status code as a negative 5034 * value so it can be distinguished easily from whatever value 5035 * was passed in n_arg originally. This of course only works as 5036 * long as arguments passed in n_arg are less than INT64_MAX, 5037 * which they currently are. 5038 */ 5039 if (cqe.cqe_sf.sf_sct == NVME_CQE_SCT_SPECIFIC) 5040 nioc->n_arg = (uint64_t)-cqe.cqe_sf.sf_sc; 5041 5042 /* 5043 * Let the DDI UFM subsystem know that the firmware information for 5044 * this device has changed. 5045 */ 5046 nvme_ufm_update(nvme); 5047 5048 return (rv); 5049 } 5050 5051 /* 5052 * Helper to copy in a passthru command from userspace, handling 5053 * different data models. 5054 */ 5055 static int 5056 nvme_passthru_copy_cmd_in(const void *buf, nvme_passthru_cmd_t *cmd, int mode) 5057 { 5058 #ifdef _MULTI_DATAMODEL 5059 switch (ddi_model_convert_from(mode & FMODELS)) { 5060 case DDI_MODEL_ILP32: { 5061 nvme_passthru_cmd32_t cmd32; 5062 if (ddi_copyin(buf, (void*)&cmd32, sizeof (cmd32), mode) != 0) 5063 return (-1); 5064 cmd->npc_opcode = cmd32.npc_opcode; 5065 cmd->npc_timeout = cmd32.npc_timeout; 5066 cmd->npc_flags = cmd32.npc_flags; 5067 cmd->npc_cdw12 = cmd32.npc_cdw12; 5068 cmd->npc_cdw13 = cmd32.npc_cdw13; 5069 cmd->npc_cdw14 = cmd32.npc_cdw14; 5070 cmd->npc_cdw15 = cmd32.npc_cdw15; 5071 cmd->npc_buflen = cmd32.npc_buflen; 5072 cmd->npc_buf = cmd32.npc_buf; 5073 break; 5074 } 5075 case DDI_MODEL_NONE: 5076 #endif 5077 if (ddi_copyin(buf, (void*)cmd, sizeof (nvme_passthru_cmd_t), 5078 mode) != 0) 5079 return (-1); 5080 #ifdef _MULTI_DATAMODEL 5081 break; 5082 } 5083 #endif 5084 return (0); 5085 } 5086 5087 /* 5088 * Helper to copy out a passthru command result to userspace, handling 5089 * different data models. 5090 */ 5091 static int 5092 nvme_passthru_copy_cmd_out(const nvme_passthru_cmd_t *cmd, void *buf, int mode) 5093 { 5094 #ifdef _MULTI_DATAMODEL 5095 switch (ddi_model_convert_from(mode & FMODELS)) { 5096 case DDI_MODEL_ILP32: { 5097 nvme_passthru_cmd32_t cmd32; 5098 bzero(&cmd32, sizeof (cmd32)); 5099 cmd32.npc_opcode = cmd->npc_opcode; 5100 cmd32.npc_status = cmd->npc_status; 5101 cmd32.npc_err = cmd->npc_err; 5102 cmd32.npc_timeout = cmd->npc_timeout; 5103 cmd32.npc_flags = cmd->npc_flags; 5104 cmd32.npc_cdw0 = cmd->npc_cdw0; 5105 cmd32.npc_cdw12 = cmd->npc_cdw12; 5106 cmd32.npc_cdw13 = cmd->npc_cdw13; 5107 cmd32.npc_cdw14 = cmd->npc_cdw14; 5108 cmd32.npc_cdw15 = cmd->npc_cdw15; 5109 cmd32.npc_buflen = (size32_t)cmd->npc_buflen; 5110 cmd32.npc_buf = (uintptr32_t)cmd->npc_buf; 5111 if (ddi_copyout(&cmd32, buf, sizeof (cmd32), mode) != 0) 5112 return (-1); 5113 break; 5114 } 5115 case DDI_MODEL_NONE: 5116 #endif 5117 if (ddi_copyout(cmd, buf, sizeof (nvme_passthru_cmd_t), 5118 mode) != 0) 5119 return (-1); 5120 #ifdef _MULTI_DATAMODEL 5121 break; 5122 } 5123 #endif 5124 return (0); 5125 } 5126 5127 /* 5128 * Run an arbitrary vendor-specific admin command on the device. 5129 */ 5130 static int 5131 nvme_ioctl_passthru(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 5132 cred_t *cred_p) 5133 { 5134 int rv = 0; 5135 uint_t timeout = 0; 5136 int rwk = 0; 5137 nvme_passthru_cmd_t cmd; 5138 size_t expected_passthru_size = 0; 5139 nvme_sqe_t sqe; 5140 nvme_cqe_t cqe; 5141 5142 bzero(&cmd, sizeof (cmd)); 5143 bzero(&sqe, sizeof (sqe)); 5144 bzero(&cqe, sizeof (cqe)); 5145 5146 /* 5147 * Basic checks: permissions, data model, argument size. 5148 */ 5149 if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) 5150 return (EPERM); 5151 5152 /* 5153 * Compute the expected size of the argument buffer 5154 */ 5155 #ifdef _MULTI_DATAMODEL 5156 switch (ddi_model_convert_from(mode & FMODELS)) { 5157 case DDI_MODEL_ILP32: 5158 expected_passthru_size = sizeof (nvme_passthru_cmd32_t); 5159 break; 5160 case DDI_MODEL_NONE: 5161 #endif 5162 expected_passthru_size = sizeof (nvme_passthru_cmd_t); 5163 #ifdef _MULTI_DATAMODEL 5164 break; 5165 } 5166 #endif 5167 5168 if (nioc->n_len != expected_passthru_size) { 5169 cmd.npc_err = NVME_PASSTHRU_ERR_CMD_SIZE; 5170 rv = EINVAL; 5171 goto out; 5172 } 5173 5174 /* 5175 * Ensure the device supports the standard vendor specific 5176 * admin command format. 5177 */ 5178 if (!nvme->n_idctl->id_nvscc.nv_spec) { 5179 cmd.npc_err = NVME_PASSTHRU_ERR_NOT_SUPPORTED; 5180 rv = ENOTSUP; 5181 goto out; 5182 } 5183 5184 if (nvme_passthru_copy_cmd_in((const void*)nioc->n_buf, &cmd, mode)) 5185 return (EFAULT); 5186 5187 if (!NVME_IS_VENDOR_SPECIFIC_CMD(cmd.npc_opcode)) { 5188 cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_OPCODE; 5189 rv = EINVAL; 5190 goto out; 5191 } 5192 5193 /* 5194 * This restriction is not mandated by the spec, so future work 5195 * could relax this if it's necessary to support commands that both 5196 * read and write. 5197 */ 5198 if ((cmd.npc_flags & NVME_PASSTHRU_READ) != 0 && 5199 (cmd.npc_flags & NVME_PASSTHRU_WRITE) != 0) { 5200 cmd.npc_err = NVME_PASSTHRU_ERR_READ_AND_WRITE; 5201 rv = EINVAL; 5202 goto out; 5203 } 5204 if (cmd.npc_timeout > nvme_vendor_specific_admin_cmd_max_timeout) { 5205 cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_TIMEOUT; 5206 rv = EINVAL; 5207 goto out; 5208 } 5209 timeout = cmd.npc_timeout; 5210 5211 /* 5212 * Passed-thru command buffer verification: 5213 * - Size is multiple of DWords 5214 * - Non-null iff the length is non-zero 5215 * - Null if neither reading nor writing data. 5216 * - Non-null if reading or writing. 5217 * - Maximum buffer size. 5218 */ 5219 if ((cmd.npc_buflen % sizeof (uint32_t)) != 0) { 5220 cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_BUFFER; 5221 rv = EINVAL; 5222 goto out; 5223 } 5224 if (((void*)cmd.npc_buf != NULL && cmd.npc_buflen == 0) || 5225 ((void*)cmd.npc_buf == NULL && cmd.npc_buflen != 0)) { 5226 cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_BUFFER; 5227 rv = EINVAL; 5228 goto out; 5229 } 5230 if (cmd.npc_flags == 0 && (void*)cmd.npc_buf != NULL) { 5231 cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_BUFFER; 5232 rv = EINVAL; 5233 goto out; 5234 } 5235 if ((cmd.npc_flags != 0) && ((void*)cmd.npc_buf == NULL)) { 5236 cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_BUFFER; 5237 rv = EINVAL; 5238 goto out; 5239 } 5240 if (cmd.npc_buflen > nvme_vendor_specific_admin_cmd_size) { 5241 cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_BUFFER; 5242 rv = EINVAL; 5243 goto out; 5244 } 5245 if ((cmd.npc_buflen >> NVME_DWORD_SHIFT) > UINT32_MAX) { 5246 cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_BUFFER; 5247 rv = EINVAL; 5248 goto out; 5249 } 5250 5251 sqe.sqe_opc = cmd.npc_opcode; 5252 sqe.sqe_nsid = nsid; 5253 sqe.sqe_cdw10 = (uint32_t)(cmd.npc_buflen >> NVME_DWORD_SHIFT); 5254 sqe.sqe_cdw12 = cmd.npc_cdw12; 5255 sqe.sqe_cdw13 = cmd.npc_cdw13; 5256 sqe.sqe_cdw14 = cmd.npc_cdw14; 5257 sqe.sqe_cdw15 = cmd.npc_cdw15; 5258 if ((cmd.npc_flags & NVME_PASSTHRU_READ) != 0) 5259 rwk = FREAD; 5260 else if ((cmd.npc_flags & NVME_PASSTHRU_WRITE) != 0) 5261 rwk = FWRITE; 5262 5263 rv = nvme_ioc_cmd(nvme, &sqe, B_TRUE, (void*)cmd.npc_buf, 5264 cmd.npc_buflen, rwk, &cqe, timeout); 5265 cmd.npc_status = cqe.cqe_sf.sf_sc; 5266 cmd.npc_cdw0 = cqe.cqe_dw0; 5267 5268 out: 5269 if (nvme_passthru_copy_cmd_out(&cmd, (void*)nioc->n_buf, mode)) 5270 rv = EFAULT; 5271 return (rv); 5272 } 5273 5274 static int 5275 nvme_ioctl_is_ignored_ns(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 5276 cred_t *cred_p) 5277 { 5278 _NOTE(ARGUNUSED(cred_p)); 5279 5280 if ((mode & FREAD) == 0) 5281 return (EPERM); 5282 5283 if (nsid == 0) 5284 return (EINVAL); 5285 5286 if (nvme->n_ns[nsid - 1].ns_ignore) 5287 nioc->n_arg = 1; 5288 else 5289 nioc->n_arg = 0; 5290 5291 return (0); 5292 } 5293 5294 static int 5295 nvme_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred_p, 5296 int *rval_p) 5297 { 5298 #ifndef __lock_lint 5299 _NOTE(ARGUNUSED(rval_p)); 5300 #endif 5301 minor_t minor = getminor(dev); 5302 nvme_t *nvme = ddi_get_soft_state(nvme_state, NVME_MINOR_INST(minor)); 5303 int nsid = NVME_MINOR_NSID(minor); 5304 int rv = 0; 5305 nvme_ioctl_t nioc; 5306 5307 int (*nvme_ioctl[])(nvme_t *, int, nvme_ioctl_t *, int, cred_t *) = { 5308 NULL, 5309 nvme_ioctl_identify, 5310 nvme_ioctl_identify, 5311 nvme_ioctl_capabilities, 5312 nvme_ioctl_get_logpage, 5313 nvme_ioctl_get_features, 5314 nvme_ioctl_intr_cnt, 5315 nvme_ioctl_version, 5316 nvme_ioctl_format, 5317 nvme_ioctl_detach, 5318 nvme_ioctl_attach, 5319 nvme_ioctl_firmware_download, 5320 nvme_ioctl_firmware_commit, 5321 nvme_ioctl_passthru, 5322 nvme_ioctl_is_ignored_ns 5323 }; 5324 5325 if (nvme == NULL) 5326 return (ENXIO); 5327 5328 if (nsid > nvme->n_namespace_count) 5329 return (ENXIO); 5330 5331 if (IS_DEVCTL(cmd)) 5332 return (ndi_devctl_ioctl(nvme->n_dip, cmd, arg, mode, 0)); 5333 5334 #ifdef _MULTI_DATAMODEL 5335 switch (ddi_model_convert_from(mode & FMODELS)) { 5336 case DDI_MODEL_ILP32: { 5337 nvme_ioctl32_t nioc32; 5338 if (ddi_copyin((void*)arg, &nioc32, sizeof (nvme_ioctl32_t), 5339 mode) != 0) 5340 return (EFAULT); 5341 nioc.n_len = nioc32.n_len; 5342 nioc.n_buf = nioc32.n_buf; 5343 nioc.n_arg = nioc32.n_arg; 5344 break; 5345 } 5346 case DDI_MODEL_NONE: 5347 #endif 5348 if (ddi_copyin((void*)arg, &nioc, sizeof (nvme_ioctl_t), mode) 5349 != 0) 5350 return (EFAULT); 5351 #ifdef _MULTI_DATAMODEL 5352 break; 5353 } 5354 #endif 5355 5356 if (nvme->n_dead && cmd != NVME_IOC_DETACH) 5357 return (EIO); 5358 5359 5360 if (cmd == NVME_IOC_IDENTIFY_CTRL) { 5361 /* 5362 * This makes NVME_IOC_IDENTIFY_CTRL work the same on devctl and 5363 * attachment point nodes. 5364 */ 5365 nsid = 0; 5366 } else if (cmd == NVME_IOC_IDENTIFY_NSID && nsid == 0) { 5367 /* 5368 * This makes NVME_IOC_IDENTIFY_NSID work on a devctl node, it 5369 * will always return identify data for namespace 1. 5370 */ 5371 nsid = 1; 5372 } 5373 5374 if (IS_NVME_IOC(cmd) && nvme_ioctl[NVME_IOC_CMD(cmd)] != NULL) 5375 rv = nvme_ioctl[NVME_IOC_CMD(cmd)](nvme, nsid, &nioc, mode, 5376 cred_p); 5377 else 5378 rv = EINVAL; 5379 5380 #ifdef _MULTI_DATAMODEL 5381 switch (ddi_model_convert_from(mode & FMODELS)) { 5382 case DDI_MODEL_ILP32: { 5383 nvme_ioctl32_t nioc32; 5384 5385 nioc32.n_len = (size32_t)nioc.n_len; 5386 nioc32.n_buf = (uintptr32_t)nioc.n_buf; 5387 nioc32.n_arg = nioc.n_arg; 5388 5389 if (ddi_copyout(&nioc32, (void *)arg, sizeof (nvme_ioctl32_t), 5390 mode) != 0) 5391 return (EFAULT); 5392 break; 5393 } 5394 case DDI_MODEL_NONE: 5395 #endif 5396 if (ddi_copyout(&nioc, (void *)arg, sizeof (nvme_ioctl_t), mode) 5397 != 0) 5398 return (EFAULT); 5399 #ifdef _MULTI_DATAMODEL 5400 break; 5401 } 5402 #endif 5403 5404 return (rv); 5405 } 5406 5407 /* 5408 * DDI UFM Callbacks 5409 */ 5410 static int 5411 nvme_ufm_fill_image(ddi_ufm_handle_t *ufmh, void *arg, uint_t imgno, 5412 ddi_ufm_image_t *img) 5413 { 5414 nvme_t *nvme = arg; 5415 5416 if (imgno != 0) 5417 return (EINVAL); 5418 5419 ddi_ufm_image_set_desc(img, "Firmware"); 5420 ddi_ufm_image_set_nslots(img, nvme->n_idctl->id_frmw.fw_nslot); 5421 5422 return (0); 5423 } 5424 5425 /* 5426 * Fill out firmware slot information for the requested slot. The firmware 5427 * slot information is gathered by requesting the Firmware Slot Information log 5428 * page. The format of the page is described in section 5.10.1.3. 5429 * 5430 * We lazily cache the log page on the first call and then invalidate the cache 5431 * data after a successful firmware download or firmware commit command. 5432 * The cached data is protected by a mutex as the state can change 5433 * asynchronous to this callback. 5434 */ 5435 static int 5436 nvme_ufm_fill_slot(ddi_ufm_handle_t *ufmh, void *arg, uint_t imgno, 5437 uint_t slotno, ddi_ufm_slot_t *slot) 5438 { 5439 nvme_t *nvme = arg; 5440 void *log = NULL; 5441 size_t bufsize; 5442 ddi_ufm_attr_t attr = 0; 5443 char fw_ver[NVME_FWVER_SZ + 1]; 5444 int ret; 5445 5446 if (imgno > 0 || slotno > (nvme->n_idctl->id_frmw.fw_nslot - 1)) 5447 return (EINVAL); 5448 5449 mutex_enter(&nvme->n_fwslot_mutex); 5450 if (nvme->n_fwslot == NULL) { 5451 ret = nvme_get_logpage(nvme, B_TRUE, &log, &bufsize, 5452 NVME_LOGPAGE_FWSLOT, 0); 5453 if (ret != DDI_SUCCESS || 5454 bufsize != sizeof (nvme_fwslot_log_t)) { 5455 if (log != NULL) 5456 kmem_free(log, bufsize); 5457 mutex_exit(&nvme->n_fwslot_mutex); 5458 return (EIO); 5459 } 5460 nvme->n_fwslot = (nvme_fwslot_log_t *)log; 5461 } 5462 5463 /* 5464 * NVMe numbers firmware slots starting at 1 5465 */ 5466 if (slotno == (nvme->n_fwslot->fw_afi - 1)) 5467 attr |= DDI_UFM_ATTR_ACTIVE; 5468 5469 if (slotno != 0 || nvme->n_idctl->id_frmw.fw_readonly == 0) 5470 attr |= DDI_UFM_ATTR_WRITEABLE; 5471 5472 if (nvme->n_fwslot->fw_frs[slotno][0] == '\0') { 5473 attr |= DDI_UFM_ATTR_EMPTY; 5474 } else { 5475 (void) strncpy(fw_ver, nvme->n_fwslot->fw_frs[slotno], 5476 NVME_FWVER_SZ); 5477 fw_ver[NVME_FWVER_SZ] = '\0'; 5478 ddi_ufm_slot_set_version(slot, fw_ver); 5479 } 5480 mutex_exit(&nvme->n_fwslot_mutex); 5481 5482 ddi_ufm_slot_set_attrs(slot, attr); 5483 5484 return (0); 5485 } 5486 5487 static int 5488 nvme_ufm_getcaps(ddi_ufm_handle_t *ufmh, void *arg, ddi_ufm_cap_t *caps) 5489 { 5490 *caps = DDI_UFM_CAP_REPORT; 5491 return (0); 5492 } 5493