1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright (c) 2016 The MathWorks, Inc. All rights reserved. 14 * Copyright 2019 Unix Software Ltd. 15 * Copyright 2020 Joyent, Inc. 16 * Copyright 2020 Racktop Systems. 17 * Copyright 2025 Oxide Computer Company. 18 * Copyright 2022 OmniOS Community Edition (OmniOSce) Association. 19 * Copyright 2022 Tintri by DDN, Inc. All rights reserved. 20 */ 21 22 /* 23 * blkdev driver for NVMe compliant storage devices 24 * 25 * This driver targets and is designed to support all NVMe 1.x and NVMe 2.x 26 * devices. Features are added to the driver as we encounter devices that 27 * require them and our needs, so some commands or log pages may not take 28 * advantage of newer features that devices support at this time. When you 29 * encounter such a case, it is generally fine to add that support to the driver 30 * as long as you take care to ensure that the requisite device version is met 31 * before using it. 32 * 33 * The driver has only been tested on x86 systems and will not work on big- 34 * endian systems without changes to the code accessing registers and data 35 * structures used by the hardware. 36 * 37 * --------------- 38 * Interrupt Usage 39 * --------------- 40 * 41 * The driver will use a single interrupt while configuring the device as the 42 * specification requires, but contrary to the specification it will try to use 43 * a single-message MSI(-X) or FIXED interrupt. Later in the attach process it 44 * will switch to multiple-message MSI(-X) if supported. The driver wants to 45 * have one interrupt vector per CPU, but it will work correctly if less are 46 * available. Interrupts can be shared by queues, the interrupt handler will 47 * iterate through the I/O queue array by steps of n_intr_cnt. Usually only 48 * the admin queue will share an interrupt with one I/O queue. The interrupt 49 * handler will retrieve completed commands from all queues sharing an interrupt 50 * vector and will post them to a taskq for completion processing. 51 * 52 * ------------------ 53 * Command Processing 54 * ------------------ 55 * 56 * NVMe devices can have up to 65535 I/O queue pairs, with each queue holding up 57 * to 65536 I/O commands. The driver will configure one I/O queue pair per 58 * available interrupt vector, with the queue length usually much smaller than 59 * the maximum of 65536. If the hardware doesn't provide enough queues, fewer 60 * interrupt vectors will be used. 61 * 62 * Additionally the hardware provides a single special admin queue pair that can 63 * hold up to 4096 admin commands. 64 * 65 * From the hardware perspective both queues of a queue pair are independent, 66 * but they share some driver state: the command array (holding pointers to 67 * commands currently being processed by the hardware) and the active command 68 * counter. Access to a submission queue and the shared state is protected by 69 * nq_mutex; completion queue is protected by ncq_mutex. 70 * 71 * When a command is submitted to a queue pair the active command counter is 72 * incremented and a pointer to the command is stored in the command array. The 73 * array index is used as command identifier (CID) in the submission queue 74 * entry. Some commands may take a very long time to complete, and if the queue 75 * wraps around in that time a submission may find the next array slot to still 76 * be used by a long-running command. In this case the array is sequentially 77 * searched for the next free slot. The length of the command array is the same 78 * as the configured queue length. Queue overrun is prevented by the semaphore, 79 * so a command submission may block if the queue is full. 80 * 81 * ------------------ 82 * Polled I/O Support 83 * ------------------ 84 * 85 * For kernel core dump support the driver can do polled I/O. As interrupts are 86 * turned off while dumping the driver will just submit a command in the regular 87 * way, and then repeatedly attempt a command retrieval until it gets the 88 * command back. 89 * 90 * ----------------- 91 * Namespace Support 92 * ----------------- 93 * 94 * NVMe devices can have multiple namespaces, each being a independent data 95 * store. The driver supports multiple namespaces and creates a blkdev interface 96 * for each namespace found. Namespaces can have various attributes to support 97 * protection information. This driver does not support any of this and ignores 98 * namespaces that have these attributes. 99 * 100 * As of NVMe 1.1 namespaces can have an 64bit Extended Unique Identifier 101 * (EUI64), and NVMe 1.2 introduced an additional 128bit Namespace Globally 102 * Unique Identifier (NGUID). This driver uses either the NGUID or the EUI64 103 * if present to generate the devid, and passes the EUI64 to blkdev to use it 104 * in the device node names. 105 * 106 * When a device has more than (2 << NVME_MINOR_INST_SHIFT) - 2 namespaces in a 107 * single controller, additional namespaces will not have minor nodes created. 108 * They can still be used and specified by the controller and libnvme. This 109 * limit is trying to balance the number of controllers and namespaces while 110 * fitting within the constraints of MAXMIN32, aka a 32-bit device number which 111 * only has 18-bits for the minor number. See the minor node section for more 112 * information. 113 * 114 * The driver supports namespace management, meaning the ability to create and 115 * destroy namespaces, and to attach and detach namespaces from controllers. 116 * Each namespace has an associated nvme_ns_state_t, which transitions through 117 * several states. The UNALLOCATED, ALLOCATED, and ACTIVE states are states that 118 * are defined by the NVMe specification. Not all ACTIVE namespaces may be 119 * attached to blkdev(4D) due to the use of features we don't support, for 120 * example, metadata protection. Such namespaces are automatically in the 121 * NOT_IGNORED state. Once they are attached to blkdev they enter the ATTACHED 122 * state. 123 * 124 * By default, a device can only transition one such state at a time. Each 125 * command that transitions between states has a corresponding array of errnos 126 * to use to transition. Examples of this are the nvme_ns_delete_states[], 127 * nvme_ctrl_attach_states[], etc. These dictate whether it is okay or not for a 128 * command that changes state to occur or not based on the current state. Each 129 * of these returns a specific error allowing one to understand why something 130 * isn't in the proper state. This allows library consumers to determine whether 131 * or not a namespace is already in the current state it's targeting to be 132 * ignored or not. The following diagram summarizes namespace transitions: 133 * 134 * +-------------+ 135 * | | 136 * | Unallocated | 137 * | | 138 * +-------------+ 139 * | ^ 140 * | | 141 * Namespace Management: . .* * . . . Namespace Management: 142 * Create | | Delete 143 * NVME_IOC_NS_CREATE | | NVME_IOC_NS_DELETE 144 * v | 145 * +-------------+ 146 * | | 147 * | Allocated | 148 * | | 149 * +-------------+ 150 * | ^ 151 * | | 152 * Namespace Attachment: . .* * . . . Namespace Attachment: 153 * Controller Attach | | Controller Detach 154 * NVME_IOC_CTRL_ATTACH | | NVME_IOC_CTRL_DETACH 155 * v | 156 * +------------+ | 157 * | | | +----------+ 158 * | Active |>-----+----<| Not | 159 * | |--*-------->| Ignored | 160 * +------------+ . +----------+ 161 * . | ^ 162 * automatic kernel transition | | 163 * | * . . blkdev Detach 164 * blkdev attach . . * | NVME_IOC_BD_DETACH 165 * NVME_IOC_BD_ATTACH | | 166 * v | 167 * +----------+ 168 * | | 169 * | blkdev | 170 * | attached | 171 * | | 172 * +----------+ 173 * 174 * ----------- 175 * Minor nodes 176 * ----------- 177 * 178 * For each NVMe device the driver exposes one minor node for the controller and 179 * one minor node for each namespace. The only operations supported by those 180 * minor nodes are open(9E), close(9E), and ioctl(9E). This serves as the 181 * primary control interface for the devices. The character device is a private 182 * interface and we attempt stability through libnvme and more so nvmeadm. 183 * 184 * The controller minor node is much more flexible than the namespace minor node 185 * and should be preferred. The controller node allows one to target any 186 * namespace that the device has, while the namespace is limited in what it can 187 * acquire. While the namespace minor exists, it should not be relied upon and 188 * is not by libnvme. 189 * 190 * The minor number space is split in two. We use the lower part to support the 191 * controller and namespaces as described above in the 'Namespace Support' 192 * section. The second set is used for cloning opens. We set aside one million 193 * minors for this purpose. We utilize a cloning open so that way we can have 194 * per-file_t state. This is how we end up implementing and tracking locking 195 * state and related. 196 * 197 * When we have this cloned open, then we allocate a new nvme_minor_t which gets 198 * its minor number from the nvme_open_minors id_space_t and is stored in the 199 * nvme_open_minors_avl. While someone calls open on a controller or namespace 200 * minor, everything else occurs in the context of one of these ephemeral 201 * minors. 202 * 203 * ------------------------------------ 204 * ioctls, Errors, and Exclusive Access 205 * ------------------------------------ 206 * 207 * All of the logical commands that one can issue are driven through the 208 * ioctl(9E) interface. All of our ioctls have a similar shape where they 209 * all include the 'nvme_ioctl_common_t' as their first member. 210 * 211 * This common ioctl structure is used to communicate the namespace that should 212 * be targeted. When the namespace is left as 0, then that indicates that it 213 * should target whatever the default is of the minor node. For a namespace 214 * minor, that will be transparently rewritten to the namespace's namespace id. 215 * 216 * In addition, the nvme_ioctl_common_t structure also has a standard error 217 * return. Our goal in our ioctl path is to ensure that we have useful semantic 218 * errors as much as possible. EINVAL, EIO, etc. are all overloaded. Instead as 219 * long as we can copy in our structure, then we will set a semantic error. If 220 * we have an error from the controller, then that will be included there. 221 * 222 * Each command has a specific policy that controls whether or not it is allowed 223 * on the namespace or controller minor, whether the broadcast namespace is 224 * allowed, various settings around what kind of exclusive access is allowed, 225 * and more. Each of these is wrapped up in a bit of policy described by the 226 * 'nvme_ioctl_check_t' structure. 227 * 228 * The device provides a form of exclusion in the form of both a 229 * controller-level and namespace-level read and write lock. Most operations do 230 * not require a lock (e.g. get log page, identify, etc.), but a few do (e.g. 231 * format nvm, firmware related activity, etc.). A read lock guarantees that you 232 * can complete your operation without interference, but read locks are not 233 * required. If you don't take a read lock and someone comes in with a write 234 * lock, then subsequent operations will fail with a semantic error indicating 235 * that you were blocked due to this. 236 * 237 * Here are some of the rules that govern our locks: 238 * 239 * 1. Writers starve readers. Any readers are allowed to finish when there is a 240 * pending writer; however, all subsequent readers will be blocked upon that 241 * writer. 242 * 2. A controller write lock takes priority over all other locks. Put 243 * differently a controller writer not only starves subsequent controller 244 * readers, but also all namespace read and write locks. 245 * 3. Each namespace lock is independent. 246 * 4. At most a single namespace lock may be owned. 247 * 5. If you own a namespace lock, you may not take a controller lock (to help 248 * with lock ordering). 249 * 6. In a similar spirit, if you own a controller write lock, you may not take 250 * any namespace lock. Someone with the controller write lock can perform any 251 * operations that they need to. However, if you have a controller read lock 252 * you may take any namespace lock. 253 * 7. There is no ability to upgrade a read lock to a write lock. 254 * 8. There is no recursive locking. 255 * 256 * While there's a lot there to keep track of, the goals of these are to 257 * constrain things so as to avoid deadlock. This is more complex than the 258 * original implementation in the driver which only allowed for an exclusive 259 * open that was tied to the thread. The first issue with tying this to the 260 * thread was that that didn't work well for software that utilized thread 261 * pools, like complex daemons. The second issue is that we want the ability for 262 * daemons, such as a FRU monitor, to be able to retain a file descriptor to the 263 * device without blocking others from taking action except during critical 264 * periods. 265 * 266 * In particular to enable something like libnvme, we didn't want someone to 267 * have to open and close the file descriptor to change what kind of exclusive 268 * access they desired. 269 * 270 * There are two different sets of data structures that we employ for tracking 271 * locking information: 272 * 273 * 1) The nvme_lock_t structure is contained in both the nvme_t and the 274 * nvme_namespace_t and tracks the current writer, readers, and pending writers 275 * and readers. Each of these lists or the writer pointer all refer to our 276 * second data structure. 277 * 278 * When a lock is owned by a single writer, then the nl_writer field is set to a 279 * specific minor's lock data structure. If instead readers are present, then 280 * the nl_readers list_t is not empty. An invariant of the system is that if 281 * nl_writer is non-NULL, nl_readers must be empty and conversely, if nl_readers 282 * is not empty, nl_writer must be NULL. 283 * 284 * 2) The nvme_minor_lock_info_t exists in the nvme_minor_t. There is one 285 * information structure which represents the minor's controller lock and a 286 * second one that represents the minor's namespace lock. The members of this 287 * are broken into tracking what the current lock is and what it targets. It 288 * also several members that are intended for debugging (nli_last_change, 289 * nli_acq_kthread, etc.). 290 * 291 * While the minor has two different lock information structures, our rules 292 * ensure that only one of the two can be pending and that they shouldn't result 293 * in a deadlock. When a lock is pending, the caller is sleeping on the minor's 294 * nm_cv member. 295 * 296 * These relationships are represented in the following image which shows a 297 * controller write lock being held with a pending readers on the controller 298 * lock and pending writers on one of the controller's namespaces. 299 * 300 * +---------+ 301 * | nvme_t | 302 * | | 303 * | n_lock -|-------+ 304 * | n_ns -+ | | +-----------------------------+ 305 * +-------|-+ +-----------------+ | nvme_minor_t | 306 * | | nvme_lock_t | | | 307 * | | | | +------------------------+ | 308 * | | writer --|-------------->| nvme_minor_lock_info_t | | 309 * | | reader list | | | nm_ctrl_lock | | 310 * | | pending writers | | +------------------------+ | 311 * | | pending readers |------+ | +------------------------+ | 312 * | +-----------------+ | | | nvme_minor_lock_info_t | | 313 * | | | | nm_ns_lock | | 314 * | | | +------------------------+ | 315 * | | +-----------------------------+ 316 * +------------------+ | +-----------------+ 317 * | nvme_namespace_t | | | nvme_minor_t | 318 * | | | | | 319 * | ns_lock ---+ | | | +-------------+ | 320 * +------------|-----+ +-----------------|>|nm_ctrl_lock | | 321 * | | +-------------+ | 322 * v +-----------------+ 323 * +------------------+ ... 324 * | nvme_lock_t | +-----------------+ 325 * | | | nvme_minor_t | 326 * | writer | | | 327 * | reader list | | +-------------+ | 328 * | pending writers -|-----------------+ | |nm_ctrl_lock | | 329 * | pending readers | | | +-------------+ | 330 * +------------------+ | +-----------------+ 331 * +-----------------------------+ | +-----------------------------+ 332 * | nvme_minor_t | | | nvme_minor_t | 333 * | | | | | 334 * | +------------------------+ | | | +------------------------+ | 335 * | | nvme_minor_lock_info_t | | | | | nvme_minor_lock_info_t | | 336 * | | nm_ctrl_lock | | | | | nm_ctrl_lock | | 337 * | +------------------------+ | | | +------------------------+ | 338 * | +------------------------+ | v | +------------------------+ | 339 * | | nvme_minor_lock_info_t |-|-----|->| nvme_minor_lock_info_t | | 340 * | | nm_ns_lock | | | | nm_ns_lock | | 341 * | +------------------------+ | | +------------------------+ | 342 * +-----------------------------+ +-----------------------------+ 343 * 344 * ---------------- 345 * Blkdev Interface 346 * ---------------- 347 * 348 * This driver uses blkdev to do all the heavy lifting involved with presenting 349 * a disk device to the system. As a result, the processing of I/O requests is 350 * relatively simple as blkdev takes care of partitioning, boundary checks, DMA 351 * setup, and splitting of transfers into manageable chunks. 352 * 353 * I/O requests coming in from blkdev are turned into NVM commands and posted to 354 * an I/O queue. The queue is selected by taking the CPU id modulo the number of 355 * queues. There is currently no timeout handling of I/O commands. 356 * 357 * Blkdev also supports querying device/media information and generating a 358 * devid. The driver reports the best block size as determined by the namespace 359 * format back to blkdev as physical block size to support partition and block 360 * alignment. The devid is either based on the namespace GUID or EUI64, if 361 * present, or composed using the device vendor ID, model number, serial number, 362 * and the namespace ID. 363 * 364 * -------------- 365 * Error Handling 366 * -------------- 367 * 368 * Error handling is currently limited to detecting fatal hardware errors, 369 * either by asynchronous events, or synchronously through command status or 370 * admin command timeouts. In case of severe errors the device is fenced off, 371 * all further requests will return EIO. FMA is then called to fault the device. 372 * 373 * The hardware has a limit for outstanding asynchronous event requests. Before 374 * this limit is known the driver assumes it is at least 1 and posts a single 375 * asynchronous request. Later when the limit is known more asynchronous event 376 * requests are posted to allow quicker reception of error information. When an 377 * asynchronous event is posted by the hardware the driver will parse the error 378 * status fields and log information or fault the device, depending on the 379 * severity of the asynchronous event. The asynchronous event request is then 380 * reused and posted to the admin queue again. 381 * 382 * On command completion the command status is checked for errors. In case of 383 * errors indicating a driver bug the driver panics. Almost all other error 384 * status values just cause EIO to be returned. 385 * 386 * Command timeouts are currently detected for all admin commands except 387 * asynchronous event requests. If a command times out and the hardware appears 388 * to be healthy the driver attempts to abort the command. The abort command 389 * timeout is a separate tunable but the original command timeout will be used 390 * if it is greater. If the abort times out too the driver assumes the device 391 * to be dead, fences it off, and calls FMA to retire it. In all other cases 392 * the aborted command should return immediately with a status indicating it 393 * was aborted, and the driver will wait indefinitely for that to happen. No 394 * timeout handling of normal I/O commands is presently done. 395 * 396 * Any command that times out due to the controller dropping dead will be put on 397 * nvme_lost_cmds list if it references DMA memory. This will prevent the DMA 398 * memory being reused by the system and later being written to by a "dead" 399 * NVMe controller. 400 * 401 * ------- 402 * Locking 403 * ------- 404 * 405 * Each queue pair has a nq_mutex and ncq_mutex. The nq_mutex must be held 406 * when accessing shared state and submission queue registers, ncq_mutex 407 * is held when accessing completion queue state and registers. 408 * Callers of nvme_unqueue_cmd() must make sure that nq_mutex is held, while 409 * nvme_submit_{admin,io}_cmd() and nvme_retrieve_cmd() take care of both 410 * mutexes themselves. 411 * 412 * Each command also has its own nc_mutex, which is associated with the 413 * condition variable nc_cv. It is only used on admin commands which are run 414 * synchronously. In that case it must be held across calls to 415 * nvme_submit_{admin,io}_cmd() and nvme_wait_cmd(), which is taken care of by 416 * nvme_admin_cmd(). It must also be held whenever the completion state of the 417 * command is changed or while an admin command timeout is handled. 418 * 419 * If both nc_mutex and nq_mutex must be held, nc_mutex must be acquired first. 420 * More than one nc_mutex may only be held when aborting commands. In this case, 421 * the nc_mutex of the command to be aborted must be held across the call to 422 * nvme_abort_cmd() to prevent the command from completing while the abort is in 423 * progress. 424 * 425 * If both nq_mutex and ncq_mutex need to be held, ncq_mutex must be 426 * acquired first. More than one nq_mutex is never held by a single thread. 427 * The ncq_mutex is only held by nvme_retrieve_cmd() and 428 * nvme_process_iocq(). nvme_process_iocq() is only called from the 429 * interrupt thread and nvme_retrieve_cmd() during polled I/O, so the 430 * mutex is non-contentious but is required for implementation completeness 431 * and safety. 432 * 433 * Each nvme_t has an n_admin_stat_mutex that protects the admin command 434 * statistics structure. If this is taken in conjunction with any other locks, 435 * then it must be taken last. 436 * 437 * There is one mutex n_minor_mutex which protects all open flags nm_open and 438 * exclusive-open thread pointers nm_oexcl of each minor node associated with a 439 * controller and its namespaces. 440 * 441 * In addition, there is a logical namespace management mutex which protects the 442 * data about namespaces. When interrogating the metadata of any namespace, this 443 * lock must be held. This gets tricky as we need to call into blkdev, which may 444 * issue callbacks into us which want this and it is illegal to hold locks 445 * across those blkdev calls as otherwise they might lead to deadlock (blkdev 446 * leverages ndi_devi_enter()). 447 * 448 * The lock exposes two levels, one that we call 'NVME' and one 'BDRO' or blkdev 449 * read-only. The idea is that most callers will use the NVME level which says 450 * this is a full traditional mutex operation. The BDRO level is used by blkdev 451 * callback functions and is a promise to only only read the data. When a blkdev 452 * operation starts, the lock holder will use nvme_mgmt_bd_start(). This 453 * strictly speaking drops the mutex, but records that the lock is logically 454 * held by the thread that did the start() operation. 455 * 456 * During this time, other threads (or even the same one) may end up calling 457 * into nvme_mgmt_lock(). Only one person may still hold the lock at any time; 458 * however, the BRDO level will be allowed to proceed during this time. This 459 * allows us to make consistent progress and honor the blkdev lock ordering 460 * requirements, albeit it is not as straightforward as a simple mutex. 461 * 462 * --------------------- 463 * Quiesce / Fast Reboot 464 * --------------------- 465 * 466 * The driver currently does not support fast reboot. A quiesce(9E) entry point 467 * is still provided which is used to send a shutdown notification to the 468 * device. 469 * 470 * 471 * ------------ 472 * NVMe Hotplug 473 * ------------ 474 * 475 * The driver supports hot removal. The driver uses the NDI event framework 476 * to register a callback, nvme_remove_callback, to clean up when a disk is 477 * removed. In particular, the driver will unqueue outstanding I/O commands and 478 * set n_dead on the softstate to true so that other operations, such as ioctls 479 * and command submissions, fail as well. 480 * 481 * While the callback registration relies on the NDI event framework, the 482 * removal event itself is kicked off in the PCIe hotplug framework, when the 483 * PCIe bridge driver ("pcieb") gets a hotplug interrupt indicating that a 484 * device was removed from the slot. 485 * 486 * The NVMe driver instance itself will remain until the final close of the 487 * device. 488 * 489 * --------------- 490 * DDI UFM Support 491 * --------------- 492 * 493 * The driver supports the DDI UFM framework for reporting information about 494 * the device's firmware image and slot configuration. This data can be 495 * queried by userland software via ioctls to the ufm driver. For more 496 * information, see ddi_ufm(9E). 497 * 498 * -------------------- 499 * Driver Configuration 500 * -------------------- 501 * 502 * The following driver properties can be changed to control some aspects of the 503 * drivers operation: 504 * - strict-version: can be set to 0 to allow devices conforming to newer 505 * major versions to be used 506 * - ignore-unknown-vendor-status: can be set to 1 to not handle any vendor 507 * specific command status as a fatal error leading device faulting 508 * - admin-queue-len: the maximum length of the admin queue (16-4096) 509 * - io-squeue-len: the maximum length of the I/O submission queues (16-65536) 510 * - io-cqueue-len: the maximum length of the I/O completion queues (16-65536) 511 * - async-event-limit: the maximum number of asynchronous event requests to be 512 * posted by the driver 513 * - volatile-write-cache-enable: can be set to 0 to disable the volatile write 514 * cache 515 * - min-phys-block-size: the minimum physical block size to report to blkdev, 516 * which is among other things the basis for ZFS vdev ashift 517 * - max-submission-queues: the maximum number of I/O submission queues. 518 * - max-completion-queues: the maximum number of I/O completion queues, 519 * can be less than max-submission-queues, in which case the completion 520 * queues are shared. 521 * 522 * In addition to the above properties, some device-specific tunables can be 523 * configured using the nvme-config-list global property. The value of this 524 * property is a list of triplets. The formal syntax is: 525 * 526 * nvme-config-list ::= <triplet> [, <triplet>]* ; 527 * <triplet> ::= "<model>" , "<rev-list>" , "<tuple-list>" 528 * <rev-list> ::= [ <fwrev> [, <fwrev>]*] 529 * <tuple-list> ::= <tunable> [, <tunable>]* 530 * <tunable> ::= <name> : <value> 531 * 532 * The <model> and <fwrev> are the strings in nvme_identify_ctrl_t`id_model and 533 * nvme_identify_ctrl_t`id_fwrev, respectively. The remainder of <tuple-list> 534 * contains one or more tunables to apply to all controllers that match the 535 * specified model number and optionally firmware revision. Each <tunable> is a 536 * <name> : <value> pair. Supported tunables are: 537 * 538 * - ignore-unknown-vendor-status: can be set to "on" to not handle any vendor 539 * specific command status as a fatal error leading device faulting 540 * 541 * - min-phys-block-size: the minimum physical block size to report to blkdev, 542 * which is among other things the basis for ZFS vdev ashift 543 * 544 * - volatile-write-cache: can be set to "on" or "off" to enable or disable the 545 * volatile write cache, if present 546 * 547 * 548 * TODO: 549 * - figure out sane default for I/O queue depth reported to blkdev 550 * - FMA handling of media errors 551 * - support for devices supporting very large I/O requests using chained PRPs 552 * - support for configuring hardware parameters like interrupt coalescing 553 * - support for big-endian systems 554 * - support for fast reboot 555 * - support for NVMe Subsystem Reset (1.1) 556 * - support for Scatter/Gather lists (1.1) 557 * - support for Reservations (1.1) 558 * - support for power management 559 */ 560 561 #include <sys/byteorder.h> 562 #ifdef _BIG_ENDIAN 563 #error nvme driver needs porting for big-endian platforms 564 #endif 565 566 #include <sys/modctl.h> 567 #include <sys/conf.h> 568 #include <sys/devops.h> 569 #include <sys/ddi.h> 570 #include <sys/ddi_ufm.h> 571 #include <sys/sunddi.h> 572 #include <sys/sunndi.h> 573 #include <sys/bitmap.h> 574 #include <sys/sysmacros.h> 575 #include <sys/param.h> 576 #include <sys/varargs.h> 577 #include <sys/cpuvar.h> 578 #include <sys/disp.h> 579 #include <sys/blkdev.h> 580 #include <sys/atomic.h> 581 #include <sys/archsystm.h> 582 #include <sys/sata/sata_hba.h> 583 #include <sys/stat.h> 584 #include <sys/policy.h> 585 #include <sys/list.h> 586 #include <sys/dkio.h> 587 #include <sys/pci.h> 588 #include <sys/mkdev.h> 589 590 #include <sys/nvme.h> 591 592 #ifdef __x86 593 #include <sys/x86_archext.h> 594 #endif 595 596 #include "nvme_reg.h" 597 #include "nvme_var.h" 598 599 /* 600 * Assertions to make sure that we've properly captured various aspects of the 601 * packed structures and haven't broken them during updates. 602 */ 603 CTASSERT(sizeof (nvme_identify_ctrl_t) == NVME_IDENTIFY_BUFSIZE); 604 CTASSERT(offsetof(nvme_identify_ctrl_t, id_oacs) == 256); 605 CTASSERT(offsetof(nvme_identify_ctrl_t, id_sqes) == 512); 606 CTASSERT(offsetof(nvme_identify_ctrl_t, id_oncs) == 520); 607 CTASSERT(offsetof(nvme_identify_ctrl_t, id_subnqn) == 768); 608 CTASSERT(offsetof(nvme_identify_ctrl_t, id_nvmof) == 1792); 609 CTASSERT(offsetof(nvme_identify_ctrl_t, id_psd) == 2048); 610 CTASSERT(offsetof(nvme_identify_ctrl_t, id_vs) == 3072); 611 612 CTASSERT(sizeof (nvme_identify_nsid_t) == NVME_IDENTIFY_BUFSIZE); 613 CTASSERT(offsetof(nvme_identify_nsid_t, id_fpi) == 32); 614 CTASSERT(offsetof(nvme_identify_nsid_t, id_anagrpid) == 92); 615 CTASSERT(offsetof(nvme_identify_nsid_t, id_nguid) == 104); 616 CTASSERT(offsetof(nvme_identify_nsid_t, id_lbaf) == 128); 617 CTASSERT(offsetof(nvme_identify_nsid_t, id_vs) == 384); 618 619 CTASSERT(sizeof (nvme_identify_nsid_list_t) == NVME_IDENTIFY_BUFSIZE); 620 CTASSERT(sizeof (nvme_identify_ctrl_list_t) == NVME_IDENTIFY_BUFSIZE); 621 622 CTASSERT(sizeof (nvme_identify_primary_caps_t) == NVME_IDENTIFY_BUFSIZE); 623 CTASSERT(offsetof(nvme_identify_primary_caps_t, nipc_vqfrt) == 32); 624 CTASSERT(offsetof(nvme_identify_primary_caps_t, nipc_vifrt) == 64); 625 626 CTASSERT(sizeof (nvme_nschange_list_t) == 4096); 627 628 /* NVMe spec version supported */ 629 static const int nvme_version_major = 2; 630 631 /* Tunable for FORMAT NVM command timeout in seconds, default is 600s */ 632 uint32_t nvme_format_cmd_timeout = 600; 633 634 /* Tunable for firmware commit with NVME_FWC_SAVE, default is 15s */ 635 uint32_t nvme_commit_save_cmd_timeout = 15; 636 637 /* 638 * Tunable for the admin command timeout used for commands other than those 639 * with their own timeouts defined above; in seconds. While most commands are 640 * expected to complete very quickly (sub-second), experience has shown that 641 * some controllers can occasionally be a bit slower, and not always consistent 642 * in the time taken - times of up to around 4.2s have been observed. Setting 643 * this to 15s by default provides headroom. 644 */ 645 uint32_t nvme_admin_cmd_timeout = 15; 646 647 /* 648 * Tunable for abort command timeout in seconds, default is 60s. This timeout 649 * is used when issuing an abort command, currently only in response to a 650 * different admin command timing out. Aborts always complete after the command 651 * that they are attempting to abort so we need to allow enough time for the 652 * controller to process the long running command that we are attempting to 653 * abort. The abort timeout here is only used if it is greater than the timeout 654 * for the command that is being aborted. 655 */ 656 uint32_t nvme_abort_cmd_timeout = 60; 657 658 /* 659 * Tunable for the size of arbitrary vendor specific admin commands, 660 * default is 16MiB. 661 */ 662 uint32_t nvme_vendor_specific_admin_cmd_size = 1 << 24; 663 664 /* 665 * Tunable for the max timeout of arbitary vendor specific admin commands, 666 * default is 60s. 667 */ 668 uint_t nvme_vendor_specific_admin_cmd_max_timeout = 60; 669 670 /* 671 * This ID space, AVL, and lock are used for keeping track of minor state across 672 * opens between different devices. 673 */ 674 static id_space_t *nvme_open_minors; 675 static avl_tree_t nvme_open_minors_avl; 676 kmutex_t nvme_open_minors_mutex; 677 678 /* 679 * Removal taskq used for n_dead callback processing. 680 */ 681 taskq_t *nvme_dead_taskq; 682 683 /* 684 * This enumeration is used in tandem with nvme_mgmt_lock() to describe which 685 * form of the lock is being taken. See the theory statement for more context. 686 */ 687 typedef enum { 688 /* 689 * This is the primary form of taking the management lock and indicates 690 * that the user intends to do a read/write of it. This should always be 691 * used for any ioctl paths or truly anything other than a blkdev 692 * information operation. 693 */ 694 NVME_MGMT_LOCK_NVME, 695 /* 696 * This is a subordinate form of the lock whereby the user is in blkdev 697 * callback context and will only intend to read the namespace data. 698 */ 699 NVME_MGMT_LOCK_BDRO 700 } nvme_mgmt_lock_level_t; 701 702 static int nvme_attach(dev_info_t *, ddi_attach_cmd_t); 703 static int nvme_detach(dev_info_t *, ddi_detach_cmd_t); 704 static int nvme_quiesce(dev_info_t *); 705 static int nvme_fm_errcb(dev_info_t *, ddi_fm_error_t *, const void *); 706 static int nvme_setup_interrupts(nvme_t *, int, int); 707 static void nvme_release_interrupts(nvme_t *); 708 static uint_t nvme_intr(caddr_t, caddr_t); 709 710 static void nvme_shutdown(nvme_t *, boolean_t); 711 static boolean_t nvme_reset(nvme_t *, boolean_t); 712 static int nvme_init(nvme_t *); 713 static nvme_cmd_t *nvme_alloc_cmd(nvme_t *, int); 714 static void nvme_free_cmd(nvme_cmd_t *); 715 static nvme_cmd_t *nvme_create_nvm_cmd(nvme_namespace_t *, uint8_t, 716 bd_xfer_t *); 717 static void nvme_admin_cmd(nvme_cmd_t *, uint32_t); 718 static void nvme_submit_admin_cmd(nvme_qpair_t *, nvme_cmd_t *, uint32_t *); 719 static int nvme_submit_io_cmd(nvme_qpair_t *, nvme_cmd_t *); 720 static void nvme_submit_cmd_common(nvme_qpair_t *, nvme_cmd_t *, uint32_t *); 721 static nvme_cmd_t *nvme_unqueue_cmd(nvme_t *, nvme_qpair_t *, int); 722 static nvme_cmd_t *nvme_retrieve_cmd(nvme_t *, nvme_qpair_t *); 723 static void nvme_wait_cmd(nvme_cmd_t *, uint_t); 724 static void nvme_wakeup_cmd(void *); 725 static void nvme_async_event_task(void *); 726 727 static int nvme_check_unknown_cmd_status(nvme_cmd_t *); 728 static int nvme_check_vendor_cmd_status(nvme_cmd_t *); 729 static int nvme_check_integrity_cmd_status(nvme_cmd_t *); 730 static int nvme_check_specific_cmd_status(nvme_cmd_t *); 731 static int nvme_check_generic_cmd_status(nvme_cmd_t *); 732 static inline int nvme_check_cmd_status(nvme_cmd_t *); 733 static boolean_t nvme_check_cmd_status_ioctl(nvme_cmd_t *, 734 nvme_ioctl_common_t *); 735 736 static int nvme_abort_cmd(nvme_cmd_t *, const uint32_t); 737 static void nvme_async_event(nvme_t *); 738 static boolean_t nvme_format_nvm(nvme_t *, nvme_ioctl_format_t *); 739 static boolean_t nvme_get_logpage_int(nvme_t *, boolean_t, void **, size_t *, 740 uint8_t); 741 static boolean_t nvme_identify(nvme_t *, boolean_t, nvme_ioctl_identify_t *, 742 void **); 743 static boolean_t nvme_identify_int(nvme_t *, uint32_t, uint8_t, void **); 744 static int nvme_set_features(nvme_t *, boolean_t, uint32_t, uint8_t, uint32_t, 745 uint32_t *); 746 static int nvme_write_cache_set(nvme_t *, boolean_t); 747 static int nvme_set_nqueues(nvme_t *); 748 749 static void nvme_free_dma(nvme_dma_t *); 750 static int nvme_zalloc_dma(nvme_t *, size_t, uint_t, ddi_dma_attr_t *, 751 nvme_dma_t **); 752 static int nvme_zalloc_queue_dma(nvme_t *, uint32_t, uint16_t, uint_t, 753 nvme_dma_t **); 754 static void nvme_free_qpair(nvme_qpair_t *); 755 static int nvme_alloc_qpair(nvme_t *, uint32_t, nvme_qpair_t **, uint_t); 756 static int nvme_create_io_qpair(nvme_t *, nvme_qpair_t *, uint16_t); 757 758 static inline void nvme_put64(nvme_t *, uintptr_t, uint64_t); 759 static inline void nvme_put32(nvme_t *, uintptr_t, uint32_t); 760 static inline uint64_t nvme_get64(nvme_t *, uintptr_t); 761 static inline uint32_t nvme_get32(nvme_t *, uintptr_t); 762 763 static boolean_t nvme_check_regs_hdl(nvme_t *); 764 static boolean_t nvme_check_dma_hdl(nvme_dma_t *); 765 766 static int nvme_fill_prp(nvme_cmd_t *, ddi_dma_handle_t); 767 768 static void nvme_bd_xfer_done(void *); 769 static void nvme_bd_driveinfo(void *, bd_drive_t *); 770 static int nvme_bd_mediainfo(void *, bd_media_t *); 771 static int nvme_bd_cmd(nvme_namespace_t *, bd_xfer_t *, uint8_t); 772 static int nvme_bd_read(void *, bd_xfer_t *); 773 static int nvme_bd_write(void *, bd_xfer_t *); 774 static int nvme_bd_sync(void *, bd_xfer_t *); 775 static int nvme_bd_devid(void *, dev_info_t *, ddi_devid_t *); 776 static int nvme_bd_free_space(void *, bd_xfer_t *); 777 778 static int nvme_prp_dma_constructor(void *, void *, int); 779 static void nvme_prp_dma_destructor(void *, void *); 780 781 static void nvme_prepare_devid(nvme_t *, uint32_t); 782 783 /* DDI UFM callbacks */ 784 static int nvme_ufm_fill_image(ddi_ufm_handle_t *, void *, uint_t, 785 ddi_ufm_image_t *); 786 static int nvme_ufm_fill_slot(ddi_ufm_handle_t *, void *, uint_t, uint_t, 787 ddi_ufm_slot_t *); 788 static int nvme_ufm_getcaps(ddi_ufm_handle_t *, void *, ddi_ufm_cap_t *); 789 790 static int nvme_open(dev_t *, int, int, cred_t *); 791 static int nvme_close(dev_t, int, int, cred_t *); 792 static int nvme_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); 793 794 static int nvme_init_ns(nvme_t *, uint32_t); 795 static boolean_t nvme_bd_attach_ns(nvme_t *, nvme_ioctl_common_t *); 796 static boolean_t nvme_bd_detach_ns(nvme_t *, nvme_ioctl_common_t *); 797 798 static int nvme_minor_comparator(const void *, const void *); 799 800 typedef struct { 801 nvme_sqe_t *ica_sqe; 802 void *ica_data; 803 uint32_t ica_data_len; 804 uint_t ica_dma_flags; 805 int ica_copy_flags; 806 uint32_t ica_timeout; 807 uint32_t ica_cdw0; 808 } nvme_ioc_cmd_args_t; 809 static boolean_t nvme_ioc_cmd(nvme_t *, nvme_ioctl_common_t *, 810 nvme_ioc_cmd_args_t *); 811 812 static ddi_ufm_ops_t nvme_ufm_ops = { 813 NULL, 814 nvme_ufm_fill_image, 815 nvme_ufm_fill_slot, 816 nvme_ufm_getcaps 817 }; 818 819 /* 820 * Minor numbers are split amongst those used for controllers and for device 821 * opens. The number of controller minors are limited based open MAXMIN32 per 822 * the theory statement. We allocate 1 million minors as a total guess at a 823 * number that'll probably be enough. The starting point of the open minors can 824 * be shifted to accommodate future expansion of the NVMe device minors. 825 */ 826 #define NVME_MINOR_INST_SHIFT 9 827 #define NVME_MINOR(inst, nsid) (((inst) << NVME_MINOR_INST_SHIFT) | (nsid)) 828 #define NVME_MINOR_INST(minor) ((minor) >> NVME_MINOR_INST_SHIFT) 829 #define NVME_MINOR_NSID(minor) ((minor) & ((1 << NVME_MINOR_INST_SHIFT) - 1)) 830 #define NVME_MINOR_MAX (NVME_MINOR(1, 0) - 2) 831 832 #define NVME_OPEN_NMINORS (1024 * 1024) 833 #define NVME_OPEN_MINOR_MIN (MAXMIN32 + 1) 834 #define NVME_OPEN_MINOR_MAX_EXCL (NVME_OPEN_MINOR_MIN + \ 835 NVME_OPEN_NMINORS) 836 837 #define NVME_BUMP_STAT(nvme, stat) \ 838 atomic_inc_64(&nvme->n_device_stat.nds_ ## stat.value.ui64) 839 840 static void *nvme_state; 841 static kmem_cache_t *nvme_cmd_cache; 842 843 /* 844 * DMA attributes for queue DMA memory 845 * 846 * Queue DMA memory must be page aligned. The maximum length of a queue is 847 * 65536 entries, and an entry can be 64 bytes long. 848 */ 849 static const ddi_dma_attr_t nvme_queue_dma_attr = { 850 .dma_attr_version = DMA_ATTR_V0, 851 .dma_attr_addr_lo = 0, 852 .dma_attr_addr_hi = 0xffffffffffffffffULL, 853 .dma_attr_count_max = (UINT16_MAX + 1) * sizeof (nvme_sqe_t) - 1, 854 .dma_attr_align = 0x1000, 855 .dma_attr_burstsizes = 0x7ff, 856 .dma_attr_minxfer = 0x1000, 857 .dma_attr_maxxfer = (UINT16_MAX + 1) * sizeof (nvme_sqe_t), 858 .dma_attr_seg = 0xffffffffffffffffULL, 859 .dma_attr_sgllen = 1, 860 .dma_attr_granular = 1, 861 .dma_attr_flags = 0, 862 }; 863 864 /* 865 * DMA attributes for transfers using Physical Region Page (PRP) entries 866 * 867 * A PRP entry describes one page of DMA memory using the page size specified 868 * in the controller configuration's memory page size register (CC.MPS). It uses 869 * a 64bit base address aligned to this page size. There is no limitation on 870 * chaining PRPs together for arbitrarily large DMA transfers. These DMA 871 * attributes will be copied into the nvme_t during nvme_attach() and the 872 * dma_attr_maxxfer will be updated. 873 */ 874 static const ddi_dma_attr_t nvme_prp_dma_attr = { 875 .dma_attr_version = DMA_ATTR_V0, 876 .dma_attr_addr_lo = 0, 877 .dma_attr_addr_hi = 0xffffffffffffffffULL, 878 .dma_attr_count_max = 0xfff, 879 .dma_attr_align = 0x1000, 880 .dma_attr_burstsizes = 0x7ff, 881 .dma_attr_minxfer = 0x1000, 882 .dma_attr_maxxfer = 0x1000, 883 .dma_attr_seg = 0xfff, 884 .dma_attr_sgllen = -1, 885 .dma_attr_granular = 1, 886 .dma_attr_flags = 0, 887 }; 888 889 /* 890 * DMA attributes for transfers using scatter/gather lists 891 * 892 * A SGL entry describes a chunk of DMA memory using a 64bit base address and a 893 * 32bit length field. SGL Segment and SGL Last Segment entries require the 894 * length to be a multiple of 16 bytes. While the SGL DMA attributes are copied 895 * into the nvme_t, they are not currently used for any I/O. 896 */ 897 static const ddi_dma_attr_t nvme_sgl_dma_attr = { 898 .dma_attr_version = DMA_ATTR_V0, 899 .dma_attr_addr_lo = 0, 900 .dma_attr_addr_hi = 0xffffffffffffffffULL, 901 .dma_attr_count_max = 0xffffffffUL, 902 .dma_attr_align = 1, 903 .dma_attr_burstsizes = 0x7ff, 904 .dma_attr_minxfer = 0x10, 905 .dma_attr_maxxfer = 0xfffffffffULL, 906 .dma_attr_seg = 0xffffffffffffffffULL, 907 .dma_attr_sgllen = -1, 908 .dma_attr_granular = 0x10, 909 .dma_attr_flags = 0 910 }; 911 912 static ddi_device_acc_attr_t nvme_reg_acc_attr = { 913 .devacc_attr_version = DDI_DEVICE_ATTR_V0, 914 .devacc_attr_endian_flags = DDI_STRUCTURE_LE_ACC, 915 .devacc_attr_dataorder = DDI_STRICTORDER_ACC 916 }; 917 918 /* 919 * ioctl validation policies. These are policies that determine which namespaces 920 * are allowed or disallowed for various operations. Note, all policy items 921 * should be explicitly listed here to help make it clear what our intent is. 922 * That is also why some of these are identical or repeated when they cover 923 * different ioctls. 924 */ 925 926 /* 927 * The controller information ioctl generally contains read-only information 928 * about the controller that is sourced from multiple different pieces of 929 * information. This does not operate on a namespace and none are accepted. 930 */ 931 static const nvme_ioctl_check_t nvme_check_ctrl_info = { 932 .nck_ns_ok = B_FALSE, .nck_ns_minor_ok = B_FALSE, 933 .nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_FALSE, 934 .nck_bcast_ok = B_FALSE, .nck_excl = NVME_IOCTL_EXCL_NONE 935 }; 936 937 /* 938 * The kernel namespace information requires a namespace ID to be specified. It 939 * does not allow for the broadcast ID to be specified. 940 */ 941 static const nvme_ioctl_check_t nvme_check_ns_info = { 942 .nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_TRUE, 943 .nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_FALSE, 944 .nck_bcast_ok = B_FALSE, .nck_excl = NVME_IOCTL_EXCL_NONE 945 }; 946 947 /* 948 * Identify commands are allowed to operate on a namespace minor. Unfortunately, 949 * the namespace field in identify commands is a bit, weird. In particular, some 950 * commands need a valid namespace, while others are namespace listing 951 * operations, which means illegal namespaces like zero are allowed. 952 */ 953 static const nvme_ioctl_check_t nvme_check_identify = { 954 .nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_TRUE, 955 .nck_skip_ctrl = B_TRUE, .nck_ctrl_rewrite = B_FALSE, 956 .nck_bcast_ok = B_TRUE, .nck_excl = NVME_IOCTL_EXCL_NONE 957 }; 958 959 /* 960 * The get log page command requires the ability to specify namespaces. When 961 * targeting the controller, one must use the broadcast NSID. 962 */ 963 static const nvme_ioctl_check_t nvme_check_get_logpage = { 964 .nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_TRUE, 965 .nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_TRUE, 966 .nck_bcast_ok = B_TRUE, .nck_excl = NVME_IOCTL_EXCL_NONE 967 }; 968 969 /* 970 * When getting a feature, we do not want rewriting behavior as most features do 971 * not require a namespace to be specified. Specific instances are checked in 972 * nvme_validate_get_feature(). 973 */ 974 static const nvme_ioctl_check_t nvme_check_get_feature = { 975 .nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_TRUE, 976 .nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_FALSE, 977 .nck_bcast_ok = B_TRUE, .nck_excl = NVME_IOCTL_EXCL_NONE 978 }; 979 980 /* 981 * Format commands must target a namespace. The broadcast namespace must be used 982 * when referring to the controller. 983 */ 984 static const nvme_ioctl_check_t nvme_check_format = { 985 .nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_TRUE, 986 .nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_TRUE, 987 .nck_bcast_ok = B_TRUE, .nck_excl = NVME_IOCTL_EXCL_WRITE 988 }; 989 990 /* 991 * blkdev and controller attach and detach must always target a namespace. 992 * However, the broadcast namespace is not allowed. We still perform rewriting 993 * so that way specifying the controller node with 0 will be caught. 994 */ 995 static const nvme_ioctl_check_t nvme_check_attach_detach = { 996 .nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_TRUE, 997 .nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_TRUE, 998 .nck_bcast_ok = B_FALSE, .nck_excl = NVME_IOCTL_EXCL_WRITE 999 }; 1000 1001 /* 1002 * Namespace creation operations cannot target a namespace as the new namespace 1003 * ID will be returned in the operation. This operation requires the entire 1004 * controller lock to be owned as one has to coordinate this operation with all 1005 * of the actual namespace logic that's present. 1006 */ 1007 static const nvme_ioctl_check_t nvme_check_ns_create = { 1008 .nck_ns_ok = B_FALSE, .nck_ns_minor_ok = B_FALSE, 1009 .nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_FALSE, 1010 .nck_bcast_ok = B_FALSE, .nck_excl = NVME_IOCTL_EXCL_CTRL 1011 }; 1012 1013 /* 1014 * NVMe namespace delete must always target a namespace. The broadcast namespace 1015 * isn't allowed. We perform rewriting so that way we can catch this. 1016 * Importantly this only requires holding an exclusive lock on the namespace, 1017 * not on the whole device like creating a namespace does. Note, we don't allow 1018 * this on the namespace minor itself as part of our path towards transitioning 1019 * away from its use. 1020 */ 1021 static const nvme_ioctl_check_t nvme_check_ns_delete = { 1022 .nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_FALSE, 1023 .nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_TRUE, 1024 .nck_bcast_ok = B_FALSE, .nck_excl = NVME_IOCTL_EXCL_WRITE 1025 }; 1026 1027 /* 1028 * Firmware operations must not target a namespace and are only allowed from the 1029 * controller. 1030 */ 1031 static const nvme_ioctl_check_t nvme_check_firmware = { 1032 .nck_ns_ok = B_FALSE, .nck_ns_minor_ok = B_FALSE, 1033 .nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_FALSE, 1034 .nck_bcast_ok = B_FALSE, .nck_excl = NVME_IOCTL_EXCL_WRITE 1035 }; 1036 1037 /* 1038 * Passthru commands are an odd set. We only allow them from the primary 1039 * controller; however, we allow a namespace to be specified in them and allow 1040 * the broadcast namespace. We do not perform rewriting because we don't know 1041 * what the semantics are. We explicitly exempt passthru commands from needing 1042 * an exclusive lock and leave it up to them to tell us the impact of the 1043 * command and semantics. As this is a privileged interface and the semantics 1044 * are arbitrary, there's not much we can do without some assistance from the 1045 * consumer. 1046 */ 1047 static const nvme_ioctl_check_t nvme_check_passthru = { 1048 .nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_FALSE, 1049 .nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_FALSE, 1050 .nck_bcast_ok = B_TRUE, .nck_excl = NVME_IOCTL_EXCL_NONE 1051 }; 1052 1053 /* 1054 * Lock operations are allowed to target a namespace, but must not be rewritten. 1055 * There is no support for the broadcast namespace. This is the only ioctl that 1056 * should skip exclusive checking as it's used to grant it. 1057 */ 1058 static const nvme_ioctl_check_t nvme_check_locking = { 1059 .nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_TRUE, 1060 .nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_FALSE, 1061 .nck_bcast_ok = B_FALSE, .nck_excl = NVME_IOCTL_EXCL_SKIP 1062 }; 1063 1064 /* 1065 * These data tables indicate how we handle the various states a namespace may 1066 * be in before we put it through the namespace state transition diagram. Note, 1067 * namespace creation does not allow one to specify a namespace ID, therefore 1068 * there it doesn't have a set of entries here. 1069 * 1070 * See Namespace Support in the theory statement for more information. 1071 */ 1072 static const nvme_ioctl_errno_t nvme_ns_delete_states[] = { 1073 [NVME_NS_STATE_UNALLOCATED] = NVME_IOCTL_E_NS_NO_NS, 1074 [NVME_NS_STATE_ALLOCATED] = NVME_IOCTL_E_OK, 1075 [NVME_NS_STATE_ACTIVE] = NVME_IOCTL_E_NS_CTRL_ATTACHED, 1076 [NVME_NS_STATE_NOT_IGNORED] = NVME_IOCTL_E_NS_CTRL_ATTACHED, 1077 [NVME_NS_STATE_ATTACHED] = NVME_IOCTL_E_NS_BLKDEV_ATTACH 1078 }; 1079 1080 static const nvme_ioctl_errno_t nvme_ctrl_attach_states[] = { 1081 [NVME_NS_STATE_UNALLOCATED] = NVME_IOCTL_E_NS_NO_NS, 1082 [NVME_NS_STATE_ALLOCATED] = NVME_IOCTL_E_OK, 1083 [NVME_NS_STATE_ACTIVE] = NVME_IOCTL_E_NS_CTRL_ATTACHED, 1084 [NVME_NS_STATE_NOT_IGNORED] = NVME_IOCTL_E_NS_CTRL_ATTACHED, 1085 [NVME_NS_STATE_ATTACHED] = NVME_IOCTL_E_NS_BLKDEV_ATTACH 1086 }; 1087 1088 static const nvme_ioctl_errno_t nvme_ctrl_detach_states[] = { 1089 [NVME_NS_STATE_UNALLOCATED] = NVME_IOCTL_E_NS_NO_NS, 1090 [NVME_NS_STATE_ALLOCATED] = NVME_IOCTL_E_NS_CTRL_NOT_ATTACHED, 1091 [NVME_NS_STATE_ACTIVE] = NVME_IOCTL_E_OK, 1092 [NVME_NS_STATE_NOT_IGNORED] = NVME_IOCTL_E_OK, 1093 [NVME_NS_STATE_ATTACHED] = NVME_IOCTL_E_NS_BLKDEV_ATTACH 1094 }; 1095 1096 static const nvme_ioctl_errno_t nvme_bd_attach_states[] = { 1097 [NVME_NS_STATE_UNALLOCATED] = NVME_IOCTL_E_NS_NO_NS, 1098 [NVME_NS_STATE_ALLOCATED] = NVME_IOCTL_E_NS_CTRL_NOT_ATTACHED, 1099 [NVME_NS_STATE_ACTIVE] = NVME_IOCTL_E_UNSUP_ATTACH_NS, 1100 [NVME_NS_STATE_NOT_IGNORED] = NVME_IOCTL_E_OK, 1101 [NVME_NS_STATE_ATTACHED] = NVME_IOCTL_E_NS_BLKDEV_ATTACH, 1102 }; 1103 1104 static const nvme_ioctl_errno_t nvme_bd_detach_states[] = { 1105 [NVME_NS_STATE_UNALLOCATED] = NVME_IOCTL_E_NS_NO_NS, 1106 [NVME_NS_STATE_ALLOCATED] = NVME_IOCTL_E_NS_CTRL_NOT_ATTACHED, 1107 [NVME_NS_STATE_ACTIVE] = NVME_IOCTL_E_NS_CTRL_ATTACHED, 1108 [NVME_NS_STATE_NOT_IGNORED] = NVME_IOCTL_E_NS_CTRL_ATTACHED, 1109 [NVME_NS_STATE_ATTACHED] = NVME_IOCTL_E_OK, 1110 }; 1111 1112 static const nvme_ioctl_errno_t nvme_format_nvm_states[] = { 1113 [NVME_NS_STATE_UNALLOCATED] = NVME_IOCTL_E_NS_NO_NS, 1114 [NVME_NS_STATE_ALLOCATED] = NVME_IOCTL_E_OK, 1115 [NVME_NS_STATE_ACTIVE] = NVME_IOCTL_E_OK, 1116 [NVME_NS_STATE_NOT_IGNORED] = NVME_IOCTL_E_OK, 1117 [NVME_NS_STATE_ATTACHED] = NVME_IOCTL_E_NS_BLKDEV_ATTACH 1118 }; 1119 1120 static struct cb_ops nvme_cb_ops = { 1121 .cb_open = nvme_open, 1122 .cb_close = nvme_close, 1123 .cb_strategy = nodev, 1124 .cb_print = nodev, 1125 .cb_dump = nodev, 1126 .cb_read = nodev, 1127 .cb_write = nodev, 1128 .cb_ioctl = nvme_ioctl, 1129 .cb_devmap = nodev, 1130 .cb_mmap = nodev, 1131 .cb_segmap = nodev, 1132 .cb_chpoll = nochpoll, 1133 .cb_prop_op = ddi_prop_op, 1134 .cb_str = 0, 1135 .cb_flag = D_NEW | D_MP, 1136 .cb_rev = CB_REV, 1137 .cb_aread = nodev, 1138 .cb_awrite = nodev 1139 }; 1140 1141 static struct dev_ops nvme_dev_ops = { 1142 .devo_rev = DEVO_REV, 1143 .devo_refcnt = 0, 1144 .devo_getinfo = ddi_no_info, 1145 .devo_identify = nulldev, 1146 .devo_probe = nulldev, 1147 .devo_attach = nvme_attach, 1148 .devo_detach = nvme_detach, 1149 .devo_reset = nodev, 1150 .devo_cb_ops = &nvme_cb_ops, 1151 .devo_bus_ops = NULL, 1152 .devo_power = NULL, 1153 .devo_quiesce = nvme_quiesce, 1154 }; 1155 1156 static struct modldrv nvme_modldrv = { 1157 .drv_modops = &mod_driverops, 1158 .drv_linkinfo = "NVMe driver", 1159 .drv_dev_ops = &nvme_dev_ops 1160 }; 1161 1162 static struct modlinkage nvme_modlinkage = { 1163 .ml_rev = MODREV_1, 1164 .ml_linkage = { &nvme_modldrv, NULL } 1165 }; 1166 1167 static bd_ops_t nvme_bd_ops = { 1168 .o_version = BD_OPS_CURRENT_VERSION, 1169 .o_drive_info = nvme_bd_driveinfo, 1170 .o_media_info = nvme_bd_mediainfo, 1171 .o_devid_init = nvme_bd_devid, 1172 .o_sync_cache = nvme_bd_sync, 1173 .o_read = nvme_bd_read, 1174 .o_write = nvme_bd_write, 1175 .o_free_space = nvme_bd_free_space, 1176 }; 1177 1178 /* 1179 * This list will hold commands that have timed out and couldn't be aborted. 1180 * As we don't know what the hardware may still do with the DMA memory we can't 1181 * free them, so we'll keep them forever on this list where we can easily look 1182 * at them with mdb. 1183 */ 1184 static struct list nvme_lost_cmds; 1185 static kmutex_t nvme_lc_mutex; 1186 1187 int 1188 _init(void) 1189 { 1190 int error; 1191 1192 error = ddi_soft_state_init(&nvme_state, sizeof (nvme_t), 1); 1193 if (error != DDI_SUCCESS) 1194 return (error); 1195 1196 if ((nvme_open_minors = id_space_create("nvme_open_minors", 1197 NVME_OPEN_MINOR_MIN, NVME_OPEN_MINOR_MAX_EXCL)) == NULL) { 1198 ddi_soft_state_fini(&nvme_state); 1199 return (ENOMEM); 1200 } 1201 1202 nvme_cmd_cache = kmem_cache_create("nvme_cmd_cache", 1203 sizeof (nvme_cmd_t), 64, NULL, NULL, NULL, NULL, NULL, 0); 1204 1205 mutex_init(&nvme_lc_mutex, NULL, MUTEX_DRIVER, NULL); 1206 list_create(&nvme_lost_cmds, sizeof (nvme_cmd_t), 1207 offsetof(nvme_cmd_t, nc_list)); 1208 1209 mutex_init(&nvme_open_minors_mutex, NULL, MUTEX_DRIVER, NULL); 1210 avl_create(&nvme_open_minors_avl, nvme_minor_comparator, 1211 sizeof (nvme_minor_t), offsetof(nvme_minor_t, nm_avl)); 1212 1213 nvme_dead_taskq = taskq_create("nvme_dead_taskq", 1, minclsyspri, 1, 1, 1214 TASKQ_PREPOPULATE); 1215 1216 bd_mod_init(&nvme_dev_ops); 1217 1218 error = mod_install(&nvme_modlinkage); 1219 if (error != DDI_SUCCESS) { 1220 ddi_soft_state_fini(&nvme_state); 1221 id_space_destroy(nvme_open_minors); 1222 mutex_destroy(&nvme_lc_mutex); 1223 list_destroy(&nvme_lost_cmds); 1224 bd_mod_fini(&nvme_dev_ops); 1225 mutex_destroy(&nvme_open_minors_mutex); 1226 avl_destroy(&nvme_open_minors_avl); 1227 taskq_destroy(nvme_dead_taskq); 1228 } 1229 1230 return (error); 1231 } 1232 1233 int 1234 _fini(void) 1235 { 1236 int error; 1237 1238 if (!list_is_empty(&nvme_lost_cmds)) 1239 return (DDI_FAILURE); 1240 1241 error = mod_remove(&nvme_modlinkage); 1242 if (error == DDI_SUCCESS) { 1243 ddi_soft_state_fini(&nvme_state); 1244 id_space_destroy(nvme_open_minors); 1245 kmem_cache_destroy(nvme_cmd_cache); 1246 mutex_destroy(&nvme_lc_mutex); 1247 list_destroy(&nvme_lost_cmds); 1248 bd_mod_fini(&nvme_dev_ops); 1249 mutex_destroy(&nvme_open_minors_mutex); 1250 avl_destroy(&nvme_open_minors_avl); 1251 taskq_destroy(nvme_dead_taskq); 1252 } 1253 1254 return (error); 1255 } 1256 1257 int 1258 _info(struct modinfo *modinfop) 1259 { 1260 return (mod_info(&nvme_modlinkage, modinfop)); 1261 } 1262 1263 static inline void 1264 nvme_put64(nvme_t *nvme, uintptr_t reg, uint64_t val) 1265 { 1266 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0); 1267 1268 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 1269 ddi_put64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg), val); 1270 } 1271 1272 static inline void 1273 nvme_put32(nvme_t *nvme, uintptr_t reg, uint32_t val) 1274 { 1275 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x3) == 0); 1276 1277 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 1278 ddi_put32(nvme->n_regh, (uint32_t *)(nvme->n_regs + reg), val); 1279 } 1280 1281 static inline uint64_t 1282 nvme_get64(nvme_t *nvme, uintptr_t reg) 1283 { 1284 uint64_t val; 1285 1286 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0); 1287 1288 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 1289 val = ddi_get64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg)); 1290 1291 return (val); 1292 } 1293 1294 static inline uint32_t 1295 nvme_get32(nvme_t *nvme, uintptr_t reg) 1296 { 1297 uint32_t val; 1298 1299 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x3) == 0); 1300 1301 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 1302 val = ddi_get32(nvme->n_regh, (uint32_t *)(nvme->n_regs + reg)); 1303 1304 return (val); 1305 } 1306 1307 static void 1308 nvme_mgmt_lock_fini(nvme_mgmt_lock_t *lock) 1309 { 1310 ASSERT3U(lock->nml_bd_own, ==, 0); 1311 mutex_destroy(&lock->nml_lock); 1312 cv_destroy(&lock->nml_cv); 1313 } 1314 1315 static void 1316 nvme_mgmt_lock_init(nvme_mgmt_lock_t *lock) 1317 { 1318 mutex_init(&lock->nml_lock, NULL, MUTEX_DRIVER, NULL); 1319 cv_init(&lock->nml_cv, NULL, CV_DRIVER, NULL); 1320 lock->nml_bd_own = 0; 1321 } 1322 1323 static void 1324 nvme_mgmt_unlock(nvme_t *nvme) 1325 { 1326 nvme_mgmt_lock_t *lock = &nvme->n_mgmt; 1327 1328 cv_broadcast(&lock->nml_cv); 1329 mutex_exit(&lock->nml_lock); 1330 } 1331 1332 static boolean_t 1333 nvme_mgmt_lock_held(const nvme_t *nvme) 1334 { 1335 return (MUTEX_HELD(&nvme->n_mgmt.nml_lock) != 0); 1336 } 1337 1338 static void 1339 nvme_mgmt_lock(nvme_t *nvme, nvme_mgmt_lock_level_t level) 1340 { 1341 nvme_mgmt_lock_t *lock = &nvme->n_mgmt; 1342 mutex_enter(&lock->nml_lock); 1343 while (lock->nml_bd_own != 0) { 1344 if (level == NVME_MGMT_LOCK_BDRO) 1345 break; 1346 cv_wait(&lock->nml_cv, &lock->nml_lock); 1347 } 1348 } 1349 1350 /* 1351 * This and nvme_mgmt_bd_end() are used to indicate that the driver is going to 1352 * be calling into a re-entrant blkdev related function. We cannot hold the lock 1353 * across such an operation and therefore must indicate that this is logically 1354 * held, while allowing other operations to proceed. This nvme_mgmt_bd_end() may 1355 * only be called by a thread that already holds the nmve_mgmt_lock(). 1356 */ 1357 static void 1358 nvme_mgmt_bd_start(nvme_t *nvme) 1359 { 1360 nvme_mgmt_lock_t *lock = &nvme->n_mgmt; 1361 1362 VERIFY(MUTEX_HELD(&lock->nml_lock)); 1363 VERIFY3U(lock->nml_bd_own, ==, 0); 1364 lock->nml_bd_own = (uintptr_t)curthread; 1365 mutex_exit(&lock->nml_lock); 1366 } 1367 1368 static void 1369 nvme_mgmt_bd_end(nvme_t *nvme) 1370 { 1371 nvme_mgmt_lock_t *lock = &nvme->n_mgmt; 1372 1373 mutex_enter(&lock->nml_lock); 1374 VERIFY3U(lock->nml_bd_own, ==, (uintptr_t)curthread); 1375 lock->nml_bd_own = 0; 1376 } 1377 1378 static boolean_t 1379 nvme_ns_state_check(const nvme_namespace_t *ns, nvme_ioctl_common_t *ioc, 1380 const nvme_ioctl_errno_t states[NVME_NS_NSTATES]) 1381 { 1382 VERIFY(nvme_mgmt_lock_held(ns->ns_nvme)); 1383 VERIFY3U(ns->ns_state, <, NVME_NS_NSTATES); 1384 1385 if (states[ns->ns_state] == NVME_IOCTL_E_OK) { 1386 return (B_TRUE); 1387 } 1388 1389 return (nvme_ioctl_error(ioc, states[ns->ns_state], 0, 0)); 1390 } 1391 1392 /* 1393 * This is a central clearing house for marking an NVMe controller dead and/or 1394 * removed. This takes care of setting the flag, taking care of outstanding 1395 * blocked locks, and sending a DDI FMA impact. This is called from a precarious 1396 * place where locking is suspect. The only guarantee we have is that the nvme_t 1397 * is valid and won't disappear until we return. 1398 */ 1399 static void 1400 nvme_ctrl_mark_dead(nvme_t *nvme, boolean_t removed) 1401 { 1402 boolean_t was_dead; 1403 1404 /* 1405 * See if we win the race to set things up here. If someone beat us to 1406 * it, we do not do anything. 1407 */ 1408 was_dead = atomic_cas_32((volatile uint32_t *)&nvme->n_dead, B_FALSE, 1409 B_TRUE); 1410 1411 /* 1412 * If we were removed, note this in our death status, regardless of 1413 * whether or not we were already dead. We need to know this so that we 1414 * can decide if it is safe to try and interact the the device in e.g. 1415 * reset and shutdown. 1416 */ 1417 if (removed) { 1418 nvme->n_dead_status = NVME_IOCTL_E_CTRL_GONE; 1419 } 1420 1421 if (was_dead) { 1422 return; 1423 } 1424 1425 /* 1426 * If this was removed, there is no reason to change the service impact. 1427 * Otherwise, we need to change our default return code to indicate that 1428 * the device is truly dead, and not simply gone. 1429 */ 1430 if (!removed) { 1431 ASSERT3U(nvme->n_dead_status, ==, NVME_IOCTL_E_CTRL_DEAD); 1432 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 1433 } 1434 1435 taskq_dispatch_ent(nvme_dead_taskq, nvme_rwlock_ctrl_dead, nvme, 1436 TQ_NOSLEEP, &nvme->n_dead_tqent); 1437 } 1438 1439 static boolean_t 1440 nvme_ctrl_is_gone(const nvme_t *nvme) 1441 { 1442 if (nvme->n_dead && nvme->n_dead_status == NVME_IOCTL_E_CTRL_GONE) 1443 return (B_TRUE); 1444 1445 return (B_FALSE); 1446 } 1447 1448 static boolean_t 1449 nvme_check_regs_hdl(nvme_t *nvme) 1450 { 1451 ddi_fm_error_t error; 1452 1453 ddi_fm_acc_err_get(nvme->n_regh, &error, DDI_FME_VERSION); 1454 1455 if (error.fme_status != DDI_FM_OK) 1456 return (B_TRUE); 1457 1458 return (B_FALSE); 1459 } 1460 1461 static boolean_t 1462 nvme_check_dma_hdl(nvme_dma_t *dma) 1463 { 1464 ddi_fm_error_t error; 1465 1466 if (dma == NULL) 1467 return (B_FALSE); 1468 1469 ddi_fm_dma_err_get(dma->nd_dmah, &error, DDI_FME_VERSION); 1470 1471 if (error.fme_status != DDI_FM_OK) 1472 return (B_TRUE); 1473 1474 return (B_FALSE); 1475 } 1476 1477 static void 1478 nvme_free_dma_common(nvme_dma_t *dma) 1479 { 1480 if (dma->nd_dmah != NULL) 1481 (void) ddi_dma_unbind_handle(dma->nd_dmah); 1482 if (dma->nd_acch != NULL) 1483 ddi_dma_mem_free(&dma->nd_acch); 1484 if (dma->nd_dmah != NULL) 1485 ddi_dma_free_handle(&dma->nd_dmah); 1486 } 1487 1488 static void 1489 nvme_free_dma(nvme_dma_t *dma) 1490 { 1491 nvme_free_dma_common(dma); 1492 kmem_free(dma, sizeof (*dma)); 1493 } 1494 1495 static void 1496 nvme_prp_dma_destructor(void *buf, void *private __unused) 1497 { 1498 nvme_dma_t *dma = (nvme_dma_t *)buf; 1499 1500 nvme_free_dma_common(dma); 1501 } 1502 1503 static int 1504 nvme_alloc_dma_common(nvme_t *nvme, nvme_dma_t *dma, 1505 size_t len, uint_t flags, ddi_dma_attr_t *dma_attr) 1506 { 1507 if (ddi_dma_alloc_handle(nvme->n_dip, dma_attr, DDI_DMA_SLEEP, NULL, 1508 &dma->nd_dmah) != DDI_SUCCESS) { 1509 /* 1510 * Due to DDI_DMA_SLEEP this can't be DDI_DMA_NORESOURCES, and 1511 * the only other possible error is DDI_DMA_BADATTR which 1512 * indicates a driver bug which should cause a panic. 1513 */ 1514 dev_err(nvme->n_dip, CE_PANIC, 1515 "!failed to get DMA handle, check DMA attributes"); 1516 return (DDI_FAILURE); 1517 } 1518 1519 /* 1520 * ddi_dma_mem_alloc() can only fail when DDI_DMA_NOSLEEP is specified 1521 * or the flags are conflicting, which isn't the case here. 1522 */ 1523 (void) ddi_dma_mem_alloc(dma->nd_dmah, len, &nvme->n_reg_acc_attr, 1524 DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL, &dma->nd_memp, 1525 &dma->nd_len, &dma->nd_acch); 1526 1527 if (ddi_dma_addr_bind_handle(dma->nd_dmah, NULL, dma->nd_memp, 1528 dma->nd_len, flags | DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL, 1529 &dma->nd_cookie, &dma->nd_ncookie) != DDI_DMA_MAPPED) { 1530 dev_err(nvme->n_dip, CE_WARN, 1531 "!failed to bind DMA memory"); 1532 NVME_BUMP_STAT(nvme, dma_bind_err); 1533 nvme_free_dma_common(dma); 1534 return (DDI_FAILURE); 1535 } 1536 1537 return (DDI_SUCCESS); 1538 } 1539 1540 static int 1541 nvme_zalloc_dma(nvme_t *nvme, size_t len, uint_t flags, 1542 ddi_dma_attr_t *dma_attr, nvme_dma_t **ret) 1543 { 1544 nvme_dma_t *dma = kmem_zalloc(sizeof (nvme_dma_t), KM_SLEEP); 1545 1546 if (nvme_alloc_dma_common(nvme, dma, len, flags, dma_attr) != 1547 DDI_SUCCESS) { 1548 *ret = NULL; 1549 kmem_free(dma, sizeof (nvme_dma_t)); 1550 return (DDI_FAILURE); 1551 } 1552 1553 bzero(dma->nd_memp, dma->nd_len); 1554 1555 *ret = dma; 1556 return (DDI_SUCCESS); 1557 } 1558 1559 static int 1560 nvme_prp_dma_constructor(void *buf, void *private, int flags __unused) 1561 { 1562 nvme_dma_t *dma = (nvme_dma_t *)buf; 1563 nvme_t *nvme = (nvme_t *)private; 1564 1565 dma->nd_dmah = NULL; 1566 dma->nd_acch = NULL; 1567 1568 if (nvme_alloc_dma_common(nvme, dma, nvme->n_pagesize, 1569 DDI_DMA_READ, &nvme->n_prp_dma_attr) != DDI_SUCCESS) { 1570 return (-1); 1571 } 1572 1573 ASSERT(dma->nd_ncookie == 1); 1574 1575 dma->nd_cached = B_TRUE; 1576 1577 return (0); 1578 } 1579 1580 static int 1581 nvme_zalloc_queue_dma(nvme_t *nvme, uint32_t nentry, uint16_t qe_len, 1582 uint_t flags, nvme_dma_t **dma) 1583 { 1584 uint32_t len = nentry * qe_len; 1585 ddi_dma_attr_t q_dma_attr = nvme->n_queue_dma_attr; 1586 1587 len = roundup(len, nvme->n_pagesize); 1588 1589 if (nvme_zalloc_dma(nvme, len, flags, &q_dma_attr, dma) 1590 != DDI_SUCCESS) { 1591 dev_err(nvme->n_dip, CE_WARN, 1592 "!failed to get DMA memory for queue"); 1593 goto fail; 1594 } 1595 1596 if ((*dma)->nd_ncookie != 1) { 1597 dev_err(nvme->n_dip, CE_WARN, 1598 "!got too many cookies for queue DMA"); 1599 goto fail; 1600 } 1601 1602 return (DDI_SUCCESS); 1603 1604 fail: 1605 if (*dma) { 1606 nvme_free_dma(*dma); 1607 *dma = NULL; 1608 } 1609 1610 return (DDI_FAILURE); 1611 } 1612 1613 static void 1614 nvme_free_cq(nvme_cq_t *cq) 1615 { 1616 mutex_destroy(&cq->ncq_mutex); 1617 1618 if (cq->ncq_cmd_taskq != NULL) 1619 taskq_destroy(cq->ncq_cmd_taskq); 1620 1621 if (cq->ncq_dma != NULL) 1622 nvme_free_dma(cq->ncq_dma); 1623 1624 kmem_free(cq, sizeof (*cq)); 1625 } 1626 1627 static void 1628 nvme_free_qpair(nvme_qpair_t *qp) 1629 { 1630 int i; 1631 1632 mutex_destroy(&qp->nq_mutex); 1633 sema_destroy(&qp->nq_sema); 1634 1635 if (qp->nq_sqdma != NULL) 1636 nvme_free_dma(qp->nq_sqdma); 1637 1638 if (qp->nq_active_cmds > 0) 1639 for (i = 0; i != qp->nq_nentry; i++) 1640 if (qp->nq_cmd[i] != NULL) 1641 nvme_free_cmd(qp->nq_cmd[i]); 1642 1643 if (qp->nq_cmd != NULL) 1644 kmem_free(qp->nq_cmd, sizeof (nvme_cmd_t *) * qp->nq_nentry); 1645 1646 kmem_free(qp, sizeof (nvme_qpair_t)); 1647 } 1648 1649 /* 1650 * Destroy the pre-allocated cq array, but only free individual completion 1651 * queues from the given starting index. 1652 */ 1653 static void 1654 nvme_destroy_cq_array(nvme_t *nvme, uint_t start) 1655 { 1656 uint_t i; 1657 1658 for (i = start; i < nvme->n_cq_count; i++) 1659 if (nvme->n_cq[i] != NULL) 1660 nvme_free_cq(nvme->n_cq[i]); 1661 1662 kmem_free(nvme->n_cq, sizeof (*nvme->n_cq) * nvme->n_cq_count); 1663 } 1664 1665 static int 1666 nvme_alloc_cq(nvme_t *nvme, uint32_t nentry, nvme_cq_t **cqp, uint16_t idx, 1667 uint_t nthr) 1668 { 1669 nvme_cq_t *cq = kmem_zalloc(sizeof (*cq), KM_SLEEP); 1670 char name[64]; /* large enough for the taskq name */ 1671 1672 mutex_init(&cq->ncq_mutex, NULL, MUTEX_DRIVER, 1673 DDI_INTR_PRI(nvme->n_intr_pri)); 1674 1675 if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_cqe_t), 1676 DDI_DMA_READ, &cq->ncq_dma) != DDI_SUCCESS) 1677 goto fail; 1678 1679 cq->ncq_cq = (nvme_cqe_t *)cq->ncq_dma->nd_memp; 1680 cq->ncq_nentry = nentry; 1681 cq->ncq_id = idx; 1682 cq->ncq_hdbl = NVME_REG_CQHDBL(nvme, idx); 1683 1684 /* 1685 * Each completion queue has its own command taskq. 1686 */ 1687 (void) snprintf(name, sizeof (name), "%s%d_cmd_taskq%u", 1688 ddi_driver_name(nvme->n_dip), ddi_get_instance(nvme->n_dip), idx); 1689 1690 cq->ncq_cmd_taskq = taskq_create(name, nthr, minclsyspri, 64, INT_MAX, 1691 TASKQ_PREPOPULATE); 1692 1693 if (cq->ncq_cmd_taskq == NULL) { 1694 dev_err(nvme->n_dip, CE_WARN, "!failed to create cmd " 1695 "taskq for cq %u", idx); 1696 goto fail; 1697 } 1698 1699 *cqp = cq; 1700 return (DDI_SUCCESS); 1701 1702 fail: 1703 nvme_free_cq(cq); 1704 *cqp = NULL; 1705 1706 return (DDI_FAILURE); 1707 } 1708 1709 /* 1710 * Create the n_cq array big enough to hold "ncq" completion queues. 1711 * If the array already exists it will be re-sized (but only larger). 1712 * The admin queue is included in this array, which boosts the 1713 * max number of entries to UINT16_MAX + 1. 1714 */ 1715 static int 1716 nvme_create_cq_array(nvme_t *nvme, uint_t ncq, uint32_t nentry, uint_t nthr) 1717 { 1718 nvme_cq_t **cq; 1719 uint_t i, cq_count; 1720 1721 ASSERT3U(ncq, >, nvme->n_cq_count); 1722 1723 cq = nvme->n_cq; 1724 cq_count = nvme->n_cq_count; 1725 1726 nvme->n_cq = kmem_zalloc(sizeof (*nvme->n_cq) * ncq, KM_SLEEP); 1727 nvme->n_cq_count = ncq; 1728 1729 for (i = 0; i < cq_count; i++) 1730 nvme->n_cq[i] = cq[i]; 1731 1732 for (; i < nvme->n_cq_count; i++) 1733 if (nvme_alloc_cq(nvme, nentry, &nvme->n_cq[i], i, nthr) != 1734 DDI_SUCCESS) 1735 goto fail; 1736 1737 if (cq != NULL) 1738 kmem_free(cq, sizeof (*cq) * cq_count); 1739 1740 return (DDI_SUCCESS); 1741 1742 fail: 1743 nvme_destroy_cq_array(nvme, cq_count); 1744 /* 1745 * Restore the original array 1746 */ 1747 nvme->n_cq_count = cq_count; 1748 nvme->n_cq = cq; 1749 1750 return (DDI_FAILURE); 1751 } 1752 1753 static int 1754 nvme_alloc_qpair(nvme_t *nvme, uint32_t nentry, nvme_qpair_t **nqp, 1755 uint_t idx) 1756 { 1757 nvme_qpair_t *qp = kmem_zalloc(sizeof (*qp), KM_SLEEP); 1758 uint_t cq_idx; 1759 1760 mutex_init(&qp->nq_mutex, NULL, MUTEX_DRIVER, 1761 DDI_INTR_PRI(nvme->n_intr_pri)); 1762 1763 /* 1764 * The NVMe spec defines that a full queue has one empty (unused) slot; 1765 * initialize the semaphore accordingly. 1766 */ 1767 sema_init(&qp->nq_sema, nentry - 1, NULL, SEMA_DRIVER, NULL); 1768 1769 if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_sqe_t), 1770 DDI_DMA_WRITE, &qp->nq_sqdma) != DDI_SUCCESS) 1771 goto fail; 1772 1773 /* 1774 * idx == 0 is adminq, those above 0 are shared io completion queues. 1775 */ 1776 cq_idx = idx == 0 ? 0 : 1 + (idx - 1) % (nvme->n_cq_count - 1); 1777 qp->nq_cq = nvme->n_cq[cq_idx]; 1778 qp->nq_sq = (nvme_sqe_t *)qp->nq_sqdma->nd_memp; 1779 qp->nq_nentry = nentry; 1780 1781 qp->nq_sqtdbl = NVME_REG_SQTDBL(nvme, idx); 1782 1783 qp->nq_cmd = kmem_zalloc(sizeof (nvme_cmd_t *) * nentry, KM_SLEEP); 1784 qp->nq_next_cmd = 0; 1785 1786 *nqp = qp; 1787 return (DDI_SUCCESS); 1788 1789 fail: 1790 nvme_free_qpair(qp); 1791 *nqp = NULL; 1792 1793 return (DDI_FAILURE); 1794 } 1795 1796 /* 1797 * One might reasonably consider that the nvme_cmd_cache should have a cache 1798 * constructor and destructor that takes care of the mutex/cv init/destroy, and 1799 * that nvme_free_cmd should reset more fields such that allocation becomes 1800 * simpler. This is not currently implemented as: 1801 * - nvme_cmd_cache is a global cache, shared across nvme instances and 1802 * therefore there is no easy access to the corresponding nvme_t in the 1803 * constructor to determine the required interrupt priority. 1804 * - Most fields in nvme_cmd_t would need to be zeroed in nvme_free_cmd while 1805 * preserving the mutex/cv. It is easier to able to zero the entire 1806 * structure and then init the mutex/cv only in the unlikely event that we 1807 * want an admin command. 1808 */ 1809 static nvme_cmd_t * 1810 nvme_alloc_cmd(nvme_t *nvme, int kmflag) 1811 { 1812 nvme_cmd_t *cmd = kmem_cache_alloc(nvme_cmd_cache, kmflag); 1813 1814 if (cmd != NULL) { 1815 bzero(cmd, sizeof (nvme_cmd_t)); 1816 cmd->nc_nvme = nvme; 1817 } 1818 1819 return (cmd); 1820 } 1821 1822 static nvme_cmd_t * 1823 nvme_alloc_admin_cmd(nvme_t *nvme, int kmflag) 1824 { 1825 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, kmflag); 1826 1827 if (cmd != NULL) { 1828 cmd->nc_flags |= NVME_CMD_F_USELOCK; 1829 mutex_init(&cmd->nc_mutex, NULL, MUTEX_DRIVER, 1830 DDI_INTR_PRI(nvme->n_intr_pri)); 1831 cv_init(&cmd->nc_cv, NULL, CV_DRIVER, NULL); 1832 } 1833 1834 return (cmd); 1835 } 1836 1837 static void 1838 nvme_free_cmd(nvme_cmd_t *cmd) 1839 { 1840 /* Don't free commands on the lost commands list. */ 1841 if (list_link_active(&cmd->nc_list)) 1842 return; 1843 1844 if (cmd->nc_dma) { 1845 nvme_free_dma(cmd->nc_dma); 1846 cmd->nc_dma = NULL; 1847 } 1848 1849 if (cmd->nc_prp) { 1850 kmem_cache_free(cmd->nc_nvme->n_prp_cache, cmd->nc_prp); 1851 cmd->nc_prp = NULL; 1852 } 1853 1854 if ((cmd->nc_flags & NVME_CMD_F_USELOCK) != 0) { 1855 cv_destroy(&cmd->nc_cv); 1856 mutex_destroy(&cmd->nc_mutex); 1857 } 1858 1859 kmem_cache_free(nvme_cmd_cache, cmd); 1860 } 1861 1862 static void 1863 nvme_submit_admin_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd, uint32_t *qtimeoutp) 1864 { 1865 sema_p(&qp->nq_sema); 1866 nvme_submit_cmd_common(qp, cmd, qtimeoutp); 1867 } 1868 1869 static int 1870 nvme_submit_io_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd) 1871 { 1872 if (cmd->nc_nvme->n_dead) { 1873 return (EIO); 1874 } 1875 1876 if (sema_tryp(&qp->nq_sema) == 0) 1877 return (EAGAIN); 1878 1879 nvme_submit_cmd_common(qp, cmd, NULL); 1880 return (0); 1881 } 1882 1883 /* 1884 * Common command submission routine. If `qtimeoutp` is not NULL then it will 1885 * be set to the sum of the timeouts of any active commands ahead of the one 1886 * being submitted. 1887 */ 1888 static void 1889 nvme_submit_cmd_common(nvme_qpair_t *qp, nvme_cmd_t *cmd, uint32_t *qtimeoutp) 1890 { 1891 nvme_reg_sqtdbl_t tail = { 0 }; 1892 1893 /* 1894 * We don't need to take a lock on cmd since it is not yet enqueued. 1895 */ 1896 cmd->nc_submit_ts = gethrtime(); 1897 cmd->nc_state = NVME_CMD_SUBMITTED; 1898 1899 mutex_enter(&qp->nq_mutex); 1900 1901 /* 1902 * Now that we hold the queue pair lock, we must check whether or not 1903 * the controller has been listed as dead (e.g. was removed due to 1904 * hotplug). This is necessary as otherwise we could race with 1905 * nvme_remove_callback(). Because this has not been enqueued, we don't 1906 * call nvme_unqueue_cmd(), which is why we must manually decrement the 1907 * semaphore. 1908 */ 1909 if (cmd->nc_nvme->n_dead) { 1910 cmd->nc_queue_ts = gethrtime(); 1911 cmd->nc_state = NVME_CMD_QUEUED; 1912 taskq_dispatch_ent(qp->nq_cq->ncq_cmd_taskq, cmd->nc_callback, 1913 cmd, TQ_NOSLEEP, &cmd->nc_tqent); 1914 sema_v(&qp->nq_sema); 1915 mutex_exit(&qp->nq_mutex); 1916 return; 1917 } 1918 1919 /* 1920 * Try to insert the cmd into the active cmd array at the nq_next_cmd 1921 * slot. If the slot is already occupied advance to the next slot and 1922 * try again. This can happen for long running commands like async event 1923 * requests. 1924 */ 1925 while (qp->nq_cmd[qp->nq_next_cmd] != NULL) 1926 qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry; 1927 qp->nq_cmd[qp->nq_next_cmd] = cmd; 1928 1929 /* 1930 * We keep track of the number of active commands in this queue, and 1931 * the sum of the timeouts for those active commands. 1932 */ 1933 qp->nq_active_cmds++; 1934 if (qtimeoutp != NULL) 1935 *qtimeoutp = qp->nq_active_timeout; 1936 qp->nq_active_timeout += cmd->nc_timeout; 1937 1938 cmd->nc_sqe.sqe_cid = qp->nq_next_cmd; 1939 bcopy(&cmd->nc_sqe, &qp->nq_sq[qp->nq_sqtail], sizeof (nvme_sqe_t)); 1940 (void) ddi_dma_sync(qp->nq_sqdma->nd_dmah, 1941 sizeof (nvme_sqe_t) * qp->nq_sqtail, 1942 sizeof (nvme_sqe_t), DDI_DMA_SYNC_FORDEV); 1943 qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry; 1944 1945 tail.b.sqtdbl_sqt = qp->nq_sqtail = (qp->nq_sqtail + 1) % qp->nq_nentry; 1946 nvme_put32(cmd->nc_nvme, qp->nq_sqtdbl, tail.r); 1947 1948 mutex_exit(&qp->nq_mutex); 1949 } 1950 1951 static nvme_cmd_t * 1952 nvme_unqueue_cmd(nvme_t *nvme, nvme_qpair_t *qp, int cid) 1953 { 1954 nvme_cmd_t *cmd; 1955 1956 ASSERT(mutex_owned(&qp->nq_mutex)); 1957 ASSERT3S(cid, <, qp->nq_nentry); 1958 1959 cmd = qp->nq_cmd[cid]; 1960 /* 1961 * Some controllers will erroneously add things to the completion queue 1962 * for which there is no matching outstanding command. If this happens, 1963 * it is almost certainly a controller firmware bug since nq_mutex 1964 * is held across command submission and ringing the queue doorbell, 1965 * and is also held in this function. 1966 * 1967 * If we see such an unexpected command, there is not much we can do. 1968 * These will be logged and counted in nvme_get_completed(), but 1969 * otherwise ignored. 1970 */ 1971 if (cmd == NULL) 1972 return (NULL); 1973 qp->nq_cmd[cid] = NULL; 1974 ASSERT3U(qp->nq_active_cmds, >, 0); 1975 qp->nq_active_cmds--; 1976 ASSERT3U(qp->nq_active_timeout, >=, cmd->nc_timeout); 1977 qp->nq_active_timeout -= cmd->nc_timeout; 1978 sema_v(&qp->nq_sema); 1979 1980 ASSERT3P(cmd, !=, NULL); 1981 ASSERT3P(cmd->nc_nvme, ==, nvme); 1982 ASSERT3S(cmd->nc_sqe.sqe_cid, ==, cid); 1983 1984 return (cmd); 1985 } 1986 1987 /* 1988 * This is called when an admin abort has failed to complete, once for the 1989 * original command and once for the abort itself. At this point the controller 1990 * has been marked dead. The commands are considered lost, de-queued if 1991 * possible, and placed on a global lost commands list so that they cannot be 1992 * freed and so that any DMA memory they have have is not re-used. 1993 */ 1994 static void 1995 nvme_lost_cmd(nvme_t *nvme, nvme_cmd_t *cmd) 1996 { 1997 ASSERT(mutex_owned(&cmd->nc_mutex)); 1998 1999 switch (cmd->nc_state) { 2000 case NVME_CMD_SUBMITTED: { 2001 nvme_qpair_t *qp = nvme->n_ioq[cmd->nc_sqid]; 2002 2003 /* 2004 * The command is still in the submitted state, meaning that we 2005 * have not processed a completion queue entry for it. De-queue 2006 * should be successful and if the hardware does later report 2007 * completion we'll skip it as a command for which we aren't 2008 * expecting a response (see nvme_unqueue_cmd()). 2009 */ 2010 mutex_enter(&qp->nq_mutex); 2011 (void) nvme_unqueue_cmd(nvme, qp, cmd->nc_sqe.sqe_cid); 2012 mutex_exit(&qp->nq_mutex); 2013 } 2014 case NVME_CMD_ALLOCATED: 2015 case NVME_CMD_COMPLETED: 2016 /* 2017 * If the command has not been submitted, or has completed, 2018 * there is nothing to do here. In the event of an abort 2019 * command timeout, we can end up here in the process of 2020 * "losing" the original command. It's possible that command 2021 * has actually completed (or been queued on the taskq) in the 2022 * interim. 2023 */ 2024 break; 2025 case NVME_CMD_QUEUED: 2026 /* 2027 * The command is on the taskq, awaiting callback. This should 2028 * be fairly rapid so wait for completion. 2029 */ 2030 while (cmd->nc_state != NVME_CMD_COMPLETED) 2031 cv_wait(&cmd->nc_cv, &cmd->nc_mutex); 2032 break; 2033 case NVME_CMD_LOST: 2034 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, 2035 "%s: command %p already lost", __func__, (void *)cmd); 2036 break; 2037 } 2038 2039 cmd->nc_state = NVME_CMD_LOST; 2040 2041 mutex_enter(&nvme_lc_mutex); 2042 list_insert_head(&nvme_lost_cmds, cmd); 2043 mutex_exit(&nvme_lc_mutex); 2044 } 2045 2046 /* 2047 * Get the command tied to the next completed cqe and bump along completion 2048 * queue head counter. 2049 */ 2050 static nvme_cmd_t * 2051 nvme_get_completed(nvme_t *nvme, nvme_cq_t *cq) 2052 { 2053 nvme_qpair_t *qp; 2054 nvme_cqe_t *cqe; 2055 nvme_cmd_t *cmd; 2056 2057 ASSERT(mutex_owned(&cq->ncq_mutex)); 2058 2059 retry: 2060 cqe = &cq->ncq_cq[cq->ncq_head]; 2061 2062 /* Check phase tag of CQE. Hardware inverts it for new entries. */ 2063 if (cqe->cqe_sf.sf_p == cq->ncq_phase) 2064 return (NULL); 2065 2066 qp = nvme->n_ioq[cqe->cqe_sqid]; 2067 2068 mutex_enter(&qp->nq_mutex); 2069 cmd = nvme_unqueue_cmd(nvme, qp, cqe->cqe_cid); 2070 mutex_exit(&qp->nq_mutex); 2071 2072 qp->nq_sqhead = cqe->cqe_sqhd; 2073 cq->ncq_head = (cq->ncq_head + 1) % cq->ncq_nentry; 2074 2075 /* Toggle phase on wrap-around. */ 2076 if (cq->ncq_head == 0) 2077 cq->ncq_phase = cq->ncq_phase != 0 ? 0 : 1; 2078 2079 if (cmd == NULL) { 2080 dev_err(nvme->n_dip, CE_WARN, 2081 "!received completion for unknown cid 0x%x", cqe->cqe_cid); 2082 NVME_BUMP_STAT(nvme, unknown_cid); 2083 /* 2084 * We want to ignore this unexpected completion entry as it 2085 * is most likely a result of a bug in the controller firmware. 2086 * However, if we return NULL, then callers will assume there 2087 * are no more pending commands for this wakeup. Retry to keep 2088 * enumerating commands until the phase tag indicates there are 2089 * no more and we are really done. 2090 */ 2091 goto retry; 2092 } 2093 2094 ASSERT3U(cmd->nc_sqid, ==, cqe->cqe_sqid); 2095 bcopy(cqe, &cmd->nc_cqe, sizeof (nvme_cqe_t)); 2096 2097 return (cmd); 2098 } 2099 2100 /* 2101 * Process all completed commands on the io completion queue. 2102 */ 2103 static uint_t 2104 nvme_process_iocq(nvme_t *nvme, nvme_cq_t *cq) 2105 { 2106 nvme_reg_cqhdbl_t head = { 0 }; 2107 nvme_cmd_t *cmd; 2108 uint_t completed = 0; 2109 2110 if (ddi_dma_sync(cq->ncq_dma->nd_dmah, 0, 0, DDI_DMA_SYNC_FORKERNEL) != 2111 DDI_SUCCESS) 2112 dev_err(nvme->n_dip, CE_WARN, "!ddi_dma_sync() failed in %s", 2113 __func__); 2114 2115 mutex_enter(&cq->ncq_mutex); 2116 2117 while ((cmd = nvme_get_completed(nvme, cq)) != NULL) { 2118 /* 2119 * NVME_CMD_F_USELOCK is applied to all commands which are 2120 * going to be waited for by another thread in nvme_wait_cmd 2121 * and indicates that the lock should be taken before modifying 2122 * protected fields, and that the mutex has been initialised. 2123 * Commands which do not require the mutex to be held have not 2124 * initialised it (to reduce overhead). 2125 */ 2126 if ((cmd->nc_flags & NVME_CMD_F_USELOCK) != 0) { 2127 mutex_enter(&cmd->nc_mutex); 2128 /* 2129 * The command could have been de-queued as lost while 2130 * we waited on the lock, in which case we drop it. 2131 */ 2132 if (cmd->nc_state == NVME_CMD_LOST) { 2133 mutex_exit(&cmd->nc_mutex); 2134 completed++; 2135 continue; 2136 } 2137 } 2138 cmd->nc_queue_ts = gethrtime(); 2139 cmd->nc_state = NVME_CMD_QUEUED; 2140 if ((cmd->nc_flags & NVME_CMD_F_USELOCK) != 0) 2141 mutex_exit(&cmd->nc_mutex); 2142 taskq_dispatch_ent(cq->ncq_cmd_taskq, cmd->nc_callback, cmd, 2143 TQ_NOSLEEP, &cmd->nc_tqent); 2144 2145 completed++; 2146 } 2147 2148 if (completed > 0) { 2149 /* 2150 * Update the completion queue head doorbell. 2151 */ 2152 head.b.cqhdbl_cqh = cq->ncq_head; 2153 nvme_put32(nvme, cq->ncq_hdbl, head.r); 2154 } 2155 2156 mutex_exit(&cq->ncq_mutex); 2157 2158 return (completed); 2159 } 2160 2161 static nvme_cmd_t * 2162 nvme_retrieve_cmd(nvme_t *nvme, nvme_qpair_t *qp) 2163 { 2164 nvme_cq_t *cq = qp->nq_cq; 2165 nvme_reg_cqhdbl_t head = { 0 }; 2166 nvme_cmd_t *cmd; 2167 2168 if (ddi_dma_sync(cq->ncq_dma->nd_dmah, 0, 0, DDI_DMA_SYNC_FORKERNEL) != 2169 DDI_SUCCESS) 2170 dev_err(nvme->n_dip, CE_WARN, "!ddi_dma_sync() failed in %s", 2171 __func__); 2172 2173 mutex_enter(&cq->ncq_mutex); 2174 2175 if ((cmd = nvme_get_completed(nvme, cq)) != NULL) { 2176 head.b.cqhdbl_cqh = cq->ncq_head; 2177 nvme_put32(nvme, cq->ncq_hdbl, head.r); 2178 } 2179 2180 mutex_exit(&cq->ncq_mutex); 2181 2182 return (cmd); 2183 } 2184 2185 static int 2186 nvme_check_unknown_cmd_status(nvme_cmd_t *cmd) 2187 { 2188 nvme_cqe_t *cqe = &cmd->nc_cqe; 2189 2190 dev_err(cmd->nc_nvme->n_dip, CE_WARN, 2191 "!unknown command status received: opc = %x, sqid = %d, cid = %d, " 2192 "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc, 2193 cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct, 2194 cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m); 2195 2196 if (cmd->nc_xfer != NULL) 2197 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 2198 2199 /* 2200 * User commands should never cause us to mark the controller dead. 2201 * Though whether we ever should mark it dead as there currently isn't a 2202 * useful recovery path is another question. 2203 */ 2204 if (((cmd->nc_flags & NVME_CMD_F_DONTPANIC) == 0) && 2205 cmd->nc_nvme->n_strict_version) { 2206 nvme_ctrl_mark_dead(cmd->nc_nvme, B_FALSE); 2207 } 2208 2209 return (EIO); 2210 } 2211 2212 static int 2213 nvme_check_vendor_cmd_status(nvme_cmd_t *cmd) 2214 { 2215 nvme_cqe_t *cqe = &cmd->nc_cqe; 2216 2217 dev_err(cmd->nc_nvme->n_dip, CE_WARN, 2218 "!unknown command status received: opc = %x, sqid = %d, cid = %d, " 2219 "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc, 2220 cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct, 2221 cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m); 2222 if (!cmd->nc_nvme->n_ignore_unknown_vendor_status) { 2223 nvme_ctrl_mark_dead(cmd->nc_nvme, B_FALSE); 2224 } 2225 2226 return (EIO); 2227 } 2228 2229 static int 2230 nvme_check_integrity_cmd_status(nvme_cmd_t *cmd) 2231 { 2232 nvme_cqe_t *cqe = &cmd->nc_cqe; 2233 2234 switch (cqe->cqe_sf.sf_sc) { 2235 case NVME_CQE_SC_INT_NVM_WRITE: 2236 /* write fail */ 2237 /* TODO: post ereport */ 2238 if (cmd->nc_xfer != NULL) 2239 bd_error(cmd->nc_xfer, BD_ERR_MEDIA); 2240 return (EIO); 2241 2242 case NVME_CQE_SC_INT_NVM_READ: 2243 /* read fail */ 2244 /* TODO: post ereport */ 2245 if (cmd->nc_xfer != NULL) 2246 bd_error(cmd->nc_xfer, BD_ERR_MEDIA); 2247 return (EIO); 2248 2249 default: 2250 return (nvme_check_unknown_cmd_status(cmd)); 2251 } 2252 } 2253 2254 static int 2255 nvme_check_generic_cmd_status(nvme_cmd_t *cmd) 2256 { 2257 nvme_cqe_t *cqe = &cmd->nc_cqe; 2258 2259 switch (cqe->cqe_sf.sf_sc) { 2260 case NVME_CQE_SC_GEN_SUCCESS: 2261 return (0); 2262 2263 /* 2264 * Errors indicating a bug in the driver should cause a panic. 2265 */ 2266 case NVME_CQE_SC_GEN_INV_OPC: 2267 /* Invalid Command Opcode */ 2268 NVME_BUMP_STAT(cmd->nc_nvme, inv_cmd_err); 2269 if ((cmd->nc_flags & NVME_CMD_F_DONTPANIC) == 0) { 2270 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, 2271 "programming error: invalid opcode in cmd %p", 2272 (void *)cmd); 2273 } 2274 return (EINVAL); 2275 2276 case NVME_CQE_SC_GEN_INV_FLD: 2277 /* Invalid Field in Command */ 2278 NVME_BUMP_STAT(cmd->nc_nvme, inv_field_err); 2279 if ((cmd->nc_flags & NVME_CMD_F_DONTPANIC) == 0) { 2280 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, 2281 "programming error: invalid field in cmd %p", 2282 (void *)cmd); 2283 } 2284 return (EIO); 2285 2286 case NVME_CQE_SC_GEN_ID_CNFL: 2287 /* Command ID Conflict */ 2288 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " 2289 "cmd ID conflict in cmd %p", (void *)cmd); 2290 return (0); 2291 2292 case NVME_CQE_SC_GEN_INV_NS: 2293 /* Invalid Namespace or Format */ 2294 NVME_BUMP_STAT(cmd->nc_nvme, inv_nsfmt_err); 2295 if ((cmd->nc_flags & NVME_CMD_F_DONTPANIC) == 0) { 2296 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, 2297 "programming error: invalid NS/format in cmd %p", 2298 (void *)cmd); 2299 } 2300 return (EINVAL); 2301 2302 case NVME_CQE_SC_GEN_CMD_SEQ_ERR: 2303 /* 2304 * Command Sequence Error 2305 * 2306 * This can be generated normally by user log page requests that 2307 * come out of order (e.g. getting the persistent event log 2308 * without establishing the context). If the kernel manages this 2309 * on its own then that's problematic. 2310 */ 2311 NVME_BUMP_STAT(cmd->nc_nvme, inv_cmdseq_err); 2312 if ((cmd->nc_flags & NVME_CMD_F_DONTPANIC) == 0) { 2313 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, 2314 "programming error: command sequencing error %p", 2315 (void *)cmd); 2316 } 2317 return (EINVAL); 2318 2319 case NVME_CQE_SC_GEN_NVM_LBA_RANGE: 2320 /* LBA Out Of Range */ 2321 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " 2322 "LBA out of range in cmd %p", (void *)cmd); 2323 return (0); 2324 2325 /* 2326 * Non-fatal errors, handle gracefully. 2327 */ 2328 case NVME_CQE_SC_GEN_DATA_XFR_ERR: 2329 /* Data Transfer Error (DMA) */ 2330 /* TODO: post ereport */ 2331 NVME_BUMP_STAT(cmd->nc_nvme, data_xfr_err); 2332 if (cmd->nc_xfer != NULL) 2333 bd_error(cmd->nc_xfer, BD_ERR_NTRDY); 2334 return (EIO); 2335 2336 case NVME_CQE_SC_GEN_INTERNAL_ERR: 2337 /* 2338 * Internal Error. The spec (v1.0, section 4.5.1.2) says 2339 * detailed error information is returned as async event, 2340 * so we pretty much ignore the error here and handle it 2341 * in the async event handler. 2342 */ 2343 NVME_BUMP_STAT(cmd->nc_nvme, internal_err); 2344 if (cmd->nc_xfer != NULL) 2345 bd_error(cmd->nc_xfer, BD_ERR_NTRDY); 2346 return (EIO); 2347 2348 case NVME_CQE_SC_GEN_ABORT_REQUEST: 2349 /* 2350 * Command Abort Requested. This normally happens only when a 2351 * command times out. 2352 */ 2353 /* TODO: post ereport or change blkdev to handle this? */ 2354 NVME_BUMP_STAT(cmd->nc_nvme, abort_rq_err); 2355 return (ECANCELED); 2356 2357 case NVME_CQE_SC_GEN_ABORT_PWRLOSS: 2358 /* Command Aborted due to Power Loss Notification */ 2359 NVME_BUMP_STAT(cmd->nc_nvme, abort_pwrloss_err); 2360 nvme_ctrl_mark_dead(cmd->nc_nvme, B_FALSE); 2361 return (EIO); 2362 2363 case NVME_CQE_SC_GEN_ABORT_SQ_DEL: 2364 /* Command Aborted due to SQ Deletion */ 2365 NVME_BUMP_STAT(cmd->nc_nvme, abort_sq_del); 2366 return (EIO); 2367 2368 case NVME_CQE_SC_GEN_NVM_CAP_EXC: 2369 /* Capacity Exceeded */ 2370 NVME_BUMP_STAT(cmd->nc_nvme, nvm_cap_exc); 2371 if (cmd->nc_xfer != NULL) 2372 bd_error(cmd->nc_xfer, BD_ERR_MEDIA); 2373 return (EIO); 2374 2375 case NVME_CQE_SC_GEN_NVM_NS_NOTRDY: 2376 /* Namespace Not Ready */ 2377 NVME_BUMP_STAT(cmd->nc_nvme, nvm_ns_notrdy); 2378 if (cmd->nc_xfer != NULL) 2379 bd_error(cmd->nc_xfer, BD_ERR_NTRDY); 2380 return (EIO); 2381 2382 case NVME_CQE_SC_GEN_NVM_FORMATTING: 2383 /* Format in progress (1.2) */ 2384 if (!NVME_VERSION_ATLEAST(&cmd->nc_nvme->n_version, 1, 2)) 2385 return (nvme_check_unknown_cmd_status(cmd)); 2386 NVME_BUMP_STAT(cmd->nc_nvme, nvm_ns_formatting); 2387 if (cmd->nc_xfer != NULL) 2388 bd_error(cmd->nc_xfer, BD_ERR_NTRDY); 2389 return (EIO); 2390 2391 default: 2392 return (nvme_check_unknown_cmd_status(cmd)); 2393 } 2394 } 2395 2396 static int 2397 nvme_check_specific_cmd_status(nvme_cmd_t *cmd) 2398 { 2399 nvme_cqe_t *cqe = &cmd->nc_cqe; 2400 2401 switch (cqe->cqe_sf.sf_sc) { 2402 case NVME_CQE_SC_SPC_INV_CQ: 2403 /* Completion Queue Invalid */ 2404 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE); 2405 NVME_BUMP_STAT(cmd->nc_nvme, inv_cq_err); 2406 return (EINVAL); 2407 2408 case NVME_CQE_SC_SPC_INV_QID: 2409 /* Invalid Queue Identifier */ 2410 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE || 2411 cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_SQUEUE || 2412 cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE || 2413 cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_CQUEUE); 2414 NVME_BUMP_STAT(cmd->nc_nvme, inv_qid_err); 2415 return (EINVAL); 2416 2417 case NVME_CQE_SC_SPC_MAX_QSZ_EXC: 2418 /* Max Queue Size Exceeded */ 2419 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE || 2420 cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE); 2421 NVME_BUMP_STAT(cmd->nc_nvme, max_qsz_exc); 2422 return (EINVAL); 2423 2424 case NVME_CQE_SC_SPC_ABRT_CMD_EXC: 2425 /* Abort Command Limit Exceeded */ 2426 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_ABORT); 2427 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " 2428 "abort command limit exceeded in cmd %p", (void *)cmd); 2429 return (0); 2430 2431 case NVME_CQE_SC_SPC_ASYNC_EVREQ_EXC: 2432 /* Async Event Request Limit Exceeded */ 2433 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_ASYNC_EVENT); 2434 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " 2435 "async event request limit exceeded in cmd %p", 2436 (void *)cmd); 2437 return (0); 2438 2439 case NVME_CQE_SC_SPC_INV_INT_VECT: 2440 /* Invalid Interrupt Vector */ 2441 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE); 2442 NVME_BUMP_STAT(cmd->nc_nvme, inv_int_vect); 2443 return (EINVAL); 2444 2445 case NVME_CQE_SC_SPC_INV_LOG_PAGE: 2446 /* Invalid Log Page */ 2447 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_GET_LOG_PAGE); 2448 NVME_BUMP_STAT(cmd->nc_nvme, inv_log_page); 2449 return (EINVAL); 2450 2451 case NVME_CQE_SC_SPC_INV_FORMAT: 2452 /* Invalid Format */ 2453 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_FORMAT || 2454 cmd->nc_sqe.sqe_opc == NVME_OPC_NS_MGMT); 2455 NVME_BUMP_STAT(cmd->nc_nvme, inv_format); 2456 if (cmd->nc_xfer != NULL) 2457 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 2458 return (EINVAL); 2459 2460 case NVME_CQE_SC_SPC_INV_Q_DEL: 2461 /* Invalid Queue Deletion */ 2462 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_CQUEUE); 2463 NVME_BUMP_STAT(cmd->nc_nvme, inv_q_del); 2464 return (EINVAL); 2465 2466 case NVME_CQE_SC_SPC_NVM_CNFL_ATTR: 2467 /* Conflicting Attributes */ 2468 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_DSET_MGMT || 2469 cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_READ || 2470 cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE); 2471 NVME_BUMP_STAT(cmd->nc_nvme, cnfl_attr); 2472 if (cmd->nc_xfer != NULL) 2473 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 2474 return (EINVAL); 2475 2476 case NVME_CQE_SC_SPC_NVM_INV_PROT: 2477 /* Invalid Protection Information */ 2478 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_COMPARE || 2479 cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_READ || 2480 cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE); 2481 NVME_BUMP_STAT(cmd->nc_nvme, inv_prot); 2482 if (cmd->nc_xfer != NULL) 2483 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 2484 return (EINVAL); 2485 2486 case NVME_CQE_SC_SPC_NVM_READONLY: 2487 /* Write to Read Only Range */ 2488 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE); 2489 NVME_BUMP_STAT(cmd->nc_nvme, readonly); 2490 if (cmd->nc_xfer != NULL) 2491 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 2492 return (EROFS); 2493 2494 case NVME_CQE_SC_SPC_INV_FW_SLOT: 2495 /* Invalid Firmware Slot */ 2496 NVME_BUMP_STAT(cmd->nc_nvme, inv_fwslot); 2497 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 2498 return (EINVAL); 2499 2500 case NVME_CQE_SC_SPC_INV_FW_IMG: 2501 /* Invalid Firmware Image */ 2502 NVME_BUMP_STAT(cmd->nc_nvme, inv_fwimg); 2503 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 2504 return (EINVAL); 2505 2506 case NVME_CQE_SC_SPC_FW_RESET: 2507 /* Conventional Reset Required */ 2508 NVME_BUMP_STAT(cmd->nc_nvme, fwact_creset); 2509 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 2510 return (0); 2511 2512 case NVME_CQE_SC_SPC_FW_NSSR: 2513 /* NVMe Subsystem Reset Required */ 2514 NVME_BUMP_STAT(cmd->nc_nvme, fwact_nssr); 2515 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 2516 return (0); 2517 2518 case NVME_CQE_SC_SPC_FW_NEXT_RESET: 2519 /* Activation Requires Reset */ 2520 NVME_BUMP_STAT(cmd->nc_nvme, fwact_reset); 2521 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 2522 return (0); 2523 2524 case NVME_CQE_SC_SPC_FW_MTFA: 2525 /* Activation Requires Maximum Time Violation */ 2526 NVME_BUMP_STAT(cmd->nc_nvme, fwact_mtfa); 2527 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 2528 return (EAGAIN); 2529 2530 case NVME_CQE_SC_SPC_FW_PROHIBITED: 2531 /* Activation Prohibited */ 2532 NVME_BUMP_STAT(cmd->nc_nvme, fwact_prohibited); 2533 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 2534 return (EINVAL); 2535 2536 case NVME_CQE_SC_SPC_FW_OVERLAP: 2537 /* Overlapping Firmware Ranges */ 2538 NVME_BUMP_STAT(cmd->nc_nvme, fw_overlap); 2539 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_IMAGE_LOAD || 2540 cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE); 2541 return (EINVAL); 2542 2543 case NVME_CQE_SC_SPC_NS_ATTACHED: 2544 /* Namespace Already Attached */ 2545 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NS_ATTACH); 2546 NVME_BUMP_STAT(cmd->nc_nvme, ns_attached); 2547 return (EEXIST); 2548 2549 case NVME_CQE_SC_SPC_NS_PRIV: 2550 /* Namespace Is Private */ 2551 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NS_ATTACH); 2552 NVME_BUMP_STAT(cmd->nc_nvme, ns_priv); 2553 return (EACCES); 2554 2555 case NVME_CQE_SC_SPC_NS_NOT_ATTACH: 2556 /* Namespace Not Attached */ 2557 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NS_ATTACH); 2558 NVME_BUMP_STAT(cmd->nc_nvme, ns_not_attached); 2559 return (ENOENT); 2560 2561 case NVME_CQE_SC_SPC_INV_CTRL_LIST: 2562 /* Controller List Invalid */ 2563 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NS_ATTACH); 2564 NVME_BUMP_STAT(cmd->nc_nvme, ana_attach); 2565 return (EINVAL); 2566 2567 case NVME_CQE_SC_SPC_ANA_ATTACH: 2568 /* ANA Attach Failed */ 2569 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NS_ATTACH); 2570 NVME_BUMP_STAT(cmd->nc_nvme, ana_attach); 2571 return (EIO); 2572 2573 case NVME_CQE_SC_SPC_NS_ATTACH_LIM: 2574 /* Namespace Attachment Limit Exceeded */ 2575 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NS_ATTACH); 2576 NVME_BUMP_STAT(cmd->nc_nvme, ns_attach_lim); 2577 return (EOVERFLOW); 2578 2579 default: 2580 return (nvme_check_unknown_cmd_status(cmd)); 2581 } 2582 } 2583 2584 static inline int 2585 nvme_check_cmd_status(nvme_cmd_t *cmd) 2586 { 2587 nvme_cqe_t *cqe = &cmd->nc_cqe; 2588 2589 /* 2590 * Take a shortcut if the controller is dead, or if 2591 * command status indicates no error. 2592 */ 2593 if (cmd->nc_nvme->n_dead) 2594 return (EIO); 2595 2596 if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC && 2597 cqe->cqe_sf.sf_sc == NVME_CQE_SC_GEN_SUCCESS) 2598 return (0); 2599 2600 if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC) 2601 return (nvme_check_generic_cmd_status(cmd)); 2602 else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_SPECIFIC) 2603 return (nvme_check_specific_cmd_status(cmd)); 2604 else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_INTEGRITY) 2605 return (nvme_check_integrity_cmd_status(cmd)); 2606 else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_VENDOR) 2607 return (nvme_check_vendor_cmd_status(cmd)); 2608 2609 return (nvme_check_unknown_cmd_status(cmd)); 2610 } 2611 2612 /* 2613 * Check the command status as used by an ioctl path and do not convert it to an 2614 * errno. We still allow all the command status checking to occur, but otherwise 2615 * will pass back the controller error as is. 2616 */ 2617 static boolean_t 2618 nvme_check_cmd_status_ioctl(nvme_cmd_t *cmd, nvme_ioctl_common_t *ioc) 2619 { 2620 nvme_cqe_t *cqe = &cmd->nc_cqe; 2621 nvme_t *nvme = cmd->nc_nvme; 2622 2623 if (nvme->n_dead) { 2624 return (nvme_ioctl_error(ioc, nvme->n_dead_status, 0, 0)); 2625 } 2626 2627 if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC && 2628 cqe->cqe_sf.sf_sc == NVME_CQE_SC_GEN_SUCCESS) 2629 return (B_TRUE); 2630 2631 if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC) { 2632 (void) nvme_check_generic_cmd_status(cmd); 2633 } else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_SPECIFIC) { 2634 (void) nvme_check_specific_cmd_status(cmd); 2635 } else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_INTEGRITY) { 2636 (void) nvme_check_integrity_cmd_status(cmd); 2637 } else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_VENDOR) { 2638 (void) nvme_check_vendor_cmd_status(cmd); 2639 } else { 2640 (void) nvme_check_unknown_cmd_status(cmd); 2641 } 2642 2643 return (nvme_ioctl_error(ioc, NVME_IOCTL_E_CTRL_ERROR, 2644 cqe->cqe_sf.sf_sct, cqe->cqe_sf.sf_sc)); 2645 } 2646 2647 static int 2648 nvme_abort_cmd(nvme_cmd_t *cmd, const uint32_t sec) 2649 { 2650 nvme_t *nvme = cmd->nc_nvme; 2651 nvme_cmd_t *abort_cmd = nvme_alloc_admin_cmd(nvme, KM_SLEEP); 2652 nvme_abort_cmd_t ac = { 0 }; 2653 int ret = 0; 2654 2655 sema_p(&nvme->n_abort_sema); 2656 2657 ac.b.ac_cid = cmd->nc_sqe.sqe_cid; 2658 ac.b.ac_sqid = cmd->nc_sqid; 2659 2660 abort_cmd->nc_sqid = 0; 2661 abort_cmd->nc_sqe.sqe_opc = NVME_OPC_ABORT; 2662 abort_cmd->nc_callback = nvme_wakeup_cmd; 2663 abort_cmd->nc_sqe.sqe_cdw10 = ac.r; 2664 2665 /* 2666 * Send the ABORT to the hardware. The ABORT command will return _after_ 2667 * the aborted command has completed (aborted or otherwise) so we must 2668 * drop the aborted command's lock to allow it to complete. 2669 * We want to allow at least `nvme_abort_cmd_timeout` seconds for the 2670 * abort to be processed, but more if we are aborting a long-running 2671 * command to give that time to complete/abort too. 2672 */ 2673 mutex_exit(&cmd->nc_mutex); 2674 nvme_admin_cmd(abort_cmd, MAX(nvme_abort_cmd_timeout, sec)); 2675 mutex_enter(&cmd->nc_mutex); 2676 2677 sema_v(&nvme->n_abort_sema); 2678 2679 /* BEGIN CSTYLED */ 2680 /* 2681 * If the abort command itself has timed out, it will have been 2682 * de-queued so that its callback will not be called after this point, 2683 * and its state will be NVME_CMD_LOST. 2684 * 2685 * nvme_admin_cmd(abort_cmd) 2686 * -> nvme_wait_cmd(abort_cmd) 2687 * -> nvme_cmd(abort_cmd) 2688 * | -> nvme_admin_cmd(cmd) 2689 * | -> nvme_wait_cmd(cmd) 2690 * | -> nvme_ctrl_mark_dead() 2691 * | -> nvme_lost_cmd(cmd) 2692 * | -> cmd->nc_stat = NVME_CMD_LOST 2693 * and here we are. 2694 */ 2695 /* END CSTYLED */ 2696 if (abort_cmd->nc_state == NVME_CMD_LOST) { 2697 dev_err(nvme->n_dip, CE_WARN, 2698 "!ABORT of command %d/%d timed out", 2699 cmd->nc_sqe.sqe_cid, cmd->nc_sqid); 2700 NVME_BUMP_STAT(nvme, abort_timeout); 2701 ret = EIO; 2702 } else if ((ret = nvme_check_cmd_status(abort_cmd)) != 0) { 2703 dev_err(nvme->n_dip, CE_WARN, 2704 "!ABORT of command %d/%d " 2705 "failed with sct = %x, sc = %x", 2706 cmd->nc_sqe.sqe_cid, cmd->nc_sqid, 2707 abort_cmd->nc_cqe.cqe_sf.sf_sct, 2708 abort_cmd->nc_cqe.cqe_sf.sf_sc); 2709 NVME_BUMP_STAT(nvme, abort_failed); 2710 } else { 2711 boolean_t success = ((abort_cmd->nc_cqe.cqe_dw0 & 1) == 0); 2712 2713 dev_err(nvme->n_dip, CE_WARN, 2714 "!ABORT of command %d/%d %ssuccessful", 2715 cmd->nc_sqe.sqe_cid, cmd->nc_sqid, 2716 success ? "" : "un"); 2717 2718 if (success) { 2719 NVME_BUMP_STAT(nvme, abort_successful); 2720 } else { 2721 NVME_BUMP_STAT(nvme, abort_unsuccessful); 2722 } 2723 } 2724 2725 /* 2726 * This abort abort_cmd has either completed or been de-queued as 2727 * lost in nvme_wait_cmd. Either way it's safe to free it here. 2728 */ 2729 nvme_free_cmd(abort_cmd); 2730 2731 return (ret); 2732 } 2733 2734 /* 2735 * nvme_wait_cmd -- wait for command completion or timeout 2736 * 2737 * In case of a serious error or a timeout of the abort command the hardware 2738 * will be declared dead and FMA will be notified. 2739 */ 2740 static void 2741 nvme_wait_cmd(nvme_cmd_t *cmd, uint32_t sec) 2742 { 2743 nvme_t *nvme = cmd->nc_nvme; 2744 nvme_reg_csts_t csts; 2745 2746 ASSERT(mutex_owned(&cmd->nc_mutex)); 2747 2748 while (cmd->nc_state != NVME_CMD_COMPLETED) { 2749 clock_t timeout = ddi_get_lbolt() + 2750 drv_usectohz((long)sec * MICROSEC); 2751 2752 if (cv_timedwait(&cmd->nc_cv, &cmd->nc_mutex, timeout) == -1) { 2753 /* 2754 * If this command is on the task queue then we don't 2755 * consider it to have timed out. We are waiting for 2756 * the callback to be invoked, the timing of which can 2757 * be affected by system load and should not count 2758 * against the device; continue to wait. 2759 * While this doesn't help deal with the possibility of 2760 * a command timing out between being placed on the CQ 2761 * and arriving on the taskq, we expect interrupts to 2762 * run fairly promptly making this a small window. 2763 */ 2764 if (cmd->nc_state != NVME_CMD_QUEUED) 2765 break; 2766 } 2767 } 2768 2769 if (cmd->nc_state == NVME_CMD_COMPLETED) { 2770 DTRACE_PROBE1(nvme_admin_cmd_completed, nvme_cmd_t *, cmd); 2771 nvme_admin_stat_cmd(nvme, cmd); 2772 return; 2773 } 2774 2775 /* 2776 * The command timed out. 2777 */ 2778 2779 DTRACE_PROBE1(nvme_admin_cmd_timeout, nvme_cmd_t *, cmd); 2780 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 2781 dev_err(nvme->n_dip, CE_WARN, "!command %d/%d timeout, " 2782 "OPC = %x, CFS = %d", cmd->nc_sqe.sqe_cid, cmd->nc_sqid, 2783 cmd->nc_sqe.sqe_opc, csts.b.csts_cfs); 2784 NVME_BUMP_STAT(nvme, cmd_timeout); 2785 2786 /* 2787 * Check controller for fatal status, any errors associated with the 2788 * register or DMA handle, or for a double timeout (abort command timed 2789 * out). If necessary log a warning and call FMA. 2790 */ 2791 if (csts.b.csts_cfs || 2792 nvme_check_regs_hdl(nvme) || 2793 nvme_check_dma_hdl(cmd->nc_dma) || 2794 cmd->nc_sqe.sqe_opc == NVME_OPC_ABORT) { 2795 nvme_ctrl_mark_dead(cmd->nc_nvme, B_FALSE); 2796 nvme_lost_cmd(nvme, cmd); 2797 return; 2798 } 2799 2800 /* Issue an abort for the command that has timed out */ 2801 if (nvme_abort_cmd(cmd, sec) == 0) { 2802 /* 2803 * If the abort completed, whether or not it was 2804 * successful in aborting the command, that command 2805 * will also have completed with an appropriate 2806 * status. 2807 */ 2808 while (cmd->nc_state != NVME_CMD_COMPLETED) 2809 cv_wait(&cmd->nc_cv, &cmd->nc_mutex); 2810 return; 2811 } 2812 2813 /* 2814 * Otherwise, the abort has also timed out or failed, which 2815 * will have marked the controller dead. De-queue the original command 2816 * and add it to the lost commands list. 2817 */ 2818 VERIFY(cmd->nc_nvme->n_dead); 2819 nvme_lost_cmd(nvme, cmd); 2820 } 2821 2822 static void 2823 nvme_wakeup_cmd(void *arg) 2824 { 2825 nvme_cmd_t *cmd = arg; 2826 2827 ASSERT(cmd->nc_flags & NVME_CMD_F_USELOCK); 2828 2829 mutex_enter(&cmd->nc_mutex); 2830 cmd->nc_state = NVME_CMD_COMPLETED; 2831 cv_signal(&cmd->nc_cv); 2832 mutex_exit(&cmd->nc_mutex); 2833 } 2834 2835 static void 2836 nvme_async_event_task(void *arg) 2837 { 2838 nvme_cmd_t *cmd = arg; 2839 nvme_t *nvme = cmd->nc_nvme; 2840 nvme_error_log_entry_t *error_log = NULL; 2841 nvme_health_log_t *health_log = NULL; 2842 nvme_nschange_list_t *nslist = NULL; 2843 size_t logsize = 0; 2844 nvme_async_event_t event; 2845 2846 /* 2847 * Check for errors associated with the async request itself. The only 2848 * command-specific error is "async event limit exceeded", which 2849 * indicates a programming error in the driver and causes a panic in 2850 * nvme_check_cmd_status(). 2851 * 2852 * Other possible errors are various scenarios where the async request 2853 * was aborted, or internal errors in the device. Internal errors are 2854 * reported to FMA, the command aborts need no special handling here. 2855 * 2856 * And finally, at least qemu nvme does not support async events, 2857 * and will return NVME_CQE_SC_GEN_INV_OPC | DNR. If so, we 2858 * will avoid posting async events. 2859 */ 2860 2861 if (nvme_check_cmd_status(cmd) != 0) { 2862 dev_err(cmd->nc_nvme->n_dip, CE_WARN, 2863 "!async event request returned failure, sct = 0x%x, " 2864 "sc = 0x%x, dnr = %d, m = %d", cmd->nc_cqe.cqe_sf.sf_sct, 2865 cmd->nc_cqe.cqe_sf.sf_sc, cmd->nc_cqe.cqe_sf.sf_dnr, 2866 cmd->nc_cqe.cqe_sf.sf_m); 2867 2868 if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC && 2869 cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INTERNAL_ERR) { 2870 nvme_ctrl_mark_dead(cmd->nc_nvme, B_FALSE); 2871 } 2872 2873 if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC && 2874 cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INV_OPC && 2875 cmd->nc_cqe.cqe_sf.sf_dnr == 1) { 2876 nvme->n_async_event_supported = B_FALSE; 2877 } 2878 2879 nvme_free_cmd(cmd); 2880 return; 2881 } 2882 2883 event.r = cmd->nc_cqe.cqe_dw0; 2884 2885 /* Clear CQE and re-submit the async request. */ 2886 bzero(&cmd->nc_cqe, sizeof (nvme_cqe_t)); 2887 nvme_submit_admin_cmd(nvme->n_adminq, cmd, NULL); 2888 cmd = NULL; /* cmd can no longer be used after resubmission */ 2889 2890 switch (event.b.ae_type) { 2891 case NVME_ASYNC_TYPE_ERROR: 2892 if (event.b.ae_logpage == NVME_LOGPAGE_ERROR) { 2893 if (!nvme_get_logpage_int(nvme, B_FALSE, 2894 (void **)&error_log, &logsize, 2895 NVME_LOGPAGE_ERROR)) { 2896 return; 2897 } 2898 } else { 2899 dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in " 2900 "async event reply: type=0x%x logpage=0x%x", 2901 event.b.ae_type, event.b.ae_logpage); 2902 NVME_BUMP_STAT(nvme, wrong_logpage); 2903 return; 2904 } 2905 2906 switch (event.b.ae_info) { 2907 case NVME_ASYNC_ERROR_INV_SQ: 2908 dev_err(nvme->n_dip, CE_PANIC, "programming error: " 2909 "invalid submission queue"); 2910 return; 2911 2912 case NVME_ASYNC_ERROR_INV_DBL: 2913 dev_err(nvme->n_dip, CE_PANIC, "programming error: " 2914 "invalid doorbell write value"); 2915 return; 2916 2917 case NVME_ASYNC_ERROR_DIAGFAIL: 2918 dev_err(nvme->n_dip, CE_WARN, "!diagnostic failure"); 2919 nvme_ctrl_mark_dead(nvme, B_FALSE); 2920 NVME_BUMP_STAT(nvme, diagfail_event); 2921 break; 2922 2923 case NVME_ASYNC_ERROR_PERSISTENT: 2924 dev_err(nvme->n_dip, CE_WARN, "!persistent internal " 2925 "device error"); 2926 nvme_ctrl_mark_dead(nvme, B_FALSE); 2927 NVME_BUMP_STAT(nvme, persistent_event); 2928 break; 2929 2930 case NVME_ASYNC_ERROR_TRANSIENT: 2931 dev_err(nvme->n_dip, CE_WARN, "!transient internal " 2932 "device error"); 2933 /* TODO: send ereport */ 2934 NVME_BUMP_STAT(nvme, transient_event); 2935 break; 2936 2937 case NVME_ASYNC_ERROR_FW_LOAD: 2938 dev_err(nvme->n_dip, CE_WARN, 2939 "!firmware image load error"); 2940 NVME_BUMP_STAT(nvme, fw_load_event); 2941 break; 2942 } 2943 break; 2944 2945 case NVME_ASYNC_TYPE_HEALTH: 2946 if (event.b.ae_logpage == NVME_LOGPAGE_HEALTH) { 2947 if (!nvme_get_logpage_int(nvme, B_FALSE, 2948 (void **)&health_log, &logsize, 2949 NVME_LOGPAGE_HEALTH)) { 2950 return; 2951 } 2952 } else { 2953 dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in " 2954 "type=0x%x logpage=0x%x", event.b.ae_type, 2955 event.b.ae_logpage); 2956 NVME_BUMP_STAT(nvme, wrong_logpage); 2957 return; 2958 } 2959 2960 switch (event.b.ae_info) { 2961 case NVME_ASYNC_HEALTH_RELIABILITY: 2962 dev_err(nvme->n_dip, CE_WARN, 2963 "!device reliability compromised"); 2964 /* TODO: send ereport */ 2965 NVME_BUMP_STAT(nvme, reliability_event); 2966 break; 2967 2968 case NVME_ASYNC_HEALTH_TEMPERATURE: 2969 dev_err(nvme->n_dip, CE_WARN, 2970 "!temperature above threshold"); 2971 /* TODO: send ereport */ 2972 NVME_BUMP_STAT(nvme, temperature_event); 2973 break; 2974 2975 case NVME_ASYNC_HEALTH_SPARE: 2976 dev_err(nvme->n_dip, CE_WARN, 2977 "!spare space below threshold"); 2978 /* TODO: send ereport */ 2979 NVME_BUMP_STAT(nvme, spare_event); 2980 break; 2981 } 2982 break; 2983 2984 case NVME_ASYNC_TYPE_NOTICE: 2985 switch (event.b.ae_info) { 2986 case NVME_ASYNC_NOTICE_NS_CHANGE: 2987 if (event.b.ae_logpage != NVME_LOGPAGE_NSCHANGE) { 2988 dev_err(nvme->n_dip, CE_WARN, 2989 "!wrong logpage in async event reply: " 2990 "type=0x%x logpage=0x%x", 2991 event.b.ae_type, event.b.ae_logpage); 2992 NVME_BUMP_STAT(nvme, wrong_logpage); 2993 break; 2994 } 2995 2996 dev_err(nvme->n_dip, CE_NOTE, 2997 "namespace attribute change event, " 2998 "logpage = 0x%x", event.b.ae_logpage); 2999 NVME_BUMP_STAT(nvme, notice_event); 3000 3001 if (!nvme_get_logpage_int(nvme, B_FALSE, 3002 (void **)&nslist, &logsize, 3003 NVME_LOGPAGE_NSCHANGE)) { 3004 break; 3005 } 3006 3007 if (nslist->nscl_ns[0] == UINT32_MAX) { 3008 dev_err(nvme->n_dip, CE_CONT, 3009 "more than %u namespaces have changed.\n", 3010 NVME_NSCHANGE_LIST_SIZE); 3011 break; 3012 } 3013 3014 nvme_mgmt_lock(nvme, NVME_MGMT_LOCK_NVME); 3015 for (uint_t i = 0; i < NVME_NSCHANGE_LIST_SIZE; i++) { 3016 uint32_t nsid = nslist->nscl_ns[i]; 3017 nvme_namespace_t *ns; 3018 3019 if (nsid == 0) /* end of list */ 3020 break; 3021 3022 dev_err(nvme->n_dip, CE_NOTE, 3023 "!namespace nvme%d/%u has changed.", 3024 ddi_get_instance(nvme->n_dip), nsid); 3025 3026 if (nvme_init_ns(nvme, nsid) != DDI_SUCCESS) 3027 continue; 3028 3029 ns = nvme_nsid2ns(nvme, nsid); 3030 if (ns->ns_state <= NVME_NS_STATE_NOT_IGNORED) 3031 continue; 3032 3033 nvme_mgmt_bd_start(nvme); 3034 bd_state_change(ns->ns_bd_hdl); 3035 nvme_mgmt_bd_end(nvme); 3036 } 3037 nvme_mgmt_unlock(nvme); 3038 3039 break; 3040 3041 case NVME_ASYNC_NOTICE_FW_ACTIVATE: 3042 dev_err(nvme->n_dip, CE_NOTE, 3043 "firmware activation starting, " 3044 "logpage = 0x%x", event.b.ae_logpage); 3045 NVME_BUMP_STAT(nvme, notice_event); 3046 break; 3047 3048 case NVME_ASYNC_NOTICE_TELEMETRY: 3049 dev_err(nvme->n_dip, CE_NOTE, 3050 "telemetry log changed, " 3051 "logpage = 0x%x", event.b.ae_logpage); 3052 NVME_BUMP_STAT(nvme, notice_event); 3053 break; 3054 3055 case NVME_ASYNC_NOTICE_NS_ASYMM: 3056 dev_err(nvme->n_dip, CE_NOTE, 3057 "asymmetric namespace access change, " 3058 "logpage = 0x%x", event.b.ae_logpage); 3059 NVME_BUMP_STAT(nvme, notice_event); 3060 break; 3061 3062 case NVME_ASYNC_NOTICE_LATENCYLOG: 3063 dev_err(nvme->n_dip, CE_NOTE, 3064 "predictable latency event aggregate log change, " 3065 "logpage = 0x%x", event.b.ae_logpage); 3066 NVME_BUMP_STAT(nvme, notice_event); 3067 break; 3068 3069 case NVME_ASYNC_NOTICE_LBASTATUS: 3070 dev_err(nvme->n_dip, CE_NOTE, 3071 "LBA status information alert, " 3072 "logpage = 0x%x", event.b.ae_logpage); 3073 NVME_BUMP_STAT(nvme, notice_event); 3074 break; 3075 3076 case NVME_ASYNC_NOTICE_ENDURANCELOG: 3077 dev_err(nvme->n_dip, CE_NOTE, 3078 "endurance group event aggregate log page change, " 3079 "logpage = 0x%x", event.b.ae_logpage); 3080 NVME_BUMP_STAT(nvme, notice_event); 3081 break; 3082 3083 default: 3084 dev_err(nvme->n_dip, CE_WARN, 3085 "!unknown notice async event received, " 3086 "info = 0x%x, logpage = 0x%x", event.b.ae_info, 3087 event.b.ae_logpage); 3088 NVME_BUMP_STAT(nvme, unknown_event); 3089 break; 3090 } 3091 break; 3092 3093 case NVME_ASYNC_TYPE_VENDOR: 3094 dev_err(nvme->n_dip, CE_WARN, "!vendor specific async event " 3095 "received, info = 0x%x, logpage = 0x%x", event.b.ae_info, 3096 event.b.ae_logpage); 3097 NVME_BUMP_STAT(nvme, vendor_event); 3098 break; 3099 3100 default: 3101 dev_err(nvme->n_dip, CE_WARN, "!unknown async event received, " 3102 "type = 0x%x, info = 0x%x, logpage = 0x%x", event.b.ae_type, 3103 event.b.ae_info, event.b.ae_logpage); 3104 NVME_BUMP_STAT(nvme, unknown_event); 3105 break; 3106 } 3107 3108 if (error_log != NULL) 3109 kmem_free(error_log, logsize); 3110 3111 if (health_log != NULL) 3112 kmem_free(health_log, logsize); 3113 3114 if (nslist != NULL) 3115 kmem_free(nslist, logsize); 3116 } 3117 3118 static void 3119 nvme_admin_cmd(nvme_cmd_t *cmd, uint32_t sec) 3120 { 3121 uint32_t qtimeout; 3122 3123 ASSERT(cmd->nc_flags & NVME_CMD_F_USELOCK); 3124 3125 mutex_enter(&cmd->nc_mutex); 3126 cmd->nc_timeout = sec; 3127 nvme_submit_admin_cmd(cmd->nc_nvme->n_adminq, cmd, &qtimeout); 3128 /* 3129 * We will wait for a total of this command's specified timeout plus 3130 * the sum of the timeouts of any commands queued ahead of this one. If 3131 * we aren't first in the queue, this will inflate the timeout somewhat 3132 * but these times are not critical and it means that if we get stuck 3133 * behind a long running command such as a namespace format then we 3134 * won't time out and trigger an abort. 3135 */ 3136 nvme_wait_cmd(cmd, sec + qtimeout); 3137 mutex_exit(&cmd->nc_mutex); 3138 } 3139 3140 static void 3141 nvme_async_event(nvme_t *nvme) 3142 { 3143 nvme_cmd_t *cmd; 3144 3145 cmd = nvme_alloc_admin_cmd(nvme, KM_SLEEP); 3146 cmd->nc_sqid = 0; 3147 cmd->nc_sqe.sqe_opc = NVME_OPC_ASYNC_EVENT; 3148 cmd->nc_callback = nvme_async_event_task; 3149 cmd->nc_flags |= NVME_CMD_F_DONTPANIC; 3150 3151 nvme_submit_admin_cmd(nvme->n_adminq, cmd, NULL); 3152 } 3153 3154 /* 3155 * There are commands such as format or vendor unique commands that are going to 3156 * manipulate the data in a namespace or destroy them, we make sure that none of 3157 * the ones that will be impacted are actually attached. 3158 */ 3159 static boolean_t 3160 nvme_no_blkdev_attached(nvme_t *nvme, uint32_t nsid) 3161 { 3162 ASSERT(nvme_mgmt_lock_held(nvme)); 3163 ASSERT3U(nsid, !=, 0); 3164 3165 if (nsid != NVME_NSID_BCAST) { 3166 nvme_namespace_t *ns = nvme_nsid2ns(nvme, nsid); 3167 return (ns->ns_state < NVME_NS_STATE_ATTACHED); 3168 } 3169 3170 for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) { 3171 nvme_namespace_t *ns = nvme_nsid2ns(nvme, i); 3172 3173 if (ns->ns_state >= NVME_NS_STATE_ATTACHED) { 3174 return (B_FALSE); 3175 } 3176 } 3177 3178 return (B_TRUE); 3179 } 3180 3181 static boolean_t 3182 nvme_format_nvm(nvme_t *nvme, nvme_ioctl_format_t *ioc) 3183 { 3184 nvme_cmd_t *cmd = nvme_alloc_admin_cmd(nvme, KM_SLEEP); 3185 nvme_format_nvm_t format_nvm = { 0 }; 3186 boolean_t ret; 3187 3188 format_nvm.b.fm_lbaf = bitx32(ioc->nif_lbaf, 3, 0); 3189 format_nvm.b.fm_ses = bitx32(ioc->nif_ses, 2, 0); 3190 3191 cmd->nc_sqid = 0; 3192 cmd->nc_callback = nvme_wakeup_cmd; 3193 cmd->nc_sqe.sqe_nsid = ioc->nif_common.nioc_nsid; 3194 cmd->nc_sqe.sqe_opc = NVME_OPC_NVM_FORMAT; 3195 cmd->nc_sqe.sqe_cdw10 = format_nvm.r; 3196 3197 /* 3198 * We don't want to panic on any format commands. There are two reasons 3199 * for this: 3200 * 3201 * 1) All format commands are initiated by users. We don't want to panic 3202 * on user commands. 3203 * 3204 * 2) Several devices like the Samsung SM951 don't allow formatting of 3205 * all namespaces in one command and we'd prefer to handle that 3206 * gracefully. 3207 */ 3208 cmd->nc_flags |= NVME_CMD_F_DONTPANIC; 3209 3210 nvme_admin_cmd(cmd, nvme_format_cmd_timeout); 3211 3212 if (!nvme_check_cmd_status_ioctl(cmd, &ioc->nif_common) != 0) { 3213 dev_err(nvme->n_dip, CE_WARN, 3214 "!FORMAT failed with sct = %x, sc = %x", 3215 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 3216 ret = B_FALSE; 3217 goto fail; 3218 } 3219 3220 ret = B_TRUE; 3221 fail: 3222 nvme_free_cmd(cmd); 3223 return (ret); 3224 } 3225 3226 /* 3227 * Retrieve a specific log page. The contents of the log page request should 3228 * have already been validated by the system. 3229 */ 3230 static boolean_t 3231 nvme_get_logpage(nvme_t *nvme, boolean_t user, nvme_ioctl_get_logpage_t *log, 3232 void **buf) 3233 { 3234 nvme_cmd_t *cmd = nvme_alloc_admin_cmd(nvme, KM_SLEEP); 3235 nvme_getlogpage_dw10_t dw10; 3236 uint32_t offlo, offhi; 3237 nvme_getlogpage_dw11_t dw11; 3238 nvme_getlogpage_dw14_t dw14; 3239 uint32_t ndw; 3240 boolean_t ret = B_FALSE; 3241 3242 bzero(&dw10, sizeof (dw10)); 3243 bzero(&dw11, sizeof (dw11)); 3244 bzero(&dw14, sizeof (dw14)); 3245 3246 cmd->nc_sqid = 0; 3247 cmd->nc_callback = nvme_wakeup_cmd; 3248 cmd->nc_sqe.sqe_opc = NVME_OPC_GET_LOG_PAGE; 3249 cmd->nc_sqe.sqe_nsid = log->nigl_common.nioc_nsid; 3250 3251 if (user) 3252 cmd->nc_flags |= NVME_CMD_F_DONTPANIC; 3253 3254 /* 3255 * The size field is the number of double words, but is a zeros based 3256 * value. We need to store our actual value minus one. 3257 */ 3258 ndw = (uint32_t)(log->nigl_len / 4); 3259 ASSERT3U(ndw, >, 0); 3260 ndw--; 3261 3262 dw10.b.lp_lid = bitx32(log->nigl_lid, 7, 0); 3263 dw10.b.lp_lsp = bitx32(log->nigl_lsp, 6, 0); 3264 dw10.b.lp_rae = bitx32(log->nigl_lsp, 0, 0); 3265 dw10.b.lp_lnumdl = bitx32(ndw, 15, 0); 3266 3267 dw11.b.lp_numdu = bitx32(ndw, 31, 16); 3268 dw11.b.lp_lsi = bitx32(log->nigl_lsi, 15, 0); 3269 3270 offlo = bitx64(log->nigl_offset, 31, 0); 3271 offhi = bitx64(log->nigl_offset, 63, 32); 3272 3273 dw14.b.lp_csi = bitx32(log->nigl_csi, 7, 0); 3274 3275 cmd->nc_sqe.sqe_cdw10 = dw10.r; 3276 cmd->nc_sqe.sqe_cdw11 = dw11.r; 3277 cmd->nc_sqe.sqe_cdw12 = offlo; 3278 cmd->nc_sqe.sqe_cdw13 = offhi; 3279 cmd->nc_sqe.sqe_cdw14 = dw14.r; 3280 3281 if (nvme_zalloc_dma(nvme, log->nigl_len, DDI_DMA_READ, 3282 &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) { 3283 dev_err(nvme->n_dip, CE_WARN, 3284 "!nvme_zalloc_dma failed for GET LOG PAGE"); 3285 ret = nvme_ioctl_error(&log->nigl_common, 3286 NVME_IOCTL_E_NO_DMA_MEM, 0, 0); 3287 goto fail; 3288 } 3289 3290 if (nvme_fill_prp(cmd, cmd->nc_dma->nd_dmah) != 0) { 3291 ret = nvme_ioctl_error(&log->nigl_common, 3292 NVME_IOCTL_E_NO_DMA_MEM, 0, 0); 3293 goto fail; 3294 } 3295 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout); 3296 3297 if (!nvme_check_cmd_status_ioctl(cmd, &log->nigl_common)) { 3298 if (!user) { 3299 dev_err(nvme->n_dip, CE_WARN, 3300 "!GET LOG PAGE failed with sct = %x, sc = %x", 3301 cmd->nc_cqe.cqe_sf.sf_sct, 3302 cmd->nc_cqe.cqe_sf.sf_sc); 3303 } 3304 ret = B_FALSE; 3305 goto fail; 3306 } 3307 3308 *buf = kmem_alloc(log->nigl_len, KM_SLEEP); 3309 bcopy(cmd->nc_dma->nd_memp, *buf, log->nigl_len); 3310 3311 ret = B_TRUE; 3312 fail: 3313 nvme_free_cmd(cmd); 3314 3315 return (ret); 3316 } 3317 3318 /* 3319 * This is an internal wrapper for when the kernel wants to get a log page. 3320 * Currently this assumes that the only thing that is required is the log page 3321 * ID. If more information is required, we'll be better served to just use the 3322 * general ioctl interface. 3323 */ 3324 static boolean_t 3325 nvme_get_logpage_int(nvme_t *nvme, boolean_t user, void **buf, size_t *bufsize, 3326 uint8_t lid) 3327 { 3328 const nvme_log_page_info_t *info = NULL; 3329 nvme_ioctl_get_logpage_t log; 3330 nvme_valid_ctrl_data_t data; 3331 boolean_t bret; 3332 bool var; 3333 3334 for (size_t i = 0; i < nvme_std_log_npages; i++) { 3335 if (nvme_std_log_pages[i].nlpi_lid == lid && 3336 nvme_std_log_pages[i].nlpi_csi == NVME_CSI_NVM) { 3337 info = &nvme_std_log_pages[i]; 3338 break; 3339 } 3340 } 3341 3342 if (info == NULL) { 3343 return (B_FALSE); 3344 } 3345 3346 data.vcd_vers = &nvme->n_version; 3347 data.vcd_id = nvme->n_idctl; 3348 bzero(&log, sizeof (log)); 3349 log.nigl_common.nioc_nsid = NVME_NSID_BCAST; 3350 log.nigl_csi = info->nlpi_csi; 3351 log.nigl_lid = info->nlpi_lid; 3352 log.nigl_len = nvme_log_page_info_size(info, &data, &var); 3353 3354 /* 3355 * We only support getting standard fixed-length log pages through the 3356 * kernel interface at this time. If a log page either has an unknown 3357 * size or has a variable length, then we cannot get it. 3358 */ 3359 if (log.nigl_len == 0 || var) { 3360 return (B_FALSE); 3361 } 3362 3363 bret = nvme_get_logpage(nvme, user, &log, buf); 3364 if (!bret) { 3365 return (B_FALSE); 3366 } 3367 3368 *bufsize = log.nigl_len; 3369 return (B_TRUE); 3370 } 3371 3372 static boolean_t 3373 nvme_identify(nvme_t *nvme, boolean_t user, nvme_ioctl_identify_t *ioc, 3374 void **buf) 3375 { 3376 nvme_cmd_t *cmd = nvme_alloc_admin_cmd(nvme, KM_SLEEP); 3377 boolean_t ret = B_FALSE; 3378 nvme_identify_dw10_t dw10; 3379 3380 ASSERT3P(buf, !=, NULL); 3381 3382 bzero(&dw10, sizeof (dw10)); 3383 3384 cmd->nc_sqid = 0; 3385 cmd->nc_callback = nvme_wakeup_cmd; 3386 cmd->nc_sqe.sqe_opc = NVME_OPC_IDENTIFY; 3387 cmd->nc_sqe.sqe_nsid = ioc->nid_common.nioc_nsid; 3388 3389 dw10.b.id_cns = bitx32(ioc->nid_cns, 7, 0); 3390 dw10.b.id_cntid = bitx32(ioc->nid_ctrlid, 15, 0); 3391 3392 cmd->nc_sqe.sqe_cdw10 = dw10.r; 3393 3394 if (nvme_zalloc_dma(nvme, NVME_IDENTIFY_BUFSIZE, DDI_DMA_READ, 3395 &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) { 3396 dev_err(nvme->n_dip, CE_WARN, 3397 "!nvme_zalloc_dma failed for IDENTIFY"); 3398 ret = nvme_ioctl_error(&ioc->nid_common, 3399 NVME_IOCTL_E_NO_DMA_MEM, 0, 0); 3400 goto fail; 3401 } 3402 3403 if (cmd->nc_dma->nd_ncookie > 2) { 3404 dev_err(nvme->n_dip, CE_WARN, 3405 "!too many DMA cookies for IDENTIFY"); 3406 NVME_BUMP_STAT(nvme, too_many_cookies); 3407 ret = nvme_ioctl_error(&ioc->nid_common, 3408 NVME_IOCTL_E_BAD_PRP, 0, 0); 3409 goto fail; 3410 } 3411 3412 cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_dma->nd_cookie.dmac_laddress; 3413 if (cmd->nc_dma->nd_ncookie > 1) { 3414 ddi_dma_nextcookie(cmd->nc_dma->nd_dmah, 3415 &cmd->nc_dma->nd_cookie); 3416 cmd->nc_sqe.sqe_dptr.d_prp[1] = 3417 cmd->nc_dma->nd_cookie.dmac_laddress; 3418 } 3419 3420 if (user) 3421 cmd->nc_flags |= NVME_CMD_F_DONTPANIC; 3422 3423 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout); 3424 3425 if (!nvme_check_cmd_status_ioctl(cmd, &ioc->nid_common)) { 3426 dev_err(nvme->n_dip, CE_WARN, 3427 "!IDENTIFY failed with sct = %x, sc = %x", 3428 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 3429 ret = B_FALSE; 3430 goto fail; 3431 } 3432 3433 *buf = kmem_alloc(NVME_IDENTIFY_BUFSIZE, KM_SLEEP); 3434 bcopy(cmd->nc_dma->nd_memp, *buf, NVME_IDENTIFY_BUFSIZE); 3435 ret = B_TRUE; 3436 3437 fail: 3438 nvme_free_cmd(cmd); 3439 3440 return (ret); 3441 } 3442 3443 static boolean_t 3444 nvme_identify_int(nvme_t *nvme, uint32_t nsid, uint8_t cns, void **buf) 3445 { 3446 nvme_ioctl_identify_t id; 3447 3448 bzero(&id, sizeof (nvme_ioctl_identify_t)); 3449 id.nid_common.nioc_nsid = nsid; 3450 id.nid_cns = cns; 3451 3452 return (nvme_identify(nvme, B_FALSE, &id, buf)); 3453 } 3454 3455 static int 3456 nvme_set_features(nvme_t *nvme, boolean_t user, uint32_t nsid, uint8_t feature, 3457 uint32_t val, uint32_t *res) 3458 { 3459 _NOTE(ARGUNUSED(nsid)); 3460 nvme_cmd_t *cmd = nvme_alloc_admin_cmd(nvme, KM_SLEEP); 3461 int ret = EINVAL; 3462 3463 ASSERT(res != NULL); 3464 3465 cmd->nc_sqid = 0; 3466 cmd->nc_callback = nvme_wakeup_cmd; 3467 cmd->nc_sqe.sqe_opc = NVME_OPC_SET_FEATURES; 3468 cmd->nc_sqe.sqe_cdw10 = feature; 3469 cmd->nc_sqe.sqe_cdw11 = val; 3470 3471 if (user) 3472 cmd->nc_flags |= NVME_CMD_F_DONTPANIC; 3473 3474 switch (feature) { 3475 case NVME_FEAT_WRITE_CACHE: 3476 if (!nvme->n_write_cache_present) 3477 goto fail; 3478 break; 3479 3480 case NVME_FEAT_NQUEUES: 3481 break; 3482 3483 default: 3484 goto fail; 3485 } 3486 3487 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout); 3488 3489 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 3490 dev_err(nvme->n_dip, CE_WARN, 3491 "!SET FEATURES %d failed with sct = %x, sc = %x", 3492 feature, cmd->nc_cqe.cqe_sf.sf_sct, 3493 cmd->nc_cqe.cqe_sf.sf_sc); 3494 goto fail; 3495 } 3496 3497 *res = cmd->nc_cqe.cqe_dw0; 3498 3499 fail: 3500 nvme_free_cmd(cmd); 3501 return (ret); 3502 } 3503 3504 static int 3505 nvme_write_cache_set(nvme_t *nvme, boolean_t enable) 3506 { 3507 nvme_write_cache_t nwc = { 0 }; 3508 3509 if (enable) 3510 nwc.b.wc_wce = 1; 3511 3512 /* 3513 * We've seen some cases where this fails due to us being told we've 3514 * specified an invalid namespace when operating against the Xen xcp-ng 3515 * qemu NVMe virtual device. As such, we generally ensure that trying to 3516 * enable this doesn't lead us to panic. It's not completely clear why 3517 * specifying namespace zero here fails, but not when we're setting the 3518 * number of queues below. 3519 */ 3520 return (nvme_set_features(nvme, B_TRUE, 0, NVME_FEAT_WRITE_CACHE, 3521 nwc.r, &nwc.r)); 3522 } 3523 3524 static int 3525 nvme_set_nqueues(nvme_t *nvme) 3526 { 3527 nvme_nqueues_t nq = { 0 }; 3528 int ret; 3529 3530 /* 3531 * The default is to allocate one completion queue per vector. 3532 */ 3533 if (nvme->n_completion_queues == -1) 3534 nvme->n_completion_queues = nvme->n_intr_cnt; 3535 3536 /* 3537 * There is no point in having more completion queues than 3538 * interrupt vectors. 3539 */ 3540 nvme->n_completion_queues = MIN(nvme->n_completion_queues, 3541 nvme->n_intr_cnt); 3542 3543 /* 3544 * The default is to use one submission queue per completion queue. 3545 */ 3546 if (nvme->n_submission_queues == -1) 3547 nvme->n_submission_queues = nvme->n_completion_queues; 3548 3549 /* 3550 * There is no point in having more completion queues than 3551 * submission queues. 3552 */ 3553 nvme->n_completion_queues = MIN(nvme->n_completion_queues, 3554 nvme->n_submission_queues); 3555 3556 ASSERT(nvme->n_submission_queues > 0); 3557 ASSERT(nvme->n_completion_queues > 0); 3558 3559 nq.b.nq_nsq = nvme->n_submission_queues - 1; 3560 nq.b.nq_ncq = nvme->n_completion_queues - 1; 3561 3562 ret = nvme_set_features(nvme, B_FALSE, 0, NVME_FEAT_NQUEUES, nq.r, 3563 &nq.r); 3564 3565 if (ret == 0) { 3566 /* 3567 * Never use more than the requested number of queues. 3568 */ 3569 nvme->n_submission_queues = MIN(nvme->n_submission_queues, 3570 nq.b.nq_nsq + 1); 3571 nvme->n_completion_queues = MIN(nvme->n_completion_queues, 3572 nq.b.nq_ncq + 1); 3573 } 3574 3575 return (ret); 3576 } 3577 3578 static int 3579 nvme_create_completion_queue(nvme_t *nvme, nvme_cq_t *cq) 3580 { 3581 nvme_cmd_t *cmd = nvme_alloc_admin_cmd(nvme, KM_SLEEP); 3582 nvme_create_queue_dw10_t dw10 = { 0 }; 3583 nvme_create_cq_dw11_t c_dw11 = { 0 }; 3584 int ret; 3585 3586 dw10.b.q_qid = cq->ncq_id; 3587 dw10.b.q_qsize = cq->ncq_nentry - 1; 3588 3589 c_dw11.b.cq_pc = 1; 3590 c_dw11.b.cq_ien = 1; 3591 c_dw11.b.cq_iv = cq->ncq_id % nvme->n_intr_cnt; 3592 3593 cmd->nc_sqid = 0; 3594 cmd->nc_callback = nvme_wakeup_cmd; 3595 cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_CQUEUE; 3596 cmd->nc_sqe.sqe_cdw10 = dw10.r; 3597 cmd->nc_sqe.sqe_cdw11 = c_dw11.r; 3598 cmd->nc_sqe.sqe_dptr.d_prp[0] = cq->ncq_dma->nd_cookie.dmac_laddress; 3599 3600 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout); 3601 3602 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 3603 dev_err(nvme->n_dip, CE_WARN, 3604 "!CREATE CQUEUE failed with sct = %x, sc = %x", 3605 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 3606 } 3607 3608 nvme_free_cmd(cmd); 3609 3610 return (ret); 3611 } 3612 3613 static int 3614 nvme_create_io_qpair(nvme_t *nvme, nvme_qpair_t *qp, uint16_t idx) 3615 { 3616 nvme_cq_t *cq = qp->nq_cq; 3617 nvme_cmd_t *cmd; 3618 nvme_create_queue_dw10_t dw10 = { 0 }; 3619 nvme_create_sq_dw11_t s_dw11 = { 0 }; 3620 int ret; 3621 3622 /* 3623 * It is possible to have more qpairs than completion queues, 3624 * and when the idx > ncq_id, that completion queue is shared 3625 * and has already been created. 3626 */ 3627 if (idx <= cq->ncq_id && 3628 nvme_create_completion_queue(nvme, cq) != DDI_SUCCESS) 3629 return (DDI_FAILURE); 3630 3631 dw10.b.q_qid = idx; 3632 dw10.b.q_qsize = qp->nq_nentry - 1; 3633 3634 s_dw11.b.sq_pc = 1; 3635 s_dw11.b.sq_cqid = cq->ncq_id; 3636 3637 cmd = nvme_alloc_admin_cmd(nvme, KM_SLEEP); 3638 cmd->nc_sqid = 0; 3639 cmd->nc_callback = nvme_wakeup_cmd; 3640 cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_SQUEUE; 3641 cmd->nc_sqe.sqe_cdw10 = dw10.r; 3642 cmd->nc_sqe.sqe_cdw11 = s_dw11.r; 3643 cmd->nc_sqe.sqe_dptr.d_prp[0] = qp->nq_sqdma->nd_cookie.dmac_laddress; 3644 3645 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout); 3646 3647 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 3648 dev_err(nvme->n_dip, CE_WARN, 3649 "!CREATE SQUEUE failed with sct = %x, sc = %x", 3650 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 3651 } 3652 3653 nvme_free_cmd(cmd); 3654 3655 return (ret); 3656 } 3657 3658 static boolean_t 3659 nvme_reset(nvme_t *nvme, boolean_t quiesce) 3660 { 3661 nvme_reg_csts_t csts; 3662 int i; 3663 3664 /* 3665 * If the device is gone, do not try to interact with it. We define 3666 * that resetting such a device is impossible, and always fails. 3667 */ 3668 if (nvme_ctrl_is_gone(nvme)) { 3669 return (B_FALSE); 3670 } 3671 3672 nvme_put32(nvme, NVME_REG_CC, 0); 3673 3674 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 3675 if (csts.b.csts_rdy == 1) { 3676 nvme_put32(nvme, NVME_REG_CC, 0); 3677 3678 /* 3679 * The timeout value is from the Controller Capabilities 3680 * register (CAP.TO, section 3.1.1). This is the worst case 3681 * time to wait for CSTS.RDY to transition from 1 to 0 after 3682 * CC.EN transitions from 1 to 0. 3683 * 3684 * The timeout units are in 500 ms units, and we are delaying 3685 * in 50ms chunks, hence counting to n_timeout * 10. 3686 */ 3687 for (i = 0; i < nvme->n_timeout * 10; i++) { 3688 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 3689 if (csts.b.csts_rdy == 0) 3690 break; 3691 3692 /* 3693 * Quiescing drivers should not use locks or timeouts, 3694 * so if this is the quiesce path, use a quiesce-safe 3695 * delay. 3696 */ 3697 if (quiesce) { 3698 drv_usecwait(50000); 3699 } else { 3700 delay(drv_usectohz(50000)); 3701 } 3702 } 3703 } 3704 3705 nvme_put32(nvme, NVME_REG_AQA, 0); 3706 nvme_put32(nvme, NVME_REG_ASQ, 0); 3707 nvme_put32(nvme, NVME_REG_ACQ, 0); 3708 3709 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 3710 return (csts.b.csts_rdy == 0 ? B_TRUE : B_FALSE); 3711 } 3712 3713 static void 3714 nvme_shutdown(nvme_t *nvme, boolean_t quiesce) 3715 { 3716 nvme_reg_cc_t cc; 3717 nvme_reg_csts_t csts; 3718 int i; 3719 3720 /* 3721 * Do not try to interact with the device if it is gone. Since it is 3722 * not there, in some sense it must already be shut down anyway. 3723 */ 3724 if (nvme_ctrl_is_gone(nvme)) { 3725 return; 3726 } 3727 3728 cc.r = nvme_get32(nvme, NVME_REG_CC); 3729 cc.b.cc_shn = NVME_CC_SHN_NORMAL; 3730 nvme_put32(nvme, NVME_REG_CC, cc.r); 3731 3732 for (i = 0; i < 10; i++) { 3733 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 3734 if (csts.b.csts_shst == NVME_CSTS_SHN_COMPLETE) 3735 break; 3736 3737 if (quiesce) { 3738 drv_usecwait(100000); 3739 } else { 3740 delay(drv_usectohz(100000)); 3741 } 3742 } 3743 } 3744 3745 /* 3746 * Return length of string without trailing spaces. 3747 */ 3748 static size_t 3749 nvme_strlen(const char *str, size_t len) 3750 { 3751 if (len <= 0) 3752 return (0); 3753 3754 while (str[--len] == ' ') 3755 ; 3756 3757 return (++len); 3758 } 3759 3760 static void 3761 nvme_config_min_block_size(nvme_t *nvme, char *model, char *val) 3762 { 3763 ulong_t bsize = 0; 3764 char *msg = ""; 3765 3766 if (ddi_strtoul(val, NULL, 0, &bsize) != 0) 3767 goto err; 3768 3769 if (!ISP2(bsize)) { 3770 msg = ": not a power of 2"; 3771 goto err; 3772 } 3773 3774 if (bsize < NVME_DEFAULT_MIN_BLOCK_SIZE) { 3775 msg = ": too low"; 3776 goto err; 3777 } 3778 3779 nvme->n_min_block_size = bsize; 3780 return; 3781 3782 err: 3783 dev_err(nvme->n_dip, CE_WARN, 3784 "!nvme-config-list: ignoring invalid min-phys-block-size '%s' " 3785 "for model '%s'%s", val, model, msg); 3786 3787 nvme->n_min_block_size = NVME_DEFAULT_MIN_BLOCK_SIZE; 3788 } 3789 3790 static void 3791 nvme_config_boolean(nvme_t *nvme, char *model, char *name, char *val, 3792 boolean_t *b) 3793 { 3794 if (strcmp(val, "on") == 0 || 3795 strcmp(val, "true") == 0) 3796 *b = B_TRUE; 3797 else if (strcmp(val, "off") == 0 || 3798 strcmp(val, "false") == 0) 3799 *b = B_FALSE; 3800 else 3801 dev_err(nvme->n_dip, CE_WARN, 3802 "!nvme-config-list: invalid value for %s '%s'" 3803 " for model '%s', ignoring", name, val, model); 3804 } 3805 3806 static void 3807 nvme_config_list(nvme_t *nvme) 3808 { 3809 char **config_list; 3810 uint_t nelem; 3811 int rv; 3812 3813 /* 3814 * We're following the pattern of 'sd-config-list' here, but extend it. 3815 * Instead of two we have three separate strings for "model", "fwrev", 3816 * and "name-value-list". 3817 */ 3818 rv = ddi_prop_lookup_string_array(DDI_DEV_T_ANY, nvme->n_dip, 3819 DDI_PROP_DONTPASS, "nvme-config-list", &config_list, &nelem); 3820 3821 if (rv != DDI_PROP_SUCCESS) { 3822 if (rv == DDI_PROP_CANNOT_DECODE) { 3823 dev_err(nvme->n_dip, CE_WARN, 3824 "!nvme-config-list: cannot be decoded"); 3825 } 3826 3827 return; 3828 } 3829 3830 if ((nelem % 3) != 0) { 3831 dev_err(nvme->n_dip, CE_WARN, "!nvme-config-list: must be " 3832 "triplets of <model>/<fwrev>/<name-value-list> strings "); 3833 goto out; 3834 } 3835 3836 for (uint_t i = 0; i < nelem; i += 3) { 3837 char *model = config_list[i]; 3838 char *fwrev = config_list[i + 1]; 3839 char *nvp, *save_nv; 3840 size_t id_model_len, id_fwrev_len; 3841 3842 id_model_len = nvme_strlen(nvme->n_idctl->id_model, 3843 sizeof (nvme->n_idctl->id_model)); 3844 3845 if (strlen(model) != id_model_len) 3846 continue; 3847 3848 if (strncmp(model, nvme->n_idctl->id_model, id_model_len) != 0) 3849 continue; 3850 3851 id_fwrev_len = nvme_strlen(nvme->n_idctl->id_fwrev, 3852 sizeof (nvme->n_idctl->id_fwrev)); 3853 3854 if (strlen(fwrev) != 0) { 3855 boolean_t match = B_FALSE; 3856 char *fwr, *last_fw; 3857 3858 for (fwr = strtok_r(fwrev, ",", &last_fw); 3859 fwr != NULL; 3860 fwr = strtok_r(NULL, ",", &last_fw)) { 3861 if (strlen(fwr) != id_fwrev_len) 3862 continue; 3863 3864 if (strncmp(fwr, nvme->n_idctl->id_fwrev, 3865 id_fwrev_len) == 0) 3866 match = B_TRUE; 3867 } 3868 3869 if (!match) 3870 continue; 3871 } 3872 3873 /* 3874 * We should now have a comma-separated list of name:value 3875 * pairs. 3876 */ 3877 for (nvp = strtok_r(config_list[i + 2], ",", &save_nv); 3878 nvp != NULL; nvp = strtok_r(NULL, ",", &save_nv)) { 3879 char *name = nvp; 3880 char *val = strchr(nvp, ':'); 3881 3882 if (val == NULL || name == val) { 3883 dev_err(nvme->n_dip, CE_WARN, 3884 "!nvme-config-list: <name-value-list> " 3885 "for model '%s' is malformed", model); 3886 goto out; 3887 } 3888 3889 /* 3890 * Null-terminate 'name', move 'val' past ':' sep. 3891 */ 3892 *val++ = '\0'; 3893 3894 /* 3895 * Process the name:val pairs that we know about. 3896 */ 3897 if (strcmp(name, "ignore-unknown-vendor-status") == 0) { 3898 nvme_config_boolean(nvme, model, name, val, 3899 &nvme->n_ignore_unknown_vendor_status); 3900 } else if (strcmp(name, "min-phys-block-size") == 0) { 3901 nvme_config_min_block_size(nvme, model, val); 3902 } else if (strcmp(name, "volatile-write-cache") == 0) { 3903 nvme_config_boolean(nvme, model, name, val, 3904 &nvme->n_write_cache_enabled); 3905 } else { 3906 /* 3907 * Unknown 'name'. 3908 */ 3909 dev_err(nvme->n_dip, CE_WARN, 3910 "!nvme-config-list: unknown config '%s' " 3911 "for model '%s', ignoring", name, model); 3912 } 3913 } 3914 } 3915 3916 out: 3917 ddi_prop_free(config_list); 3918 } 3919 3920 static void 3921 nvme_prepare_devid(nvme_t *nvme, uint32_t nsid) 3922 { 3923 /* 3924 * Section 7.7 of the spec describes how to get a unique ID for 3925 * the controller: the vendor ID, the model name and the serial 3926 * number shall be unique when combined. 3927 * 3928 * If a namespace has no EUI64 we use the above and add the hex 3929 * namespace ID to get a unique ID for the namespace. 3930 */ 3931 char model[sizeof (nvme->n_idctl->id_model) + 1]; 3932 char serial[sizeof (nvme->n_idctl->id_serial) + 1]; 3933 3934 bcopy(nvme->n_idctl->id_model, model, sizeof (nvme->n_idctl->id_model)); 3935 bcopy(nvme->n_idctl->id_serial, serial, 3936 sizeof (nvme->n_idctl->id_serial)); 3937 3938 model[sizeof (nvme->n_idctl->id_model)] = '\0'; 3939 serial[sizeof (nvme->n_idctl->id_serial)] = '\0'; 3940 3941 nvme_nsid2ns(nvme, nsid)->ns_devid = kmem_asprintf("%4X-%s-%s-%X", 3942 nvme->n_idctl->id_vid, model, serial, nsid); 3943 } 3944 3945 static nvme_identify_nsid_list_t * 3946 nvme_update_nsid_list(nvme_t *nvme, int cns) 3947 { 3948 nvme_identify_nsid_list_t *nslist; 3949 3950 /* 3951 * We currently don't handle cases where there are more than 3952 * 1024 active namespaces, requiring several IDENTIFY commands. 3953 */ 3954 if (nvme_identify_int(nvme, 0, cns, (void **)&nslist)) 3955 return (nslist); 3956 3957 return (NULL); 3958 } 3959 3960 nvme_namespace_t * 3961 nvme_nsid2ns(nvme_t *nvme, uint32_t nsid) 3962 { 3963 ASSERT3U(nsid, !=, 0); 3964 ASSERT3U(nsid, <=, nvme->n_namespace_count); 3965 return (&nvme->n_ns[nsid - 1]); 3966 } 3967 3968 static boolean_t 3969 nvme_allocated_ns(nvme_namespace_t *ns) 3970 { 3971 nvme_t *nvme = ns->ns_nvme; 3972 uint32_t i; 3973 3974 ASSERT(nvme_mgmt_lock_held(nvme)); 3975 3976 /* 3977 * If supported, update the list of allocated namespace IDs. 3978 */ 3979 if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 2) && 3980 nvme->n_idctl->id_oacs.oa_nsmgmt != 0) { 3981 nvme_identify_nsid_list_t *nslist = nvme_update_nsid_list(nvme, 3982 NVME_IDENTIFY_NSID_ALLOC_LIST); 3983 boolean_t found = B_FALSE; 3984 3985 /* 3986 * When namespace management is supported, this really shouldn't 3987 * be NULL. Treat all namespaces as allocated if it is. 3988 */ 3989 if (nslist == NULL) 3990 return (B_TRUE); 3991 3992 for (i = 0; i < ARRAY_SIZE(nslist->nl_nsid); i++) { 3993 if (ns->ns_id == 0) 3994 break; 3995 3996 if (ns->ns_id == nslist->nl_nsid[i]) 3997 found = B_TRUE; 3998 } 3999 4000 kmem_free(nslist, NVME_IDENTIFY_BUFSIZE); 4001 return (found); 4002 } else { 4003 /* 4004 * If namespace management isn't supported, report all 4005 * namespaces as allocated. 4006 */ 4007 return (B_TRUE); 4008 } 4009 } 4010 4011 static boolean_t 4012 nvme_active_ns(nvme_namespace_t *ns) 4013 { 4014 nvme_t *nvme = ns->ns_nvme; 4015 uint64_t *ptr; 4016 uint32_t i; 4017 4018 ASSERT(nvme_mgmt_lock_held(nvme)); 4019 4020 /* 4021 * If supported, update the list of active namespace IDs. 4022 */ 4023 if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1)) { 4024 nvme_identify_nsid_list_t *nslist = nvme_update_nsid_list(nvme, 4025 NVME_IDENTIFY_NSID_LIST); 4026 boolean_t found = B_FALSE; 4027 4028 /* 4029 * When namespace management is supported, this really shouldn't 4030 * be NULL. Treat all namespaces as allocated if it is. 4031 */ 4032 if (nslist == NULL) 4033 return (B_TRUE); 4034 4035 for (i = 0; i < ARRAY_SIZE(nslist->nl_nsid); i++) { 4036 if (ns->ns_id == 0) 4037 break; 4038 4039 if (ns->ns_id == nslist->nl_nsid[i]) 4040 found = B_TRUE; 4041 } 4042 4043 kmem_free(nslist, NVME_IDENTIFY_BUFSIZE); 4044 return (found); 4045 } 4046 4047 /* 4048 * Workaround for revision 1.0: 4049 * Check whether the IDENTIFY NAMESPACE data is zero-filled. 4050 */ 4051 for (ptr = (uint64_t *)ns->ns_idns; 4052 ptr != (uint64_t *)(ns->ns_idns + 1); 4053 ptr++) { 4054 if (*ptr != 0) { 4055 return (B_TRUE); 4056 } 4057 } 4058 4059 return (B_FALSE); 4060 } 4061 4062 static int 4063 nvme_init_ns(nvme_t *nvme, uint32_t nsid) 4064 { 4065 nvme_namespace_t *ns = nvme_nsid2ns(nvme, nsid); 4066 nvme_identify_nsid_t *idns; 4067 nvme_ns_state_t orig_state; 4068 4069 ns->ns_nvme = nvme; 4070 4071 ASSERT(nvme_mgmt_lock_held(nvme)); 4072 4073 /* 4074 * Because we might rescan a namespace and this will fail after boot 4075 * that'd leave us in a bad spot. We need to do something about this 4076 * longer term, but it's not clear how exactly we would recover right 4077 * now. 4078 */ 4079 if (!nvme_identify_int(nvme, nsid, NVME_IDENTIFY_NSID, 4080 (void **)&idns)) { 4081 dev_err(nvme->n_dip, CE_WARN, 4082 "!failed to identify namespace %d", nsid); 4083 return (DDI_FAILURE); 4084 } 4085 4086 if (ns->ns_idns != NULL) 4087 kmem_free(ns->ns_idns, sizeof (nvme_identify_nsid_t)); 4088 4089 ns->ns_idns = idns; 4090 ns->ns_id = nsid; 4091 4092 /* 4093 * Save the current state so we can tell what changed. Look at the 4094 * current state of the device. We will flag active devices that should 4095 * be ignored after this. 4096 */ 4097 orig_state = ns->ns_state; 4098 if (nvme_active_ns(ns)) { 4099 /* 4100 * If the device previously had blkdev active, then that is its 4101 * current state. Otherwise, we consider this an upgrade and 4102 * just set it to not ignored. 4103 */ 4104 if (orig_state == NVME_NS_STATE_ATTACHED) { 4105 ns->ns_state = NVME_NS_STATE_ATTACHED; 4106 } else { 4107 ns->ns_state = NVME_NS_STATE_NOT_IGNORED; 4108 } 4109 } else if (nvme_allocated_ns(ns)) { 4110 ns->ns_state = NVME_NS_STATE_ALLOCATED; 4111 } else { 4112 ns->ns_state = NVME_NS_STATE_UNALLOCATED; 4113 } 4114 4115 ns->ns_block_count = idns->id_nsize; 4116 ns->ns_block_size = 4117 1 << idns->id_lbaf[idns->id_flbas.lba_format].lbaf_lbads; 4118 ns->ns_best_block_size = ns->ns_block_size; 4119 4120 /* 4121 * Get the EUI64 if present. 4122 */ 4123 if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1)) 4124 bcopy(idns->id_eui64, ns->ns_eui64, sizeof (ns->ns_eui64)); 4125 4126 /* 4127 * Get the NGUID if present. 4128 */ 4129 if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 2)) 4130 bcopy(idns->id_nguid, ns->ns_nguid, sizeof (ns->ns_nguid)); 4131 4132 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 4133 if (*(uint64_t *)ns->ns_eui64 == 0) 4134 nvme_prepare_devid(nvme, ns->ns_id); 4135 4136 (void) snprintf(ns->ns_name, sizeof (ns->ns_name), "%u", ns->ns_id); 4137 4138 /* 4139 * Find the LBA format with no metadata and the best relative 4140 * performance. A value of 3 means "degraded", 0 is best. 4141 */ 4142 for (uint32_t j = 0, last_rp = 3; j <= idns->id_nlbaf; j++) { 4143 if (idns->id_lbaf[j].lbaf_lbads == 0) 4144 break; 4145 if (idns->id_lbaf[j].lbaf_ms != 0) 4146 continue; 4147 if (idns->id_lbaf[j].lbaf_rp >= last_rp) 4148 continue; 4149 last_rp = idns->id_lbaf[j].lbaf_rp; 4150 ns->ns_best_block_size = 4151 1 << idns->id_lbaf[j].lbaf_lbads; 4152 } 4153 4154 if (ns->ns_best_block_size < nvme->n_min_block_size) 4155 ns->ns_best_block_size = nvme->n_min_block_size; 4156 4157 /* 4158 * We currently don't support namespaces that are inactive, or use 4159 * either: 4160 * - protection information 4161 * - illegal block size (< 512) 4162 */ 4163 if (ns->ns_state >= NVME_NS_STATE_NOT_IGNORED) { 4164 if (idns->id_dps.dp_pinfo) { 4165 dev_err(nvme->n_dip, CE_WARN, 4166 "!ignoring namespace %d, unsupported feature: " 4167 "pinfo = %d", nsid, idns->id_dps.dp_pinfo); 4168 ns->ns_state = NVME_NS_STATE_ACTIVE; 4169 } 4170 4171 if (ns->ns_block_size < 512) { 4172 dev_err(nvme->n_dip, CE_WARN, 4173 "!ignoring namespace %d, unsupported block size " 4174 "%"PRIu64, nsid, (uint64_t)ns->ns_block_size); 4175 ns->ns_state = NVME_NS_STATE_ACTIVE; 4176 } 4177 } 4178 4179 /* 4180 * If we were previously in a state where blkdev was active and suddenly 4181 * we think it should not be because ignore is set, then something has 4182 * gone behind our backs and this is not going to be recoverable. 4183 */ 4184 if (orig_state == NVME_NS_STATE_ATTACHED && 4185 ns->ns_state != NVME_NS_STATE_ATTACHED) { 4186 dev_err(nvme->n_dip, CE_PANIC, "namespace %u state " 4187 "unexpectedly changed and removed blkdev support!", nsid); 4188 } 4189 4190 /* 4191 * Keep a count of namespaces which are attachable. 4192 * See comments in nvme_bd_driveinfo() to understand its effect. 4193 */ 4194 if (orig_state > NVME_NS_STATE_ACTIVE) { 4195 /* 4196 * Wasn't attachable previously, but now needs to be. 4197 * Discount it. 4198 */ 4199 if (ns->ns_state < NVME_NS_STATE_NOT_IGNORED) 4200 nvme->n_namespaces_attachable--; 4201 } else if (ns->ns_state >= NVME_NS_STATE_NOT_IGNORED) { 4202 /* 4203 * Previously ignored, but now not. Count it. 4204 */ 4205 nvme->n_namespaces_attachable++; 4206 } 4207 4208 return (DDI_SUCCESS); 4209 } 4210 4211 static boolean_t 4212 nvme_bd_attach_ns(nvme_t *nvme, nvme_ioctl_common_t *com) 4213 { 4214 nvme_namespace_t *ns = nvme_nsid2ns(nvme, com->nioc_nsid); 4215 int ret; 4216 4217 ASSERT(nvme_mgmt_lock_held(nvme)); 4218 4219 if (!nvme_ns_state_check(ns, com, nvme_bd_attach_states)) { 4220 return (B_FALSE); 4221 } 4222 4223 if (ns->ns_bd_hdl == NULL) { 4224 bd_ops_t ops = nvme_bd_ops; 4225 4226 if (!nvme->n_idctl->id_oncs.on_dset_mgmt) 4227 ops.o_free_space = NULL; 4228 4229 ns->ns_bd_hdl = bd_alloc_handle(ns, &ops, &nvme->n_prp_dma_attr, 4230 KM_SLEEP); 4231 4232 if (ns->ns_bd_hdl == NULL) { 4233 dev_err(nvme->n_dip, CE_WARN, "!Failed to get blkdev " 4234 "handle for namespace id %u", com->nioc_nsid); 4235 return (nvme_ioctl_error(com, 4236 NVME_IOCTL_E_BLKDEV_ATTACH, 0, 0)); 4237 } 4238 } 4239 4240 nvme_mgmt_bd_start(nvme); 4241 ret = bd_attach_handle(nvme->n_dip, ns->ns_bd_hdl); 4242 nvme_mgmt_bd_end(nvme); 4243 if (ret != DDI_SUCCESS) { 4244 return (nvme_ioctl_error(com, NVME_IOCTL_E_BLKDEV_ATTACH, 4245 0, 0)); 4246 } 4247 4248 ns->ns_state = NVME_NS_STATE_ATTACHED; 4249 4250 return (B_TRUE); 4251 } 4252 4253 static boolean_t 4254 nvme_bd_detach_ns(nvme_t *nvme, nvme_ioctl_common_t *com) 4255 { 4256 nvme_namespace_t *ns = nvme_nsid2ns(nvme, com->nioc_nsid); 4257 int ret; 4258 4259 ASSERT(nvme_mgmt_lock_held(nvme)); 4260 4261 if (!nvme_ns_state_check(ns, com, nvme_bd_detach_states)) { 4262 return (B_FALSE); 4263 } 4264 4265 nvme_mgmt_bd_start(nvme); 4266 ASSERT3P(ns->ns_bd_hdl, !=, NULL); 4267 ret = bd_detach_handle(ns->ns_bd_hdl); 4268 nvme_mgmt_bd_end(nvme); 4269 4270 if (ret != DDI_SUCCESS) { 4271 return (nvme_ioctl_error(com, NVME_IOCTL_E_BLKDEV_DETACH, 0, 4272 0)); 4273 } 4274 4275 ns->ns_state = NVME_NS_STATE_NOT_IGNORED; 4276 return (B_TRUE); 4277 4278 } 4279 4280 /* 4281 * Rescan the namespace information associated with the namespaces indicated by 4282 * ioc. They should not be attached to blkdev right now. 4283 */ 4284 static void 4285 nvme_rescan_ns(nvme_t *nvme, uint32_t nsid) 4286 { 4287 ASSERT(nvme_mgmt_lock_held(nvme)); 4288 ASSERT3U(nsid, !=, 0); 4289 4290 if (nsid != NVME_NSID_BCAST) { 4291 nvme_namespace_t *ns = nvme_nsid2ns(nvme, nsid); 4292 4293 ASSERT3U(ns->ns_state, <, NVME_NS_STATE_ATTACHED); 4294 (void) nvme_init_ns(nvme, nsid); 4295 return; 4296 } 4297 4298 for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) { 4299 nvme_namespace_t *ns = nvme_nsid2ns(nvme, i); 4300 4301 ASSERT3U(ns->ns_state, <, NVME_NS_STATE_ATTACHED); 4302 (void) nvme_init_ns(nvme, i); 4303 } 4304 } 4305 4306 typedef struct nvme_quirk_table { 4307 uint16_t nq_vendor_id; 4308 uint16_t nq_device_id; 4309 nvme_quirk_t nq_quirks; 4310 } nvme_quirk_table_t; 4311 4312 static const nvme_quirk_table_t nvme_quirks[] = { 4313 { 0x1987, 0x5018, NVME_QUIRK_START_CID }, /* Phison E18 */ 4314 }; 4315 4316 static void 4317 nvme_detect_quirks(nvme_t *nvme) 4318 { 4319 for (uint_t i = 0; i < ARRAY_SIZE(nvme_quirks); i++) { 4320 const nvme_quirk_table_t *nqt = &nvme_quirks[i]; 4321 4322 if (nqt->nq_vendor_id == nvme->n_vendor_id && 4323 nqt->nq_device_id == nvme->n_device_id) { 4324 nvme->n_quirks = nqt->nq_quirks; 4325 return; 4326 } 4327 } 4328 } 4329 4330 /* 4331 * Indicate to the controller that we support various behaviors. These are 4332 * things the controller needs to be proactively told. We only will do this if 4333 * the controller indicates support for something that we care about, otherwise 4334 * there is no need to talk to the controller and there is no separate way to 4335 * know that this feature is otherwise supported. Support for most features is 4336 * indicated by setting it to 1. 4337 * 4338 * The current behaviors we enable are: 4339 * 4340 * - Extended Telemetry Data Area 4: This enables additional telemetry to be 4341 * possibly generated and depends on the DA4S bit in the log page attributes. 4342 */ 4343 static void 4344 nvme_enable_host_behavior(nvme_t *nvme) 4345 { 4346 nvme_host_behavior_t *hb; 4347 nvme_ioc_cmd_args_t args = { NULL }; 4348 nvme_sqe_t sqe = { 4349 .sqe_opc = NVME_OPC_SET_FEATURES, 4350 .sqe_cdw10 = NVME_FEAT_HOST_BEHAVE, 4351 .sqe_nsid = 0 4352 }; 4353 nvme_ioctl_common_t err; 4354 4355 if (nvme->n_idctl->id_lpa.lp_da4s == 0) 4356 return; 4357 4358 hb = kmem_zalloc(sizeof (nvme_host_behavior_t), KM_SLEEP); 4359 hb->nhb_etdas = 1; 4360 4361 args.ica_sqe = &sqe; 4362 args.ica_data = hb; 4363 args.ica_data_len = sizeof (nvme_host_behavior_t); 4364 args.ica_dma_flags = DDI_DMA_WRITE; 4365 args.ica_copy_flags = FKIOCTL; 4366 args.ica_timeout = nvme_admin_cmd_timeout; 4367 4368 if (!nvme_ioc_cmd(nvme, &err, &args)) { 4369 dev_err(nvme->n_dip, CE_WARN, "failed to enable host behavior " 4370 "feature: 0x%x/0x%x/0x%x", err.nioc_drv_err, 4371 err.nioc_ctrl_sct, err.nioc_ctrl_sc); 4372 } 4373 4374 kmem_free(hb, sizeof (nvme_host_behavior_t)); 4375 } 4376 4377 static int 4378 nvme_init(nvme_t *nvme) 4379 { 4380 nvme_reg_cc_t cc = { 0 }; 4381 nvme_reg_aqa_t aqa = { 0 }; 4382 nvme_reg_asq_t asq = { 0 }; 4383 nvme_reg_acq_t acq = { 0 }; 4384 nvme_reg_cap_t cap; 4385 nvme_reg_vs_t vs; 4386 nvme_reg_csts_t csts; 4387 int i = 0; 4388 uint16_t nqueues; 4389 uint_t tq_threads; 4390 char model[sizeof (nvme->n_idctl->id_model) + 1]; 4391 char *vendor, *product; 4392 uint32_t nsid; 4393 4394 /* Check controller version */ 4395 vs.r = nvme_get32(nvme, NVME_REG_VS); 4396 nvme->n_version.v_major = vs.b.vs_mjr; 4397 nvme->n_version.v_minor = vs.b.vs_mnr; 4398 dev_err(nvme->n_dip, CE_CONT, "?NVMe spec version %d.%d\n", 4399 nvme->n_version.v_major, nvme->n_version.v_minor); 4400 4401 if (nvme->n_version.v_major > nvme_version_major) { 4402 dev_err(nvme->n_dip, CE_WARN, "!no support for version > %d.x", 4403 nvme_version_major); 4404 if (nvme->n_strict_version) 4405 goto fail; 4406 } 4407 4408 /* retrieve controller configuration */ 4409 cap.r = nvme_get64(nvme, NVME_REG_CAP); 4410 4411 if ((cap.b.cap_css & NVME_CAP_CSS_NVM) == 0) { 4412 dev_err(nvme->n_dip, CE_WARN, 4413 "!NVM command set not supported by hardware"); 4414 goto fail; 4415 } 4416 4417 nvme->n_nssr_supported = cap.b.cap_nssrs; 4418 nvme->n_doorbell_stride = 4 << cap.b.cap_dstrd; 4419 nvme->n_timeout = cap.b.cap_to; 4420 nvme->n_arbitration_mechanisms = cap.b.cap_ams; 4421 nvme->n_cont_queues_reqd = cap.b.cap_cqr; 4422 nvme->n_max_queue_entries = cap.b.cap_mqes + 1; 4423 4424 /* 4425 * The MPSMIN and MPSMAX fields in the CAP register use 0 to specify 4426 * the base page size of 4k (1<<12), so add 12 here to get the real 4427 * page size value. 4428 */ 4429 nvme->n_pageshift = MIN(MAX(cap.b.cap_mpsmin + 12, PAGESHIFT), 4430 cap.b.cap_mpsmax + 12); 4431 nvme->n_pagesize = 1UL << (nvme->n_pageshift); 4432 4433 /* 4434 * Set up Queue DMA to transfer at least 1 page-aligned page at a time. 4435 */ 4436 nvme->n_queue_dma_attr.dma_attr_align = nvme->n_pagesize; 4437 nvme->n_queue_dma_attr.dma_attr_minxfer = nvme->n_pagesize; 4438 4439 /* 4440 * Set up PRP DMA to transfer 1 page-aligned page at a time. 4441 * Maxxfer may be increased after we identified the controller limits. 4442 */ 4443 nvme->n_prp_dma_attr.dma_attr_maxxfer = nvme->n_pagesize; 4444 nvme->n_prp_dma_attr.dma_attr_minxfer = nvme->n_pagesize; 4445 nvme->n_prp_dma_attr.dma_attr_align = nvme->n_pagesize; 4446 nvme->n_prp_dma_attr.dma_attr_seg = nvme->n_pagesize - 1; 4447 4448 /* 4449 * Reset controller if it's still in ready state. 4450 */ 4451 if (nvme_reset(nvme, B_FALSE) == B_FALSE) { 4452 dev_err(nvme->n_dip, CE_WARN, "!unable to reset controller"); 4453 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 4454 nvme->n_dead = B_TRUE; 4455 goto fail; 4456 } 4457 4458 /* 4459 * Create the cq array with one completion queue to be assigned 4460 * to the admin queue pair and a limited number of taskqs (4). 4461 */ 4462 if (nvme_create_cq_array(nvme, 1, nvme->n_admin_queue_len, 4) != 4463 DDI_SUCCESS) { 4464 dev_err(nvme->n_dip, CE_WARN, 4465 "!failed to pre-allocate admin completion queue"); 4466 goto fail; 4467 } 4468 /* 4469 * Create the admin queue pair. 4470 */ 4471 if (nvme_alloc_qpair(nvme, nvme->n_admin_queue_len, &nvme->n_adminq, 0) 4472 != DDI_SUCCESS) { 4473 dev_err(nvme->n_dip, CE_WARN, 4474 "!unable to allocate admin qpair"); 4475 goto fail; 4476 } 4477 nvme->n_ioq = kmem_alloc(sizeof (nvme_qpair_t *), KM_SLEEP); 4478 nvme->n_ioq[0] = nvme->n_adminq; 4479 4480 if (nvme->n_quirks & NVME_QUIRK_START_CID) 4481 nvme->n_adminq->nq_next_cmd++; 4482 4483 nvme->n_progress |= NVME_ADMIN_QUEUE; 4484 4485 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, 4486 "admin-queue-len", nvme->n_admin_queue_len); 4487 4488 aqa.b.aqa_asqs = aqa.b.aqa_acqs = nvme->n_admin_queue_len - 1; 4489 asq = nvme->n_adminq->nq_sqdma->nd_cookie.dmac_laddress; 4490 acq = nvme->n_adminq->nq_cq->ncq_dma->nd_cookie.dmac_laddress; 4491 4492 ASSERT((asq & (nvme->n_pagesize - 1)) == 0); 4493 ASSERT((acq & (nvme->n_pagesize - 1)) == 0); 4494 4495 nvme_put32(nvme, NVME_REG_AQA, aqa.r); 4496 nvme_put64(nvme, NVME_REG_ASQ, asq); 4497 nvme_put64(nvme, NVME_REG_ACQ, acq); 4498 4499 cc.b.cc_ams = 0; /* use Round-Robin arbitration */ 4500 cc.b.cc_css = 0; /* use NVM command set */ 4501 cc.b.cc_mps = nvme->n_pageshift - 12; 4502 cc.b.cc_shn = 0; /* no shutdown in progress */ 4503 cc.b.cc_en = 1; /* enable controller */ 4504 cc.b.cc_iosqes = 6; /* submission queue entry is 2^6 bytes long */ 4505 cc.b.cc_iocqes = 4; /* completion queue entry is 2^4 bytes long */ 4506 4507 nvme_put32(nvme, NVME_REG_CC, cc.r); 4508 4509 /* 4510 * Wait for the controller to become ready. 4511 */ 4512 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 4513 if (csts.b.csts_rdy == 0) { 4514 for (i = 0; i != nvme->n_timeout * 10; i++) { 4515 delay(drv_usectohz(50000)); 4516 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 4517 4518 if (csts.b.csts_cfs == 1) { 4519 dev_err(nvme->n_dip, CE_WARN, 4520 "!controller fatal status at init"); 4521 ddi_fm_service_impact(nvme->n_dip, 4522 DDI_SERVICE_LOST); 4523 nvme->n_dead = B_TRUE; 4524 goto fail; 4525 } 4526 4527 if (csts.b.csts_rdy == 1) 4528 break; 4529 } 4530 } 4531 4532 if (csts.b.csts_rdy == 0) { 4533 dev_err(nvme->n_dip, CE_WARN, "!controller not ready"); 4534 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 4535 nvme->n_dead = B_TRUE; 4536 goto fail; 4537 } 4538 4539 /* 4540 * Assume an abort command limit of 1. We'll destroy and re-init 4541 * that later when we know the true abort command limit. 4542 */ 4543 sema_init(&nvme->n_abort_sema, 1, NULL, SEMA_DRIVER, NULL); 4544 4545 /* 4546 * Set up initial interrupt for admin queue. 4547 */ 4548 if ((nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSIX, 1) 4549 != DDI_SUCCESS) && 4550 (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSI, 1) 4551 != DDI_SUCCESS) && 4552 (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_FIXED, 1) 4553 != DDI_SUCCESS)) { 4554 dev_err(nvme->n_dip, CE_WARN, 4555 "!failed to set up initial interrupt"); 4556 goto fail; 4557 } 4558 4559 /* 4560 * Initialize the failure status we should use if we mark the controller 4561 * dead. Do this ahead of issuing any commands. 4562 */ 4563 nvme->n_dead_status = NVME_IOCTL_E_CTRL_DEAD; 4564 4565 /* 4566 * Identify Controller 4567 */ 4568 if (!nvme_identify_int(nvme, 0, NVME_IDENTIFY_CTRL, 4569 (void **)&nvme->n_idctl)) { 4570 dev_err(nvme->n_dip, CE_WARN, "!failed to identify controller"); 4571 goto fail; 4572 } 4573 4574 /* 4575 * Process nvme-config-list (if present) in nvme.conf. 4576 */ 4577 nvme_config_list(nvme); 4578 4579 /* 4580 * Get Vendor & Product ID 4581 */ 4582 bcopy(nvme->n_idctl->id_model, model, sizeof (nvme->n_idctl->id_model)); 4583 model[sizeof (nvme->n_idctl->id_model)] = '\0'; 4584 sata_split_model(model, &vendor, &product); 4585 4586 if (vendor == NULL) 4587 nvme->n_vendor = strdup("NVMe"); 4588 else 4589 nvme->n_vendor = strdup(vendor); 4590 4591 nvme->n_product = strdup(product); 4592 4593 /* 4594 * Get controller limits. 4595 */ 4596 nvme->n_async_event_limit = MAX(NVME_MIN_ASYNC_EVENT_LIMIT, 4597 MIN(nvme->n_admin_queue_len / 10, 4598 MIN(nvme->n_idctl->id_aerl + 1, nvme->n_async_event_limit))); 4599 4600 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, 4601 "async-event-limit", nvme->n_async_event_limit); 4602 4603 nvme->n_abort_command_limit = nvme->n_idctl->id_acl + 1; 4604 4605 /* 4606 * Reinitialize the semaphore with the true abort command limit 4607 * supported by the hardware. It's not necessary to disable interrupts 4608 * as only command aborts use the semaphore, and no commands are 4609 * executed or aborted while we're here. 4610 */ 4611 sema_destroy(&nvme->n_abort_sema); 4612 sema_init(&nvme->n_abort_sema, nvme->n_abort_command_limit - 1, NULL, 4613 SEMA_DRIVER, NULL); 4614 4615 nvme->n_progress |= NVME_CTRL_LIMITS; 4616 4617 if (nvme->n_idctl->id_mdts == 0) 4618 nvme->n_max_data_transfer_size = nvme->n_pagesize * 65536; 4619 else 4620 nvme->n_max_data_transfer_size = 4621 1ull << (nvme->n_pageshift + nvme->n_idctl->id_mdts); 4622 4623 nvme->n_error_log_len = nvme->n_idctl->id_elpe + 1; 4624 4625 /* 4626 * Limit n_max_data_transfer_size to what we can handle in one PRP. 4627 * Chained PRPs are currently unsupported. 4628 * 4629 * This is a no-op on hardware which doesn't support a transfer size 4630 * big enough to require chained PRPs. 4631 */ 4632 nvme->n_max_data_transfer_size = MIN(nvme->n_max_data_transfer_size, 4633 (nvme->n_pagesize / sizeof (uint64_t) * nvme->n_pagesize)); 4634 4635 nvme->n_prp_dma_attr.dma_attr_maxxfer = nvme->n_max_data_transfer_size; 4636 4637 /* 4638 * Make sure the minimum/maximum queue entry sizes are not 4639 * larger/smaller than the default. 4640 */ 4641 4642 if (((1 << nvme->n_idctl->id_sqes.qes_min) > sizeof (nvme_sqe_t)) || 4643 ((1 << nvme->n_idctl->id_sqes.qes_max) < sizeof (nvme_sqe_t)) || 4644 ((1 << nvme->n_idctl->id_cqes.qes_min) > sizeof (nvme_cqe_t)) || 4645 ((1 << nvme->n_idctl->id_cqes.qes_max) < sizeof (nvme_cqe_t))) 4646 goto fail; 4647 4648 /* 4649 * Check for the presence of a Volatile Write Cache. If present, 4650 * enable or disable based on the value of the property 4651 * volatile-write-cache-enable (default is enabled). 4652 */ 4653 nvme->n_write_cache_present = 4654 nvme->n_idctl->id_vwc.vwc_present == 0 ? B_FALSE : B_TRUE; 4655 4656 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, 4657 "volatile-write-cache-present", 4658 nvme->n_write_cache_present ? 1 : 0); 4659 4660 if (!nvme->n_write_cache_present) { 4661 nvme->n_write_cache_enabled = B_FALSE; 4662 } else if (nvme_write_cache_set(nvme, nvme->n_write_cache_enabled) 4663 != 0) { 4664 dev_err(nvme->n_dip, CE_WARN, 4665 "!failed to %sable volatile write cache", 4666 nvme->n_write_cache_enabled ? "en" : "dis"); 4667 /* 4668 * Assume the cache is (still) enabled. 4669 */ 4670 nvme->n_write_cache_enabled = B_TRUE; 4671 } 4672 4673 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, 4674 "volatile-write-cache-enable", 4675 nvme->n_write_cache_enabled ? 1 : 0); 4676 4677 /* 4678 * Get number of supported namespaces and allocate namespace array. 4679 */ 4680 nvme->n_namespace_count = nvme->n_idctl->id_nn; 4681 4682 if (nvme->n_namespace_count == 0) { 4683 dev_err(nvme->n_dip, CE_WARN, 4684 "!controllers without namespaces are not supported"); 4685 goto fail; 4686 } 4687 4688 nvme->n_ns = kmem_zalloc(sizeof (nvme_namespace_t) * 4689 nvme->n_namespace_count, KM_SLEEP); 4690 4691 /* 4692 * Get the common namespace information if available. If not, we use the 4693 * information for nsid 1. 4694 */ 4695 if (nvme_ctrl_atleast(nvme, &nvme_vers_1v2) && 4696 nvme->n_idctl->id_oacs.oa_nsmgmt != 0) { 4697 nsid = NVME_NSID_BCAST; 4698 } else { 4699 nsid = 1; 4700 } 4701 4702 if (!nvme_identify_int(nvme, nsid, NVME_IDENTIFY_NSID, 4703 (void **)&nvme->n_idcomns)) { 4704 dev_err(nvme->n_dip, CE_WARN, "!failed to identify common " 4705 "namespace information"); 4706 goto fail; 4707 } 4708 4709 /* 4710 * Try to set up MSI/MSI-X interrupts. 4711 */ 4712 if ((nvme->n_intr_types & (DDI_INTR_TYPE_MSI | DDI_INTR_TYPE_MSIX)) 4713 != 0) { 4714 nvme_release_interrupts(nvme); 4715 4716 nqueues = MIN(UINT16_MAX, ncpus); 4717 4718 if ((nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSIX, 4719 nqueues) != DDI_SUCCESS) && 4720 (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSI, 4721 nqueues) != DDI_SUCCESS)) { 4722 dev_err(nvme->n_dip, CE_WARN, 4723 "!failed to set up MSI/MSI-X interrupts"); 4724 goto fail; 4725 } 4726 } 4727 4728 /* 4729 * Create I/O queue pairs. 4730 */ 4731 4732 if (nvme_set_nqueues(nvme) != 0) { 4733 dev_err(nvme->n_dip, CE_WARN, 4734 "!failed to set number of I/O queues to %d", 4735 nvme->n_intr_cnt); 4736 goto fail; 4737 } 4738 4739 /* 4740 * Reallocate I/O queue array 4741 */ 4742 kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *)); 4743 nvme->n_ioq = kmem_zalloc(sizeof (nvme_qpair_t *) * 4744 (nvme->n_submission_queues + 1), KM_SLEEP); 4745 nvme->n_ioq[0] = nvme->n_adminq; 4746 4747 /* 4748 * There should always be at least as many submission queues 4749 * as completion queues. 4750 */ 4751 ASSERT(nvme->n_submission_queues >= nvme->n_completion_queues); 4752 4753 nvme->n_ioq_count = nvme->n_submission_queues; 4754 4755 nvme->n_io_squeue_len = 4756 MIN(nvme->n_io_squeue_len, nvme->n_max_queue_entries); 4757 4758 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-squeue-len", 4759 nvme->n_io_squeue_len); 4760 4761 /* 4762 * Pre-allocate completion queues. 4763 * When there are the same number of submission and completion 4764 * queues there is no value in having a larger completion 4765 * queue length. 4766 */ 4767 if (nvme->n_submission_queues == nvme->n_completion_queues) 4768 nvme->n_io_cqueue_len = MIN(nvme->n_io_cqueue_len, 4769 nvme->n_io_squeue_len); 4770 4771 nvme->n_io_cqueue_len = MIN(nvme->n_io_cqueue_len, 4772 nvme->n_max_queue_entries); 4773 4774 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-cqueue-len", 4775 nvme->n_io_cqueue_len); 4776 4777 /* 4778 * Assign the equal quantity of taskq threads to each completion 4779 * queue, capping the total number of threads to the number 4780 * of CPUs. 4781 */ 4782 tq_threads = MIN(UINT16_MAX, ncpus) / nvme->n_completion_queues; 4783 4784 /* 4785 * In case the calculation above is zero, we need at least one 4786 * thread per completion queue. 4787 */ 4788 tq_threads = MAX(1, tq_threads); 4789 4790 if (nvme_create_cq_array(nvme, nvme->n_completion_queues + 1, 4791 nvme->n_io_cqueue_len, tq_threads) != DDI_SUCCESS) { 4792 dev_err(nvme->n_dip, CE_WARN, 4793 "!failed to pre-allocate completion queues"); 4794 goto fail; 4795 } 4796 4797 /* 4798 * If we use less completion queues than interrupt vectors return 4799 * some of the interrupt vectors back to the system. 4800 */ 4801 if (nvme->n_completion_queues + 1 < nvme->n_intr_cnt) { 4802 nvme_release_interrupts(nvme); 4803 4804 if (nvme_setup_interrupts(nvme, nvme->n_intr_type, 4805 nvme->n_completion_queues + 1) != DDI_SUCCESS) { 4806 dev_err(nvme->n_dip, CE_WARN, 4807 "!failed to reduce number of interrupts"); 4808 goto fail; 4809 } 4810 } 4811 4812 /* 4813 * Alloc & register I/O queue pairs 4814 */ 4815 4816 for (i = 1; i != nvme->n_ioq_count + 1; i++) { 4817 if (nvme_alloc_qpair(nvme, nvme->n_io_squeue_len, 4818 &nvme->n_ioq[i], i) != DDI_SUCCESS) { 4819 dev_err(nvme->n_dip, CE_WARN, 4820 "!unable to allocate I/O qpair %d", i); 4821 goto fail; 4822 } 4823 4824 if (nvme_create_io_qpair(nvme, nvme->n_ioq[i], i) != 0) { 4825 dev_err(nvme->n_dip, CE_WARN, 4826 "!unable to create I/O qpair %d", i); 4827 goto fail; 4828 } 4829 } 4830 4831 /* 4832 * Enable any host behavior features that make sense for us. 4833 */ 4834 nvme_enable_host_behavior(nvme); 4835 4836 return (DDI_SUCCESS); 4837 4838 fail: 4839 (void) nvme_reset(nvme, B_FALSE); 4840 return (DDI_FAILURE); 4841 } 4842 4843 static uint_t 4844 nvme_intr(caddr_t arg1, caddr_t arg2) 4845 { 4846 nvme_t *nvme = (nvme_t *)arg1; 4847 int inum = (int)(uintptr_t)arg2; 4848 int ccnt = 0; 4849 int qnum; 4850 4851 if (inum >= nvme->n_intr_cnt) 4852 return (DDI_INTR_UNCLAIMED); 4853 4854 if (nvme->n_dead) { 4855 return (nvme->n_intr_type == DDI_INTR_TYPE_FIXED ? 4856 DDI_INTR_UNCLAIMED : DDI_INTR_CLAIMED); 4857 } 4858 4859 /* 4860 * The interrupt vector a queue uses is calculated as queue_idx % 4861 * intr_cnt in nvme_create_io_qpair(). Iterate through the queue array 4862 * in steps of n_intr_cnt to process all queues using this vector. 4863 */ 4864 for (qnum = inum; 4865 qnum < nvme->n_cq_count && nvme->n_cq[qnum] != NULL; 4866 qnum += nvme->n_intr_cnt) { 4867 ccnt += nvme_process_iocq(nvme, nvme->n_cq[qnum]); 4868 } 4869 4870 return (ccnt > 0 ? DDI_INTR_CLAIMED : DDI_INTR_UNCLAIMED); 4871 } 4872 4873 static void 4874 nvme_release_interrupts(nvme_t *nvme) 4875 { 4876 int i; 4877 4878 for (i = 0; i < nvme->n_intr_cnt; i++) { 4879 if (nvme->n_inth[i] == NULL) 4880 break; 4881 4882 if (nvme->n_intr_cap & DDI_INTR_FLAG_BLOCK) 4883 (void) ddi_intr_block_disable(&nvme->n_inth[i], 1); 4884 else 4885 (void) ddi_intr_disable(nvme->n_inth[i]); 4886 4887 (void) ddi_intr_remove_handler(nvme->n_inth[i]); 4888 (void) ddi_intr_free(nvme->n_inth[i]); 4889 } 4890 4891 kmem_free(nvme->n_inth, nvme->n_inth_sz); 4892 nvme->n_inth = NULL; 4893 nvme->n_inth_sz = 0; 4894 4895 nvme->n_progress &= ~NVME_INTERRUPTS; 4896 } 4897 4898 static int 4899 nvme_setup_interrupts(nvme_t *nvme, int intr_type, int nqpairs) 4900 { 4901 int nintrs, navail, count; 4902 int ret; 4903 int i; 4904 4905 if (nvme->n_intr_types == 0) { 4906 ret = ddi_intr_get_supported_types(nvme->n_dip, 4907 &nvme->n_intr_types); 4908 if (ret != DDI_SUCCESS) { 4909 dev_err(nvme->n_dip, CE_WARN, 4910 "!%s: ddi_intr_get_supported types failed", 4911 __func__); 4912 return (ret); 4913 } 4914 #ifdef __x86 4915 if (get_hwenv() == HW_VMWARE) 4916 nvme->n_intr_types &= ~DDI_INTR_TYPE_MSIX; 4917 #endif 4918 } 4919 4920 if ((nvme->n_intr_types & intr_type) == 0) 4921 return (DDI_FAILURE); 4922 4923 ret = ddi_intr_get_nintrs(nvme->n_dip, intr_type, &nintrs); 4924 if (ret != DDI_SUCCESS) { 4925 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_nintrs failed", 4926 __func__); 4927 return (ret); 4928 } 4929 4930 ret = ddi_intr_get_navail(nvme->n_dip, intr_type, &navail); 4931 if (ret != DDI_SUCCESS) { 4932 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_navail failed", 4933 __func__); 4934 return (ret); 4935 } 4936 4937 /* We want at most one interrupt per queue pair. */ 4938 if (navail > nqpairs) 4939 navail = nqpairs; 4940 4941 nvme->n_inth_sz = sizeof (ddi_intr_handle_t) * navail; 4942 nvme->n_inth = kmem_zalloc(nvme->n_inth_sz, KM_SLEEP); 4943 4944 ret = ddi_intr_alloc(nvme->n_dip, nvme->n_inth, intr_type, 0, navail, 4945 &count, 0); 4946 if (ret != DDI_SUCCESS) { 4947 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_alloc failed", 4948 __func__); 4949 goto fail; 4950 } 4951 4952 nvme->n_intr_cnt = count; 4953 4954 ret = ddi_intr_get_pri(nvme->n_inth[0], &nvme->n_intr_pri); 4955 if (ret != DDI_SUCCESS) { 4956 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_pri failed", 4957 __func__); 4958 goto fail; 4959 } 4960 4961 for (i = 0; i < count; i++) { 4962 ret = ddi_intr_add_handler(nvme->n_inth[i], nvme_intr, 4963 (void *)nvme, (void *)(uintptr_t)i); 4964 if (ret != DDI_SUCCESS) { 4965 dev_err(nvme->n_dip, CE_WARN, 4966 "!%s: ddi_intr_add_handler failed", __func__); 4967 goto fail; 4968 } 4969 } 4970 4971 (void) ddi_intr_get_cap(nvme->n_inth[0], &nvme->n_intr_cap); 4972 4973 for (i = 0; i < count; i++) { 4974 if (nvme->n_intr_cap & DDI_INTR_FLAG_BLOCK) 4975 ret = ddi_intr_block_enable(&nvme->n_inth[i], 1); 4976 else 4977 ret = ddi_intr_enable(nvme->n_inth[i]); 4978 4979 if (ret != DDI_SUCCESS) { 4980 dev_err(nvme->n_dip, CE_WARN, 4981 "!%s: enabling interrupt %d failed", __func__, i); 4982 goto fail; 4983 } 4984 } 4985 4986 nvme->n_intr_type = intr_type; 4987 4988 nvme->n_progress |= NVME_INTERRUPTS; 4989 4990 return (DDI_SUCCESS); 4991 4992 fail: 4993 nvme_release_interrupts(nvme); 4994 4995 return (ret); 4996 } 4997 4998 static int 4999 nvme_fm_errcb(dev_info_t *dip, ddi_fm_error_t *fm_error, const void *arg) 5000 { 5001 _NOTE(ARGUNUSED(arg)); 5002 5003 pci_ereport_post(dip, fm_error, NULL); 5004 return (fm_error->fme_status); 5005 } 5006 5007 static void 5008 nvme_remove_callback(dev_info_t *dip, ddi_eventcookie_t cookie, void *a, 5009 void *b) 5010 { 5011 nvme_t *nvme = a; 5012 5013 nvme_ctrl_mark_dead(nvme, B_TRUE); 5014 5015 /* 5016 * Fail all outstanding commands, including those in the admin queue 5017 * (queue 0). 5018 */ 5019 for (uint_t i = 0; i < nvme->n_ioq_count + 1; i++) { 5020 nvme_qpair_t *qp = nvme->n_ioq[i]; 5021 5022 mutex_enter(&qp->nq_mutex); 5023 for (size_t j = 0; j < qp->nq_nentry; j++) { 5024 nvme_cmd_t *cmd = qp->nq_cmd[j]; 5025 nvme_cmd_t *u_cmd; 5026 5027 if (cmd == NULL) { 5028 continue; 5029 } 5030 5031 /* 5032 * Since we have the queue lock held the entire time we 5033 * iterate over it, it's not possible for the queue to 5034 * change underneath us. Thus, we don't need to check 5035 * that the return value of nvme_unqueue_cmd matches the 5036 * requested cmd to unqueue. 5037 */ 5038 u_cmd = nvme_unqueue_cmd(nvme, qp, cmd->nc_sqe.sqe_cid); 5039 taskq_dispatch_ent(qp->nq_cq->ncq_cmd_taskq, 5040 cmd->nc_callback, cmd, TQ_NOSLEEP, &cmd->nc_tqent); 5041 5042 ASSERT3P(u_cmd, ==, cmd); 5043 } 5044 mutex_exit(&qp->nq_mutex); 5045 } 5046 } 5047 5048 /* 5049 * Open minor management 5050 */ 5051 static int 5052 nvme_minor_comparator(const void *l, const void *r) 5053 { 5054 const nvme_minor_t *lm = l; 5055 const nvme_minor_t *rm = r; 5056 5057 if (lm->nm_minor > rm->nm_minor) { 5058 return (1); 5059 } else if (lm->nm_minor < rm->nm_minor) { 5060 return (-1); 5061 } else { 5062 return (0); 5063 } 5064 } 5065 5066 static void 5067 nvme_minor_free(nvme_minor_t *minor) 5068 { 5069 if (minor->nm_minor > 0) { 5070 ASSERT3S(minor->nm_minor, >=, NVME_OPEN_MINOR_MIN); 5071 id_free(nvme_open_minors, minor->nm_minor); 5072 minor->nm_minor = 0; 5073 } 5074 VERIFY0(list_link_active(&minor->nm_ctrl_lock.nli_node)); 5075 VERIFY0(list_link_active(&minor->nm_ns_lock.nli_node)); 5076 cv_destroy(&minor->nm_cv); 5077 kmem_free(minor, sizeof (nvme_minor_t)); 5078 } 5079 5080 static nvme_minor_t * 5081 nvme_minor_find_by_dev(dev_t dev) 5082 { 5083 id_t id = (id_t)getminor(dev); 5084 nvme_minor_t search = { .nm_minor = id }; 5085 nvme_minor_t *ret; 5086 5087 mutex_enter(&nvme_open_minors_mutex); 5088 ret = avl_find(&nvme_open_minors_avl, &search, NULL); 5089 mutex_exit(&nvme_open_minors_mutex); 5090 5091 return (ret); 5092 } 5093 5094 static int 5095 nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 5096 { 5097 nvme_t *nvme; 5098 int instance; 5099 int nregs; 5100 off_t regsize; 5101 char name[32]; 5102 5103 if (cmd != DDI_ATTACH) 5104 return (DDI_FAILURE); 5105 5106 instance = ddi_get_instance(dip); 5107 5108 if (ddi_soft_state_zalloc(nvme_state, instance) != DDI_SUCCESS) 5109 return (DDI_FAILURE); 5110 5111 nvme = ddi_get_soft_state(nvme_state, instance); 5112 ddi_set_driver_private(dip, nvme); 5113 nvme->n_dip = dip; 5114 5115 /* 5116 * Map PCI config space 5117 */ 5118 if (pci_config_setup(dip, &nvme->n_pcicfg_handle) != DDI_SUCCESS) { 5119 dev_err(dip, CE_WARN, "!failed to map PCI config space"); 5120 goto fail; 5121 } 5122 nvme->n_progress |= NVME_PCI_CONFIG; 5123 5124 /* 5125 * Get the various PCI IDs from config space 5126 */ 5127 nvme->n_vendor_id = 5128 pci_config_get16(nvme->n_pcicfg_handle, PCI_CONF_VENID); 5129 nvme->n_device_id = 5130 pci_config_get16(nvme->n_pcicfg_handle, PCI_CONF_DEVID); 5131 nvme->n_revision_id = 5132 pci_config_get8(nvme->n_pcicfg_handle, PCI_CONF_REVID); 5133 nvme->n_subsystem_device_id = 5134 pci_config_get16(nvme->n_pcicfg_handle, PCI_CONF_SUBSYSID); 5135 nvme->n_subsystem_vendor_id = 5136 pci_config_get16(nvme->n_pcicfg_handle, PCI_CONF_SUBVENID); 5137 5138 nvme_detect_quirks(nvme); 5139 5140 /* 5141 * Set up event handlers for hot removal. While npe(4D) supports the hot 5142 * removal event being injected for devices, the same is not true of all 5143 * of our possible parents (i.e. pci(4D) as of this writing). The most 5144 * common case this shows up is in some virtualization environments. We 5145 * should treat this as non-fatal so that way devices work but leave 5146 * this set up in such a way that if a nexus does grow support for this 5147 * we're good to go. 5148 */ 5149 if (ddi_get_eventcookie(nvme->n_dip, DDI_DEVI_REMOVE_EVENT, 5150 &nvme->n_rm_cookie) == DDI_SUCCESS) { 5151 if (ddi_add_event_handler(nvme->n_dip, nvme->n_rm_cookie, 5152 nvme_remove_callback, nvme, &nvme->n_ev_rm_cb_id) != 5153 DDI_SUCCESS) { 5154 goto fail; 5155 } 5156 } else { 5157 nvme->n_ev_rm_cb_id = NULL; 5158 } 5159 5160 mutex_init(&nvme->n_minor_mutex, NULL, MUTEX_DRIVER, NULL); 5161 nvme->n_progress |= NVME_MUTEX_INIT; 5162 5163 nvme->n_strict_version = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 5164 DDI_PROP_DONTPASS, "strict-version", 1) == 1 ? B_TRUE : B_FALSE; 5165 nvme->n_ignore_unknown_vendor_status = ddi_prop_get_int(DDI_DEV_T_ANY, 5166 dip, DDI_PROP_DONTPASS, "ignore-unknown-vendor-status", 0) == 1 ? 5167 B_TRUE : B_FALSE; 5168 nvme->n_admin_queue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 5169 DDI_PROP_DONTPASS, "admin-queue-len", NVME_DEFAULT_ADMIN_QUEUE_LEN); 5170 nvme->n_io_squeue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 5171 DDI_PROP_DONTPASS, "io-squeue-len", NVME_DEFAULT_IO_QUEUE_LEN); 5172 /* 5173 * Double up the default for completion queues in case of 5174 * queue sharing. 5175 */ 5176 nvme->n_io_cqueue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 5177 DDI_PROP_DONTPASS, "io-cqueue-len", 2 * NVME_DEFAULT_IO_QUEUE_LEN); 5178 nvme->n_async_event_limit = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 5179 DDI_PROP_DONTPASS, "async-event-limit", 5180 NVME_DEFAULT_ASYNC_EVENT_LIMIT); 5181 nvme->n_write_cache_enabled = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 5182 DDI_PROP_DONTPASS, "volatile-write-cache-enable", 1) != 0 ? 5183 B_TRUE : B_FALSE; 5184 nvme->n_min_block_size = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 5185 DDI_PROP_DONTPASS, "min-phys-block-size", 5186 NVME_DEFAULT_MIN_BLOCK_SIZE); 5187 nvme->n_submission_queues = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 5188 DDI_PROP_DONTPASS, "max-submission-queues", -1); 5189 nvme->n_completion_queues = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 5190 DDI_PROP_DONTPASS, "max-completion-queues", -1); 5191 5192 if (!ISP2(nvme->n_min_block_size) || 5193 (nvme->n_min_block_size < NVME_DEFAULT_MIN_BLOCK_SIZE)) { 5194 dev_err(dip, CE_WARN, "!min-phys-block-size %s, " 5195 "using default %d", ISP2(nvme->n_min_block_size) ? 5196 "too low" : "not a power of 2", 5197 NVME_DEFAULT_MIN_BLOCK_SIZE); 5198 nvme->n_min_block_size = NVME_DEFAULT_MIN_BLOCK_SIZE; 5199 } 5200 5201 if (nvme->n_submission_queues != -1 && 5202 (nvme->n_submission_queues < 1 || 5203 nvme->n_submission_queues > UINT16_MAX)) { 5204 dev_err(dip, CE_WARN, "!\"submission-queues\"=%d is not " 5205 "valid. Must be [1..%d]", nvme->n_submission_queues, 5206 UINT16_MAX); 5207 nvme->n_submission_queues = -1; 5208 } 5209 5210 if (nvme->n_completion_queues != -1 && 5211 (nvme->n_completion_queues < 1 || 5212 nvme->n_completion_queues > UINT16_MAX)) { 5213 dev_err(dip, CE_WARN, "!\"completion-queues\"=%d is not " 5214 "valid. Must be [1..%d]", nvme->n_completion_queues, 5215 UINT16_MAX); 5216 nvme->n_completion_queues = -1; 5217 } 5218 5219 if (nvme->n_admin_queue_len < NVME_MIN_ADMIN_QUEUE_LEN) 5220 nvme->n_admin_queue_len = NVME_MIN_ADMIN_QUEUE_LEN; 5221 else if (nvme->n_admin_queue_len > NVME_MAX_ADMIN_QUEUE_LEN) 5222 nvme->n_admin_queue_len = NVME_MAX_ADMIN_QUEUE_LEN; 5223 5224 if (nvme->n_io_squeue_len < NVME_MIN_IO_QUEUE_LEN) 5225 nvme->n_io_squeue_len = NVME_MIN_IO_QUEUE_LEN; 5226 if (nvme->n_io_cqueue_len < NVME_MIN_IO_QUEUE_LEN) 5227 nvme->n_io_cqueue_len = NVME_MIN_IO_QUEUE_LEN; 5228 5229 if (nvme->n_async_event_limit < 1) 5230 nvme->n_async_event_limit = NVME_DEFAULT_ASYNC_EVENT_LIMIT; 5231 5232 nvme->n_reg_acc_attr = nvme_reg_acc_attr; 5233 nvme->n_queue_dma_attr = nvme_queue_dma_attr; 5234 nvme->n_prp_dma_attr = nvme_prp_dma_attr; 5235 nvme->n_sgl_dma_attr = nvme_sgl_dma_attr; 5236 5237 /* 5238 * Set up FMA support. 5239 */ 5240 nvme->n_fm_cap = ddi_getprop(DDI_DEV_T_ANY, dip, 5241 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "fm-capable", 5242 DDI_FM_EREPORT_CAPABLE | DDI_FM_ACCCHK_CAPABLE | 5243 DDI_FM_DMACHK_CAPABLE | DDI_FM_ERRCB_CAPABLE); 5244 5245 ddi_fm_init(dip, &nvme->n_fm_cap, &nvme->n_fm_ibc); 5246 5247 if (nvme->n_fm_cap) { 5248 if (nvme->n_fm_cap & DDI_FM_ACCCHK_CAPABLE) 5249 nvme->n_reg_acc_attr.devacc_attr_access = 5250 DDI_FLAGERR_ACC; 5251 5252 if (nvme->n_fm_cap & DDI_FM_DMACHK_CAPABLE) { 5253 nvme->n_prp_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR; 5254 nvme->n_sgl_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR; 5255 } 5256 5257 if (DDI_FM_EREPORT_CAP(nvme->n_fm_cap) || 5258 DDI_FM_ERRCB_CAP(nvme->n_fm_cap)) 5259 pci_ereport_setup(dip); 5260 5261 if (DDI_FM_ERRCB_CAP(nvme->n_fm_cap)) 5262 ddi_fm_handler_register(dip, nvme_fm_errcb, 5263 (void *)nvme); 5264 } 5265 5266 nvme->n_progress |= NVME_FMA_INIT; 5267 5268 /* 5269 * The spec defines several register sets. Only the controller 5270 * registers (set 1) are currently used. 5271 */ 5272 if (ddi_dev_nregs(dip, &nregs) == DDI_FAILURE || 5273 nregs < 2 || 5274 ddi_dev_regsize(dip, 1, ®size) == DDI_FAILURE) 5275 goto fail; 5276 5277 if (ddi_regs_map_setup(dip, 1, &nvme->n_regs, 0, regsize, 5278 &nvme->n_reg_acc_attr, &nvme->n_regh) != DDI_SUCCESS) { 5279 dev_err(dip, CE_WARN, "!failed to map regset 1"); 5280 goto fail; 5281 } 5282 5283 nvme->n_progress |= NVME_REGS_MAPPED; 5284 5285 /* 5286 * Set up kstats 5287 */ 5288 if (!nvme_stat_init(nvme)) { 5289 dev_err(dip, CE_WARN, "!failed to create device kstats"); 5290 goto fail; 5291 } 5292 nvme->n_progress |= NVME_STAT_INIT; 5293 5294 /* 5295 * Create PRP DMA cache 5296 */ 5297 (void) snprintf(name, sizeof (name), "%s%d_prp_cache", 5298 ddi_driver_name(dip), ddi_get_instance(dip)); 5299 nvme->n_prp_cache = kmem_cache_create(name, sizeof (nvme_dma_t), 5300 0, nvme_prp_dma_constructor, nvme_prp_dma_destructor, 5301 NULL, (void *)nvme, NULL, 0); 5302 5303 if (nvme_init(nvme) != DDI_SUCCESS) 5304 goto fail; 5305 5306 /* 5307 * Initialize the driver with the UFM subsystem 5308 */ 5309 if (ddi_ufm_init(dip, DDI_UFM_CURRENT_VERSION, &nvme_ufm_ops, 5310 &nvme->n_ufmh, nvme) != 0) { 5311 dev_err(dip, CE_WARN, "!failed to initialize UFM subsystem"); 5312 goto fail; 5313 } 5314 mutex_init(&nvme->n_fwslot_mutex, NULL, MUTEX_DRIVER, NULL); 5315 ddi_ufm_update(nvme->n_ufmh); 5316 nvme->n_progress |= NVME_UFM_INIT; 5317 5318 nvme_mgmt_lock_init(&nvme->n_mgmt); 5319 nvme_lock_init(&nvme->n_lock); 5320 nvme->n_progress |= NVME_MGMT_INIT; 5321 5322 /* 5323 * Identify namespaces. 5324 */ 5325 nvme_mgmt_lock(nvme, NVME_MGMT_LOCK_NVME); 5326 5327 boolean_t minor_logged = B_FALSE; 5328 for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) { 5329 nvme_namespace_t *ns = nvme_nsid2ns(nvme, i); 5330 5331 nvme_lock_init(&ns->ns_lock); 5332 ns->ns_progress |= NVME_NS_LOCK; 5333 5334 /* 5335 * Namespaces start out in the active state. This is the 5336 * default state until we find out information about the 5337 * namespaces in more detail. nvme_init_ns() will go through and 5338 * determine what the proper state should be. It will also use 5339 * this state change to keep an accurate count of attachable 5340 * namespaces. 5341 */ 5342 ns->ns_state = NVME_NS_STATE_ACTIVE; 5343 if (nvme_init_ns(nvme, i) != 0) { 5344 nvme_mgmt_unlock(nvme); 5345 goto fail; 5346 } 5347 5348 /* 5349 * We only create compat minor nodes for the namespace for the 5350 * first NVME_MINOR_MAX namespaces. Those that are beyond this 5351 * can only be accessed through the primary controller node, 5352 * which is generally fine as that's what libnvme uses and is 5353 * our preferred path. Not having a minor is better than not 5354 * having the namespace! 5355 */ 5356 if (i > NVME_MINOR_MAX) { 5357 if (!minor_logged) { 5358 dev_err(dip, CE_WARN, "namespace minor " 5359 "creation limited to the first %u " 5360 "namespaces, device has %u", 5361 NVME_MINOR_MAX, nvme->n_namespace_count); 5362 minor_logged = B_TRUE; 5363 } 5364 continue; 5365 } 5366 5367 if (ddi_create_minor_node(nvme->n_dip, ns->ns_name, S_IFCHR, 5368 NVME_MINOR(ddi_get_instance(nvme->n_dip), i), 5369 DDI_NT_NVME_ATTACHMENT_POINT, 0) != DDI_SUCCESS) { 5370 nvme_mgmt_unlock(nvme); 5371 dev_err(dip, CE_WARN, 5372 "!failed to create minor node for namespace %d", i); 5373 goto fail; 5374 } 5375 ns->ns_progress |= NVME_NS_MINOR; 5376 } 5377 5378 /* 5379 * Indicate that namespace initialization is complete and therefore 5380 * marking the controller dead can evaluate every namespace lock. 5381 */ 5382 nvme->n_progress |= NVME_NS_INIT; 5383 5384 if (ddi_create_minor_node(dip, "devctl", S_IFCHR, 5385 NVME_MINOR(ddi_get_instance(dip), 0), DDI_NT_NVME_NEXUS, 0) != 5386 DDI_SUCCESS) { 5387 nvme_mgmt_unlock(nvme); 5388 dev_err(dip, CE_WARN, "nvme_attach: " 5389 "cannot create devctl minor node"); 5390 goto fail; 5391 } 5392 5393 /* 5394 * Attempt to attach all namespaces that are in a reasonable state. This 5395 * should not fail attach. 5396 */ 5397 for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) { 5398 nvme_namespace_t *ns = nvme_nsid2ns(nvme, i); 5399 nvme_ioctl_common_t com = { .nioc_nsid = i }; 5400 5401 if (ns->ns_state < NVME_NS_STATE_NOT_IGNORED) 5402 continue; 5403 5404 if (!nvme_bd_attach_ns(nvme, &com) && com.nioc_drv_err != 5405 NVME_IOCTL_E_UNSUP_ATTACH_NS) { 5406 dev_err(nvme->n_dip, CE_WARN, "!failed to attach " 5407 "namespace %d due to blkdev error (0x%x)", i, 5408 com.nioc_drv_err); 5409 } 5410 } 5411 5412 nvme_mgmt_unlock(nvme); 5413 5414 /* 5415 * As the last thing that we do, we finally go ahead and enable 5416 * asynchronous event notifications. Currently we rely upon whatever 5417 * defaults the device has for the events that we will receive. If we 5418 * enable this earlier, it's possible that we'll get events that we 5419 * cannot handle yet because all of our data structures are not valid. 5420 * The device will queue all asynchronous events on a per-log page basis 5421 * until we submit this. If the device is totally broken, it will have 5422 * likely failed our commands already. If we add support for configuring 5423 * which asynchronous events we would like to receive via the SET 5424 * FEATURES command, then we should do that as one of the first commands 5425 * we send in nvme_init(). 5426 * 5427 * We start by assuming asynchronous events are supported. However, not 5428 * all devices (e.g. some versions of QEMU) support this, so we end up 5429 * tracking whether or not we think these actually work. 5430 */ 5431 nvme->n_async_event_supported = B_TRUE; 5432 for (uint16_t i = 0; i < nvme->n_async_event_limit; i++) { 5433 nvme_async_event(nvme); 5434 } 5435 5436 5437 return (DDI_SUCCESS); 5438 5439 fail: 5440 /* attach successful anyway so that FMA can retire the device */ 5441 if (nvme->n_dead) 5442 return (DDI_SUCCESS); 5443 5444 (void) nvme_detach(dip, DDI_DETACH); 5445 5446 return (DDI_FAILURE); 5447 } 5448 5449 static int 5450 nvme_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 5451 { 5452 int instance; 5453 nvme_t *nvme; 5454 5455 if (cmd != DDI_DETACH) 5456 return (DDI_FAILURE); 5457 5458 instance = ddi_get_instance(dip); 5459 5460 nvme = ddi_get_soft_state(nvme_state, instance); 5461 5462 if (nvme == NULL) 5463 return (DDI_FAILURE); 5464 5465 /* 5466 * Remove all minor nodes from the device regardless of the source in 5467 * one swoop. 5468 */ 5469 ddi_remove_minor_node(dip, NULL); 5470 5471 /* 5472 * We need to remove the event handler as one of the first things that 5473 * we do. If we proceed with other teardown without removing the event 5474 * handler, we could end up in a very unfortunate race with ourselves. 5475 * The DDI does not serialize these with detach (just like timeout(9F) 5476 * and others). 5477 */ 5478 if (nvme->n_ev_rm_cb_id != NULL) { 5479 (void) ddi_remove_event_handler(nvme->n_ev_rm_cb_id); 5480 } 5481 nvme->n_ev_rm_cb_id = NULL; 5482 5483 /* 5484 * If the controller was marked dead, there is a slight chance that we 5485 * are asynchronusly processing the removal taskq. Because we have 5486 * removed the callback handler above and all minor nodes and commands 5487 * are closed, there is no other way to get in here. As such, we wait on 5488 * the nvme_dead_taskq to complete so we can avoid tracking if it's 5489 * running or not. 5490 */ 5491 taskq_wait(nvme_dead_taskq); 5492 5493 if (nvme->n_ns) { 5494 for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) { 5495 nvme_namespace_t *ns = nvme_nsid2ns(nvme, i); 5496 5497 if (ns->ns_bd_hdl) { 5498 (void) bd_detach_handle(ns->ns_bd_hdl); 5499 bd_free_handle(ns->ns_bd_hdl); 5500 } 5501 5502 if (ns->ns_idns) 5503 kmem_free(ns->ns_idns, 5504 sizeof (nvme_identify_nsid_t)); 5505 if (ns->ns_devid) 5506 strfree(ns->ns_devid); 5507 5508 if ((ns->ns_progress & NVME_NS_LOCK) != 0) 5509 nvme_lock_fini(&ns->ns_lock); 5510 } 5511 5512 kmem_free(nvme->n_ns, sizeof (nvme_namespace_t) * 5513 nvme->n_namespace_count); 5514 } 5515 5516 if (nvme->n_progress & NVME_MGMT_INIT) { 5517 nvme_lock_fini(&nvme->n_lock); 5518 nvme_mgmt_lock_fini(&nvme->n_mgmt); 5519 } 5520 5521 if (nvme->n_progress & NVME_UFM_INIT) { 5522 ddi_ufm_fini(nvme->n_ufmh); 5523 mutex_destroy(&nvme->n_fwslot_mutex); 5524 } 5525 5526 if (nvme->n_progress & NVME_INTERRUPTS) 5527 nvme_release_interrupts(nvme); 5528 5529 for (uint_t i = 0; i < nvme->n_cq_count; i++) { 5530 if (nvme->n_cq[i]->ncq_cmd_taskq != NULL) 5531 taskq_wait(nvme->n_cq[i]->ncq_cmd_taskq); 5532 } 5533 5534 if (nvme->n_progress & NVME_MUTEX_INIT) { 5535 mutex_destroy(&nvme->n_minor_mutex); 5536 } 5537 5538 if (nvme->n_ioq_count > 0) { 5539 for (uint_t i = 1; i != nvme->n_ioq_count + 1; i++) { 5540 if (nvme->n_ioq[i] != NULL) { 5541 /* TODO: send destroy queue commands */ 5542 nvme_free_qpair(nvme->n_ioq[i]); 5543 } 5544 } 5545 5546 kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *) * 5547 (nvme->n_ioq_count + 1)); 5548 } 5549 5550 if (nvme->n_prp_cache != NULL) { 5551 kmem_cache_destroy(nvme->n_prp_cache); 5552 } 5553 5554 if (nvme->n_progress & NVME_REGS_MAPPED) { 5555 nvme_shutdown(nvme, B_FALSE); 5556 (void) nvme_reset(nvme, B_FALSE); 5557 } 5558 5559 if (nvme->n_progress & NVME_CTRL_LIMITS) 5560 sema_destroy(&nvme->n_abort_sema); 5561 5562 if (nvme->n_progress & NVME_ADMIN_QUEUE) 5563 nvme_free_qpair(nvme->n_adminq); 5564 5565 if (nvme->n_cq_count > 0) { 5566 nvme_destroy_cq_array(nvme, 0); 5567 nvme->n_cq = NULL; 5568 nvme->n_cq_count = 0; 5569 } 5570 5571 if (nvme->n_idcomns) 5572 kmem_free(nvme->n_idcomns, NVME_IDENTIFY_BUFSIZE); 5573 5574 if (nvme->n_idctl) 5575 kmem_free(nvme->n_idctl, NVME_IDENTIFY_BUFSIZE); 5576 5577 if (nvme->n_progress & NVME_REGS_MAPPED) 5578 ddi_regs_map_free(&nvme->n_regh); 5579 5580 if (nvme->n_progress & NVME_STAT_INIT) 5581 nvme_stat_cleanup(nvme); 5582 5583 if (nvme->n_progress & NVME_FMA_INIT) { 5584 if (DDI_FM_ERRCB_CAP(nvme->n_fm_cap)) 5585 ddi_fm_handler_unregister(nvme->n_dip); 5586 5587 if (DDI_FM_EREPORT_CAP(nvme->n_fm_cap) || 5588 DDI_FM_ERRCB_CAP(nvme->n_fm_cap)) 5589 pci_ereport_teardown(nvme->n_dip); 5590 5591 ddi_fm_fini(nvme->n_dip); 5592 } 5593 5594 if (nvme->n_progress & NVME_PCI_CONFIG) 5595 pci_config_teardown(&nvme->n_pcicfg_handle); 5596 5597 if (nvme->n_vendor != NULL) 5598 strfree(nvme->n_vendor); 5599 5600 if (nvme->n_product != NULL) 5601 strfree(nvme->n_product); 5602 5603 ddi_soft_state_free(nvme_state, instance); 5604 5605 return (DDI_SUCCESS); 5606 } 5607 5608 static int 5609 nvme_quiesce(dev_info_t *dip) 5610 { 5611 int instance; 5612 nvme_t *nvme; 5613 5614 instance = ddi_get_instance(dip); 5615 5616 nvme = ddi_get_soft_state(nvme_state, instance); 5617 5618 if (nvme == NULL) 5619 return (DDI_FAILURE); 5620 5621 nvme_shutdown(nvme, B_TRUE); 5622 5623 (void) nvme_reset(nvme, B_TRUE); 5624 5625 return (DDI_SUCCESS); 5626 } 5627 5628 static int 5629 nvme_fill_prp(nvme_cmd_t *cmd, ddi_dma_handle_t dma) 5630 { 5631 nvme_t *nvme = cmd->nc_nvme; 5632 uint_t nprp_per_page, nprp; 5633 uint64_t *prp; 5634 const ddi_dma_cookie_t *cookie; 5635 uint_t idx; 5636 uint_t ncookies = ddi_dma_ncookies(dma); 5637 5638 if (ncookies == 0) 5639 return (DDI_FAILURE); 5640 5641 if ((cookie = ddi_dma_cookie_get(dma, 0)) == NULL) 5642 return (DDI_FAILURE); 5643 cmd->nc_sqe.sqe_dptr.d_prp[0] = cookie->dmac_laddress; 5644 5645 if (ncookies == 1) { 5646 cmd->nc_sqe.sqe_dptr.d_prp[1] = 0; 5647 return (DDI_SUCCESS); 5648 } else if (ncookies == 2) { 5649 if ((cookie = ddi_dma_cookie_get(dma, 1)) == NULL) 5650 return (DDI_FAILURE); 5651 cmd->nc_sqe.sqe_dptr.d_prp[1] = cookie->dmac_laddress; 5652 return (DDI_SUCCESS); 5653 } 5654 5655 /* 5656 * At this point, we're always operating on cookies at 5657 * index >= 1 and writing the addresses of those cookies 5658 * into a new page. The address of that page is stored 5659 * as the second PRP entry. 5660 */ 5661 nprp_per_page = nvme->n_pagesize / sizeof (uint64_t); 5662 ASSERT(nprp_per_page > 0); 5663 5664 /* 5665 * We currently don't support chained PRPs and set up our DMA 5666 * attributes to reflect that. If we still get an I/O request 5667 * that needs a chained PRP something is very wrong. Account 5668 * for the first cookie here, which we've placed in d_prp[0]. 5669 */ 5670 nprp = howmany(ncookies - 1, nprp_per_page); 5671 VERIFY(nprp == 1); 5672 5673 /* 5674 * Allocate a page of pointers, in which we'll write the 5675 * addresses of cookies 1 to `ncookies`. 5676 */ 5677 cmd->nc_prp = kmem_cache_alloc(nvme->n_prp_cache, KM_SLEEP); 5678 bzero(cmd->nc_prp->nd_memp, cmd->nc_prp->nd_len); 5679 cmd->nc_sqe.sqe_dptr.d_prp[1] = cmd->nc_prp->nd_cookie.dmac_laddress; 5680 5681 prp = (uint64_t *)cmd->nc_prp->nd_memp; 5682 for (idx = 1; idx < ncookies; idx++) { 5683 if ((cookie = ddi_dma_cookie_get(dma, idx)) == NULL) 5684 return (DDI_FAILURE); 5685 *prp++ = cookie->dmac_laddress; 5686 } 5687 5688 (void) ddi_dma_sync(cmd->nc_prp->nd_dmah, 0, cmd->nc_prp->nd_len, 5689 DDI_DMA_SYNC_FORDEV); 5690 return (DDI_SUCCESS); 5691 } 5692 5693 /* 5694 * The maximum number of requests supported for a deallocate request is 5695 * NVME_DSET_MGMT_MAX_RANGES (256) -- this is from the NVMe 1.1 spec (and 5696 * unchanged through at least 1.4a). The definition of nvme_range_t is also 5697 * from the NVMe 1.1 spec. Together, the result is that all of the ranges for 5698 * a deallocate request will fit into the smallest supported namespace page 5699 * (4k). 5700 */ 5701 CTASSERT(sizeof (nvme_range_t) * NVME_DSET_MGMT_MAX_RANGES == 4096); 5702 5703 static int 5704 nvme_fill_ranges(nvme_cmd_t *cmd, bd_xfer_t *xfer, uint64_t blocksize, 5705 int allocflag) 5706 { 5707 const dkioc_free_list_t *dfl = xfer->x_dfl; 5708 const dkioc_free_list_ext_t *exts = dfl->dfl_exts; 5709 nvme_t *nvme = cmd->nc_nvme; 5710 nvme_range_t *ranges = NULL; 5711 uint_t i; 5712 5713 /* 5714 * The number of ranges in the request is 0s based (that is 5715 * word10 == 0 -> 1 range, word10 == 1 -> 2 ranges, ..., 5716 * word10 == 255 -> 256 ranges). Therefore the allowed values are 5717 * [1..NVME_DSET_MGMT_MAX_RANGES]. If blkdev gives us a bad request, 5718 * we either provided bad info in nvme_bd_driveinfo() or there is a bug 5719 * in blkdev. 5720 */ 5721 VERIFY3U(dfl->dfl_num_exts, >, 0); 5722 VERIFY3U(dfl->dfl_num_exts, <=, NVME_DSET_MGMT_MAX_RANGES); 5723 cmd->nc_sqe.sqe_cdw10 = (dfl->dfl_num_exts - 1) & 0xff; 5724 5725 cmd->nc_sqe.sqe_cdw11 = NVME_DSET_MGMT_ATTR_DEALLOCATE; 5726 5727 cmd->nc_prp = kmem_cache_alloc(nvme->n_prp_cache, allocflag); 5728 if (cmd->nc_prp == NULL) 5729 return (DDI_FAILURE); 5730 5731 bzero(cmd->nc_prp->nd_memp, cmd->nc_prp->nd_len); 5732 ranges = (nvme_range_t *)cmd->nc_prp->nd_memp; 5733 5734 cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_prp->nd_cookie.dmac_laddress; 5735 cmd->nc_sqe.sqe_dptr.d_prp[1] = 0; 5736 5737 for (i = 0; i < dfl->dfl_num_exts; i++) { 5738 uint64_t lba, len; 5739 5740 lba = (dfl->dfl_offset + exts[i].dfle_start) / blocksize; 5741 len = exts[i].dfle_length / blocksize; 5742 5743 VERIFY3U(len, <=, UINT32_MAX); 5744 5745 /* No context attributes for a deallocate request */ 5746 ranges[i].nr_ctxattr = 0; 5747 ranges[i].nr_len = len; 5748 ranges[i].nr_lba = lba; 5749 } 5750 5751 (void) ddi_dma_sync(cmd->nc_prp->nd_dmah, 0, cmd->nc_prp->nd_len, 5752 DDI_DMA_SYNC_FORDEV); 5753 5754 return (DDI_SUCCESS); 5755 } 5756 5757 static nvme_cmd_t * 5758 nvme_create_nvm_cmd(nvme_namespace_t *ns, uint8_t opc, bd_xfer_t *xfer) 5759 { 5760 nvme_t *nvme = ns->ns_nvme; 5761 nvme_cmd_t *cmd; 5762 int allocflag; 5763 5764 /* 5765 * Blkdev only sets BD_XFER_POLL when dumping, so don't sleep. 5766 */ 5767 allocflag = (xfer->x_flags & BD_XFER_POLL) ? KM_NOSLEEP : KM_SLEEP; 5768 cmd = nvme_alloc_cmd(nvme, allocflag); 5769 5770 if (cmd == NULL) 5771 return (NULL); 5772 5773 cmd->nc_sqe.sqe_opc = opc; 5774 cmd->nc_callback = nvme_bd_xfer_done; 5775 cmd->nc_xfer = xfer; 5776 5777 switch (opc) { 5778 case NVME_OPC_NVM_WRITE: 5779 case NVME_OPC_NVM_READ: 5780 VERIFY(xfer->x_nblks <= 0x10000); 5781 5782 cmd->nc_sqe.sqe_nsid = ns->ns_id; 5783 5784 cmd->nc_sqe.sqe_cdw10 = xfer->x_blkno & 0xffffffffu; 5785 cmd->nc_sqe.sqe_cdw11 = (xfer->x_blkno >> 32); 5786 cmd->nc_sqe.sqe_cdw12 = (uint16_t)(xfer->x_nblks - 1); 5787 5788 if (nvme_fill_prp(cmd, xfer->x_dmah) != DDI_SUCCESS) 5789 goto fail; 5790 break; 5791 5792 case NVME_OPC_NVM_FLUSH: 5793 cmd->nc_sqe.sqe_nsid = ns->ns_id; 5794 break; 5795 5796 case NVME_OPC_NVM_DSET_MGMT: 5797 cmd->nc_sqe.sqe_nsid = ns->ns_id; 5798 5799 if (nvme_fill_ranges(cmd, xfer, 5800 (uint64_t)ns->ns_block_size, allocflag) != DDI_SUCCESS) 5801 goto fail; 5802 break; 5803 5804 default: 5805 goto fail; 5806 } 5807 5808 return (cmd); 5809 5810 fail: 5811 nvme_free_cmd(cmd); 5812 return (NULL); 5813 } 5814 5815 static void 5816 nvme_bd_xfer_done(void *arg) 5817 { 5818 nvme_cmd_t *cmd = arg; 5819 bd_xfer_t *xfer = cmd->nc_xfer; 5820 int error = 0; 5821 5822 error = nvme_check_cmd_status(cmd); 5823 nvme_free_cmd(cmd); 5824 5825 bd_xfer_done(xfer, error); 5826 } 5827 5828 static void 5829 nvme_bd_driveinfo(void *arg, bd_drive_t *drive) 5830 { 5831 nvme_namespace_t *ns = arg; 5832 nvme_t *nvme = ns->ns_nvme; 5833 uint_t ns_count = MAX(1, nvme->n_namespaces_attachable); 5834 5835 nvme_mgmt_lock(nvme, NVME_MGMT_LOCK_BDRO); 5836 5837 /* 5838 * Set the blkdev qcount to the number of submission queues. 5839 * It will then create one waitq/runq pair for each submission 5840 * queue and spread I/O requests across the queues. 5841 */ 5842 drive->d_qcount = nvme->n_ioq_count; 5843 5844 /* 5845 * I/O activity to individual namespaces is distributed across 5846 * each of the d_qcount blkdev queues (which has been set to 5847 * the number of nvme submission queues). d_qsize is the number 5848 * of submitted and not completed I/Os within each queue that blkdev 5849 * will allow before it starts holding them in the waitq. 5850 * 5851 * Each namespace will create a child blkdev instance, for each one 5852 * we try and set the d_qsize so that each namespace gets an 5853 * equal portion of the submission queue. 5854 * 5855 * If post instantiation of the nvme drive, n_namespaces_attachable 5856 * changes and a namespace is attached it could calculate a 5857 * different d_qsize. It may even be that the sum of the d_qsizes is 5858 * now beyond the submission queue size. Should that be the case 5859 * and the I/O rate is such that blkdev attempts to submit more 5860 * I/Os than the size of the submission queue, the excess I/Os 5861 * will be held behind the semaphore nq_sema. 5862 */ 5863 drive->d_qsize = nvme->n_io_squeue_len / ns_count; 5864 5865 /* 5866 * Don't let the queue size drop below the minimum, though. 5867 */ 5868 drive->d_qsize = MAX(drive->d_qsize, NVME_MIN_IO_QUEUE_LEN); 5869 5870 /* 5871 * d_maxxfer is not set, which means the value is taken from the DMA 5872 * attributes specified to bd_alloc_handle. 5873 */ 5874 5875 drive->d_removable = B_FALSE; 5876 drive->d_hotpluggable = B_FALSE; 5877 5878 bcopy(ns->ns_eui64, drive->d_eui64, sizeof (drive->d_eui64)); 5879 drive->d_target = ns->ns_id; 5880 drive->d_lun = 0; 5881 5882 drive->d_model = nvme->n_idctl->id_model; 5883 drive->d_model_len = sizeof (nvme->n_idctl->id_model); 5884 drive->d_vendor = nvme->n_vendor; 5885 drive->d_vendor_len = strlen(nvme->n_vendor); 5886 drive->d_product = nvme->n_product; 5887 drive->d_product_len = strlen(nvme->n_product); 5888 drive->d_serial = nvme->n_idctl->id_serial; 5889 drive->d_serial_len = sizeof (nvme->n_idctl->id_serial); 5890 drive->d_revision = nvme->n_idctl->id_fwrev; 5891 drive->d_revision_len = sizeof (nvme->n_idctl->id_fwrev); 5892 5893 /* 5894 * If we support the dataset management command, the only restrictions 5895 * on a discard request are the maximum number of ranges (segments) 5896 * per single request. 5897 */ 5898 if (nvme->n_idctl->id_oncs.on_dset_mgmt) 5899 drive->d_max_free_seg = NVME_DSET_MGMT_MAX_RANGES; 5900 5901 nvme_mgmt_unlock(nvme); 5902 } 5903 5904 static int 5905 nvme_bd_mediainfo(void *arg, bd_media_t *media) 5906 { 5907 nvme_namespace_t *ns = arg; 5908 nvme_t *nvme = ns->ns_nvme; 5909 5910 if (nvme->n_dead) { 5911 return (EIO); 5912 } 5913 5914 nvme_mgmt_lock(nvme, NVME_MGMT_LOCK_BDRO); 5915 5916 media->m_nblks = ns->ns_block_count; 5917 media->m_blksize = ns->ns_block_size; 5918 media->m_readonly = B_FALSE; 5919 media->m_solidstate = B_TRUE; 5920 5921 media->m_pblksize = ns->ns_best_block_size; 5922 5923 nvme_mgmt_unlock(nvme); 5924 5925 return (0); 5926 } 5927 5928 static int 5929 nvme_bd_cmd(nvme_namespace_t *ns, bd_xfer_t *xfer, uint8_t opc) 5930 { 5931 nvme_t *nvme = ns->ns_nvme; 5932 nvme_cmd_t *cmd; 5933 nvme_qpair_t *ioq; 5934 boolean_t poll; 5935 int ret; 5936 5937 if (nvme->n_dead) { 5938 return (EIO); 5939 } 5940 5941 cmd = nvme_create_nvm_cmd(ns, opc, xfer); 5942 if (cmd == NULL) 5943 return (ENOMEM); 5944 5945 cmd->nc_sqid = xfer->x_qnum + 1; 5946 ASSERT(cmd->nc_sqid <= nvme->n_ioq_count); 5947 ioq = nvme->n_ioq[cmd->nc_sqid]; 5948 5949 /* 5950 * Get the polling flag before submitting the command. The command may 5951 * complete immediately after it was submitted, which means we must 5952 * treat both cmd and xfer as if they have been freed already. 5953 */ 5954 poll = (xfer->x_flags & BD_XFER_POLL) != 0; 5955 5956 ret = nvme_submit_io_cmd(ioq, cmd); 5957 5958 if (ret != 0) 5959 return (ret); 5960 5961 if (!poll) 5962 return (0); 5963 5964 do { 5965 cmd = nvme_retrieve_cmd(nvme, ioq); 5966 if (cmd != NULL) { 5967 ASSERT0(cmd->nc_flags & NVME_CMD_F_USELOCK); 5968 cmd->nc_callback(cmd); 5969 } else { 5970 drv_usecwait(10); 5971 } 5972 } while (ioq->nq_active_cmds != 0); 5973 5974 return (0); 5975 } 5976 5977 static int 5978 nvme_bd_read(void *arg, bd_xfer_t *xfer) 5979 { 5980 nvme_namespace_t *ns = arg; 5981 5982 return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_READ)); 5983 } 5984 5985 static int 5986 nvme_bd_write(void *arg, bd_xfer_t *xfer) 5987 { 5988 nvme_namespace_t *ns = arg; 5989 5990 return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_WRITE)); 5991 } 5992 5993 static int 5994 nvme_bd_sync(void *arg, bd_xfer_t *xfer) 5995 { 5996 nvme_namespace_t *ns = arg; 5997 5998 if (ns->ns_nvme->n_dead) 5999 return (EIO); 6000 6001 /* 6002 * If the volatile write cache is not present or not enabled the FLUSH 6003 * command is a no-op, so we can take a shortcut here. 6004 */ 6005 if (!ns->ns_nvme->n_write_cache_present) { 6006 bd_xfer_done(xfer, ENOTSUP); 6007 return (0); 6008 } 6009 6010 if (!ns->ns_nvme->n_write_cache_enabled) { 6011 bd_xfer_done(xfer, 0); 6012 return (0); 6013 } 6014 6015 return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_FLUSH)); 6016 } 6017 6018 static int 6019 nvme_bd_devid(void *arg, dev_info_t *devinfo, ddi_devid_t *devid) 6020 { 6021 nvme_namespace_t *ns = arg; 6022 nvme_t *nvme = ns->ns_nvme; 6023 6024 if (nvme->n_dead) { 6025 return (EIO); 6026 } 6027 6028 if (*(uint64_t *)ns->ns_nguid != 0 || 6029 *(uint64_t *)(ns->ns_nguid + 8) != 0) { 6030 return (ddi_devid_init(devinfo, DEVID_NVME_NGUID, 6031 sizeof (ns->ns_nguid), ns->ns_nguid, devid)); 6032 } else if (*(uint64_t *)ns->ns_eui64 != 0) { 6033 return (ddi_devid_init(devinfo, DEVID_NVME_EUI64, 6034 sizeof (ns->ns_eui64), ns->ns_eui64, devid)); 6035 } else { 6036 return (ddi_devid_init(devinfo, DEVID_NVME_NSID, 6037 strlen(ns->ns_devid), ns->ns_devid, devid)); 6038 } 6039 } 6040 6041 static int 6042 nvme_bd_free_space(void *arg, bd_xfer_t *xfer) 6043 { 6044 nvme_namespace_t *ns = arg; 6045 6046 if (xfer->x_dfl == NULL) 6047 return (EINVAL); 6048 6049 if (!ns->ns_nvme->n_idctl->id_oncs.on_dset_mgmt) 6050 return (ENOTSUP); 6051 6052 return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_DSET_MGMT)); 6053 } 6054 6055 static int 6056 nvme_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) 6057 { 6058 #ifndef __lock_lint 6059 _NOTE(ARGUNUSED(cred_p)); 6060 #endif 6061 nvme_t *nvme; 6062 nvme_minor_t *minor = NULL; 6063 uint32_t nsid; 6064 minor_t m = getminor(*devp); 6065 int rv = 0; 6066 6067 if (otyp != OTYP_CHR) 6068 return (EINVAL); 6069 6070 if (m >= NVME_OPEN_MINOR_MIN) 6071 return (ENXIO); 6072 6073 nvme = ddi_get_soft_state(nvme_state, NVME_MINOR_INST(m)); 6074 nsid = NVME_MINOR_NSID(m); 6075 6076 if (nvme == NULL) 6077 return (ENXIO); 6078 6079 if (nsid > MIN(nvme->n_namespace_count, NVME_MINOR_MAX)) 6080 return (ENXIO); 6081 6082 if (nvme->n_dead) 6083 return (EIO); 6084 6085 /* 6086 * At this point, we're going to allow an open to proceed on this 6087 * device. We need to allocate a new instance for this (presuming one is 6088 * available). 6089 */ 6090 minor = kmem_zalloc(sizeof (nvme_minor_t), KM_NOSLEEP_LAZY); 6091 if (minor == NULL) { 6092 return (ENOMEM); 6093 } 6094 6095 cv_init(&minor->nm_cv, NULL, CV_DRIVER, NULL); 6096 list_link_init(&minor->nm_ctrl_lock.nli_node); 6097 minor->nm_ctrl_lock.nli_nvme = nvme; 6098 minor->nm_ctrl_lock.nli_minor = minor; 6099 list_link_init(&minor->nm_ns_lock.nli_node); 6100 minor->nm_ns_lock.nli_nvme = nvme; 6101 minor->nm_ns_lock.nli_minor = minor; 6102 minor->nm_minor = id_alloc_nosleep(nvme_open_minors); 6103 if (minor->nm_minor == -1) { 6104 nvme_minor_free(minor); 6105 return (ENOSPC); 6106 } 6107 6108 minor->nm_ctrl = nvme; 6109 if (nsid != 0) { 6110 minor->nm_ns = nvme_nsid2ns(nvme, nsid); 6111 } 6112 6113 /* 6114 * Before we check for exclusive access and attempt a lock if requested, 6115 * ensure that this minor is persisted. 6116 */ 6117 mutex_enter(&nvme_open_minors_mutex); 6118 avl_add(&nvme_open_minors_avl, minor); 6119 mutex_exit(&nvme_open_minors_mutex); 6120 6121 /* 6122 * A request for opening this FEXCL, is translated into a non-blocking 6123 * write lock of the appropriate entity. This honors the original 6124 * semantics here. In the future, we should see if we can remove this 6125 * and turn a request for FEXCL at open into ENOTSUP. 6126 */ 6127 mutex_enter(&nvme->n_minor_mutex); 6128 if ((flag & FEXCL) != 0) { 6129 nvme_ioctl_lock_t lock = { 6130 .nil_level = NVME_LOCK_L_WRITE, 6131 .nil_flags = NVME_LOCK_F_DONT_BLOCK 6132 }; 6133 6134 if (minor->nm_ns != NULL) { 6135 lock.nil_ent = NVME_LOCK_E_NS; 6136 lock.nil_common.nioc_nsid = nsid; 6137 } else { 6138 lock.nil_ent = NVME_LOCK_E_CTRL; 6139 } 6140 nvme_rwlock(minor, &lock); 6141 if (lock.nil_common.nioc_drv_err != NVME_IOCTL_E_OK) { 6142 mutex_exit(&nvme->n_minor_mutex); 6143 6144 mutex_enter(&nvme_open_minors_mutex); 6145 avl_remove(&nvme_open_minors_avl, minor); 6146 mutex_exit(&nvme_open_minors_mutex); 6147 6148 nvme_minor_free(minor); 6149 return (EBUSY); 6150 } 6151 } 6152 mutex_exit(&nvme->n_minor_mutex); 6153 6154 *devp = makedevice(getmajor(*devp), (minor_t)minor->nm_minor); 6155 return (rv); 6156 6157 } 6158 6159 static int 6160 nvme_close(dev_t dev, int flag __unused, int otyp, cred_t *cred_p __unused) 6161 { 6162 nvme_minor_t *minor; 6163 nvme_t *nvme; 6164 6165 if (otyp != OTYP_CHR) { 6166 return (ENXIO); 6167 } 6168 6169 minor = nvme_minor_find_by_dev(dev); 6170 if (minor == NULL) { 6171 return (ENXIO); 6172 } 6173 6174 mutex_enter(&nvme_open_minors_mutex); 6175 avl_remove(&nvme_open_minors_avl, minor); 6176 mutex_exit(&nvme_open_minors_mutex); 6177 6178 /* 6179 * When this device is being closed, we must ensure that any locks held 6180 * by this are dealt with. 6181 */ 6182 nvme = minor->nm_ctrl; 6183 mutex_enter(&nvme->n_minor_mutex); 6184 ASSERT3U(minor->nm_ctrl_lock.nli_state, !=, NVME_LOCK_STATE_BLOCKED); 6185 ASSERT3U(minor->nm_ns_lock.nli_state, !=, NVME_LOCK_STATE_BLOCKED); 6186 6187 if (minor->nm_ctrl_lock.nli_state == NVME_LOCK_STATE_ACQUIRED) { 6188 VERIFY3P(minor->nm_ctrl_lock.nli_lock, !=, NULL); 6189 nvme_rwunlock(&minor->nm_ctrl_lock, 6190 minor->nm_ctrl_lock.nli_lock); 6191 } 6192 6193 if (minor->nm_ns_lock.nli_state == NVME_LOCK_STATE_ACQUIRED) { 6194 VERIFY3P(minor->nm_ns_lock.nli_lock, !=, NULL); 6195 nvme_rwunlock(&minor->nm_ns_lock, minor->nm_ns_lock.nli_lock); 6196 } 6197 mutex_exit(&nvme->n_minor_mutex); 6198 6199 nvme_minor_free(minor); 6200 6201 return (0); 6202 } 6203 6204 void 6205 nvme_ioctl_success(nvme_ioctl_common_t *ioc) 6206 { 6207 ioc->nioc_drv_err = NVME_IOCTL_E_OK; 6208 ioc->nioc_ctrl_sc = NVME_CQE_SC_GEN_SUCCESS; 6209 ioc->nioc_ctrl_sct = NVME_CQE_SCT_GENERIC; 6210 } 6211 6212 boolean_t 6213 nvme_ioctl_error(nvme_ioctl_common_t *ioc, nvme_ioctl_errno_t err, uint32_t sct, 6214 uint32_t sc) 6215 { 6216 ioc->nioc_drv_err = err; 6217 ioc->nioc_ctrl_sct = sct; 6218 ioc->nioc_ctrl_sc = sc; 6219 6220 return (B_FALSE); 6221 } 6222 6223 static int 6224 nvme_ioctl_copyout_error(nvme_ioctl_errno_t err, intptr_t uaddr, int mode) 6225 { 6226 nvme_ioctl_common_t ioc; 6227 6228 ASSERT3U(err, !=, NVME_IOCTL_E_CTRL_ERROR); 6229 bzero(&ioc, sizeof (ioc)); 6230 if (ddi_copyout(&ioc, (void *)uaddr, sizeof (nvme_ioctl_common_t), 6231 mode & FKIOCTL) != 0) { 6232 return (EFAULT); 6233 } 6234 return (0); 6235 } 6236 6237 /* 6238 * The companion to the namespace checking. This occurs after any rewriting 6239 * occurs. This is the primary point that we attempt to enforce any operation's 6240 * exclusivity. Note, it is theoretically possible for an operation to be 6241 * ongoing and to have someone with an exclusive lock ask to unlock it for some 6242 * reason. This does not maintain the number of such events that are going on. 6243 * While perhaps this is leaving too much up to the user, by the same token we 6244 * don't try to stop them from issuing two different format NVM commands 6245 * targeting the whole device at the same time either, even though the 6246 * controller would really rather that didn't happen. 6247 */ 6248 static boolean_t 6249 nvme_ioctl_excl_check(nvme_minor_t *minor, nvme_ioctl_common_t *ioc, 6250 const nvme_ioctl_check_t *check) 6251 { 6252 nvme_t *const nvme = minor->nm_ctrl; 6253 nvme_namespace_t *ns; 6254 boolean_t have_ctrl, have_ns, ctrl_is_excl, ns_is_excl; 6255 6256 /* 6257 * If the command doesn't require anything, then we're done. 6258 */ 6259 if (check->nck_excl == NVME_IOCTL_EXCL_SKIP) { 6260 return (B_TRUE); 6261 } 6262 6263 if (ioc->nioc_nsid == 0 || ioc->nioc_nsid == NVME_NSID_BCAST) { 6264 ns = NULL; 6265 } else { 6266 ns = nvme_nsid2ns(nvme, ioc->nioc_nsid); 6267 } 6268 6269 mutex_enter(&nvme->n_minor_mutex); 6270 ctrl_is_excl = nvme->n_lock.nl_writer != NULL; 6271 have_ctrl = nvme->n_lock.nl_writer == &minor->nm_ctrl_lock; 6272 if (ns != NULL) { 6273 /* 6274 * We explicitly test the namespace lock's writer versus asking 6275 * the minor because the minor's namespace lock may apply to a 6276 * different namespace. 6277 */ 6278 ns_is_excl = ns->ns_lock.nl_writer != NULL; 6279 have_ns = ns->ns_lock.nl_writer == &minor->nm_ns_lock; 6280 ASSERT0(have_ctrl && have_ns); 6281 #ifdef DEBUG 6282 if (have_ns) { 6283 ASSERT3P(minor->nm_ns_lock.nli_ns, ==, ns); 6284 } 6285 #endif 6286 } else { 6287 ns_is_excl = B_FALSE; 6288 have_ns = B_FALSE; 6289 } 6290 ASSERT0(ctrl_is_excl && ns_is_excl); 6291 mutex_exit(&nvme->n_minor_mutex); 6292 6293 if (check->nck_excl == NVME_IOCTL_EXCL_CTRL) { 6294 if (have_ctrl) { 6295 return (B_TRUE); 6296 } 6297 6298 return (nvme_ioctl_error(ioc, NVME_IOCTL_E_NEED_CTRL_WRLOCK, 6299 0, 0)); 6300 } 6301 6302 if (check->nck_excl == NVME_IOCTL_EXCL_WRITE) { 6303 if (ns == NULL) { 6304 if (have_ctrl) { 6305 return (B_TRUE); 6306 } 6307 return (nvme_ioctl_error(ioc, 6308 NVME_IOCTL_E_NEED_CTRL_WRLOCK, 0, 0)); 6309 } else { 6310 if (have_ctrl || have_ns) { 6311 return (B_TRUE); 6312 } 6313 return (nvme_ioctl_error(ioc, 6314 NVME_IOCTL_E_NEED_NS_WRLOCK, 0, 0)); 6315 } 6316 } 6317 6318 /* 6319 * Now we have an operation that does not require exclusive access. We 6320 * can proceed as long as no one else has it or if someone does it is 6321 * us. Regardless of what we target, a controller lock will stop us. 6322 */ 6323 if (ctrl_is_excl && !have_ctrl) { 6324 return (nvme_ioctl_error(ioc, NVME_IOCTL_E_CTRL_LOCKED, 0, 0)); 6325 } 6326 6327 /* 6328 * Only check namespace exclusivity if we are targeting one. 6329 */ 6330 if (ns != NULL && ns_is_excl && !have_ns) { 6331 return (nvme_ioctl_error(ioc, NVME_IOCTL_E_NS_LOCKED, 0, 0)); 6332 } 6333 6334 return (B_TRUE); 6335 } 6336 6337 /* 6338 * Perform common checking as to whether or not an ioctl operation may proceed. 6339 * We check in this function various aspects of the namespace attributes that 6340 * it's calling on. Once the namespace attributes and any possible rewriting 6341 * have been performed, then we proceed to check whether or not the requisite 6342 * exclusive access is present in nvme_ioctl_excl_check(). 6343 */ 6344 static boolean_t 6345 nvme_ioctl_check(nvme_minor_t *minor, nvme_ioctl_common_t *ioc, 6346 const nvme_ioctl_check_t *check) 6347 { 6348 /* 6349 * If the minor has a namespace pointer, then it is constrained to that 6350 * namespace. If a namespace is allowed, then there are only two valid 6351 * values that we can find. The first is matching the minor. The second 6352 * is our value zero, which will be transformed to the current 6353 * namespace. 6354 */ 6355 if (minor->nm_ns != NULL) { 6356 if (!check->nck_ns_ok || !check->nck_ns_minor_ok) { 6357 return (nvme_ioctl_error(ioc, NVME_IOCTL_E_NOT_CTRL, 0, 6358 0)); 6359 } 6360 6361 if (ioc->nioc_nsid == 0) { 6362 ioc->nioc_nsid = minor->nm_ns->ns_id; 6363 } else if (ioc->nioc_nsid != minor->nm_ns->ns_id) { 6364 return (nvme_ioctl_error(ioc, 6365 NVME_IOCTL_E_MINOR_WRONG_NS, 0, 0)); 6366 } 6367 6368 return (nvme_ioctl_excl_check(minor, ioc, check)); 6369 } 6370 6371 /* 6372 * If we've been told to skip checking the controller, here's where we 6373 * do that. This should really only be for commands which use the 6374 * namespace ID for listing purposes and therefore can have 6375 * traditionally illegal values here. 6376 */ 6377 if (check->nck_skip_ctrl) { 6378 return (nvme_ioctl_excl_check(minor, ioc, check)); 6379 } 6380 6381 /* 6382 * At this point, we know that we're on the controller's node. We first 6383 * deal with the simple case, is a namespace allowed at all or not. If 6384 * it is not allowed, then the only acceptable value is zero. 6385 */ 6386 if (!check->nck_ns_ok) { 6387 if (ioc->nioc_nsid != 0) { 6388 return (nvme_ioctl_error(ioc, NVME_IOCTL_E_NS_UNUSE, 0, 6389 0)); 6390 } 6391 6392 return (nvme_ioctl_excl_check(minor, ioc, check)); 6393 } 6394 6395 /* 6396 * At this point, we know that a controller is allowed to use a 6397 * namespace. If we haven't been given zero or the broadcast namespace, 6398 * check to see if it's actually a valid namespace ID. If is outside of 6399 * range, then it is an error. Next, if we have been requested to 6400 * rewrite 0 (the this controller indicator) as the broadcast namespace, 6401 * do so. 6402 * 6403 * While we validate that this namespace is within the valid range, we 6404 * do not check if it is active or inactive. That is left to our callers 6405 * to determine. 6406 */ 6407 if (ioc->nioc_nsid > minor->nm_ctrl->n_namespace_count && 6408 ioc->nioc_nsid != NVME_NSID_BCAST) { 6409 return (nvme_ioctl_error(ioc, NVME_IOCTL_E_NS_RANGE, 0, 0)); 6410 } 6411 6412 if (ioc->nioc_nsid == 0 && check->nck_ctrl_rewrite) { 6413 ioc->nioc_nsid = NVME_NSID_BCAST; 6414 } 6415 6416 /* 6417 * Finally, see if we have ended up with a broadcast namespace ID 6418 * whether through specification or rewriting. If that is not allowed, 6419 * then that is an error. 6420 */ 6421 if (!check->nck_bcast_ok && ioc->nioc_nsid == NVME_NSID_BCAST) { 6422 return (nvme_ioctl_error(ioc, NVME_IOCTL_E_NO_BCAST_NS, 0, 0)); 6423 } 6424 6425 return (nvme_ioctl_excl_check(minor, ioc, check)); 6426 } 6427 6428 static int 6429 nvme_ioctl_ctrl_info(nvme_minor_t *minor, intptr_t arg, int mode, 6430 cred_t *cred_p) 6431 { 6432 nvme_t *const nvme = minor->nm_ctrl; 6433 nvme_ioctl_ctrl_info_t *info; 6434 nvme_reg_cap_t cap = { 0 }; 6435 nvme_ioctl_identify_t id = { .nid_cns = NVME_IDENTIFY_CTRL }; 6436 void *idbuf; 6437 6438 if ((mode & FREAD) == 0) 6439 return (EBADF); 6440 6441 info = kmem_alloc(sizeof (nvme_ioctl_ctrl_info_t), KM_NOSLEEP_LAZY); 6442 if (info == NULL) { 6443 return (nvme_ioctl_copyout_error(NVME_IOCTL_E_NO_KERN_MEM, arg, 6444 mode)); 6445 } 6446 6447 if (ddi_copyin((void *)arg, info, sizeof (nvme_ioctl_ctrl_info_t), 6448 mode & FKIOCTL) != 0) { 6449 kmem_free(info, sizeof (nvme_ioctl_ctrl_info_t)); 6450 return (EFAULT); 6451 } 6452 6453 if (!nvme_ioctl_check(minor, &info->nci_common, 6454 &nvme_check_ctrl_info)) { 6455 goto copyout; 6456 } 6457 6458 /* 6459 * We explicitly do not use the identify controller copy in the kernel 6460 * right now so that way we can get a snapshot of the controller's 6461 * current capacity and values. While it's tempting to try to use this 6462 * to refresh the kernel's version we don't just to simplify the rest of 6463 * the driver right now. 6464 */ 6465 if (!nvme_identify(nvme, B_TRUE, &id, &idbuf)) { 6466 info->nci_common = id.nid_common; 6467 goto copyout; 6468 } 6469 bcopy(idbuf, &info->nci_ctrl_id, sizeof (nvme_identify_ctrl_t)); 6470 kmem_free(idbuf, NVME_IDENTIFY_BUFSIZE); 6471 6472 /* 6473 * Use the kernel's cached common namespace information for this. 6474 */ 6475 bcopy(nvme->n_idcomns, &info->nci_common_ns, 6476 sizeof (nvme_identify_nsid_t)); 6477 6478 info->nci_vers = nvme->n_version; 6479 6480 /* 6481 * The MPSMIN and MPSMAX fields in the CAP register use 0 to 6482 * specify the base page size of 4k (1<<12), so add 12 here to 6483 * get the real page size value. 6484 */ 6485 cap.r = nvme_get64(nvme, NVME_REG_CAP); 6486 info->nci_caps.cap_mpsmax = 1 << (12 + cap.b.cap_mpsmax); 6487 info->nci_caps.cap_mpsmin = 1 << (12 + cap.b.cap_mpsmin); 6488 6489 info->nci_nintrs = (uint32_t)nvme->n_intr_cnt; 6490 6491 copyout: 6492 if (ddi_copyout(info, (void *)arg, sizeof (nvme_ioctl_ctrl_info_t), 6493 mode & FKIOCTL) != 0) { 6494 kmem_free(info, sizeof (nvme_ioctl_ctrl_info_t)); 6495 return (EFAULT); 6496 } 6497 6498 kmem_free(info, sizeof (nvme_ioctl_ctrl_info_t)); 6499 return (0); 6500 } 6501 6502 static int 6503 nvme_ioctl_ns_info(nvme_minor_t *minor, intptr_t arg, int mode, cred_t *cred_p) 6504 { 6505 nvme_t *const nvme = minor->nm_ctrl; 6506 nvme_ioctl_ns_info_t *ns_info; 6507 nvme_namespace_t *ns; 6508 nvme_ioctl_identify_t id = { .nid_cns = NVME_IDENTIFY_NSID }; 6509 void *idbuf; 6510 6511 if ((mode & FREAD) == 0) 6512 return (EBADF); 6513 6514 ns_info = kmem_zalloc(sizeof (nvme_ioctl_ns_info_t), KM_NOSLEEP_LAZY); 6515 if (ns_info == NULL) { 6516 return (nvme_ioctl_copyout_error(NVME_IOCTL_E_NO_KERN_MEM, arg, 6517 mode)); 6518 } 6519 6520 if (ddi_copyin((void *)arg, ns_info, sizeof (nvme_ioctl_ns_info_t), 6521 mode & FKIOCTL) != 0) { 6522 kmem_free(ns_info, sizeof (nvme_ioctl_ns_info_t)); 6523 return (EFAULT); 6524 } 6525 6526 if (!nvme_ioctl_check(minor, &ns_info->nni_common, 6527 &nvme_check_ns_info)) { 6528 goto copyout; 6529 } 6530 6531 ASSERT3U(ns_info->nni_common.nioc_nsid, >, 0); 6532 ns = nvme_nsid2ns(nvme, ns_info->nni_common.nioc_nsid); 6533 6534 /* 6535 * First fetch a fresh copy of the namespace information. Most callers 6536 * are using this because they will want a mostly accurate snapshot of 6537 * capacity and utilization. 6538 */ 6539 id.nid_common.nioc_nsid = ns_info->nni_common.nioc_nsid; 6540 if (!nvme_identify(nvme, B_TRUE, &id, &idbuf)) { 6541 ns_info->nni_common = id.nid_common; 6542 goto copyout; 6543 } 6544 bcopy(idbuf, &ns_info->nni_id, sizeof (nvme_identify_nsid_t)); 6545 kmem_free(idbuf, NVME_IDENTIFY_BUFSIZE); 6546 6547 nvme_mgmt_lock(nvme, NVME_MGMT_LOCK_NVME); 6548 ns_info->nni_state = ns->ns_state; 6549 if (ns->ns_state >= NVME_NS_STATE_ATTACHED) { 6550 const char *addr; 6551 6552 ns_info->nni_state = NVME_NS_STATE_ATTACHED; 6553 addr = bd_address(ns->ns_bd_hdl); 6554 if (strlcpy(ns_info->nni_addr, addr, 6555 sizeof (ns_info->nni_addr)) >= sizeof (ns_info->nni_addr)) { 6556 nvme_mgmt_unlock(nvme); 6557 (void) nvme_ioctl_error(&ns_info->nni_common, 6558 NVME_IOCTL_E_BD_ADDR_OVER, 0, 0); 6559 goto copyout; 6560 } 6561 } 6562 nvme_mgmt_unlock(nvme); 6563 6564 copyout: 6565 if (ddi_copyout(ns_info, (void *)arg, sizeof (nvme_ioctl_ns_info_t), 6566 mode & FKIOCTL) != 0) { 6567 kmem_free(ns_info, sizeof (nvme_ioctl_ns_info_t)); 6568 return (EFAULT); 6569 } 6570 6571 kmem_free(ns_info, sizeof (nvme_ioctl_ns_info_t)); 6572 return (0); 6573 } 6574 6575 static int 6576 nvme_ioctl_identify(nvme_minor_t *minor, intptr_t arg, int mode, cred_t *cred_p) 6577 { 6578 _NOTE(ARGUNUSED(cred_p)); 6579 nvme_t *const nvme = minor->nm_ctrl; 6580 void *idctl; 6581 uint_t model; 6582 nvme_ioctl_identify_t id; 6583 #ifdef _MULTI_DATAMODEL 6584 nvme_ioctl_identify32_t id32; 6585 #endif 6586 boolean_t ns_minor; 6587 6588 if ((mode & FREAD) == 0) 6589 return (EBADF); 6590 6591 model = ddi_model_convert_from(mode); 6592 switch (model) { 6593 #ifdef _MULTI_DATAMODEL 6594 case DDI_MODEL_ILP32: 6595 bzero(&id, sizeof (id)); 6596 if (ddi_copyin((void *)arg, &id32, sizeof (id32), 6597 mode & FKIOCTL) != 0) { 6598 return (EFAULT); 6599 } 6600 id.nid_common.nioc_nsid = id32.nid_common.nioc_nsid; 6601 id.nid_cns = id32.nid_cns; 6602 id.nid_ctrlid = id32.nid_ctrlid; 6603 id.nid_data = id32.nid_data; 6604 break; 6605 #endif /* _MULTI_DATAMODEL */ 6606 case DDI_MODEL_NONE: 6607 if (ddi_copyin((void *)arg, &id, sizeof (id), 6608 mode & FKIOCTL) != 0) { 6609 return (EFAULT); 6610 } 6611 break; 6612 default: 6613 return (ENOTSUP); 6614 } 6615 6616 if (!nvme_ioctl_check(minor, &id.nid_common, &nvme_check_identify)) { 6617 goto copyout; 6618 } 6619 6620 ns_minor = minor->nm_ns != NULL; 6621 if (!nvme_validate_identify(nvme, &id, ns_minor)) { 6622 goto copyout; 6623 } 6624 6625 if (nvme_identify(nvme, B_TRUE, &id, &idctl)) { 6626 int ret = ddi_copyout(idctl, (void *)id.nid_data, 6627 NVME_IDENTIFY_BUFSIZE, mode & FKIOCTL); 6628 kmem_free(idctl, NVME_IDENTIFY_BUFSIZE); 6629 if (ret != 0) { 6630 (void) nvme_ioctl_error(&id.nid_common, 6631 NVME_IOCTL_E_BAD_USER_DATA, 0, 0); 6632 goto copyout; 6633 } 6634 6635 nvme_ioctl_success(&id.nid_common); 6636 } 6637 6638 copyout: 6639 switch (model) { 6640 #ifdef _MULTI_DATAMODEL 6641 case DDI_MODEL_ILP32: 6642 id32.nid_common = id.nid_common; 6643 6644 if (ddi_copyout(&id32, (void *)arg, sizeof (id32), 6645 mode & FKIOCTL) != 0) { 6646 return (EFAULT); 6647 } 6648 break; 6649 #endif /* _MULTI_DATAMODEL */ 6650 case DDI_MODEL_NONE: 6651 if (ddi_copyout(&id, (void *)arg, sizeof (id), 6652 mode & FKIOCTL) != 0) { 6653 return (EFAULT); 6654 } 6655 break; 6656 default: 6657 return (ENOTSUP); 6658 } 6659 6660 return (0); 6661 } 6662 6663 /* 6664 * Execute commands on behalf of the various ioctls. 6665 * 6666 * If this returns true then the command completed successfully. Otherwise error 6667 * information is returned in the nvme_ioctl_common_t arguments. 6668 */ 6669 static boolean_t 6670 nvme_ioc_cmd(nvme_t *nvme, nvme_ioctl_common_t *ioc, nvme_ioc_cmd_args_t *args) 6671 { 6672 nvme_cmd_t *cmd; 6673 boolean_t ret = B_FALSE; 6674 6675 cmd = nvme_alloc_admin_cmd(nvme, KM_SLEEP); 6676 cmd->nc_sqid = 0; 6677 6678 /* 6679 * This function is used to facilitate requests from 6680 * userspace, so don't panic if the command fails. This 6681 * is especially true for admin passthru commands, where 6682 * the actual command data structure is entirely defined 6683 * by userspace. 6684 */ 6685 cmd->nc_flags |= NVME_CMD_F_DONTPANIC; 6686 6687 cmd->nc_callback = nvme_wakeup_cmd; 6688 cmd->nc_sqe = *args->ica_sqe; 6689 6690 if ((args->ica_dma_flags & DDI_DMA_RDWR) != 0) { 6691 if (args->ica_data == NULL) { 6692 ret = nvme_ioctl_error(ioc, NVME_IOCTL_E_NO_DMA_MEM, 6693 0, 0); 6694 goto free_cmd; 6695 } 6696 6697 if (nvme_zalloc_dma(nvme, args->ica_data_len, 6698 args->ica_dma_flags, &nvme->n_prp_dma_attr, &cmd->nc_dma) != 6699 DDI_SUCCESS) { 6700 dev_err(nvme->n_dip, CE_WARN, 6701 "!nvme_zalloc_dma failed for nvme_ioc_cmd()"); 6702 ret = nvme_ioctl_error(ioc, 6703 NVME_IOCTL_E_NO_DMA_MEM, 0, 0); 6704 goto free_cmd; 6705 } 6706 6707 if (nvme_fill_prp(cmd, cmd->nc_dma->nd_dmah) != 0) { 6708 ret = nvme_ioctl_error(ioc, 6709 NVME_IOCTL_E_NO_DMA_MEM, 0, 0); 6710 goto free_cmd; 6711 } 6712 6713 if ((args->ica_dma_flags & DDI_DMA_WRITE) != 0 && 6714 ddi_copyin(args->ica_data, cmd->nc_dma->nd_memp, 6715 args->ica_data_len, args->ica_copy_flags) != 0) { 6716 ret = nvme_ioctl_error(ioc, NVME_IOCTL_E_BAD_USER_DATA, 6717 0, 0); 6718 goto free_cmd; 6719 } 6720 } 6721 6722 nvme_admin_cmd(cmd, args->ica_timeout); 6723 6724 if (!nvme_check_cmd_status_ioctl(cmd, ioc)) { 6725 ret = B_FALSE; 6726 goto free_cmd; 6727 } 6728 6729 args->ica_cdw0 = cmd->nc_cqe.cqe_dw0; 6730 6731 if ((args->ica_dma_flags & DDI_DMA_READ) != 0 && 6732 ddi_copyout(cmd->nc_dma->nd_memp, args->ica_data, 6733 args->ica_data_len, args->ica_copy_flags) != 0) { 6734 ret = nvme_ioctl_error(ioc, NVME_IOCTL_E_BAD_USER_DATA, 0, 0); 6735 goto free_cmd; 6736 } 6737 6738 ret = B_TRUE; 6739 nvme_ioctl_success(ioc); 6740 6741 free_cmd: 6742 nvme_free_cmd(cmd); 6743 6744 return (ret); 6745 } 6746 6747 static int 6748 nvme_ioctl_get_logpage(nvme_minor_t *minor, intptr_t arg, int mode, 6749 cred_t *cred_p) 6750 { 6751 nvme_t *const nvme = minor->nm_ctrl; 6752 void *buf; 6753 nvme_ioctl_get_logpage_t log; 6754 uint_t model; 6755 #ifdef _MULTI_DATAMODEL 6756 nvme_ioctl_get_logpage32_t log32; 6757 #endif 6758 6759 if ((mode & FREAD) == 0) { 6760 return (EBADF); 6761 } 6762 6763 model = ddi_model_convert_from(mode); 6764 switch (model) { 6765 #ifdef _MULTI_DATAMODEL 6766 case DDI_MODEL_ILP32: 6767 bzero(&log, sizeof (log)); 6768 if (ddi_copyin((void *)arg, &log32, sizeof (log32), 6769 mode & FKIOCTL) != 0) { 6770 return (EFAULT); 6771 } 6772 6773 log.nigl_common.nioc_nsid = log32.nigl_common.nioc_nsid; 6774 log.nigl_csi = log32.nigl_csi; 6775 log.nigl_lid = log32.nigl_lid; 6776 log.nigl_lsp = log32.nigl_lsp; 6777 log.nigl_len = log32.nigl_len; 6778 log.nigl_offset = log32.nigl_offset; 6779 log.nigl_data = log32.nigl_data; 6780 break; 6781 #endif /* _MULTI_DATAMODEL */ 6782 case DDI_MODEL_NONE: 6783 if (ddi_copyin((void *)arg, &log, sizeof (log), 6784 mode & FKIOCTL) != 0) { 6785 return (EFAULT); 6786 } 6787 break; 6788 default: 6789 return (ENOTSUP); 6790 } 6791 6792 /* 6793 * Eventually we'd like to do a soft lock on the namespaces from 6794 * changing out from us during this operation in the future. But we 6795 * haven't implemented that yet. 6796 */ 6797 if (!nvme_ioctl_check(minor, &log.nigl_common, 6798 &nvme_check_get_logpage)) { 6799 goto copyout; 6800 } 6801 6802 if (!nvme_validate_logpage(nvme, &log)) { 6803 goto copyout; 6804 } 6805 6806 if (nvme_get_logpage(nvme, B_TRUE, &log, &buf)) { 6807 int copy; 6808 6809 copy = ddi_copyout(buf, (void *)log.nigl_data, log.nigl_len, 6810 mode & FKIOCTL); 6811 kmem_free(buf, log.nigl_len); 6812 if (copy != 0) { 6813 (void) nvme_ioctl_error(&log.nigl_common, 6814 NVME_IOCTL_E_BAD_USER_DATA, 0, 0); 6815 goto copyout; 6816 } 6817 6818 nvme_ioctl_success(&log.nigl_common); 6819 } 6820 6821 copyout: 6822 switch (model) { 6823 #ifdef _MULTI_DATAMODEL 6824 case DDI_MODEL_ILP32: 6825 bzero(&log32, sizeof (log32)); 6826 6827 log32.nigl_common = log.nigl_common; 6828 log32.nigl_csi = log.nigl_csi; 6829 log32.nigl_lid = log.nigl_lid; 6830 log32.nigl_lsp = log.nigl_lsp; 6831 log32.nigl_len = log.nigl_len; 6832 log32.nigl_offset = log.nigl_offset; 6833 log32.nigl_data = log.nigl_data; 6834 if (ddi_copyout(&log32, (void *)arg, sizeof (log32), 6835 mode & FKIOCTL) != 0) { 6836 return (EFAULT); 6837 } 6838 break; 6839 #endif /* _MULTI_DATAMODEL */ 6840 case DDI_MODEL_NONE: 6841 if (ddi_copyout(&log, (void *)arg, sizeof (log), 6842 mode & FKIOCTL) != 0) { 6843 return (EFAULT); 6844 } 6845 break; 6846 default: 6847 return (ENOTSUP); 6848 } 6849 6850 return (0); 6851 } 6852 6853 static int 6854 nvme_ioctl_get_feature(nvme_minor_t *minor, intptr_t arg, int mode, 6855 cred_t *cred_p) 6856 { 6857 nvme_t *const nvme = minor->nm_ctrl; 6858 nvme_ioctl_get_feature_t feat; 6859 uint_t model; 6860 #ifdef _MULTI_DATAMODEL 6861 nvme_ioctl_get_feature32_t feat32; 6862 #endif 6863 nvme_get_features_dw10_t gf_dw10 = { 0 }; 6864 nvme_ioc_cmd_args_t args = { NULL }; 6865 nvme_sqe_t sqe = { 6866 .sqe_opc = NVME_OPC_GET_FEATURES 6867 }; 6868 6869 if ((mode & FREAD) == 0) { 6870 return (EBADF); 6871 } 6872 6873 model = ddi_model_convert_from(mode); 6874 switch (model) { 6875 #ifdef _MULTI_DATAMODEL 6876 case DDI_MODEL_ILP32: 6877 bzero(&feat, sizeof (feat)); 6878 if (ddi_copyin((void *)arg, &feat32, sizeof (feat32), 6879 mode & FKIOCTL) != 0) { 6880 return (EFAULT); 6881 } 6882 6883 feat.nigf_common.nioc_nsid = feat32.nigf_common.nioc_nsid; 6884 feat.nigf_fid = feat32.nigf_fid; 6885 feat.nigf_sel = feat32.nigf_sel; 6886 feat.nigf_cdw11 = feat32.nigf_cdw11; 6887 feat.nigf_data = feat32.nigf_data; 6888 feat.nigf_len = feat32.nigf_len; 6889 break; 6890 #endif /* _MULTI_DATAMODEL */ 6891 case DDI_MODEL_NONE: 6892 if (ddi_copyin((void *)arg, &feat, sizeof (feat), 6893 mode & FKIOCTL) != 0) { 6894 return (EFAULT); 6895 } 6896 break; 6897 default: 6898 return (ENOTSUP); 6899 } 6900 6901 if (!nvme_ioctl_check(minor, &feat.nigf_common, 6902 &nvme_check_get_feature)) { 6903 goto copyout; 6904 } 6905 6906 if (!nvme_validate_get_feature(nvme, &feat)) { 6907 goto copyout; 6908 } 6909 6910 gf_dw10.b.gt_fid = bitx32(feat.nigf_fid, 7, 0); 6911 gf_dw10.b.gt_sel = bitx32(feat.nigf_sel, 2, 0); 6912 sqe.sqe_cdw10 = gf_dw10.r; 6913 sqe.sqe_cdw11 = feat.nigf_cdw11; 6914 sqe.sqe_nsid = feat.nigf_common.nioc_nsid; 6915 6916 args.ica_sqe = &sqe; 6917 if (feat.nigf_len != 0) { 6918 args.ica_data = (void *)feat.nigf_data; 6919 args.ica_data_len = feat.nigf_len; 6920 args.ica_dma_flags = DDI_DMA_READ; 6921 } 6922 args.ica_copy_flags = mode; 6923 args.ica_timeout = nvme_admin_cmd_timeout; 6924 6925 if (!nvme_ioc_cmd(nvme, &feat.nigf_common, &args)) { 6926 goto copyout; 6927 } 6928 6929 feat.nigf_cdw0 = args.ica_cdw0; 6930 6931 copyout: 6932 switch (model) { 6933 #ifdef _MULTI_DATAMODEL 6934 case DDI_MODEL_ILP32: 6935 bzero(&feat32, sizeof (feat32)); 6936 6937 feat32.nigf_common = feat.nigf_common; 6938 feat32.nigf_fid = feat.nigf_fid; 6939 feat32.nigf_sel = feat.nigf_sel; 6940 feat32.nigf_cdw11 = feat.nigf_cdw11; 6941 feat32.nigf_data = feat.nigf_data; 6942 feat32.nigf_len = feat.nigf_len; 6943 feat32.nigf_cdw0 = feat.nigf_cdw0; 6944 if (ddi_copyout(&feat32, (void *)arg, sizeof (feat32), 6945 mode & FKIOCTL) != 0) { 6946 return (EFAULT); 6947 } 6948 break; 6949 #endif /* _MULTI_DATAMODEL */ 6950 case DDI_MODEL_NONE: 6951 if (ddi_copyout(&feat, (void *)arg, sizeof (feat), 6952 mode & FKIOCTL) != 0) { 6953 return (EFAULT); 6954 } 6955 break; 6956 default: 6957 return (ENOTSUP); 6958 } 6959 6960 return (0); 6961 } 6962 6963 static int 6964 nvme_ioctl_format(nvme_minor_t *minor, intptr_t arg, int mode, cred_t *cred_p) 6965 { 6966 nvme_t *const nvme = minor->nm_ctrl; 6967 nvme_ioctl_format_t ioc; 6968 6969 if ((mode & FWRITE) == 0) 6970 return (EBADF); 6971 6972 if (secpolicy_sys_config(cred_p, B_FALSE) != 0) 6973 return (EPERM); 6974 6975 if (ddi_copyin((void *)(uintptr_t)arg, &ioc, 6976 sizeof (nvme_ioctl_format_t), mode & FKIOCTL) != 0) 6977 return (EFAULT); 6978 6979 if (!nvme_ioctl_check(minor, &ioc.nif_common, &nvme_check_format)) { 6980 goto copyout; 6981 } 6982 6983 if (!nvme_validate_format(nvme, &ioc)) { 6984 goto copyout; 6985 } 6986 6987 /* 6988 * The broadcast namespace can format all namespaces attached to the 6989 * controller, meaning active namespaces. However, a targeted format can 6990 * impact any allocated namespace, even one not attached. As such, we 6991 * need different checks for each situation. 6992 */ 6993 nvme_mgmt_lock(nvme, NVME_MGMT_LOCK_NVME); 6994 if (ioc.nif_common.nioc_nsid == NVME_NSID_BCAST) { 6995 if (!nvme_no_blkdev_attached(nvme, ioc.nif_common.nioc_nsid)) { 6996 nvme_mgmt_unlock(nvme); 6997 (void) nvme_ioctl_error(&ioc.nif_common, 6998 NVME_IOCTL_E_NS_BLKDEV_ATTACH, 0, 0); 6999 goto copyout; 7000 } 7001 } else { 7002 nvme_namespace_t *ns = nvme_nsid2ns(nvme, 7003 ioc.nif_common.nioc_nsid); 7004 7005 if (!nvme_ns_state_check(ns, &ioc.nif_common, 7006 nvme_format_nvm_states)) { 7007 nvme_mgmt_unlock(nvme); 7008 goto copyout; 7009 } 7010 } 7011 7012 if (nvme_format_nvm(nvme, &ioc)) { 7013 nvme_ioctl_success(&ioc.nif_common); 7014 nvme_rescan_ns(nvme, ioc.nif_common.nioc_nsid); 7015 } 7016 nvme_mgmt_unlock(nvme); 7017 7018 copyout: 7019 if (ddi_copyout(&ioc, (void *)(uintptr_t)arg, sizeof (ioc), 7020 mode & FKIOCTL) != 0) { 7021 return (EFAULT); 7022 } 7023 7024 return (0); 7025 } 7026 7027 static int 7028 nvme_ioctl_bd_detach(nvme_minor_t *minor, intptr_t arg, int mode, 7029 cred_t *cred_p) 7030 { 7031 nvme_t *const nvme = minor->nm_ctrl; 7032 nvme_ioctl_common_t com; 7033 7034 if ((mode & FWRITE) == 0) 7035 return (EBADF); 7036 7037 if (secpolicy_sys_config(cred_p, B_FALSE) != 0) 7038 return (EPERM); 7039 7040 if (ddi_copyin((void *)(uintptr_t)arg, &com, sizeof (com), 7041 mode & FKIOCTL) != 0) { 7042 return (EFAULT); 7043 } 7044 7045 if (!nvme_ioctl_check(minor, &com, &nvme_check_attach_detach)) { 7046 goto copyout; 7047 } 7048 7049 nvme_mgmt_lock(nvme, NVME_MGMT_LOCK_NVME); 7050 if (nvme_bd_detach_ns(nvme, &com)) { 7051 nvme_ioctl_success(&com); 7052 } 7053 nvme_mgmt_unlock(nvme); 7054 7055 copyout: 7056 if (ddi_copyout(&com, (void *)(uintptr_t)arg, sizeof (com), 7057 mode & FKIOCTL) != 0) { 7058 return (EFAULT); 7059 } 7060 7061 return (0); 7062 } 7063 7064 static int 7065 nvme_ioctl_bd_attach(nvme_minor_t *minor, intptr_t arg, int mode, 7066 cred_t *cred_p) 7067 { 7068 nvme_t *const nvme = minor->nm_ctrl; 7069 nvme_ioctl_common_t com; 7070 nvme_namespace_t *ns; 7071 7072 if ((mode & FWRITE) == 0) 7073 return (EBADF); 7074 7075 if (secpolicy_sys_config(cred_p, B_FALSE) != 0) 7076 return (EPERM); 7077 7078 if (ddi_copyin((void *)(uintptr_t)arg, &com, sizeof (com), 7079 mode & FKIOCTL) != 0) { 7080 return (EFAULT); 7081 } 7082 7083 if (!nvme_ioctl_check(minor, &com, &nvme_check_attach_detach)) { 7084 goto copyout; 7085 } 7086 7087 nvme_mgmt_lock(nvme, NVME_MGMT_LOCK_NVME); 7088 ns = nvme_nsid2ns(nvme, com.nioc_nsid); 7089 7090 /* 7091 * Strictly speaking we shouldn't need to call nvme_init_ns() here as 7092 * we should be properly refreshing the internal state when we are 7093 * issuing commands that change things. However, we opt to still do so 7094 * as a bit of a safety check lest we give the kernel something bad or a 7095 * vendor unique command somehow did something behind our backs. 7096 */ 7097 if (ns->ns_state < NVME_NS_STATE_ATTACHED) { 7098 nvme_rescan_ns(nvme, com.nioc_nsid); 7099 } 7100 7101 if (nvme_bd_attach_ns(nvme, &com)) { 7102 nvme_ioctl_success(&com); 7103 } 7104 nvme_mgmt_unlock(nvme); 7105 7106 copyout: 7107 if (ddi_copyout(&com, (void *)(uintptr_t)arg, sizeof (com), 7108 mode & FKIOCTL) != 0) { 7109 return (EFAULT); 7110 } 7111 7112 return (0); 7113 } 7114 7115 /* 7116 * Attach or detach a controller from the specified namespace. While this in 7117 * theory allows for multiple controllers to be specified, currently we only 7118 * support using the controller that we've issued this ioctl on. In the future 7119 * when we have better ways to test dual-attached controllers then this should 7120 * be extended to take the controller list from userland. 7121 */ 7122 static boolean_t 7123 nvme_ctrl_attach_detach_ns(nvme_t *nvme, nvme_namespace_t *ns, 7124 nvme_ioctl_common_t *ioc, boolean_t attach) 7125 { 7126 nvme_ioc_cmd_args_t args = { NULL }; 7127 nvme_sqe_t sqe; 7128 nvme_ns_mgmt_dw10_t dw10; 7129 uint16_t ctrlids[2]; 7130 7131 ASSERT(nvme_mgmt_lock_held(nvme)); 7132 7133 bzero(&sqe, sizeof (sqe)); 7134 sqe.sqe_nsid = ioc->nioc_nsid; 7135 sqe.sqe_opc = NVME_OPC_NS_ATTACH; 7136 7137 dw10.r = 0; 7138 dw10.b.nsm_sel = attach ? NVME_NS_ATTACH_CTRL_ATTACH : 7139 NVME_NS_ATTACH_CTRL_DETACH; 7140 sqe.sqe_cdw10 = dw10.r; 7141 7142 /* 7143 * As we only support sending our current controller's id along, we can 7144 * simplify this and don't need both allocating a full 7145 * nvme_identify_ctrl_list_t for two items. 7146 */ 7147 ctrlids[0] = 1; 7148 ctrlids[1] = nvme->n_idctl->id_cntlid; 7149 7150 args.ica_sqe = &sqe; 7151 args.ica_data = ctrlids; 7152 args.ica_data_len = sizeof (ctrlids); 7153 args.ica_dma_flags = DDI_DMA_WRITE; 7154 args.ica_copy_flags = FKIOCTL; 7155 args.ica_timeout = nvme_admin_cmd_timeout; 7156 7157 return (nvme_ioc_cmd(nvme, ioc, &args)); 7158 } 7159 7160 static int 7161 nvme_ioctl_ctrl_detach(nvme_minor_t *minor, intptr_t arg, int mode, 7162 cred_t *cred_p) 7163 { 7164 nvme_t *const nvme = minor->nm_ctrl; 7165 nvme_ioctl_common_t com; 7166 nvme_namespace_t *ns; 7167 7168 if ((mode & FWRITE) == 0) 7169 return (EBADF); 7170 7171 if (secpolicy_sys_config(cred_p, B_FALSE) != 0) 7172 return (EPERM); 7173 7174 if (ddi_copyin((void *)(uintptr_t)arg, &com, sizeof (com), 7175 mode & FKIOCTL) != 0) { 7176 return (EFAULT); 7177 } 7178 7179 if (!nvme_ioctl_check(minor, &com, &nvme_check_attach_detach)) { 7180 goto copyout; 7181 } 7182 7183 if (!nvme_validate_ctrl_attach_detach_ns(nvme, &com)) { 7184 goto copyout; 7185 } 7186 7187 nvme_mgmt_lock(nvme, NVME_MGMT_LOCK_NVME); 7188 ns = nvme_nsid2ns(nvme, com.nioc_nsid); 7189 7190 if (nvme_ns_state_check(ns, &com, nvme_ctrl_detach_states)) { 7191 if (nvme_ctrl_attach_detach_ns(nvme, ns, &com, B_FALSE)) { 7192 nvme_rescan_ns(nvme, com.nioc_nsid); 7193 nvme_ioctl_success(&com); 7194 } 7195 } 7196 nvme_mgmt_unlock(nvme); 7197 7198 copyout: 7199 if (ddi_copyout(&com, (void *)(uintptr_t)arg, sizeof (com), 7200 mode & FKIOCTL) != 0) { 7201 return (EFAULT); 7202 } 7203 7204 return (0); 7205 } 7206 7207 static int 7208 nvme_ioctl_ns_create(nvme_minor_t *minor, intptr_t arg, int mode, 7209 cred_t *cred_p) 7210 { 7211 nvme_t *const nvme = minor->nm_ctrl; 7212 nvme_ioctl_ns_create_t create; 7213 7214 if ((mode & FWRITE) == 0) 7215 return (EBADF); 7216 7217 if (secpolicy_sys_config(cred_p, B_FALSE) != 0) 7218 return (EPERM); 7219 7220 if (ddi_copyin((void *)(uintptr_t)arg, &create, sizeof (create), 7221 mode & FKIOCTL) != 0) { 7222 return (EFAULT); 7223 } 7224 7225 if (!nvme_ioctl_check(minor, &create.nnc_common, 7226 &nvme_check_ns_create)) { 7227 goto copyout; 7228 } 7229 7230 if (!nvme_validate_ns_create(nvme, &create)) { 7231 goto copyout; 7232 } 7233 7234 /* 7235 * Now that we've validated this, proceed to build up the actual data 7236 * request. We need to fill out the relevant identify namespace data 7237 * structure fields. 7238 */ 7239 nvme_identify_nsid_t *idns = kmem_zalloc(sizeof (nvme_identify_nsid_t), 7240 KM_NOSLEEP_LAZY); 7241 if (idns == NULL) { 7242 (void) nvme_ioctl_error(&create.nnc_common, 7243 NVME_IOCTL_E_NO_KERN_MEM, 0, 0); 7244 goto copyout; 7245 } 7246 7247 idns->id_nsize = create.nnc_nsze; 7248 idns->id_ncap = create.nnc_ncap; 7249 idns->id_flbas.lba_format = create.nnc_flbas; 7250 idns->id_nmic.nm_shared = bitx32(create.nnc_nmic, 0, 0); 7251 7252 nvme_ioc_cmd_args_t args = { NULL }; 7253 nvme_sqe_t sqe; 7254 nvme_ns_mgmt_dw10_t dw10; 7255 nvme_ns_mgmt_dw11_t dw11; 7256 7257 bzero(&sqe, sizeof (sqe)); 7258 sqe.sqe_nsid = create.nnc_common.nioc_nsid; 7259 sqe.sqe_opc = NVME_OPC_NS_MGMT; 7260 7261 dw10.r = 0; 7262 dw10.b.nsm_sel = NVME_NS_MGMT_NS_CREATE; 7263 sqe.sqe_cdw10 = dw10.r; 7264 7265 dw11.r = 0; 7266 dw11.b.nsm_csi = create.nnc_csi; 7267 sqe.sqe_cdw11 = dw11.r; 7268 7269 args.ica_sqe = &sqe; 7270 args.ica_data = idns; 7271 args.ica_data_len = sizeof (nvme_identify_nsid_t); 7272 args.ica_dma_flags = DDI_DMA_WRITE; 7273 args.ica_copy_flags = FKIOCTL; 7274 args.ica_timeout = nvme_format_cmd_timeout; 7275 7276 /* 7277 * This command manipulates our understanding of a namespace's state. 7278 * While we don't need to check anything before we proceed, we still 7279 * logically require the lock. 7280 */ 7281 nvme_mgmt_lock(nvme, NVME_MGMT_LOCK_NVME); 7282 if (nvme_ioc_cmd(nvme, &create.nnc_common, &args)) { 7283 create.nnc_nsid = args.ica_cdw0; 7284 nvme_rescan_ns(nvme, create.nnc_nsid); 7285 nvme_ioctl_success(&create.nnc_common); 7286 } 7287 nvme_mgmt_unlock(nvme); 7288 kmem_free(idns, sizeof (nvme_identify_nsid_t)); 7289 7290 copyout: 7291 if (ddi_copyout(&create, (void *)(uintptr_t)arg, sizeof (create), 7292 mode & FKIOCTL) != 0) { 7293 return (EFAULT); 7294 } 7295 7296 return (0); 7297 7298 } 7299 7300 static int 7301 nvme_ioctl_ns_delete(nvme_minor_t *minor, intptr_t arg, int mode, 7302 cred_t *cred_p) 7303 { 7304 nvme_t *const nvme = minor->nm_ctrl; 7305 nvme_ioctl_common_t com; 7306 7307 if ((mode & FWRITE) == 0) 7308 return (EBADF); 7309 7310 if (secpolicy_sys_config(cred_p, B_FALSE) != 0) 7311 return (EPERM); 7312 7313 if (ddi_copyin((void *)(uintptr_t)arg, &com, sizeof (com), 7314 mode & FKIOCTL) != 0) { 7315 return (EFAULT); 7316 } 7317 7318 if (!nvme_ioctl_check(minor, &com, &nvme_check_ns_delete)) { 7319 goto copyout; 7320 } 7321 7322 if (!nvme_validate_ns_delete(nvme, &com)) { 7323 goto copyout; 7324 } 7325 7326 nvme_mgmt_lock(nvme, NVME_MGMT_LOCK_NVME); 7327 if (com.nioc_nsid == NVME_NSID_BCAST) { 7328 if (!nvme_no_blkdev_attached(nvme, com.nioc_nsid)) { 7329 nvme_mgmt_unlock(nvme); 7330 (void) nvme_ioctl_error(&com, 7331 NVME_IOCTL_E_NS_BLKDEV_ATTACH, 0, 0); 7332 goto copyout; 7333 } 7334 } else { 7335 nvme_namespace_t *ns = nvme_nsid2ns(nvme, com.nioc_nsid); 7336 7337 if (!nvme_ns_state_check(ns, &com, nvme_ns_delete_states)) { 7338 nvme_mgmt_unlock(nvme); 7339 goto copyout; 7340 } 7341 } 7342 7343 nvme_ioc_cmd_args_t args = { NULL }; 7344 nvme_sqe_t sqe; 7345 nvme_ns_mgmt_dw10_t dw10; 7346 7347 bzero(&sqe, sizeof (sqe)); 7348 sqe.sqe_nsid = com.nioc_nsid; 7349 sqe.sqe_opc = NVME_OPC_NS_MGMT; 7350 7351 dw10.r = 0; 7352 dw10.b.nsm_sel = NVME_NS_MGMT_NS_DELETE; 7353 sqe.sqe_cdw10 = dw10.r; 7354 7355 args.ica_sqe = &sqe; 7356 args.ica_data = NULL; 7357 args.ica_data_len = 0; 7358 args.ica_dma_flags = 0; 7359 args.ica_copy_flags = 0; 7360 args.ica_timeout = nvme_format_cmd_timeout; 7361 7362 if (nvme_ioc_cmd(nvme, &com, &args)) { 7363 nvme_rescan_ns(nvme, com.nioc_nsid); 7364 nvme_ioctl_success(&com); 7365 } 7366 nvme_mgmt_unlock(nvme); 7367 7368 copyout: 7369 if (ddi_copyout(&com, (void *)(uintptr_t)arg, sizeof (com), 7370 mode & FKIOCTL) != 0) { 7371 return (EFAULT); 7372 } 7373 7374 return (0); 7375 } 7376 7377 static int 7378 nvme_ioctl_ctrl_attach(nvme_minor_t *minor, intptr_t arg, int mode, 7379 cred_t *cred_p) 7380 { 7381 nvme_t *const nvme = minor->nm_ctrl; 7382 nvme_ioctl_common_t com; 7383 nvme_namespace_t *ns; 7384 7385 if ((mode & FWRITE) == 0) 7386 return (EBADF); 7387 7388 if (secpolicy_sys_config(cred_p, B_FALSE) != 0) 7389 return (EPERM); 7390 7391 if (ddi_copyin((void *)(uintptr_t)arg, &com, sizeof (com), 7392 mode & FKIOCTL) != 0) { 7393 return (EFAULT); 7394 } 7395 7396 if (!nvme_ioctl_check(minor, &com, &nvme_check_attach_detach)) { 7397 goto copyout; 7398 } 7399 7400 if (!nvme_validate_ctrl_attach_detach_ns(nvme, &com)) { 7401 goto copyout; 7402 } 7403 7404 nvme_mgmt_lock(nvme, NVME_MGMT_LOCK_NVME); 7405 ns = nvme_nsid2ns(nvme, com.nioc_nsid); 7406 7407 if (nvme_ns_state_check(ns, &com, nvme_ctrl_attach_states)) { 7408 if (nvme_ctrl_attach_detach_ns(nvme, ns, &com, B_TRUE)) { 7409 nvme_rescan_ns(nvme, com.nioc_nsid); 7410 nvme_ioctl_success(&com); 7411 } 7412 } 7413 nvme_mgmt_unlock(nvme); 7414 7415 copyout: 7416 if (ddi_copyout(&com, (void *)(uintptr_t)arg, sizeof (com), 7417 mode & FKIOCTL) != 0) { 7418 return (EFAULT); 7419 } 7420 7421 return (0); 7422 } 7423 7424 static void 7425 nvme_ufm_update(nvme_t *nvme) 7426 { 7427 mutex_enter(&nvme->n_fwslot_mutex); 7428 ddi_ufm_update(nvme->n_ufmh); 7429 if (nvme->n_fwslot != NULL) { 7430 kmem_free(nvme->n_fwslot, sizeof (nvme_fwslot_log_t)); 7431 nvme->n_fwslot = NULL; 7432 } 7433 mutex_exit(&nvme->n_fwslot_mutex); 7434 } 7435 7436 /* 7437 * Download new firmware to the device's internal staging area. We do not call 7438 * nvme_ufm_update() here because after a firmware download, there has been no 7439 * change to any of the actual persistent firmware data. That requires a 7440 * subsequent ioctl (NVME_IOC_FIRMWARE_COMMIT) to commit the firmware to a slot 7441 * or to activate a slot. 7442 */ 7443 static int 7444 nvme_ioctl_firmware_download(nvme_minor_t *minor, intptr_t arg, int mode, 7445 cred_t *cred_p) 7446 { 7447 nvme_t *const nvme = minor->nm_ctrl; 7448 nvme_ioctl_fw_load_t fw; 7449 uint64_t len, maxcopy; 7450 offset_t offset; 7451 uint32_t gran; 7452 nvme_valid_ctrl_data_t data; 7453 uintptr_t buf; 7454 nvme_sqe_t sqe = { 7455 .sqe_opc = NVME_OPC_FW_IMAGE_LOAD 7456 }; 7457 7458 if ((mode & FWRITE) == 0) 7459 return (EBADF); 7460 7461 if (secpolicy_sys_config(cred_p, B_FALSE) != 0) 7462 return (EPERM); 7463 7464 if (ddi_copyin((void *)(uintptr_t)arg, &fw, sizeof (fw), 7465 mode & FKIOCTL) != 0) { 7466 return (EFAULT); 7467 } 7468 7469 if (!nvme_ioctl_check(minor, &fw.fwl_common, &nvme_check_firmware)) { 7470 goto copyout; 7471 } 7472 7473 if (!nvme_validate_fw_load(nvme, &fw)) { 7474 goto copyout; 7475 } 7476 7477 len = fw.fwl_len; 7478 offset = fw.fwl_off; 7479 buf = fw.fwl_buf; 7480 7481 /* 7482 * We need to determine the minimum and maximum amount of data that we 7483 * will send to the device in a given go. Starting in NMVe 1.3 this must 7484 * be a multiple of the firmware update granularity (FWUG), but must not 7485 * exceed the maximum data transfer that we've set. Many devices don't 7486 * report something here, which means we'll end up getting our default 7487 * value. Our policy is a little simple, but it's basically if the 7488 * maximum data transfer is evenly divided by the granularity, then use 7489 * it. Otherwise we use the granularity itself. The granularity is 7490 * always in page sized units, so trying to find another optimum point 7491 * isn't worth it. If we encounter a contradiction, then we will have to 7492 * error out. 7493 */ 7494 data.vcd_vers = &nvme->n_version; 7495 data.vcd_id = nvme->n_idctl; 7496 gran = nvme_fw_load_granularity(&data); 7497 7498 if ((nvme->n_max_data_transfer_size % gran) == 0) { 7499 maxcopy = nvme->n_max_data_transfer_size; 7500 } else if (gran <= nvme->n_max_data_transfer_size) { 7501 maxcopy = gran; 7502 } else { 7503 (void) nvme_ioctl_error(&fw.fwl_common, 7504 NVME_IOCTL_E_FW_LOAD_IMPOS_GRAN, 0, 0); 7505 goto copyout; 7506 } 7507 7508 while (len > 0) { 7509 nvme_ioc_cmd_args_t args = { NULL }; 7510 uint64_t copylen = MIN(maxcopy, len); 7511 7512 sqe.sqe_cdw10 = (uint32_t)(copylen >> NVME_DWORD_SHIFT) - 1; 7513 sqe.sqe_cdw11 = (uint32_t)(offset >> NVME_DWORD_SHIFT); 7514 7515 args.ica_sqe = &sqe; 7516 args.ica_data = (void *)buf; 7517 args.ica_data_len = copylen; 7518 args.ica_dma_flags = DDI_DMA_WRITE; 7519 args.ica_copy_flags = mode; 7520 args.ica_timeout = nvme_admin_cmd_timeout; 7521 7522 if (!nvme_ioc_cmd(nvme, &fw.fwl_common, &args)) { 7523 break; 7524 } 7525 7526 buf += copylen; 7527 offset += copylen; 7528 len -= copylen; 7529 } 7530 7531 copyout: 7532 if (ddi_copyout(&fw, (void *)(uintptr_t)arg, sizeof (fw), 7533 mode & FKIOCTL) != 0) { 7534 return (EFAULT); 7535 } 7536 7537 return (0); 7538 } 7539 7540 static int 7541 nvme_ioctl_firmware_commit(nvme_minor_t *minor, intptr_t arg, int mode, 7542 cred_t *cred_p) 7543 { 7544 nvme_t *const nvme = minor->nm_ctrl; 7545 nvme_ioctl_fw_commit_t fw; 7546 nvme_firmware_commit_dw10_t fc_dw10 = { 0 }; 7547 nvme_ioc_cmd_args_t args = { NULL }; 7548 nvme_sqe_t sqe = { 7549 .sqe_opc = NVME_OPC_FW_ACTIVATE 7550 }; 7551 7552 if ((mode & FWRITE) == 0) 7553 return (EBADF); 7554 7555 if (secpolicy_sys_config(cred_p, B_FALSE) != 0) 7556 return (EPERM); 7557 7558 if (ddi_copyin((void *)(uintptr_t)arg, &fw, sizeof (fw), 7559 mode & FKIOCTL) != 0) { 7560 return (EFAULT); 7561 } 7562 7563 if (!nvme_ioctl_check(minor, &fw.fwc_common, &nvme_check_firmware)) { 7564 goto copyout; 7565 } 7566 7567 if (!nvme_validate_fw_commit(nvme, &fw)) { 7568 goto copyout; 7569 } 7570 7571 fc_dw10.b.fc_slot = fw.fwc_slot; 7572 fc_dw10.b.fc_action = fw.fwc_action; 7573 sqe.sqe_cdw10 = fc_dw10.r; 7574 7575 args.ica_sqe = &sqe; 7576 args.ica_timeout = nvme_commit_save_cmd_timeout; 7577 7578 /* 7579 * There are no conditional actions to take based on this succeeding or 7580 * failing. A failure is recorded in the ioctl structure returned to the 7581 * user. 7582 */ 7583 (void) nvme_ioc_cmd(nvme, &fw.fwc_common, &args); 7584 7585 /* 7586 * Let the DDI UFM subsystem know that the firmware information for 7587 * this device has changed. We perform this unconditionally as an 7588 * invalidation doesn't particularly hurt us. 7589 */ 7590 nvme_ufm_update(nvme); 7591 7592 copyout: 7593 if (ddi_copyout(&fw, (void *)(uintptr_t)arg, sizeof (fw), 7594 mode & FKIOCTL) != 0) { 7595 return (EFAULT); 7596 } 7597 7598 return (0); 7599 } 7600 7601 /* 7602 * Helper to copy in a passthru command from userspace, handling 7603 * different data models. 7604 */ 7605 static int 7606 nvme_passthru_copyin_cmd(const void *buf, nvme_ioctl_passthru_t *cmd, int mode) 7607 { 7608 switch (ddi_model_convert_from(mode & FMODELS)) { 7609 #ifdef _MULTI_DATAMODEL 7610 case DDI_MODEL_ILP32: { 7611 nvme_ioctl_passthru32_t cmd32; 7612 7613 if (ddi_copyin(buf, (void*)&cmd32, sizeof (cmd32), mode) != 0) 7614 return (EFAULT); 7615 7616 bzero(cmd, sizeof (nvme_ioctl_passthru_t)); 7617 7618 cmd->npc_common.nioc_nsid = cmd32.npc_common.nioc_nsid; 7619 cmd->npc_opcode = cmd32.npc_opcode; 7620 cmd->npc_timeout = cmd32.npc_timeout; 7621 cmd->npc_flags = cmd32.npc_flags; 7622 cmd->npc_impact = cmd32.npc_impact; 7623 cmd->npc_cdw12 = cmd32.npc_cdw12; 7624 cmd->npc_cdw13 = cmd32.npc_cdw13; 7625 cmd->npc_cdw14 = cmd32.npc_cdw14; 7626 cmd->npc_cdw15 = cmd32.npc_cdw15; 7627 cmd->npc_buflen = cmd32.npc_buflen; 7628 cmd->npc_buf = cmd32.npc_buf; 7629 break; 7630 } 7631 #endif /* _MULTI_DATAMODEL */ 7632 case DDI_MODEL_NONE: 7633 if (ddi_copyin(buf, (void *)cmd, sizeof (nvme_ioctl_passthru_t), 7634 mode) != 0) { 7635 return (EFAULT); 7636 } 7637 break; 7638 default: 7639 return (ENOTSUP); 7640 } 7641 7642 return (0); 7643 } 7644 7645 /* 7646 * Helper to copy out a passthru command result to userspace, handling 7647 * different data models. 7648 */ 7649 static int 7650 nvme_passthru_copyout_cmd(const nvme_ioctl_passthru_t *cmd, void *buf, int mode) 7651 { 7652 switch (ddi_model_convert_from(mode & FMODELS)) { 7653 #ifdef _MULTI_DATAMODEL 7654 case DDI_MODEL_ILP32: { 7655 nvme_ioctl_passthru32_t cmd32; 7656 7657 bzero(&cmd32, sizeof (nvme_ioctl_passthru32_t)); 7658 7659 cmd32.npc_common = cmd->npc_common; 7660 cmd32.npc_opcode = cmd->npc_opcode; 7661 cmd32.npc_timeout = cmd->npc_timeout; 7662 cmd32.npc_flags = cmd->npc_flags; 7663 cmd32.npc_impact = cmd->npc_impact; 7664 cmd32.npc_cdw0 = cmd->npc_cdw0; 7665 cmd32.npc_cdw12 = cmd->npc_cdw12; 7666 cmd32.npc_cdw13 = cmd->npc_cdw13; 7667 cmd32.npc_cdw14 = cmd->npc_cdw14; 7668 cmd32.npc_cdw15 = cmd->npc_cdw15; 7669 cmd32.npc_buflen = (size32_t)cmd->npc_buflen; 7670 cmd32.npc_buf = (uintptr32_t)cmd->npc_buf; 7671 if (ddi_copyout(&cmd32, buf, sizeof (cmd32), mode) != 0) 7672 return (EFAULT); 7673 break; 7674 } 7675 #endif /* _MULTI_DATAMODEL */ 7676 case DDI_MODEL_NONE: 7677 if (ddi_copyout(cmd, buf, sizeof (nvme_ioctl_passthru_t), 7678 mode) != 0) { 7679 return (EFAULT); 7680 } 7681 break; 7682 default: 7683 return (ENOTSUP); 7684 } 7685 return (0); 7686 } 7687 7688 /* 7689 * Run an arbitrary vendor-specific admin command on the device. 7690 */ 7691 static int 7692 nvme_ioctl_passthru(nvme_minor_t *minor, intptr_t arg, int mode, cred_t *cred_p) 7693 { 7694 nvme_t *const nvme = minor->nm_ctrl; 7695 int rv; 7696 nvme_ioctl_passthru_t pass; 7697 nvme_sqe_t sqe; 7698 nvme_ioc_cmd_args_t args = { NULL }; 7699 7700 /* 7701 * Basic checks: permissions, data model, argument size. 7702 */ 7703 if ((mode & FWRITE) == 0) 7704 return (EBADF); 7705 7706 if (secpolicy_sys_config(cred_p, B_FALSE) != 0) 7707 return (EPERM); 7708 7709 if ((rv = nvme_passthru_copyin_cmd((void *)(uintptr_t)arg, &pass, 7710 mode)) != 0) { 7711 return (rv); 7712 } 7713 7714 if (!nvme_ioctl_check(minor, &pass.npc_common, &nvme_check_passthru)) { 7715 goto copyout; 7716 } 7717 7718 if (!nvme_validate_vuc(nvme, &pass)) { 7719 goto copyout; 7720 } 7721 7722 nvme_mgmt_lock(nvme, NVME_MGMT_LOCK_NVME); 7723 if ((pass.npc_impact & NVME_IMPACT_NS) != 0) { 7724 /* 7725 * We've been told this has ns impact. Right now force that to 7726 * be every ns until we have more use cases and reason to trust 7727 * the nsid field. 7728 */ 7729 if (!nvme_no_blkdev_attached(nvme, NVME_NSID_BCAST)) { 7730 nvme_mgmt_unlock(nvme); 7731 (void) nvme_ioctl_error(&pass.npc_common, 7732 NVME_IOCTL_E_NS_BLKDEV_ATTACH, 0, 0); 7733 goto copyout; 7734 } 7735 } 7736 7737 bzero(&sqe, sizeof (sqe)); 7738 7739 sqe.sqe_opc = pass.npc_opcode; 7740 sqe.sqe_nsid = pass.npc_common.nioc_nsid; 7741 sqe.sqe_cdw10 = (uint32_t)(pass.npc_buflen >> NVME_DWORD_SHIFT); 7742 sqe.sqe_cdw12 = pass.npc_cdw12; 7743 sqe.sqe_cdw13 = pass.npc_cdw13; 7744 sqe.sqe_cdw14 = pass.npc_cdw14; 7745 sqe.sqe_cdw15 = pass.npc_cdw15; 7746 7747 args.ica_sqe = &sqe; 7748 args.ica_data = (void *)pass.npc_buf; 7749 args.ica_data_len = pass.npc_buflen; 7750 args.ica_copy_flags = mode; 7751 args.ica_timeout = pass.npc_timeout; 7752 7753 if ((pass.npc_flags & NVME_PASSTHRU_READ) != 0) 7754 args.ica_dma_flags |= DDI_DMA_READ; 7755 else if ((pass.npc_flags & NVME_PASSTHRU_WRITE) != 0) 7756 args.ica_dma_flags |= DDI_DMA_WRITE; 7757 7758 if (nvme_ioc_cmd(nvme, &pass.npc_common, &args)) { 7759 pass.npc_cdw0 = args.ica_cdw0; 7760 if ((pass.npc_impact & NVME_IMPACT_NS) != 0) { 7761 nvme_rescan_ns(nvme, NVME_NSID_BCAST); 7762 } 7763 } 7764 nvme_mgmt_unlock(nvme); 7765 7766 copyout: 7767 rv = nvme_passthru_copyout_cmd(&pass, (void *)(uintptr_t)arg, 7768 mode); 7769 7770 return (rv); 7771 } 7772 7773 static int 7774 nvme_ioctl_lock(nvme_minor_t *minor, intptr_t arg, int mode, 7775 cred_t *cred_p) 7776 { 7777 nvme_ioctl_lock_t lock; 7778 const nvme_lock_flags_t all_flags = NVME_LOCK_F_DONT_BLOCK; 7779 nvme_t *nvme = minor->nm_ctrl; 7780 7781 if ((mode & FWRITE) == 0) 7782 return (EBADF); 7783 7784 if (secpolicy_sys_config(cred_p, B_FALSE) != 0) 7785 return (EPERM); 7786 7787 if (ddi_copyin((void *)(uintptr_t)arg, &lock, sizeof (lock), 7788 mode & FKIOCTL) != 0) { 7789 return (EFAULT); 7790 } 7791 7792 if (lock.nil_ent != NVME_LOCK_E_CTRL && 7793 lock.nil_ent != NVME_LOCK_E_NS) { 7794 (void) nvme_ioctl_error(&lock.nil_common, 7795 NVME_IOCTL_E_BAD_LOCK_ENTITY, 0, 0); 7796 goto copyout; 7797 } 7798 7799 if (lock.nil_level != NVME_LOCK_L_READ && 7800 lock.nil_level != NVME_LOCK_L_WRITE) { 7801 (void) nvme_ioctl_error(&lock.nil_common, 7802 NVME_IOCTL_E_BAD_LOCK_LEVEL, 0, 0); 7803 goto copyout; 7804 } 7805 7806 if ((lock.nil_flags & ~all_flags) != 0) { 7807 (void) nvme_ioctl_error(&lock.nil_common, 7808 NVME_IOCTL_E_BAD_LOCK_FLAGS, 0, 0); 7809 goto copyout; 7810 } 7811 7812 if (!nvme_ioctl_check(minor, &lock.nil_common, &nvme_check_locking)) { 7813 goto copyout; 7814 } 7815 7816 /* 7817 * If we're on a namespace, confirm that we're not asking for the 7818 * controller. 7819 */ 7820 if (lock.nil_common.nioc_nsid != 0 && 7821 lock.nil_ent == NVME_LOCK_E_CTRL) { 7822 (void) nvme_ioctl_error(&lock.nil_common, 7823 NVME_IOCTL_E_NS_CANNOT_LOCK_CTRL, 0, 0); 7824 goto copyout; 7825 } 7826 7827 /* 7828 * We've reached the point where we can no longer actually check things 7829 * without serializing state. First, we need to check to make sure that 7830 * none of our invariants are being broken for locking: 7831 * 7832 * 1) The caller isn't already blocking for a lock operation to 7833 * complete. 7834 * 7835 * 2) The caller is attempting to grab a lock that they already have. 7836 * While there are other rule violations that this might create, we opt 7837 * to check this ahead of it so we can have slightly better error 7838 * messages for our callers. 7839 * 7840 * 3) The caller is trying to grab a controller lock, while holding a 7841 * namespace lock. 7842 * 7843 * 4) The caller has a controller write lock and is trying to get a 7844 * namespace lock. For now, we disallow this case. Holding a controller 7845 * read lock is allowed, but the write lock allows you to operate on all 7846 * namespaces anyways. In addition, this simplifies the locking logic; 7847 * however, this constraint may be loosened in the future. 7848 * 7849 * 5) The caller is trying to acquire a second namespace lock when they 7850 * already have one. 7851 */ 7852 mutex_enter(&nvme->n_minor_mutex); 7853 if (minor->nm_ctrl_lock.nli_state == NVME_LOCK_STATE_BLOCKED || 7854 minor->nm_ns_lock.nli_state == NVME_LOCK_STATE_BLOCKED) { 7855 (void) nvme_ioctl_error(&lock.nil_common, 7856 NVME_IOCTL_E_LOCK_PENDING, 0, 0); 7857 mutex_exit(&nvme->n_minor_mutex); 7858 goto copyout; 7859 } 7860 7861 if ((lock.nil_ent == NVME_LOCK_E_CTRL && 7862 minor->nm_ctrl_lock.nli_state == NVME_LOCK_STATE_ACQUIRED) || 7863 (lock.nil_ent == NVME_LOCK_E_NS && 7864 minor->nm_ns_lock.nli_state == NVME_LOCK_STATE_ACQUIRED && 7865 minor->nm_ns_lock.nli_ns->ns_id == lock.nil_common.nioc_nsid)) { 7866 (void) nvme_ioctl_error(&lock.nil_common, 7867 NVME_IOCTL_E_LOCK_ALREADY_HELD, 0, 0); 7868 mutex_exit(&nvme->n_minor_mutex); 7869 goto copyout; 7870 } 7871 7872 if (lock.nil_ent == NVME_LOCK_E_CTRL && 7873 minor->nm_ns_lock.nli_state != NVME_LOCK_STATE_UNLOCKED) { 7874 (void) nvme_ioctl_error(&lock.nil_common, 7875 NVME_IOCTL_E_LOCK_NO_CTRL_WITH_NS, 0, 0); 7876 mutex_exit(&nvme->n_minor_mutex); 7877 goto copyout; 7878 } 7879 7880 if (lock.nil_ent == NVME_LOCK_E_NS && 7881 (minor->nm_ctrl_lock.nli_state == NVME_LOCK_STATE_ACQUIRED && 7882 minor->nm_ctrl_lock.nli_curlevel == NVME_LOCK_L_WRITE)) { 7883 (void) nvme_ioctl_error(&lock.nil_common, 7884 NVME_IOCTL_LOCK_NO_NS_WITH_CTRL_WRLOCK, 0, 0); 7885 mutex_exit(&nvme->n_minor_mutex); 7886 goto copyout; 7887 } 7888 7889 if (lock.nil_ent == NVME_LOCK_E_NS && 7890 minor->nm_ns_lock.nli_state != NVME_LOCK_STATE_UNLOCKED) { 7891 (void) nvme_ioctl_error(&lock.nil_common, 7892 NVME_IOCTL_E_LOCK_NO_2ND_NS, 0, 0); 7893 mutex_exit(&nvme->n_minor_mutex); 7894 goto copyout; 7895 } 7896 7897 #ifdef DEBUG 7898 /* 7899 * This is a big block of sanity checks to make sure that we haven't 7900 * allowed anything bad to happen. 7901 */ 7902 if (lock.nil_ent == NVME_LOCK_E_NS) { 7903 ASSERT3P(minor->nm_ns_lock.nli_lock, ==, NULL); 7904 ASSERT3U(minor->nm_ns_lock.nli_state, ==, 7905 NVME_LOCK_STATE_UNLOCKED); 7906 ASSERT3U(minor->nm_ns_lock.nli_curlevel, ==, 0); 7907 ASSERT3P(minor->nm_ns_lock.nli_ns, ==, NULL); 7908 7909 if (minor->nm_ns != NULL) { 7910 ASSERT3U(minor->nm_ns->ns_id, ==, 7911 lock.nil_common.nioc_nsid); 7912 } 7913 7914 ASSERT0(list_link_active(&minor->nm_ns_lock.nli_node)); 7915 } else { 7916 ASSERT3P(minor->nm_ctrl_lock.nli_lock, ==, NULL); 7917 ASSERT3U(minor->nm_ctrl_lock.nli_state, ==, 7918 NVME_LOCK_STATE_UNLOCKED); 7919 ASSERT3U(minor->nm_ctrl_lock.nli_curlevel, ==, 0); 7920 ASSERT3P(minor->nm_ns_lock.nli_ns, ==, NULL); 7921 ASSERT0(list_link_active(&minor->nm_ctrl_lock.nli_node)); 7922 7923 ASSERT3P(minor->nm_ns_lock.nli_lock, ==, NULL); 7924 ASSERT3U(minor->nm_ns_lock.nli_state, ==, 7925 NVME_LOCK_STATE_UNLOCKED); 7926 ASSERT3U(minor->nm_ns_lock.nli_curlevel, ==, 0); 7927 ASSERT3P(minor->nm_ns_lock.nli_ns, ==, NULL); 7928 ASSERT0(list_link_active(&minor->nm_ns_lock.nli_node)); 7929 } 7930 #endif /* DEBUG */ 7931 7932 /* 7933 * At this point we should actually attempt a locking operation. 7934 */ 7935 nvme_rwlock(minor, &lock); 7936 mutex_exit(&nvme->n_minor_mutex); 7937 7938 copyout: 7939 if (ddi_copyout(&lock, (void *)(uintptr_t)arg, sizeof (lock), 7940 mode & FKIOCTL) != 0) { 7941 return (EFAULT); 7942 } 7943 7944 return (0); 7945 } 7946 7947 static int 7948 nvme_ioctl_unlock(nvme_minor_t *minor, intptr_t arg, int mode, 7949 cred_t *cred_p) 7950 { 7951 nvme_ioctl_unlock_t unlock; 7952 nvme_t *const nvme = minor->nm_ctrl; 7953 boolean_t is_ctrl; 7954 nvme_lock_t *lock; 7955 nvme_minor_lock_info_t *info; 7956 7957 /* 7958 * Note, we explicitly don't check for privileges for unlock. The idea 7959 * being that if you have the lock, that's what matters. If you don't 7960 * have the lock, it doesn't matter what privileges that you have at 7961 * all. 7962 */ 7963 if ((mode & FWRITE) == 0) 7964 return (EBADF); 7965 7966 if (ddi_copyin((void *)(uintptr_t)arg, &unlock, sizeof (unlock), 7967 mode & FKIOCTL) != 0) { 7968 return (EFAULT); 7969 } 7970 7971 if (unlock.niu_ent != NVME_LOCK_E_CTRL && 7972 unlock.niu_ent != NVME_LOCK_E_NS) { 7973 (void) nvme_ioctl_error(&unlock.niu_common, 7974 NVME_IOCTL_E_BAD_LOCK_ENTITY, 0, 0); 7975 goto copyout; 7976 } 7977 7978 if (!nvme_ioctl_check(minor, &unlock.niu_common, &nvme_check_locking)) { 7979 goto copyout; 7980 } 7981 7982 /* 7983 * If we're on a namespace, confirm that we're not asking for the 7984 * controller. 7985 */ 7986 if (unlock.niu_common.nioc_nsid != 0 && 7987 unlock.niu_ent == NVME_LOCK_E_CTRL) { 7988 (void) nvme_ioctl_error(&unlock.niu_common, 7989 NVME_IOCTL_E_NS_CANNOT_UNLOCK_CTRL, 0, 0); 7990 goto copyout; 7991 } 7992 7993 mutex_enter(&nvme->n_minor_mutex); 7994 if (unlock.niu_ent == NVME_LOCK_E_CTRL) { 7995 if (minor->nm_ctrl_lock.nli_state != NVME_LOCK_STATE_ACQUIRED) { 7996 mutex_exit(&nvme->n_minor_mutex); 7997 (void) nvme_ioctl_error(&unlock.niu_common, 7998 NVME_IOCTL_E_LOCK_NOT_HELD, 0, 0); 7999 goto copyout; 8000 } 8001 } else { 8002 if (minor->nm_ns_lock.nli_ns == NULL) { 8003 mutex_exit(&nvme->n_minor_mutex); 8004 (void) nvme_ioctl_error(&unlock.niu_common, 8005 NVME_IOCTL_E_LOCK_NOT_HELD, 0, 0); 8006 goto copyout; 8007 } 8008 8009 /* 8010 * Check that our unlock request corresponds to the namespace ID 8011 * that is currently locked. This could happen if we're using 8012 * the controller node and it specified a valid, but not locked, 8013 * namespace ID. 8014 */ 8015 if (minor->nm_ns_lock.nli_ns->ns_id != 8016 unlock.niu_common.nioc_nsid) { 8017 mutex_exit(&nvme->n_minor_mutex); 8018 ASSERT3P(minor->nm_ns, ==, NULL); 8019 (void) nvme_ioctl_error(&unlock.niu_common, 8020 NVME_IOCTL_E_LOCK_WRONG_NS, 0, 0); 8021 goto copyout; 8022 } 8023 8024 if (minor->nm_ns_lock.nli_state != NVME_LOCK_STATE_ACQUIRED) { 8025 mutex_exit(&nvme->n_minor_mutex); 8026 (void) nvme_ioctl_error(&unlock.niu_common, 8027 NVME_IOCTL_E_LOCK_NOT_HELD, 0, 0); 8028 goto copyout; 8029 } 8030 } 8031 8032 /* 8033 * Finally, perform the unlock. 8034 */ 8035 is_ctrl = unlock.niu_ent == NVME_LOCK_E_CTRL; 8036 if (is_ctrl) { 8037 lock = &nvme->n_lock; 8038 info = &minor->nm_ctrl_lock; 8039 } else { 8040 nvme_namespace_t *ns; 8041 const uint32_t nsid = unlock.niu_common.nioc_nsid; 8042 8043 ns = nvme_nsid2ns(nvme, nsid); 8044 lock = &ns->ns_lock; 8045 info = &minor->nm_ns_lock; 8046 VERIFY3P(ns, ==, info->nli_ns); 8047 } 8048 nvme_rwunlock(info, lock); 8049 mutex_exit(&nvme->n_minor_mutex); 8050 nvme_ioctl_success(&unlock.niu_common); 8051 8052 copyout: 8053 if (ddi_copyout(&unlock, (void *)(uintptr_t)arg, sizeof (unlock), 8054 mode & FKIOCTL) != 0) { 8055 return (EFAULT); 8056 } 8057 8058 return (0); 8059 } 8060 8061 static int 8062 nvme_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred_p, 8063 int *rval_p) 8064 { 8065 #ifndef __lock_lint 8066 _NOTE(ARGUNUSED(rval_p)); 8067 #endif 8068 int ret; 8069 nvme_minor_t *minor; 8070 nvme_t *nvme; 8071 8072 minor = nvme_minor_find_by_dev(dev); 8073 if (minor == NULL) { 8074 return (ENXIO); 8075 } 8076 8077 nvme = minor->nm_ctrl; 8078 if (nvme == NULL) 8079 return (ENXIO); 8080 8081 if (IS_DEVCTL(cmd)) 8082 return (ndi_devctl_ioctl(nvme->n_dip, cmd, arg, mode, 0)); 8083 8084 if (nvme->n_dead && (cmd != NVME_IOC_BD_DETACH && cmd != 8085 NVME_IOC_UNLOCK)) { 8086 if (IS_NVME_IOC(cmd) == 0) { 8087 return (EIO); 8088 } 8089 8090 return (nvme_ioctl_copyout_error(nvme->n_dead_status, arg, 8091 mode)); 8092 } 8093 8094 /* 8095 * ioctls that are no longer using the original ioctl structure. 8096 */ 8097 switch (cmd) { 8098 case NVME_IOC_CTRL_INFO: 8099 ret = nvme_ioctl_ctrl_info(minor, arg, mode, cred_p); 8100 break; 8101 case NVME_IOC_IDENTIFY: 8102 ret = nvme_ioctl_identify(minor, arg, mode, cred_p); 8103 break; 8104 case NVME_IOC_GET_LOGPAGE: 8105 ret = nvme_ioctl_get_logpage(minor, arg, mode, cred_p); 8106 break; 8107 case NVME_IOC_GET_FEATURE: 8108 ret = nvme_ioctl_get_feature(minor, arg, mode, cred_p); 8109 break; 8110 case NVME_IOC_BD_DETACH: 8111 ret = nvme_ioctl_bd_detach(minor, arg, mode, cred_p); 8112 break; 8113 case NVME_IOC_BD_ATTACH: 8114 ret = nvme_ioctl_bd_attach(minor, arg, mode, cred_p); 8115 break; 8116 case NVME_IOC_FORMAT: 8117 ret = nvme_ioctl_format(minor, arg, mode, cred_p); 8118 break; 8119 case NVME_IOC_FIRMWARE_DOWNLOAD: 8120 ret = nvme_ioctl_firmware_download(minor, arg, mode, cred_p); 8121 break; 8122 case NVME_IOC_FIRMWARE_COMMIT: 8123 ret = nvme_ioctl_firmware_commit(minor, arg, mode, cred_p); 8124 break; 8125 case NVME_IOC_NS_INFO: 8126 ret = nvme_ioctl_ns_info(minor, arg, mode, cred_p); 8127 break; 8128 case NVME_IOC_PASSTHRU: 8129 ret = nvme_ioctl_passthru(minor, arg, mode, cred_p); 8130 break; 8131 case NVME_IOC_LOCK: 8132 ret = nvme_ioctl_lock(minor, arg, mode, cred_p); 8133 break; 8134 case NVME_IOC_UNLOCK: 8135 ret = nvme_ioctl_unlock(minor, arg, mode, cred_p); 8136 break; 8137 case NVME_IOC_CTRL_DETACH: 8138 ret = nvme_ioctl_ctrl_detach(minor, arg, mode, cred_p); 8139 break; 8140 case NVME_IOC_CTRL_ATTACH: 8141 ret = nvme_ioctl_ctrl_attach(minor, arg, mode, cred_p); 8142 break; 8143 case NVME_IOC_NS_CREATE: 8144 ret = nvme_ioctl_ns_create(minor, arg, mode, cred_p); 8145 break; 8146 case NVME_IOC_NS_DELETE: 8147 ret = nvme_ioctl_ns_delete(minor, arg, mode, cred_p); 8148 break; 8149 default: 8150 ret = ENOTTY; 8151 break; 8152 } 8153 8154 ASSERT(!nvme_mgmt_lock_held(nvme)); 8155 return (ret); 8156 } 8157 8158 /* 8159 * DDI UFM Callbacks 8160 */ 8161 static int 8162 nvme_ufm_fill_image(ddi_ufm_handle_t *ufmh, void *arg, uint_t imgno, 8163 ddi_ufm_image_t *img) 8164 { 8165 nvme_t *nvme = arg; 8166 8167 if (imgno != 0) 8168 return (EINVAL); 8169 8170 ddi_ufm_image_set_desc(img, "Firmware"); 8171 ddi_ufm_image_set_nslots(img, nvme->n_idctl->id_frmw.fw_nslot); 8172 8173 return (0); 8174 } 8175 8176 /* 8177 * Fill out firmware slot information for the requested slot. The firmware 8178 * slot information is gathered by requesting the Firmware Slot Information log 8179 * page. The format of the page is described in section 5.10.1.3. 8180 * 8181 * We lazily cache the log page on the first call and then invalidate the cache 8182 * data after a successful firmware download or firmware commit command. 8183 * The cached data is protected by a mutex as the state can change 8184 * asynchronous to this callback. 8185 */ 8186 static int 8187 nvme_ufm_fill_slot(ddi_ufm_handle_t *ufmh, void *arg, uint_t imgno, 8188 uint_t slotno, ddi_ufm_slot_t *slot) 8189 { 8190 nvme_t *nvme = arg; 8191 void *log = NULL; 8192 size_t bufsize; 8193 ddi_ufm_attr_t attr = 0; 8194 char fw_ver[NVME_FWVER_SZ + 1]; 8195 8196 if (imgno > 0 || slotno > (nvme->n_idctl->id_frmw.fw_nslot - 1)) 8197 return (EINVAL); 8198 8199 mutex_enter(&nvme->n_fwslot_mutex); 8200 if (nvme->n_fwslot == NULL) { 8201 if (!nvme_get_logpage_int(nvme, B_TRUE, &log, &bufsize, 8202 NVME_LOGPAGE_FWSLOT) || 8203 bufsize != sizeof (nvme_fwslot_log_t)) { 8204 if (log != NULL) 8205 kmem_free(log, bufsize); 8206 mutex_exit(&nvme->n_fwslot_mutex); 8207 return (EIO); 8208 } 8209 nvme->n_fwslot = (nvme_fwslot_log_t *)log; 8210 } 8211 8212 /* 8213 * NVMe numbers firmware slots starting at 1 8214 */ 8215 if (slotno == (nvme->n_fwslot->fw_afi - 1)) 8216 attr |= DDI_UFM_ATTR_ACTIVE; 8217 8218 if (slotno != 0 || nvme->n_idctl->id_frmw.fw_readonly == 0) 8219 attr |= DDI_UFM_ATTR_WRITEABLE; 8220 8221 if (nvme->n_fwslot->fw_frs[slotno][0] == '\0') { 8222 attr |= DDI_UFM_ATTR_EMPTY; 8223 } else { 8224 (void) strncpy(fw_ver, nvme->n_fwslot->fw_frs[slotno], 8225 NVME_FWVER_SZ); 8226 fw_ver[NVME_FWVER_SZ] = '\0'; 8227 ddi_ufm_slot_set_version(slot, fw_ver); 8228 } 8229 mutex_exit(&nvme->n_fwslot_mutex); 8230 8231 ddi_ufm_slot_set_attrs(slot, attr); 8232 8233 return (0); 8234 } 8235 8236 static int 8237 nvme_ufm_getcaps(ddi_ufm_handle_t *ufmh, void *arg, ddi_ufm_cap_t *caps) 8238 { 8239 *caps = DDI_UFM_CAP_REPORT; 8240 return (0); 8241 } 8242 8243 boolean_t 8244 nvme_ctrl_atleast(nvme_t *nvme, const nvme_version_t *min) 8245 { 8246 return (nvme_vers_atleast(&nvme->n_version, min) ? B_TRUE : B_FALSE); 8247 } 8248