xref: /illumos-gate/usr/src/uts/common/io/nvme/nvme.c (revision 6fc89bfc8e69fd45d8778b2f0ad45efc0ded99ed)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright (c) 2016 The MathWorks, Inc.  All rights reserved.
14  * Copyright 2019 Unix Software Ltd.
15  * Copyright 2020 Joyent, Inc.
16  * Copyright 2020 Racktop Systems.
17  * Copyright 2025 Oxide Computer Company.
18  * Copyright 2022 OmniOS Community Edition (OmniOSce) Association.
19  * Copyright 2022 Tintri by DDN, Inc. All rights reserved.
20  */
21 
22 /*
23  * blkdev driver for NVMe compliant storage devices
24  *
25  * This driver targets and is designed to support all NVMe 1.x and NVMe 2.x
26  * devices. Features are added to the driver as we encounter devices that
27  * require them and our needs, so some commands or log pages may not take
28  * advantage of newer features that devices support at this time. When you
29  * encounter such a case, it is generally fine to add that support to the driver
30  * as long as you take care to ensure that the requisite device version is met
31  * before using it.
32  *
33  * The driver has only been tested on x86 systems and will not work on big-
34  * endian systems without changes to the code accessing registers and data
35  * structures used by the hardware.
36  *
37  * ---------------
38  * Interrupt Usage
39  * ---------------
40  *
41  * The driver will use a single interrupt while configuring the device as the
42  * specification requires, but contrary to the specification it will try to use
43  * a single-message MSI(-X) or FIXED interrupt. Later in the attach process it
44  * will switch to multiple-message MSI(-X) if supported. The driver wants to
45  * have one interrupt vector per CPU, but it will work correctly if less are
46  * available. Interrupts can be shared by queues, the interrupt handler will
47  * iterate through the I/O queue array by steps of n_intr_cnt. Usually only
48  * the admin queue will share an interrupt with one I/O queue. The interrupt
49  * handler will retrieve completed commands from all queues sharing an interrupt
50  * vector and will post them to a taskq for completion processing.
51  *
52  * ------------------
53  * Command Processing
54  * ------------------
55  *
56  * NVMe devices can have up to 65535 I/O queue pairs, with each queue holding up
57  * to 65536 I/O commands. The driver will configure one I/O queue pair per
58  * available interrupt vector, with the queue length usually much smaller than
59  * the maximum of 65536. If the hardware doesn't provide enough queues, fewer
60  * interrupt vectors will be used.
61  *
62  * Additionally the hardware provides a single special admin queue pair that can
63  * hold up to 4096 admin commands.
64  *
65  * From the hardware perspective both queues of a queue pair are independent,
66  * but they share some driver state: the command array (holding pointers to
67  * commands currently being processed by the hardware) and the active command
68  * counter. Access to a submission queue and the shared state is protected by
69  * nq_mutex; completion queue is protected by ncq_mutex.
70  *
71  * When a command is submitted to a queue pair the active command counter is
72  * incremented and a pointer to the command is stored in the command array. The
73  * array index is used as command identifier (CID) in the submission queue
74  * entry. Some commands may take a very long time to complete, and if the queue
75  * wraps around in that time a submission may find the next array slot to still
76  * be used by a long-running command. In this case the array is sequentially
77  * searched for the next free slot. The length of the command array is the same
78  * as the configured queue length. Queue overrun is prevented by the semaphore,
79  * so a command submission may block if the queue is full.
80  *
81  * ------------------
82  * Polled I/O Support
83  * ------------------
84  *
85  * For kernel core dump support the driver can do polled I/O. As interrupts are
86  * turned off while dumping the driver will just submit a command in the regular
87  * way, and then repeatedly attempt a command retrieval until it gets the
88  * command back.
89  *
90  * -----------------
91  * Namespace Support
92  * -----------------
93  *
94  * NVMe devices can have multiple namespaces, each being a independent data
95  * store. The driver supports multiple namespaces and creates a blkdev interface
96  * for each namespace found. Namespaces can have various attributes to support
97  * protection information. This driver does not support any of this and ignores
98  * namespaces that have these attributes.
99  *
100  * As of NVMe 1.1 namespaces can have an 64bit Extended Unique Identifier
101  * (EUI64), and NVMe 1.2 introduced an additional 128bit Namespace Globally
102  * Unique Identifier (NGUID). This driver uses either the NGUID or the EUI64
103  * if present to generate the devid, and passes the EUI64 to blkdev to use it
104  * in the device node names.
105  *
106  * When a device has more than (2 << NVME_MINOR_INST_SHIFT) - 2 namespaces in a
107  * single controller, additional namespaces will not have minor nodes created.
108  * They can still be used and specified by the controller and libnvme. This
109  * limit is trying to balance the number of controllers and namespaces while
110  * fitting within the constraints of MAXMIN32, aka a 32-bit device number which
111  * only has 18-bits for the minor number. See the minor node section for more
112  * information.
113  *
114  * The driver supports namespace management, meaning the ability to create and
115  * destroy namespaces, and to attach and detach namespaces from controllers.
116  * Each namespace has an associated nvme_ns_state_t, which transitions through
117  * several states. The UNALLOCATED, ALLOCATED, and ACTIVE states are states that
118  * are defined by the NVMe specification. Not all ACTIVE namespaces may be
119  * attached to blkdev(4D) due to the use of features we don't support, for
120  * example, metadata protection. Such namespaces are automatically in the
121  * NOT_IGNORED state. Once they are attached to blkdev they enter the ATTACHED
122  * state.
123  *
124  * By default, a device can only transition one such state at a time. Each
125  * command that transitions between states has a corresponding array of errnos
126  * to use to transition. Examples of this are the nvme_ns_delete_states[],
127  * nvme_ctrl_attach_states[], etc. These dictate whether it is okay or not for a
128  * command that changes state to occur or not based on the current state. Each
129  * of these returns a specific error allowing one to understand why something
130  * isn't in the proper state. This allows library consumers to determine whether
131  * or not a namespace is already in the current state it's targeting to be
132  * ignored or not. The following diagram summarizes namespace transitions:
133  *
134  *                       +-------------+
135  *                       |             |
136  *                       | Unallocated |
137  *                       |             |
138  *                       +-------------+
139  *                          |       ^
140  *                          |       |
141  * Namespace Management: . .*       * . . . Namespace Management:
142  * Create                   |       |       Delete
143  * NVME_IOC_NS_CREATE       |       |       NVME_IOC_NS_DELETE
144  *                          v       |
145  *                       +-------------+
146  *                       |             |
147  *                       |  Allocated  |
148  *                       |             |
149  *                       +-------------+
150  *                          |       ^
151  *                          |       |
152  * Namespace Attachment: . .*       * . . . Namespace Attachment:
153  * Controller Attach        |       |       Controller Detach
154  * NVME_IOC_CTRL_ATTACH     |       |       NVME_IOC_CTRL_DETACH
155  *                          v       |
156  *              +------------+      |
157  *              |            |      |     +----------+
158  *              |   Active   |>-----+----<|   Not    |
159  *              |            |--*-------->| Ignored  |
160  *              +------------+  .         +----------+
161  *                              .           |      ^
162  *    automatic kernel transition           |      |
163  *                                          |      * . . blkdev Detach
164  *                       blkdev attach  . . *      |     NVME_IOC_BD_DETACH
165  *                       NVME_IOC_BD_ATTACH |      |
166  *                                          v      |
167  *                                        +----------+
168  *                                        |          |
169  *                                        |  blkdev  |
170  *                                        | attached |
171  *                                        |          |
172  *                                        +----------+
173  *
174  * -----------
175  * Minor nodes
176  * -----------
177  *
178  * For each NVMe device the driver exposes one minor node for the controller and
179  * one minor node for each namespace. The only operations supported by those
180  * minor nodes are open(9E), close(9E), and ioctl(9E). This serves as the
181  * primary control interface for the devices. The character device is a private
182  * interface and we attempt stability through libnvme and more so nvmeadm.
183  *
184  * The controller minor node is much more flexible than the namespace minor node
185  * and should be preferred. The controller node allows one to target any
186  * namespace that the device has, while the namespace is limited in what it can
187  * acquire. While the namespace minor exists, it should not be relied upon and
188  * is not by libnvme.
189  *
190  * The minor number space is split in two. We use the lower part to support the
191  * controller and namespaces as described above in the 'Namespace Support'
192  * section. The second set is used for cloning opens. We set aside one million
193  * minors for this purpose. We utilize a cloning open so that way we can have
194  * per-file_t state. This is how we end up implementing and tracking locking
195  * state and related.
196  *
197  * When we have this cloned open, then we allocate a new nvme_minor_t which gets
198  * its minor number from the nvme_open_minors id_space_t and is stored in the
199  * nvme_open_minors_avl. While someone calls open on a controller or namespace
200  * minor, everything else occurs in the context of one of these ephemeral
201  * minors.
202  *
203  * ------------------------------------
204  * ioctls, Errors, and Exclusive Access
205  * ------------------------------------
206  *
207  * All of the logical commands that one can issue are driven through the
208  * ioctl(9E) interface. All of our ioctls have a similar shape where they
209  * all include the 'nvme_ioctl_common_t' as their first member.
210  *
211  * This common ioctl structure is used to communicate the namespace that should
212  * be targeted. When the namespace is left as 0, then that indicates that it
213  * should target whatever the default is of the minor node. For a namespace
214  * minor, that will be transparently rewritten to the namespace's namespace id.
215  *
216  * In addition, the nvme_ioctl_common_t structure also has a standard error
217  * return. Our goal in our ioctl path is to ensure that we have useful semantic
218  * errors as much as possible. EINVAL, EIO, etc. are all overloaded. Instead as
219  * long as we can copy in our structure, then we will set a semantic error. If
220  * we have an error from the controller, then that will be included there.
221  *
222  * Each command has a specific policy that controls whether or not it is allowed
223  * on the namespace or controller minor, whether the broadcast namespace is
224  * allowed, various settings around what kind of exclusive access is allowed,
225  * and more. Each of these is wrapped up in a bit of policy described by the
226  * 'nvme_ioctl_check_t' structure.
227  *
228  * The device provides a form of exclusion in the form of both a
229  * controller-level and namespace-level read and write lock. Most operations do
230  * not require a lock (e.g. get log page, identify, etc.), but a few do (e.g.
231  * format nvm, firmware related activity, etc.). A read lock guarantees that you
232  * can complete your operation without interference, but read locks are not
233  * required. If you don't take a read lock and someone comes in with a write
234  * lock, then subsequent operations will fail with a semantic error indicating
235  * that you were blocked due to this.
236  *
237  * Here are some of the rules that govern our locks:
238  *
239  * 1. Writers starve readers. Any readers are allowed to finish when there is a
240  *    pending writer; however, all subsequent readers will be blocked upon that
241  *    writer.
242  * 2. A controller write lock takes priority over all other locks. Put
243  *    differently a controller writer not only starves subsequent controller
244  *    readers, but also all namespace read and write locks.
245  * 3. Each namespace lock is independent.
246  * 4. At most a single namespace lock may be owned.
247  * 5. If you own a namespace lock, you may not take a controller lock (to help
248  *    with lock ordering).
249  * 6. In a similar spirit, if you own a controller write lock, you may not take
250  *    any namespace lock. Someone with the controller write lock can perform any
251  *    operations that they need to. However, if you have a controller read lock
252  *    you may take any namespace lock.
253  * 7. There is no ability to upgrade a read lock to a write lock.
254  * 8. There is no recursive locking.
255  *
256  * While there's a lot there to keep track of, the goals of these are to
257  * constrain things so as to avoid deadlock. This is more complex than the
258  * original implementation in the driver which only allowed for an exclusive
259  * open that was tied to the thread. The first issue with tying this to the
260  * thread was that that didn't work well for software that utilized thread
261  * pools, like complex daemons. The second issue is that we want the ability for
262  * daemons, such as a FRU monitor, to be able to retain a file descriptor to the
263  * device without blocking others from taking action except during critical
264  * periods.
265  *
266  * In particular to enable something like libnvme, we didn't want someone to
267  * have to open and close the file descriptor to change what kind of exclusive
268  * access they desired.
269  *
270  * There are two different sets of data structures that we employ for tracking
271  * locking information:
272  *
273  * 1) The nvme_lock_t structure is contained in both the nvme_t and the
274  * nvme_namespace_t and tracks the current writer, readers, and pending writers
275  * and readers. Each of these lists or the writer pointer all refer to our
276  * second data structure.
277  *
278  * When a lock is owned by a single writer, then the nl_writer field is set to a
279  * specific minor's lock data structure. If instead readers are present, then
280  * the nl_readers list_t is not empty. An invariant of the system is that if
281  * nl_writer is non-NULL, nl_readers must be empty and conversely, if nl_readers
282  * is not empty, nl_writer must be NULL.
283  *
284  * 2) The nvme_minor_lock_info_t exists in the nvme_minor_t. There is one
285  * information structure which represents the minor's controller lock and a
286  * second one that represents the minor's namespace lock. The members of this
287  * are broken into tracking what the current lock is and what it targets. It
288  * also several members that are intended for debugging (nli_last_change,
289  * nli_acq_kthread, etc.).
290  *
291  * While the minor has two different lock information structures, our rules
292  * ensure that only one of the two can be pending and that they shouldn't result
293  * in a deadlock. When a lock is pending, the caller is sleeping on the minor's
294  * nm_cv member.
295  *
296  * These relationships are represented in the following image which shows a
297  * controller write lock being held with a pending readers on the controller
298  * lock and pending writers on one of the controller's namespaces.
299  *
300  *  +---------+
301  *  | nvme_t  |
302  *  |         |
303  *  | n_lock -|-------+
304  *  | n_ns -+ |       |                          +-----------------------------+
305  *  +-------|-+   +-----------------+            | nvme_minor_t                |
306  *          |     | nvme_lock_t     |            |                             |
307  *          |     |                 |            |  +------------------------+ |
308  *          |     | writer        --|-------------->| nvme_minor_lock_info_t | |
309  *          |     | reader list     |            |  | nm_ctrl_lock           | |
310  *          |     | pending writers |            |  +------------------------+ |
311  *          |     | pending readers |------+     |  +------------------------+ |
312  *          |     +-----------------+      |     |  | nvme_minor_lock_info_t | |
313  *          |                              |     |  | nm_ns_lock             | |
314  *          |                              |     |  +------------------------+ |
315  *          |                              |     +-----------------------------+
316  *  +------------------+                   |                 +-----------------+
317  *  | nvme_namespace_t |                   |                 | nvme_minor_t    |
318  *  |                  |                   |                 |                 |
319  *  | ns_lock ---+     |                   |                 | +-------------+ |
320  *  +------------|-----+                   +-----------------|>|nm_ctrl_lock | |
321  *               |                                           | +-------------+ |
322  *               v                                           +-----------------+
323  *     +------------------+                                         ...
324  *     | nvme_lock_t      |                                  +-----------------+
325  *     |                  |                                  | nvme_minor_t    |
326  *     | writer           |                                  |                 |
327  *     | reader list      |                                  | +-------------+ |
328  *     | pending writers -|-----------------+                | |nm_ctrl_lock | |
329  *     | pending readers  |                 |                | +-------------+ |
330  *     +------------------+                 |                +-----------------+
331  *         +-----------------------------+  |  +-----------------------------+
332  *         | nvme_minor_t                |  |  | nvme_minor_t                |
333  *         |                             |  |  |                             |
334  *         |  +------------------------+ |  |  |  +------------------------+ |
335  *         |  | nvme_minor_lock_info_t | |  |  |  | nvme_minor_lock_info_t | |
336  *         |  | nm_ctrl_lock           | |  |  |  | nm_ctrl_lock           | |
337  *         |  +------------------------+ |  |  |  +------------------------+ |
338  *         |  +------------------------+ |  v  |  +------------------------+ |
339  *         |  | nvme_minor_lock_info_t |-|-----|->| nvme_minor_lock_info_t | |
340  *         |  | nm_ns_lock             | |     |  | nm_ns_lock             | |
341  *         |  +------------------------+ |     |  +------------------------+ |
342  *         +-----------------------------+     +-----------------------------+
343  *
344  * ----------------
345  * Blkdev Interface
346  * ----------------
347  *
348  * This driver uses blkdev to do all the heavy lifting involved with presenting
349  * a disk device to the system. As a result, the processing of I/O requests is
350  * relatively simple as blkdev takes care of partitioning, boundary checks, DMA
351  * setup, and splitting of transfers into manageable chunks.
352  *
353  * I/O requests coming in from blkdev are turned into NVM commands and posted to
354  * an I/O queue. The queue is selected by taking the CPU id modulo the number of
355  * queues. There is currently no timeout handling of I/O commands.
356  *
357  * Blkdev also supports querying device/media information and generating a
358  * devid. The driver reports the best block size as determined by the namespace
359  * format back to blkdev as physical block size to support partition and block
360  * alignment. The devid is either based on the namespace GUID or EUI64, if
361  * present, or composed using the device vendor ID, model number, serial number,
362  * and the namespace ID.
363  *
364  * --------------
365  * Error Handling
366  * --------------
367  *
368  * Error handling is currently limited to detecting fatal hardware errors,
369  * either by asynchronous events, or synchronously through command status or
370  * admin command timeouts. In case of severe errors the device is fenced off,
371  * all further requests will return EIO. FMA is then called to fault the device.
372  *
373  * The hardware has a limit for outstanding asynchronous event requests. Before
374  * this limit is known the driver assumes it is at least 1 and posts a single
375  * asynchronous request. Later when the limit is known more asynchronous event
376  * requests are posted to allow quicker reception of error information. When an
377  * asynchronous event is posted by the hardware the driver will parse the error
378  * status fields and log information or fault the device, depending on the
379  * severity of the asynchronous event. The asynchronous event request is then
380  * reused and posted to the admin queue again.
381  *
382  * On command completion the command status is checked for errors. In case of
383  * errors indicating a driver bug the driver panics. Almost all other error
384  * status values just cause EIO to be returned.
385  *
386  * Command timeouts are currently detected for all admin commands except
387  * asynchronous event requests. If a command times out and the hardware appears
388  * to be healthy the driver attempts to abort the command. The abort command
389  * timeout is a separate tunable but the original command timeout will be used
390  * if it is greater. If the abort times out too the driver assumes the device
391  * to be dead, fences it off, and calls FMA to retire it. In all other cases
392  * the aborted command should return immediately with a status indicating it
393  * was aborted, and the driver will wait indefinitely for that to happen. No
394  * timeout handling of normal I/O commands is presently done.
395  *
396  * Any command that times out due to the controller dropping dead will be put on
397  * nvme_lost_cmds list if it references DMA memory. This will prevent the DMA
398  * memory being reused by the system and later being written to by a "dead"
399  * NVMe controller.
400  *
401  * -------
402  * Locking
403  * -------
404  *
405  * Each queue pair has a nq_mutex and ncq_mutex. The nq_mutex must be held
406  * when accessing shared state and submission queue registers, ncq_mutex
407  * is held when accessing completion queue state and registers.
408  * Callers of nvme_unqueue_cmd() must make sure that nq_mutex is held, while
409  * nvme_submit_{admin,io}_cmd() and nvme_retrieve_cmd() take care of both
410  * mutexes themselves.
411  *
412  * Each command also has its own nc_mutex, which is associated with the
413  * condition variable nc_cv. It is only used on admin commands which are run
414  * synchronously. In that case it must be held across calls to
415  * nvme_submit_{admin,io}_cmd() and nvme_wait_cmd(), which is taken care of by
416  * nvme_admin_cmd(). It must also be held whenever the completion state of the
417  * command is changed or while an admin command timeout is handled.
418  *
419  * If both nc_mutex and nq_mutex must be held, nc_mutex must be acquired first.
420  * More than one nc_mutex may only be held when aborting commands. In this case,
421  * the nc_mutex of the command to be aborted must be held across the call to
422  * nvme_abort_cmd() to prevent the command from completing while the abort is in
423  * progress.
424  *
425  * If both nq_mutex and ncq_mutex need to be held, ncq_mutex must be
426  * acquired first. More than one nq_mutex is never held by a single thread.
427  * The ncq_mutex is only held by nvme_retrieve_cmd() and
428  * nvme_process_iocq(). nvme_process_iocq() is only called from the
429  * interrupt thread and nvme_retrieve_cmd() during polled I/O, so the
430  * mutex is non-contentious but is required for implementation completeness
431  * and safety.
432  *
433  * Each nvme_t has an n_admin_stat_mutex that protects the admin command
434  * statistics structure. If this is taken in conjunction with any other locks,
435  * then it must be taken last.
436  *
437  * There is one mutex n_minor_mutex which protects all open flags nm_open and
438  * exclusive-open thread pointers nm_oexcl of each minor node associated with a
439  * controller and its namespaces.
440  *
441  * In addition, there is a logical namespace management mutex which protects the
442  * data about namespaces. When interrogating the metadata of any namespace, this
443  * lock must be held. This gets tricky as we need to call into blkdev, which may
444  * issue callbacks into us which want this and it is illegal to hold locks
445  * across those blkdev calls as otherwise they might lead to deadlock (blkdev
446  * leverages ndi_devi_enter()).
447  *
448  * The lock exposes two levels, one that we call 'NVME' and one 'BDRO' or blkdev
449  * read-only. The idea is that most callers will use the NVME level which says
450  * this is a full traditional mutex operation. The BDRO level is used by blkdev
451  * callback functions and is a promise to only only read the data. When a blkdev
452  * operation starts, the lock holder will use nvme_mgmt_bd_start(). This
453  * strictly speaking drops the mutex, but records that the lock is logically
454  * held by the thread that did the start() operation.
455  *
456  * During this time, other threads (or even the same one) may end up calling
457  * into nvme_mgmt_lock(). Only one person may still hold the lock at any time;
458  * however, the BRDO level will be allowed to proceed during this time. This
459  * allows us to make consistent progress and honor the blkdev lock ordering
460  * requirements, albeit it is not as straightforward as a simple mutex.
461  *
462  * ---------------------
463  * Quiesce / Fast Reboot
464  * ---------------------
465  *
466  * The driver currently does not support fast reboot. A quiesce(9E) entry point
467  * is still provided which is used to send a shutdown notification to the
468  * device.
469  *
470  *
471  * ------------
472  * NVMe Hotplug
473  * ------------
474  *
475  * The driver supports hot removal. The driver uses the NDI event framework
476  * to register a callback, nvme_remove_callback, to clean up when a disk is
477  * removed. In particular, the driver will unqueue outstanding I/O commands and
478  * set n_dead on the softstate to true so that other operations, such as ioctls
479  * and command submissions, fail as well.
480  *
481  * While the callback registration relies on the NDI event framework, the
482  * removal event itself is kicked off in the PCIe hotplug framework, when the
483  * PCIe bridge driver ("pcieb") gets a hotplug interrupt indicating that a
484  * device was removed from the slot.
485  *
486  * The NVMe driver instance itself will remain until the final close of the
487  * device.
488  *
489  * ---------------
490  * DDI UFM Support
491  * ---------------
492  *
493  * The driver supports the DDI UFM framework for reporting information about
494  * the device's firmware image and slot configuration. This data can be
495  * queried by userland software via ioctls to the ufm driver. For more
496  * information, see ddi_ufm(9E).
497  *
498  * --------------------
499  * Driver Configuration
500  * --------------------
501  *
502  * The following driver properties can be changed to control some aspects of the
503  * drivers operation:
504  * - strict-version: can be set to 0 to allow devices conforming to newer
505  *   major versions to be used
506  * - ignore-unknown-vendor-status: can be set to 1 to not handle any vendor
507  *   specific command status as a fatal error leading device faulting
508  * - admin-queue-len: the maximum length of the admin queue (16-4096)
509  * - io-squeue-len: the maximum length of the I/O submission queues (16-65536)
510  * - io-cqueue-len: the maximum length of the I/O completion queues (16-65536)
511  * - async-event-limit: the maximum number of asynchronous event requests to be
512  *   posted by the driver
513  * - volatile-write-cache-enable: can be set to 0 to disable the volatile write
514  *   cache
515  * - min-phys-block-size: the minimum physical block size to report to blkdev,
516  *   which is among other things the basis for ZFS vdev ashift
517  * - max-submission-queues: the maximum number of I/O submission queues.
518  * - max-completion-queues: the maximum number of I/O completion queues,
519  *   can be less than max-submission-queues, in which case the completion
520  *   queues are shared.
521  *
522  * In addition to the above properties, some device-specific tunables can be
523  * configured using the nvme-config-list global property. The value of this
524  * property is a list of triplets. The formal syntax is:
525  *
526  *   nvme-config-list ::= <triplet> [, <triplet>]* ;
527  *   <triplet>        ::= "<model>" , "<rev-list>" , "<tuple-list>"
528  *   <rev-list>       ::= [ <fwrev> [, <fwrev>]*]
529  *   <tuple-list>     ::= <tunable> [, <tunable>]*
530  *   <tunable>        ::= <name> : <value>
531  *
532  * The <model> and <fwrev> are the strings in nvme_identify_ctrl_t`id_model and
533  * nvme_identify_ctrl_t`id_fwrev, respectively. The remainder of <tuple-list>
534  * contains one or more tunables to apply to all controllers that match the
535  * specified model number and optionally firmware revision. Each <tunable> is a
536  * <name> : <value> pair.  Supported tunables are:
537  *
538  * - ignore-unknown-vendor-status:  can be set to "on" to not handle any vendor
539  *   specific command status as a fatal error leading device faulting
540  *
541  * - min-phys-block-size: the minimum physical block size to report to blkdev,
542  *   which is among other things the basis for ZFS vdev ashift
543  *
544  * - volatile-write-cache: can be set to "on" or "off" to enable or disable the
545  *   volatile write cache, if present
546  *
547  *
548  * TODO:
549  * - figure out sane default for I/O queue depth reported to blkdev
550  * - FMA handling of media errors
551  * - support for devices supporting very large I/O requests using chained PRPs
552  * - support for configuring hardware parameters like interrupt coalescing
553  * - support for big-endian systems
554  * - support for fast reboot
555  * - support for NVMe Subsystem Reset (1.1)
556  * - support for Scatter/Gather lists (1.1)
557  * - support for Reservations (1.1)
558  * - support for power management
559  */
560 
561 #include <sys/byteorder.h>
562 #ifdef _BIG_ENDIAN
563 #error nvme driver needs porting for big-endian platforms
564 #endif
565 
566 #include <sys/modctl.h>
567 #include <sys/conf.h>
568 #include <sys/devops.h>
569 #include <sys/ddi.h>
570 #include <sys/ddi_ufm.h>
571 #include <sys/sunddi.h>
572 #include <sys/sunndi.h>
573 #include <sys/bitmap.h>
574 #include <sys/sysmacros.h>
575 #include <sys/param.h>
576 #include <sys/varargs.h>
577 #include <sys/cpuvar.h>
578 #include <sys/disp.h>
579 #include <sys/blkdev.h>
580 #include <sys/atomic.h>
581 #include <sys/archsystm.h>
582 #include <sys/sata/sata_hba.h>
583 #include <sys/stat.h>
584 #include <sys/policy.h>
585 #include <sys/list.h>
586 #include <sys/dkio.h>
587 #include <sys/pci.h>
588 #include <sys/mkdev.h>
589 
590 #include <sys/nvme.h>
591 
592 #ifdef __x86
593 #include <sys/x86_archext.h>
594 #endif
595 
596 #include "nvme_reg.h"
597 #include "nvme_var.h"
598 
599 /*
600  * Assertions to make sure that we've properly captured various aspects of the
601  * packed structures and haven't broken them during updates.
602  */
603 CTASSERT(sizeof (nvme_identify_ctrl_t) == NVME_IDENTIFY_BUFSIZE);
604 CTASSERT(offsetof(nvme_identify_ctrl_t, id_oacs) == 256);
605 CTASSERT(offsetof(nvme_identify_ctrl_t, id_sqes) == 512);
606 CTASSERT(offsetof(nvme_identify_ctrl_t, id_oncs) == 520);
607 CTASSERT(offsetof(nvme_identify_ctrl_t, id_subnqn) == 768);
608 CTASSERT(offsetof(nvme_identify_ctrl_t, id_nvmof) == 1792);
609 CTASSERT(offsetof(nvme_identify_ctrl_t, id_psd) == 2048);
610 CTASSERT(offsetof(nvme_identify_ctrl_t, id_vs) == 3072);
611 
612 CTASSERT(sizeof (nvme_identify_nsid_t) == NVME_IDENTIFY_BUFSIZE);
613 CTASSERT(offsetof(nvme_identify_nsid_t, id_fpi) == 32);
614 CTASSERT(offsetof(nvme_identify_nsid_t, id_anagrpid) == 92);
615 CTASSERT(offsetof(nvme_identify_nsid_t, id_nguid) == 104);
616 CTASSERT(offsetof(nvme_identify_nsid_t, id_lbaf) == 128);
617 CTASSERT(offsetof(nvme_identify_nsid_t, id_vs) == 384);
618 
619 CTASSERT(sizeof (nvme_identify_nsid_list_t) == NVME_IDENTIFY_BUFSIZE);
620 CTASSERT(sizeof (nvme_identify_ctrl_list_t) == NVME_IDENTIFY_BUFSIZE);
621 
622 CTASSERT(sizeof (nvme_identify_primary_caps_t) == NVME_IDENTIFY_BUFSIZE);
623 CTASSERT(offsetof(nvme_identify_primary_caps_t, nipc_vqfrt) == 32);
624 CTASSERT(offsetof(nvme_identify_primary_caps_t, nipc_vifrt) == 64);
625 
626 CTASSERT(sizeof (nvme_nschange_list_t) == 4096);
627 
628 /* NVMe spec version supported */
629 static const int nvme_version_major = 2;
630 
631 /* Tunable for FORMAT NVM command timeout in seconds, default is 600s */
632 uint32_t nvme_format_cmd_timeout = 600;
633 
634 /* Tunable for firmware commit with NVME_FWC_SAVE, default is 15s */
635 uint32_t nvme_commit_save_cmd_timeout = 15;
636 
637 /*
638  * Tunable for the admin command timeout used for commands other than those
639  * with their own timeouts defined above; in seconds. While most commands are
640  * expected to complete very quickly (sub-second), experience has shown that
641  * some controllers can occasionally be a bit slower, and not always consistent
642  * in the time taken - times of up to around 4.2s have been observed. Setting
643  * this to 15s by default provides headroom.
644  */
645 uint32_t nvme_admin_cmd_timeout = 15;
646 
647 /*
648  * Tunable for abort command timeout in seconds, default is 60s. This timeout
649  * is used when issuing an abort command, currently only in response to a
650  * different admin command timing out. Aborts always complete after the command
651  * that they are attempting to abort so we need to allow enough time for the
652  * controller to process the long running command that we are attempting to
653  * abort. The abort timeout here is only used if it is greater than the timeout
654  * for the command that is being aborted.
655  */
656 uint32_t nvme_abort_cmd_timeout = 60;
657 
658 /*
659  * Tunable for the size of arbitrary vendor specific admin commands,
660  * default is 16MiB.
661  */
662 uint32_t nvme_vendor_specific_admin_cmd_size = 1 << 24;
663 
664 /*
665  * Tunable for the max timeout of arbitary vendor specific admin commands,
666  * default is 60s.
667  */
668 uint_t nvme_vendor_specific_admin_cmd_max_timeout = 60;
669 
670 /*
671  * This ID space, AVL, and lock are used for keeping track of minor state across
672  * opens between different devices.
673  */
674 static id_space_t *nvme_open_minors;
675 static avl_tree_t nvme_open_minors_avl;
676 kmutex_t nvme_open_minors_mutex;
677 
678 /*
679  * Removal taskq used for n_dead callback processing.
680  */
681 taskq_t *nvme_dead_taskq;
682 
683 /*
684  * This enumeration is used in tandem with nvme_mgmt_lock() to describe which
685  * form of the lock is being taken. See the theory statement for more context.
686  */
687 typedef enum {
688 	/*
689 	 * This is the primary form of taking the management lock and indicates
690 	 * that the user intends to do a read/write of it. This should always be
691 	 * used for any ioctl paths or truly anything other than a blkdev
692 	 * information operation.
693 	 */
694 	NVME_MGMT_LOCK_NVME,
695 	/*
696 	 * This is a subordinate form of the lock whereby the user is in blkdev
697 	 * callback context and will only intend to read the namespace data.
698 	 */
699 	NVME_MGMT_LOCK_BDRO
700 } nvme_mgmt_lock_level_t;
701 
702 static int nvme_attach(dev_info_t *, ddi_attach_cmd_t);
703 static int nvme_detach(dev_info_t *, ddi_detach_cmd_t);
704 static int nvme_quiesce(dev_info_t *);
705 static int nvme_fm_errcb(dev_info_t *, ddi_fm_error_t *, const void *);
706 static int nvme_setup_interrupts(nvme_t *, int, int);
707 static void nvme_release_interrupts(nvme_t *);
708 static uint_t nvme_intr(caddr_t, caddr_t);
709 
710 static void nvme_shutdown(nvme_t *, boolean_t);
711 static boolean_t nvme_reset(nvme_t *, boolean_t);
712 static int nvme_init(nvme_t *);
713 static nvme_cmd_t *nvme_alloc_cmd(nvme_t *, int);
714 static void nvme_free_cmd(nvme_cmd_t *);
715 static nvme_cmd_t *nvme_create_nvm_cmd(nvme_namespace_t *, uint8_t,
716     bd_xfer_t *);
717 static void nvme_admin_cmd(nvme_cmd_t *, uint32_t);
718 static void nvme_submit_admin_cmd(nvme_qpair_t *, nvme_cmd_t *, uint32_t *);
719 static int nvme_submit_io_cmd(nvme_qpair_t *, nvme_cmd_t *);
720 static void nvme_submit_cmd_common(nvme_qpair_t *, nvme_cmd_t *, uint32_t *);
721 static nvme_cmd_t *nvme_unqueue_cmd(nvme_t *, nvme_qpair_t *, int);
722 static nvme_cmd_t *nvme_retrieve_cmd(nvme_t *, nvme_qpair_t *);
723 static void nvme_wait_cmd(nvme_cmd_t *, uint_t);
724 static void nvme_wakeup_cmd(void *);
725 static void nvme_async_event_task(void *);
726 
727 static int nvme_check_unknown_cmd_status(nvme_cmd_t *);
728 static int nvme_check_vendor_cmd_status(nvme_cmd_t *);
729 static int nvme_check_integrity_cmd_status(nvme_cmd_t *);
730 static int nvme_check_specific_cmd_status(nvme_cmd_t *);
731 static int nvme_check_generic_cmd_status(nvme_cmd_t *);
732 static inline int nvme_check_cmd_status(nvme_cmd_t *);
733 static boolean_t nvme_check_cmd_status_ioctl(nvme_cmd_t *,
734     nvme_ioctl_common_t *);
735 
736 static int nvme_abort_cmd(nvme_cmd_t *, const uint32_t);
737 static void nvme_async_event(nvme_t *);
738 static boolean_t nvme_format_nvm(nvme_t *, nvme_ioctl_format_t *);
739 static boolean_t nvme_get_logpage_int(nvme_t *, boolean_t, void **, size_t *,
740     uint8_t);
741 static boolean_t nvme_identify(nvme_t *, boolean_t, nvme_ioctl_identify_t *,
742     void **);
743 static boolean_t nvme_identify_int(nvme_t *, uint32_t, uint8_t, void **);
744 static int nvme_set_features(nvme_t *, boolean_t, uint32_t, uint8_t, uint32_t,
745     uint32_t *);
746 static int nvme_write_cache_set(nvme_t *, boolean_t);
747 static int nvme_set_nqueues(nvme_t *);
748 
749 static void nvme_free_dma(nvme_dma_t *);
750 static int nvme_zalloc_dma(nvme_t *, size_t, uint_t, ddi_dma_attr_t *,
751     nvme_dma_t **);
752 static int nvme_zalloc_queue_dma(nvme_t *, uint32_t, uint16_t, uint_t,
753     nvme_dma_t **);
754 static void nvme_free_qpair(nvme_qpair_t *);
755 static int nvme_alloc_qpair(nvme_t *, uint32_t, nvme_qpair_t **, uint_t);
756 static int nvme_create_io_qpair(nvme_t *, nvme_qpair_t *, uint16_t);
757 
758 static inline void nvme_put64(nvme_t *, uintptr_t, uint64_t);
759 static inline void nvme_put32(nvme_t *, uintptr_t, uint32_t);
760 static inline uint64_t nvme_get64(nvme_t *, uintptr_t);
761 static inline uint32_t nvme_get32(nvme_t *, uintptr_t);
762 
763 static boolean_t nvme_check_regs_hdl(nvme_t *);
764 static boolean_t nvme_check_dma_hdl(nvme_dma_t *);
765 
766 static int nvme_fill_prp(nvme_cmd_t *, ddi_dma_handle_t);
767 
768 static void nvme_bd_xfer_done(void *);
769 static void nvme_bd_driveinfo(void *, bd_drive_t *);
770 static int nvme_bd_mediainfo(void *, bd_media_t *);
771 static int nvme_bd_cmd(nvme_namespace_t *, bd_xfer_t *, uint8_t);
772 static int nvme_bd_read(void *, bd_xfer_t *);
773 static int nvme_bd_write(void *, bd_xfer_t *);
774 static int nvme_bd_sync(void *, bd_xfer_t *);
775 static int nvme_bd_devid(void *, dev_info_t *, ddi_devid_t *);
776 static int nvme_bd_free_space(void *, bd_xfer_t *);
777 
778 static int nvme_prp_dma_constructor(void *, void *, int);
779 static void nvme_prp_dma_destructor(void *, void *);
780 
781 static void nvme_prepare_devid(nvme_t *, uint32_t);
782 
783 /* DDI UFM callbacks */
784 static int nvme_ufm_fill_image(ddi_ufm_handle_t *, void *, uint_t,
785     ddi_ufm_image_t *);
786 static int nvme_ufm_fill_slot(ddi_ufm_handle_t *, void *, uint_t, uint_t,
787     ddi_ufm_slot_t *);
788 static int nvme_ufm_getcaps(ddi_ufm_handle_t *, void *, ddi_ufm_cap_t *);
789 
790 static int nvme_open(dev_t *, int, int, cred_t *);
791 static int nvme_close(dev_t, int, int, cred_t *);
792 static int nvme_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
793 
794 static int nvme_init_ns(nvme_t *, uint32_t);
795 static boolean_t nvme_bd_attach_ns(nvme_t *, nvme_ioctl_common_t *);
796 static boolean_t nvme_bd_detach_ns(nvme_t *, nvme_ioctl_common_t *);
797 
798 static int nvme_minor_comparator(const void *, const void *);
799 
800 typedef struct {
801 	nvme_sqe_t *ica_sqe;
802 	void *ica_data;
803 	uint32_t ica_data_len;
804 	uint_t ica_dma_flags;
805 	int ica_copy_flags;
806 	uint32_t ica_timeout;
807 	uint32_t ica_cdw0;
808 } nvme_ioc_cmd_args_t;
809 static boolean_t nvme_ioc_cmd(nvme_t *, nvme_ioctl_common_t *,
810     nvme_ioc_cmd_args_t *);
811 
812 static ddi_ufm_ops_t nvme_ufm_ops = {
813 	NULL,
814 	nvme_ufm_fill_image,
815 	nvme_ufm_fill_slot,
816 	nvme_ufm_getcaps
817 };
818 
819 /*
820  * Minor numbers are split amongst those used for controllers and for device
821  * opens. The number of controller minors are limited based open MAXMIN32 per
822  * the theory statement. We allocate 1 million minors as a total guess at a
823  * number that'll probably be enough. The starting point of the open minors can
824  * be shifted to accommodate future expansion of the NVMe device minors.
825  */
826 #define	NVME_MINOR_INST_SHIFT	9
827 #define	NVME_MINOR(inst, nsid)	(((inst) << NVME_MINOR_INST_SHIFT) | (nsid))
828 #define	NVME_MINOR_INST(minor)	((minor) >> NVME_MINOR_INST_SHIFT)
829 #define	NVME_MINOR_NSID(minor)	((minor) & ((1 << NVME_MINOR_INST_SHIFT) - 1))
830 #define	NVME_MINOR_MAX		(NVME_MINOR(1, 0) - 2)
831 
832 #define	NVME_OPEN_NMINORS		(1024 * 1024)
833 #define	NVME_OPEN_MINOR_MIN		(MAXMIN32 + 1)
834 #define	NVME_OPEN_MINOR_MAX_EXCL	(NVME_OPEN_MINOR_MIN + \
835     NVME_OPEN_NMINORS)
836 
837 #define	NVME_BUMP_STAT(nvme, stat)	\
838 	atomic_inc_64(&nvme->n_device_stat.nds_ ## stat.value.ui64)
839 
840 static void *nvme_state;
841 static kmem_cache_t *nvme_cmd_cache;
842 
843 /*
844  * DMA attributes for queue DMA memory
845  *
846  * Queue DMA memory must be page aligned. The maximum length of a queue is
847  * 65536 entries, and an entry can be 64 bytes long.
848  */
849 static const ddi_dma_attr_t nvme_queue_dma_attr = {
850 	.dma_attr_version	= DMA_ATTR_V0,
851 	.dma_attr_addr_lo	= 0,
852 	.dma_attr_addr_hi	= 0xffffffffffffffffULL,
853 	.dma_attr_count_max	= (UINT16_MAX + 1) * sizeof (nvme_sqe_t) - 1,
854 	.dma_attr_align		= 0x1000,
855 	.dma_attr_burstsizes	= 0x7ff,
856 	.dma_attr_minxfer	= 0x1000,
857 	.dma_attr_maxxfer	= (UINT16_MAX + 1) * sizeof (nvme_sqe_t),
858 	.dma_attr_seg		= 0xffffffffffffffffULL,
859 	.dma_attr_sgllen	= 1,
860 	.dma_attr_granular	= 1,
861 	.dma_attr_flags		= 0,
862 };
863 
864 /*
865  * DMA attributes for transfers using Physical Region Page (PRP) entries
866  *
867  * A PRP entry describes one page of DMA memory using the page size specified
868  * in the controller configuration's memory page size register (CC.MPS). It uses
869  * a 64bit base address aligned to this page size. There is no limitation on
870  * chaining PRPs together for arbitrarily large DMA transfers. These DMA
871  * attributes will be copied into the nvme_t during nvme_attach() and the
872  * dma_attr_maxxfer will be updated.
873  */
874 static const ddi_dma_attr_t nvme_prp_dma_attr = {
875 	.dma_attr_version	= DMA_ATTR_V0,
876 	.dma_attr_addr_lo	= 0,
877 	.dma_attr_addr_hi	= 0xffffffffffffffffULL,
878 	.dma_attr_count_max	= 0xfff,
879 	.dma_attr_align		= 0x1000,
880 	.dma_attr_burstsizes	= 0x7ff,
881 	.dma_attr_minxfer	= 0x1000,
882 	.dma_attr_maxxfer	= 0x1000,
883 	.dma_attr_seg		= 0xfff,
884 	.dma_attr_sgllen	= -1,
885 	.dma_attr_granular	= 1,
886 	.dma_attr_flags		= 0,
887 };
888 
889 /*
890  * DMA attributes for transfers using scatter/gather lists
891  *
892  * A SGL entry describes a chunk of DMA memory using a 64bit base address and a
893  * 32bit length field. SGL Segment and SGL Last Segment entries require the
894  * length to be a multiple of 16 bytes. While the SGL DMA attributes are copied
895  * into the nvme_t, they are not currently used for any I/O.
896  */
897 static const ddi_dma_attr_t nvme_sgl_dma_attr = {
898 	.dma_attr_version	= DMA_ATTR_V0,
899 	.dma_attr_addr_lo	= 0,
900 	.dma_attr_addr_hi	= 0xffffffffffffffffULL,
901 	.dma_attr_count_max	= 0xffffffffUL,
902 	.dma_attr_align		= 1,
903 	.dma_attr_burstsizes	= 0x7ff,
904 	.dma_attr_minxfer	= 0x10,
905 	.dma_attr_maxxfer	= 0xfffffffffULL,
906 	.dma_attr_seg		= 0xffffffffffffffffULL,
907 	.dma_attr_sgllen	= -1,
908 	.dma_attr_granular	= 0x10,
909 	.dma_attr_flags		= 0
910 };
911 
912 static ddi_device_acc_attr_t nvme_reg_acc_attr = {
913 	.devacc_attr_version	= DDI_DEVICE_ATTR_V0,
914 	.devacc_attr_endian_flags = DDI_STRUCTURE_LE_ACC,
915 	.devacc_attr_dataorder	= DDI_STRICTORDER_ACC
916 };
917 
918 /*
919  * ioctl validation policies. These are policies that determine which namespaces
920  * are allowed or disallowed for various operations. Note, all policy items
921  * should be explicitly listed here to help make it clear what our intent is.
922  * That is also why some of these are identical or repeated when they cover
923  * different ioctls.
924  */
925 
926 /*
927  * The controller information ioctl generally contains read-only information
928  * about the controller that is sourced from multiple different pieces of
929  * information. This does not operate on a namespace and none are accepted.
930  */
931 static const nvme_ioctl_check_t nvme_check_ctrl_info = {
932 	.nck_ns_ok = B_FALSE, .nck_ns_minor_ok = B_FALSE,
933 	.nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_FALSE,
934 	.nck_bcast_ok = B_FALSE, .nck_excl = NVME_IOCTL_EXCL_NONE
935 };
936 
937 /*
938  * The kernel namespace information requires a namespace ID to be specified. It
939  * does not allow for the broadcast ID to be specified.
940  */
941 static const nvme_ioctl_check_t nvme_check_ns_info = {
942 	.nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_TRUE,
943 	.nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_FALSE,
944 	.nck_bcast_ok = B_FALSE, .nck_excl = NVME_IOCTL_EXCL_NONE
945 };
946 
947 /*
948  * Identify commands are allowed to operate on a namespace minor. Unfortunately,
949  * the namespace field in identify commands is a bit, weird. In particular, some
950  * commands need a valid namespace, while others are namespace listing
951  * operations, which means illegal namespaces like zero are allowed.
952  */
953 static const nvme_ioctl_check_t nvme_check_identify = {
954 	.nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_TRUE,
955 	.nck_skip_ctrl = B_TRUE, .nck_ctrl_rewrite = B_FALSE,
956 	.nck_bcast_ok = B_TRUE, .nck_excl = NVME_IOCTL_EXCL_NONE
957 };
958 
959 /*
960  * The get log page command requires the ability to specify namespaces. When
961  * targeting the controller, one must use the broadcast NSID.
962  */
963 static const nvme_ioctl_check_t nvme_check_get_logpage = {
964 	.nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_TRUE,
965 	.nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_TRUE,
966 	.nck_bcast_ok = B_TRUE, .nck_excl = NVME_IOCTL_EXCL_NONE
967 };
968 
969 /*
970  * When getting a feature, we do not want rewriting behavior as most features do
971  * not require a namespace to be specified. Specific instances are checked in
972  * nvme_validate_get_feature().
973  */
974 static const nvme_ioctl_check_t nvme_check_get_feature = {
975 	.nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_TRUE,
976 	.nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_FALSE,
977 	.nck_bcast_ok = B_TRUE, .nck_excl = NVME_IOCTL_EXCL_NONE
978 };
979 
980 /*
981  * Format commands must target a namespace. The broadcast namespace must be used
982  * when referring to the controller.
983  */
984 static const nvme_ioctl_check_t nvme_check_format = {
985 	.nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_TRUE,
986 	.nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_TRUE,
987 	.nck_bcast_ok = B_TRUE, .nck_excl = NVME_IOCTL_EXCL_WRITE
988 };
989 
990 /*
991  * blkdev and controller attach and detach must always target a namespace.
992  * However, the broadcast namespace is not allowed. We still perform rewriting
993  * so that way specifying the controller node with 0 will be caught.
994  */
995 static const nvme_ioctl_check_t nvme_check_attach_detach = {
996 	.nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_TRUE,
997 	.nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_TRUE,
998 	.nck_bcast_ok = B_FALSE, .nck_excl = NVME_IOCTL_EXCL_WRITE
999 };
1000 
1001 /*
1002  * Namespace creation operations cannot target a namespace as the new namespace
1003  * ID will be returned in the operation. This operation requires the entire
1004  * controller lock to be owned as one has to coordinate this operation with all
1005  * of the actual namespace logic that's present.
1006  */
1007 static const nvme_ioctl_check_t nvme_check_ns_create = {
1008 	.nck_ns_ok = B_FALSE, .nck_ns_minor_ok = B_FALSE,
1009 	.nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_FALSE,
1010 	.nck_bcast_ok = B_FALSE, .nck_excl = NVME_IOCTL_EXCL_CTRL
1011 };
1012 
1013 /*
1014  * NVMe namespace delete must always target a namespace. The broadcast namespace
1015  * isn't allowed. We perform rewriting so that way we can catch this.
1016  * Importantly this only requires holding an exclusive lock on the namespace,
1017  * not on the whole device like creating a namespace does. Note, we don't allow
1018  * this on the namespace minor itself as part of our path towards transitioning
1019  * away from its use.
1020  */
1021 static const nvme_ioctl_check_t nvme_check_ns_delete = {
1022 	.nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_FALSE,
1023 	.nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_TRUE,
1024 	.nck_bcast_ok = B_FALSE, .nck_excl = NVME_IOCTL_EXCL_WRITE
1025 };
1026 
1027 /*
1028  * Firmware operations must not target a namespace and are only allowed from the
1029  * controller.
1030  */
1031 static const nvme_ioctl_check_t nvme_check_firmware = {
1032 	.nck_ns_ok = B_FALSE, .nck_ns_minor_ok = B_FALSE,
1033 	.nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_FALSE,
1034 	.nck_bcast_ok = B_FALSE, .nck_excl = NVME_IOCTL_EXCL_WRITE
1035 };
1036 
1037 /*
1038  * Passthru commands are an odd set. We only allow them from the primary
1039  * controller; however, we allow a namespace to be specified in them and allow
1040  * the broadcast namespace. We do not perform rewriting because we don't know
1041  * what the semantics are. We explicitly exempt passthru commands from needing
1042  * an exclusive lock and leave it up to them to tell us the impact of the
1043  * command and semantics. As this is a privileged interface and the semantics
1044  * are arbitrary, there's not much we can do without some assistance from the
1045  * consumer.
1046  */
1047 static const nvme_ioctl_check_t nvme_check_passthru = {
1048 	.nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_FALSE,
1049 	.nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_FALSE,
1050 	.nck_bcast_ok = B_TRUE, .nck_excl = NVME_IOCTL_EXCL_NONE
1051 };
1052 
1053 /*
1054  * Lock operations are allowed to target a namespace, but must not be rewritten.
1055  * There is no support for the broadcast namespace. This is the only ioctl that
1056  * should skip exclusive checking as it's used to grant it.
1057  */
1058 static const nvme_ioctl_check_t nvme_check_locking = {
1059 	.nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_TRUE,
1060 	.nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_FALSE,
1061 	.nck_bcast_ok = B_FALSE, .nck_excl = NVME_IOCTL_EXCL_SKIP
1062 };
1063 
1064 /*
1065  * These data tables indicate how we handle the various states a namespace may
1066  * be in before we put it through the namespace state transition diagram. Note,
1067  * namespace creation does not allow one to specify a namespace ID, therefore
1068  * there it doesn't have a set of entries here.
1069  *
1070  * See Namespace Support in the theory statement for more information.
1071  */
1072 static const nvme_ioctl_errno_t nvme_ns_delete_states[] = {
1073 	[NVME_NS_STATE_UNALLOCATED] = NVME_IOCTL_E_NS_NO_NS,
1074 	[NVME_NS_STATE_ALLOCATED] = NVME_IOCTL_E_OK,
1075 	[NVME_NS_STATE_ACTIVE] = NVME_IOCTL_E_NS_CTRL_ATTACHED,
1076 	[NVME_NS_STATE_NOT_IGNORED] = NVME_IOCTL_E_NS_CTRL_ATTACHED,
1077 	[NVME_NS_STATE_ATTACHED] = NVME_IOCTL_E_NS_BLKDEV_ATTACH
1078 };
1079 
1080 static const nvme_ioctl_errno_t nvme_ctrl_attach_states[] = {
1081 	[NVME_NS_STATE_UNALLOCATED] = NVME_IOCTL_E_NS_NO_NS,
1082 	[NVME_NS_STATE_ALLOCATED] = NVME_IOCTL_E_OK,
1083 	[NVME_NS_STATE_ACTIVE] = NVME_IOCTL_E_NS_CTRL_ATTACHED,
1084 	[NVME_NS_STATE_NOT_IGNORED] = NVME_IOCTL_E_NS_CTRL_ATTACHED,
1085 	[NVME_NS_STATE_ATTACHED] = NVME_IOCTL_E_NS_BLKDEV_ATTACH
1086 };
1087 
1088 static const nvme_ioctl_errno_t nvme_ctrl_detach_states[] = {
1089 	[NVME_NS_STATE_UNALLOCATED] = NVME_IOCTL_E_NS_NO_NS,
1090 	[NVME_NS_STATE_ALLOCATED] = NVME_IOCTL_E_NS_CTRL_NOT_ATTACHED,
1091 	[NVME_NS_STATE_ACTIVE] = NVME_IOCTL_E_OK,
1092 	[NVME_NS_STATE_NOT_IGNORED] = NVME_IOCTL_E_OK,
1093 	[NVME_NS_STATE_ATTACHED] = NVME_IOCTL_E_NS_BLKDEV_ATTACH
1094 };
1095 
1096 static const nvme_ioctl_errno_t nvme_bd_attach_states[] = {
1097 	[NVME_NS_STATE_UNALLOCATED] = NVME_IOCTL_E_NS_NO_NS,
1098 	[NVME_NS_STATE_ALLOCATED] = NVME_IOCTL_E_NS_CTRL_NOT_ATTACHED,
1099 	[NVME_NS_STATE_ACTIVE] = NVME_IOCTL_E_UNSUP_ATTACH_NS,
1100 	[NVME_NS_STATE_NOT_IGNORED] = NVME_IOCTL_E_OK,
1101 	[NVME_NS_STATE_ATTACHED] = NVME_IOCTL_E_NS_BLKDEV_ATTACH,
1102 };
1103 
1104 static const nvme_ioctl_errno_t nvme_bd_detach_states[] = {
1105 	[NVME_NS_STATE_UNALLOCATED] = NVME_IOCTL_E_NS_NO_NS,
1106 	[NVME_NS_STATE_ALLOCATED] = NVME_IOCTL_E_NS_CTRL_NOT_ATTACHED,
1107 	[NVME_NS_STATE_ACTIVE] = NVME_IOCTL_E_NS_CTRL_ATTACHED,
1108 	[NVME_NS_STATE_NOT_IGNORED] = NVME_IOCTL_E_NS_CTRL_ATTACHED,
1109 	[NVME_NS_STATE_ATTACHED] = NVME_IOCTL_E_OK,
1110 };
1111 
1112 static const nvme_ioctl_errno_t nvme_format_nvm_states[] = {
1113 	[NVME_NS_STATE_UNALLOCATED] = NVME_IOCTL_E_NS_NO_NS,
1114 	[NVME_NS_STATE_ALLOCATED] = NVME_IOCTL_E_OK,
1115 	[NVME_NS_STATE_ACTIVE] = NVME_IOCTL_E_OK,
1116 	[NVME_NS_STATE_NOT_IGNORED] = NVME_IOCTL_E_OK,
1117 	[NVME_NS_STATE_ATTACHED] = NVME_IOCTL_E_NS_BLKDEV_ATTACH
1118 };
1119 
1120 static struct cb_ops nvme_cb_ops = {
1121 	.cb_open	= nvme_open,
1122 	.cb_close	= nvme_close,
1123 	.cb_strategy	= nodev,
1124 	.cb_print	= nodev,
1125 	.cb_dump	= nodev,
1126 	.cb_read	= nodev,
1127 	.cb_write	= nodev,
1128 	.cb_ioctl	= nvme_ioctl,
1129 	.cb_devmap	= nodev,
1130 	.cb_mmap	= nodev,
1131 	.cb_segmap	= nodev,
1132 	.cb_chpoll	= nochpoll,
1133 	.cb_prop_op	= ddi_prop_op,
1134 	.cb_str		= 0,
1135 	.cb_flag	= D_NEW | D_MP,
1136 	.cb_rev		= CB_REV,
1137 	.cb_aread	= nodev,
1138 	.cb_awrite	= nodev
1139 };
1140 
1141 static struct dev_ops nvme_dev_ops = {
1142 	.devo_rev	= DEVO_REV,
1143 	.devo_refcnt	= 0,
1144 	.devo_getinfo	= ddi_no_info,
1145 	.devo_identify	= nulldev,
1146 	.devo_probe	= nulldev,
1147 	.devo_attach	= nvme_attach,
1148 	.devo_detach	= nvme_detach,
1149 	.devo_reset	= nodev,
1150 	.devo_cb_ops	= &nvme_cb_ops,
1151 	.devo_bus_ops	= NULL,
1152 	.devo_power	= NULL,
1153 	.devo_quiesce	= nvme_quiesce,
1154 };
1155 
1156 static struct modldrv nvme_modldrv = {
1157 	.drv_modops	= &mod_driverops,
1158 	.drv_linkinfo	= "NVMe driver",
1159 	.drv_dev_ops	= &nvme_dev_ops
1160 };
1161 
1162 static struct modlinkage nvme_modlinkage = {
1163 	.ml_rev		= MODREV_1,
1164 	.ml_linkage	= { &nvme_modldrv, NULL }
1165 };
1166 
1167 static bd_ops_t nvme_bd_ops = {
1168 	.o_version	= BD_OPS_CURRENT_VERSION,
1169 	.o_drive_info	= nvme_bd_driveinfo,
1170 	.o_media_info	= nvme_bd_mediainfo,
1171 	.o_devid_init	= nvme_bd_devid,
1172 	.o_sync_cache	= nvme_bd_sync,
1173 	.o_read		= nvme_bd_read,
1174 	.o_write	= nvme_bd_write,
1175 	.o_free_space	= nvme_bd_free_space,
1176 };
1177 
1178 /*
1179  * This list will hold commands that have timed out and couldn't be aborted.
1180  * As we don't know what the hardware may still do with the DMA memory we can't
1181  * free them, so we'll keep them forever on this list where we can easily look
1182  * at them with mdb.
1183  */
1184 static struct list nvme_lost_cmds;
1185 static kmutex_t nvme_lc_mutex;
1186 
1187 int
1188 _init(void)
1189 {
1190 	int error;
1191 
1192 	error = ddi_soft_state_init(&nvme_state, sizeof (nvme_t), 1);
1193 	if (error != DDI_SUCCESS)
1194 		return (error);
1195 
1196 	if ((nvme_open_minors = id_space_create("nvme_open_minors",
1197 	    NVME_OPEN_MINOR_MIN, NVME_OPEN_MINOR_MAX_EXCL)) == NULL) {
1198 		ddi_soft_state_fini(&nvme_state);
1199 		return (ENOMEM);
1200 	}
1201 
1202 	nvme_cmd_cache = kmem_cache_create("nvme_cmd_cache",
1203 	    sizeof (nvme_cmd_t), 64, NULL, NULL, NULL, NULL, NULL, 0);
1204 
1205 	mutex_init(&nvme_lc_mutex, NULL, MUTEX_DRIVER, NULL);
1206 	list_create(&nvme_lost_cmds, sizeof (nvme_cmd_t),
1207 	    offsetof(nvme_cmd_t, nc_list));
1208 
1209 	mutex_init(&nvme_open_minors_mutex, NULL, MUTEX_DRIVER, NULL);
1210 	avl_create(&nvme_open_minors_avl, nvme_minor_comparator,
1211 	    sizeof (nvme_minor_t), offsetof(nvme_minor_t, nm_avl));
1212 
1213 	nvme_dead_taskq = taskq_create("nvme_dead_taskq", 1, minclsyspri, 1, 1,
1214 	    TASKQ_PREPOPULATE);
1215 
1216 	bd_mod_init(&nvme_dev_ops);
1217 
1218 	error = mod_install(&nvme_modlinkage);
1219 	if (error != DDI_SUCCESS) {
1220 		ddi_soft_state_fini(&nvme_state);
1221 		id_space_destroy(nvme_open_minors);
1222 		mutex_destroy(&nvme_lc_mutex);
1223 		list_destroy(&nvme_lost_cmds);
1224 		bd_mod_fini(&nvme_dev_ops);
1225 		mutex_destroy(&nvme_open_minors_mutex);
1226 		avl_destroy(&nvme_open_minors_avl);
1227 		taskq_destroy(nvme_dead_taskq);
1228 	}
1229 
1230 	return (error);
1231 }
1232 
1233 int
1234 _fini(void)
1235 {
1236 	int error;
1237 
1238 	if (!list_is_empty(&nvme_lost_cmds))
1239 		return (DDI_FAILURE);
1240 
1241 	error = mod_remove(&nvme_modlinkage);
1242 	if (error == DDI_SUCCESS) {
1243 		ddi_soft_state_fini(&nvme_state);
1244 		id_space_destroy(nvme_open_minors);
1245 		kmem_cache_destroy(nvme_cmd_cache);
1246 		mutex_destroy(&nvme_lc_mutex);
1247 		list_destroy(&nvme_lost_cmds);
1248 		bd_mod_fini(&nvme_dev_ops);
1249 		mutex_destroy(&nvme_open_minors_mutex);
1250 		avl_destroy(&nvme_open_minors_avl);
1251 		taskq_destroy(nvme_dead_taskq);
1252 	}
1253 
1254 	return (error);
1255 }
1256 
1257 int
1258 _info(struct modinfo *modinfop)
1259 {
1260 	return (mod_info(&nvme_modlinkage, modinfop));
1261 }
1262 
1263 static inline void
1264 nvme_put64(nvme_t *nvme, uintptr_t reg, uint64_t val)
1265 {
1266 	ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0);
1267 
1268 	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
1269 	ddi_put64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg), val);
1270 }
1271 
1272 static inline void
1273 nvme_put32(nvme_t *nvme, uintptr_t reg, uint32_t val)
1274 {
1275 	ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x3) == 0);
1276 
1277 	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
1278 	ddi_put32(nvme->n_regh, (uint32_t *)(nvme->n_regs + reg), val);
1279 }
1280 
1281 static inline uint64_t
1282 nvme_get64(nvme_t *nvme, uintptr_t reg)
1283 {
1284 	uint64_t val;
1285 
1286 	ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0);
1287 
1288 	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
1289 	val = ddi_get64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg));
1290 
1291 	return (val);
1292 }
1293 
1294 static inline uint32_t
1295 nvme_get32(nvme_t *nvme, uintptr_t reg)
1296 {
1297 	uint32_t val;
1298 
1299 	ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x3) == 0);
1300 
1301 	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
1302 	val = ddi_get32(nvme->n_regh, (uint32_t *)(nvme->n_regs + reg));
1303 
1304 	return (val);
1305 }
1306 
1307 static void
1308 nvme_mgmt_lock_fini(nvme_mgmt_lock_t *lock)
1309 {
1310 	ASSERT3U(lock->nml_bd_own, ==, 0);
1311 	mutex_destroy(&lock->nml_lock);
1312 	cv_destroy(&lock->nml_cv);
1313 }
1314 
1315 static void
1316 nvme_mgmt_lock_init(nvme_mgmt_lock_t *lock)
1317 {
1318 	mutex_init(&lock->nml_lock, NULL, MUTEX_DRIVER, NULL);
1319 	cv_init(&lock->nml_cv, NULL, CV_DRIVER, NULL);
1320 	lock->nml_bd_own = 0;
1321 }
1322 
1323 static void
1324 nvme_mgmt_unlock(nvme_t *nvme)
1325 {
1326 	nvme_mgmt_lock_t *lock = &nvme->n_mgmt;
1327 
1328 	cv_broadcast(&lock->nml_cv);
1329 	mutex_exit(&lock->nml_lock);
1330 }
1331 
1332 static boolean_t
1333 nvme_mgmt_lock_held(const nvme_t *nvme)
1334 {
1335 	return (MUTEX_HELD(&nvme->n_mgmt.nml_lock) != 0);
1336 }
1337 
1338 static void
1339 nvme_mgmt_lock(nvme_t *nvme, nvme_mgmt_lock_level_t level)
1340 {
1341 	nvme_mgmt_lock_t *lock = &nvme->n_mgmt;
1342 	mutex_enter(&lock->nml_lock);
1343 	while (lock->nml_bd_own != 0) {
1344 		if (level == NVME_MGMT_LOCK_BDRO)
1345 			break;
1346 		cv_wait(&lock->nml_cv, &lock->nml_lock);
1347 	}
1348 }
1349 
1350 /*
1351  * This and nvme_mgmt_bd_end() are used to indicate that the driver is going to
1352  * be calling into a re-entrant blkdev related function. We cannot hold the lock
1353  * across such an operation and therefore must indicate that this is logically
1354  * held, while allowing other operations to proceed. This nvme_mgmt_bd_end() may
1355  * only be called by a thread that already holds the nmve_mgmt_lock().
1356  */
1357 static void
1358 nvme_mgmt_bd_start(nvme_t *nvme)
1359 {
1360 	nvme_mgmt_lock_t *lock = &nvme->n_mgmt;
1361 
1362 	VERIFY(MUTEX_HELD(&lock->nml_lock));
1363 	VERIFY3U(lock->nml_bd_own, ==, 0);
1364 	lock->nml_bd_own = (uintptr_t)curthread;
1365 	mutex_exit(&lock->nml_lock);
1366 }
1367 
1368 static void
1369 nvme_mgmt_bd_end(nvme_t *nvme)
1370 {
1371 	nvme_mgmt_lock_t *lock = &nvme->n_mgmt;
1372 
1373 	mutex_enter(&lock->nml_lock);
1374 	VERIFY3U(lock->nml_bd_own, ==, (uintptr_t)curthread);
1375 	lock->nml_bd_own = 0;
1376 }
1377 
1378 static boolean_t
1379 nvme_ns_state_check(const nvme_namespace_t *ns, nvme_ioctl_common_t *ioc,
1380     const nvme_ioctl_errno_t states[NVME_NS_NSTATES])
1381 {
1382 	VERIFY(nvme_mgmt_lock_held(ns->ns_nvme));
1383 	VERIFY3U(ns->ns_state, <, NVME_NS_NSTATES);
1384 
1385 	if (states[ns->ns_state] == NVME_IOCTL_E_OK) {
1386 		return (B_TRUE);
1387 	}
1388 
1389 	return (nvme_ioctl_error(ioc, states[ns->ns_state], 0, 0));
1390 }
1391 
1392 /*
1393  * This is a central clearing house for marking an NVMe controller dead and/or
1394  * removed. This takes care of setting the flag, taking care of outstanding
1395  * blocked locks, and sending a DDI FMA impact. This is called from a precarious
1396  * place where locking is suspect. The only guarantee we have is that the nvme_t
1397  * is valid and won't disappear until we return.
1398  */
1399 static void
1400 nvme_ctrl_mark_dead(nvme_t *nvme, boolean_t removed)
1401 {
1402 	boolean_t was_dead;
1403 
1404 	/*
1405 	 * See if we win the race to set things up here. If someone beat us to
1406 	 * it, we do not do anything.
1407 	 */
1408 	was_dead = atomic_cas_32((volatile uint32_t *)&nvme->n_dead, B_FALSE,
1409 	    B_TRUE);
1410 
1411 	/*
1412 	 * If we were removed, note this in our death status, regardless of
1413 	 * whether or not we were already dead.  We need to know this so that we
1414 	 * can decide if it is safe to try and interact the the device in e.g.
1415 	 * reset and shutdown.
1416 	 */
1417 	if (removed) {
1418 		nvme->n_dead_status = NVME_IOCTL_E_CTRL_GONE;
1419 	}
1420 
1421 	if (was_dead) {
1422 		return;
1423 	}
1424 
1425 	/*
1426 	 * If this was removed, there is no reason to change the service impact.
1427 	 * Otherwise, we need to change our default return code to indicate that
1428 	 * the device is truly dead, and not simply gone.
1429 	 */
1430 	if (!removed) {
1431 		ASSERT3U(nvme->n_dead_status, ==, NVME_IOCTL_E_CTRL_DEAD);
1432 		ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST);
1433 	}
1434 
1435 	taskq_dispatch_ent(nvme_dead_taskq, nvme_rwlock_ctrl_dead, nvme,
1436 	    TQ_NOSLEEP, &nvme->n_dead_tqent);
1437 }
1438 
1439 static boolean_t
1440 nvme_ctrl_is_gone(const nvme_t *nvme)
1441 {
1442 	if (nvme->n_dead && nvme->n_dead_status == NVME_IOCTL_E_CTRL_GONE)
1443 		return (B_TRUE);
1444 
1445 	return (B_FALSE);
1446 }
1447 
1448 static boolean_t
1449 nvme_check_regs_hdl(nvme_t *nvme)
1450 {
1451 	ddi_fm_error_t error;
1452 
1453 	ddi_fm_acc_err_get(nvme->n_regh, &error, DDI_FME_VERSION);
1454 
1455 	if (error.fme_status != DDI_FM_OK)
1456 		return (B_TRUE);
1457 
1458 	return (B_FALSE);
1459 }
1460 
1461 static boolean_t
1462 nvme_check_dma_hdl(nvme_dma_t *dma)
1463 {
1464 	ddi_fm_error_t error;
1465 
1466 	if (dma == NULL)
1467 		return (B_FALSE);
1468 
1469 	ddi_fm_dma_err_get(dma->nd_dmah, &error, DDI_FME_VERSION);
1470 
1471 	if (error.fme_status != DDI_FM_OK)
1472 		return (B_TRUE);
1473 
1474 	return (B_FALSE);
1475 }
1476 
1477 static void
1478 nvme_free_dma_common(nvme_dma_t *dma)
1479 {
1480 	if (dma->nd_dmah != NULL)
1481 		(void) ddi_dma_unbind_handle(dma->nd_dmah);
1482 	if (dma->nd_acch != NULL)
1483 		ddi_dma_mem_free(&dma->nd_acch);
1484 	if (dma->nd_dmah != NULL)
1485 		ddi_dma_free_handle(&dma->nd_dmah);
1486 }
1487 
1488 static void
1489 nvme_free_dma(nvme_dma_t *dma)
1490 {
1491 	nvme_free_dma_common(dma);
1492 	kmem_free(dma, sizeof (*dma));
1493 }
1494 
1495 static void
1496 nvme_prp_dma_destructor(void *buf, void *private __unused)
1497 {
1498 	nvme_dma_t *dma = (nvme_dma_t *)buf;
1499 
1500 	nvme_free_dma_common(dma);
1501 }
1502 
1503 static int
1504 nvme_alloc_dma_common(nvme_t *nvme, nvme_dma_t *dma,
1505     size_t len, uint_t flags, ddi_dma_attr_t *dma_attr)
1506 {
1507 	if (ddi_dma_alloc_handle(nvme->n_dip, dma_attr, DDI_DMA_SLEEP, NULL,
1508 	    &dma->nd_dmah) != DDI_SUCCESS) {
1509 		/*
1510 		 * Due to DDI_DMA_SLEEP this can't be DDI_DMA_NORESOURCES, and
1511 		 * the only other possible error is DDI_DMA_BADATTR which
1512 		 * indicates a driver bug which should cause a panic.
1513 		 */
1514 		dev_err(nvme->n_dip, CE_PANIC,
1515 		    "!failed to get DMA handle, check DMA attributes");
1516 		return (DDI_FAILURE);
1517 	}
1518 
1519 	/*
1520 	 * ddi_dma_mem_alloc() can only fail when DDI_DMA_NOSLEEP is specified
1521 	 * or the flags are conflicting, which isn't the case here.
1522 	 */
1523 	(void) ddi_dma_mem_alloc(dma->nd_dmah, len, &nvme->n_reg_acc_attr,
1524 	    DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL, &dma->nd_memp,
1525 	    &dma->nd_len, &dma->nd_acch);
1526 
1527 	if (ddi_dma_addr_bind_handle(dma->nd_dmah, NULL, dma->nd_memp,
1528 	    dma->nd_len, flags | DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL,
1529 	    &dma->nd_cookie, &dma->nd_ncookie) != DDI_DMA_MAPPED) {
1530 		dev_err(nvme->n_dip, CE_WARN,
1531 		    "!failed to bind DMA memory");
1532 		NVME_BUMP_STAT(nvme, dma_bind_err);
1533 		nvme_free_dma_common(dma);
1534 		return (DDI_FAILURE);
1535 	}
1536 
1537 	return (DDI_SUCCESS);
1538 }
1539 
1540 static int
1541 nvme_zalloc_dma(nvme_t *nvme, size_t len, uint_t flags,
1542     ddi_dma_attr_t *dma_attr, nvme_dma_t **ret)
1543 {
1544 	nvme_dma_t *dma = kmem_zalloc(sizeof (nvme_dma_t), KM_SLEEP);
1545 
1546 	if (nvme_alloc_dma_common(nvme, dma, len, flags, dma_attr) !=
1547 	    DDI_SUCCESS) {
1548 		*ret = NULL;
1549 		kmem_free(dma, sizeof (nvme_dma_t));
1550 		return (DDI_FAILURE);
1551 	}
1552 
1553 	bzero(dma->nd_memp, dma->nd_len);
1554 
1555 	*ret = dma;
1556 	return (DDI_SUCCESS);
1557 }
1558 
1559 static int
1560 nvme_prp_dma_constructor(void *buf, void *private, int flags __unused)
1561 {
1562 	nvme_dma_t *dma = (nvme_dma_t *)buf;
1563 	nvme_t *nvme = (nvme_t *)private;
1564 
1565 	dma->nd_dmah = NULL;
1566 	dma->nd_acch = NULL;
1567 
1568 	if (nvme_alloc_dma_common(nvme, dma, nvme->n_pagesize,
1569 	    DDI_DMA_READ, &nvme->n_prp_dma_attr) != DDI_SUCCESS) {
1570 		return (-1);
1571 	}
1572 
1573 	ASSERT(dma->nd_ncookie == 1);
1574 
1575 	dma->nd_cached = B_TRUE;
1576 
1577 	return (0);
1578 }
1579 
1580 static int
1581 nvme_zalloc_queue_dma(nvme_t *nvme, uint32_t nentry, uint16_t qe_len,
1582     uint_t flags, nvme_dma_t **dma)
1583 {
1584 	uint32_t len = nentry * qe_len;
1585 	ddi_dma_attr_t q_dma_attr = nvme->n_queue_dma_attr;
1586 
1587 	len = roundup(len, nvme->n_pagesize);
1588 
1589 	if (nvme_zalloc_dma(nvme, len, flags, &q_dma_attr, dma)
1590 	    != DDI_SUCCESS) {
1591 		dev_err(nvme->n_dip, CE_WARN,
1592 		    "!failed to get DMA memory for queue");
1593 		goto fail;
1594 	}
1595 
1596 	if ((*dma)->nd_ncookie != 1) {
1597 		dev_err(nvme->n_dip, CE_WARN,
1598 		    "!got too many cookies for queue DMA");
1599 		goto fail;
1600 	}
1601 
1602 	return (DDI_SUCCESS);
1603 
1604 fail:
1605 	if (*dma) {
1606 		nvme_free_dma(*dma);
1607 		*dma = NULL;
1608 	}
1609 
1610 	return (DDI_FAILURE);
1611 }
1612 
1613 static void
1614 nvme_free_cq(nvme_cq_t *cq)
1615 {
1616 	mutex_destroy(&cq->ncq_mutex);
1617 
1618 	if (cq->ncq_cmd_taskq != NULL)
1619 		taskq_destroy(cq->ncq_cmd_taskq);
1620 
1621 	if (cq->ncq_dma != NULL)
1622 		nvme_free_dma(cq->ncq_dma);
1623 
1624 	kmem_free(cq, sizeof (*cq));
1625 }
1626 
1627 static void
1628 nvme_free_qpair(nvme_qpair_t *qp)
1629 {
1630 	int i;
1631 
1632 	mutex_destroy(&qp->nq_mutex);
1633 	sema_destroy(&qp->nq_sema);
1634 
1635 	if (qp->nq_sqdma != NULL)
1636 		nvme_free_dma(qp->nq_sqdma);
1637 
1638 	if (qp->nq_active_cmds > 0)
1639 		for (i = 0; i != qp->nq_nentry; i++)
1640 			if (qp->nq_cmd[i] != NULL)
1641 				nvme_free_cmd(qp->nq_cmd[i]);
1642 
1643 	if (qp->nq_cmd != NULL)
1644 		kmem_free(qp->nq_cmd, sizeof (nvme_cmd_t *) * qp->nq_nentry);
1645 
1646 	kmem_free(qp, sizeof (nvme_qpair_t));
1647 }
1648 
1649 /*
1650  * Destroy the pre-allocated cq array, but only free individual completion
1651  * queues from the given starting index.
1652  */
1653 static void
1654 nvme_destroy_cq_array(nvme_t *nvme, uint_t start)
1655 {
1656 	uint_t i;
1657 
1658 	for (i = start; i < nvme->n_cq_count; i++)
1659 		if (nvme->n_cq[i] != NULL)
1660 			nvme_free_cq(nvme->n_cq[i]);
1661 
1662 	kmem_free(nvme->n_cq, sizeof (*nvme->n_cq) * nvme->n_cq_count);
1663 }
1664 
1665 static int
1666 nvme_alloc_cq(nvme_t *nvme, uint32_t nentry, nvme_cq_t **cqp, uint16_t idx,
1667     uint_t nthr)
1668 {
1669 	nvme_cq_t *cq = kmem_zalloc(sizeof (*cq), KM_SLEEP);
1670 	char name[64];		/* large enough for the taskq name */
1671 
1672 	mutex_init(&cq->ncq_mutex, NULL, MUTEX_DRIVER,
1673 	    DDI_INTR_PRI(nvme->n_intr_pri));
1674 
1675 	if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_cqe_t),
1676 	    DDI_DMA_READ, &cq->ncq_dma) != DDI_SUCCESS)
1677 		goto fail;
1678 
1679 	cq->ncq_cq = (nvme_cqe_t *)cq->ncq_dma->nd_memp;
1680 	cq->ncq_nentry = nentry;
1681 	cq->ncq_id = idx;
1682 	cq->ncq_hdbl = NVME_REG_CQHDBL(nvme, idx);
1683 
1684 	/*
1685 	 * Each completion queue has its own command taskq.
1686 	 */
1687 	(void) snprintf(name, sizeof (name), "%s%d_cmd_taskq%u",
1688 	    ddi_driver_name(nvme->n_dip), ddi_get_instance(nvme->n_dip), idx);
1689 
1690 	cq->ncq_cmd_taskq = taskq_create(name, nthr, minclsyspri, 64, INT_MAX,
1691 	    TASKQ_PREPOPULATE);
1692 
1693 	if (cq->ncq_cmd_taskq == NULL) {
1694 		dev_err(nvme->n_dip, CE_WARN, "!failed to create cmd "
1695 		    "taskq for cq %u", idx);
1696 		goto fail;
1697 	}
1698 
1699 	*cqp = cq;
1700 	return (DDI_SUCCESS);
1701 
1702 fail:
1703 	nvme_free_cq(cq);
1704 	*cqp = NULL;
1705 
1706 	return (DDI_FAILURE);
1707 }
1708 
1709 /*
1710  * Create the n_cq array big enough to hold "ncq" completion queues.
1711  * If the array already exists it will be re-sized (but only larger).
1712  * The admin queue is included in this array, which boosts the
1713  * max number of entries to UINT16_MAX + 1.
1714  */
1715 static int
1716 nvme_create_cq_array(nvme_t *nvme, uint_t ncq, uint32_t nentry, uint_t nthr)
1717 {
1718 	nvme_cq_t **cq;
1719 	uint_t i, cq_count;
1720 
1721 	ASSERT3U(ncq, >, nvme->n_cq_count);
1722 
1723 	cq = nvme->n_cq;
1724 	cq_count = nvme->n_cq_count;
1725 
1726 	nvme->n_cq = kmem_zalloc(sizeof (*nvme->n_cq) * ncq, KM_SLEEP);
1727 	nvme->n_cq_count = ncq;
1728 
1729 	for (i = 0; i < cq_count; i++)
1730 		nvme->n_cq[i] = cq[i];
1731 
1732 	for (; i < nvme->n_cq_count; i++)
1733 		if (nvme_alloc_cq(nvme, nentry, &nvme->n_cq[i], i, nthr) !=
1734 		    DDI_SUCCESS)
1735 			goto fail;
1736 
1737 	if (cq != NULL)
1738 		kmem_free(cq, sizeof (*cq) * cq_count);
1739 
1740 	return (DDI_SUCCESS);
1741 
1742 fail:
1743 	nvme_destroy_cq_array(nvme, cq_count);
1744 	/*
1745 	 * Restore the original array
1746 	 */
1747 	nvme->n_cq_count = cq_count;
1748 	nvme->n_cq = cq;
1749 
1750 	return (DDI_FAILURE);
1751 }
1752 
1753 static int
1754 nvme_alloc_qpair(nvme_t *nvme, uint32_t nentry, nvme_qpair_t **nqp,
1755     uint_t idx)
1756 {
1757 	nvme_qpair_t *qp = kmem_zalloc(sizeof (*qp), KM_SLEEP);
1758 	uint_t cq_idx;
1759 
1760 	mutex_init(&qp->nq_mutex, NULL, MUTEX_DRIVER,
1761 	    DDI_INTR_PRI(nvme->n_intr_pri));
1762 
1763 	/*
1764 	 * The NVMe spec defines that a full queue has one empty (unused) slot;
1765 	 * initialize the semaphore accordingly.
1766 	 */
1767 	sema_init(&qp->nq_sema, nentry - 1, NULL, SEMA_DRIVER, NULL);
1768 
1769 	if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_sqe_t),
1770 	    DDI_DMA_WRITE, &qp->nq_sqdma) != DDI_SUCCESS)
1771 		goto fail;
1772 
1773 	/*
1774 	 * idx == 0 is adminq, those above 0 are shared io completion queues.
1775 	 */
1776 	cq_idx = idx == 0 ? 0 : 1 + (idx - 1) % (nvme->n_cq_count - 1);
1777 	qp->nq_cq = nvme->n_cq[cq_idx];
1778 	qp->nq_sq = (nvme_sqe_t *)qp->nq_sqdma->nd_memp;
1779 	qp->nq_nentry = nentry;
1780 
1781 	qp->nq_sqtdbl = NVME_REG_SQTDBL(nvme, idx);
1782 
1783 	qp->nq_cmd = kmem_zalloc(sizeof (nvme_cmd_t *) * nentry, KM_SLEEP);
1784 	qp->nq_next_cmd = 0;
1785 
1786 	*nqp = qp;
1787 	return (DDI_SUCCESS);
1788 
1789 fail:
1790 	nvme_free_qpair(qp);
1791 	*nqp = NULL;
1792 
1793 	return (DDI_FAILURE);
1794 }
1795 
1796 /*
1797  * One might reasonably consider that the nvme_cmd_cache should have a cache
1798  * constructor and destructor that takes care of the mutex/cv init/destroy, and
1799  * that nvme_free_cmd should reset more fields such that allocation becomes
1800  * simpler. This is not currently implemented as:
1801  * - nvme_cmd_cache is a global cache, shared across nvme instances and
1802  *   therefore there is no easy access to the corresponding nvme_t in the
1803  *   constructor to determine the required interrupt priority.
1804  * - Most fields in nvme_cmd_t would need to be zeroed in nvme_free_cmd while
1805  *   preserving the mutex/cv. It is easier to able to zero the entire
1806  *   structure and then init the mutex/cv only in the unlikely event that we
1807  *   want an admin command.
1808  */
1809 static nvme_cmd_t *
1810 nvme_alloc_cmd(nvme_t *nvme, int kmflag)
1811 {
1812 	nvme_cmd_t *cmd = kmem_cache_alloc(nvme_cmd_cache, kmflag);
1813 
1814 	if (cmd != NULL) {
1815 		bzero(cmd, sizeof (nvme_cmd_t));
1816 		cmd->nc_nvme = nvme;
1817 	}
1818 
1819 	return (cmd);
1820 }
1821 
1822 static nvme_cmd_t *
1823 nvme_alloc_admin_cmd(nvme_t *nvme, int kmflag)
1824 {
1825 	nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, kmflag);
1826 
1827 	if (cmd != NULL) {
1828 		cmd->nc_flags |= NVME_CMD_F_USELOCK;
1829 		mutex_init(&cmd->nc_mutex, NULL, MUTEX_DRIVER,
1830 		    DDI_INTR_PRI(nvme->n_intr_pri));
1831 		cv_init(&cmd->nc_cv, NULL, CV_DRIVER, NULL);
1832 	}
1833 
1834 	return (cmd);
1835 }
1836 
1837 static void
1838 nvme_free_cmd(nvme_cmd_t *cmd)
1839 {
1840 	/* Don't free commands on the lost commands list. */
1841 	if (list_link_active(&cmd->nc_list))
1842 		return;
1843 
1844 	if (cmd->nc_dma) {
1845 		nvme_free_dma(cmd->nc_dma);
1846 		cmd->nc_dma = NULL;
1847 	}
1848 
1849 	if (cmd->nc_prp) {
1850 		kmem_cache_free(cmd->nc_nvme->n_prp_cache, cmd->nc_prp);
1851 		cmd->nc_prp = NULL;
1852 	}
1853 
1854 	if ((cmd->nc_flags & NVME_CMD_F_USELOCK) != 0) {
1855 		cv_destroy(&cmd->nc_cv);
1856 		mutex_destroy(&cmd->nc_mutex);
1857 	}
1858 
1859 	kmem_cache_free(nvme_cmd_cache, cmd);
1860 }
1861 
1862 static void
1863 nvme_submit_admin_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd, uint32_t *qtimeoutp)
1864 {
1865 	sema_p(&qp->nq_sema);
1866 	nvme_submit_cmd_common(qp, cmd, qtimeoutp);
1867 }
1868 
1869 static int
1870 nvme_submit_io_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd)
1871 {
1872 	if (cmd->nc_nvme->n_dead) {
1873 		return (EIO);
1874 	}
1875 
1876 	if (sema_tryp(&qp->nq_sema) == 0)
1877 		return (EAGAIN);
1878 
1879 	nvme_submit_cmd_common(qp, cmd, NULL);
1880 	return (0);
1881 }
1882 
1883 /*
1884  * Common command submission routine. If `qtimeoutp` is not NULL then it will
1885  * be set to the sum of the timeouts of any active commands ahead of the one
1886  * being submitted.
1887  */
1888 static void
1889 nvme_submit_cmd_common(nvme_qpair_t *qp, nvme_cmd_t *cmd, uint32_t *qtimeoutp)
1890 {
1891 	nvme_reg_sqtdbl_t tail = { 0 };
1892 
1893 	/*
1894 	 * We don't need to take a lock on cmd since it is not yet enqueued.
1895 	 */
1896 	cmd->nc_submit_ts = gethrtime();
1897 	cmd->nc_state = NVME_CMD_SUBMITTED;
1898 
1899 	mutex_enter(&qp->nq_mutex);
1900 
1901 	/*
1902 	 * Now that we hold the queue pair lock, we must check whether or not
1903 	 * the controller has been listed as dead (e.g. was removed due to
1904 	 * hotplug). This is necessary as otherwise we could race with
1905 	 * nvme_remove_callback(). Because this has not been enqueued, we don't
1906 	 * call nvme_unqueue_cmd(), which is why we must manually decrement the
1907 	 * semaphore.
1908 	 */
1909 	if (cmd->nc_nvme->n_dead) {
1910 		cmd->nc_queue_ts = gethrtime();
1911 		cmd->nc_state = NVME_CMD_QUEUED;
1912 		taskq_dispatch_ent(qp->nq_cq->ncq_cmd_taskq, cmd->nc_callback,
1913 		    cmd, TQ_NOSLEEP, &cmd->nc_tqent);
1914 		sema_v(&qp->nq_sema);
1915 		mutex_exit(&qp->nq_mutex);
1916 		return;
1917 	}
1918 
1919 	/*
1920 	 * Try to insert the cmd into the active cmd array at the nq_next_cmd
1921 	 * slot. If the slot is already occupied advance to the next slot and
1922 	 * try again. This can happen for long running commands like async event
1923 	 * requests.
1924 	 */
1925 	while (qp->nq_cmd[qp->nq_next_cmd] != NULL)
1926 		qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry;
1927 	qp->nq_cmd[qp->nq_next_cmd] = cmd;
1928 
1929 	/*
1930 	 * We keep track of the number of active commands in this queue, and
1931 	 * the sum of the timeouts for those active commands.
1932 	 */
1933 	qp->nq_active_cmds++;
1934 	if (qtimeoutp != NULL)
1935 		*qtimeoutp = qp->nq_active_timeout;
1936 	qp->nq_active_timeout += cmd->nc_timeout;
1937 
1938 	cmd->nc_sqe.sqe_cid = qp->nq_next_cmd;
1939 	bcopy(&cmd->nc_sqe, &qp->nq_sq[qp->nq_sqtail], sizeof (nvme_sqe_t));
1940 	(void) ddi_dma_sync(qp->nq_sqdma->nd_dmah,
1941 	    sizeof (nvme_sqe_t) * qp->nq_sqtail,
1942 	    sizeof (nvme_sqe_t), DDI_DMA_SYNC_FORDEV);
1943 	qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry;
1944 
1945 	tail.b.sqtdbl_sqt = qp->nq_sqtail = (qp->nq_sqtail + 1) % qp->nq_nentry;
1946 	nvme_put32(cmd->nc_nvme, qp->nq_sqtdbl, tail.r);
1947 
1948 	mutex_exit(&qp->nq_mutex);
1949 }
1950 
1951 static nvme_cmd_t *
1952 nvme_unqueue_cmd(nvme_t *nvme, nvme_qpair_t *qp, int cid)
1953 {
1954 	nvme_cmd_t *cmd;
1955 
1956 	ASSERT(mutex_owned(&qp->nq_mutex));
1957 	ASSERT3S(cid, <, qp->nq_nentry);
1958 
1959 	cmd = qp->nq_cmd[cid];
1960 	/*
1961 	 * Some controllers will erroneously add things to the completion queue
1962 	 * for which there is no matching outstanding command. If this happens,
1963 	 * it is almost certainly a controller firmware bug since nq_mutex
1964 	 * is held across command submission and ringing the queue doorbell,
1965 	 * and is also held in this function.
1966 	 *
1967 	 * If we see such an unexpected command, there is not much we can do.
1968 	 * These will be logged and counted in nvme_get_completed(), but
1969 	 * otherwise ignored.
1970 	 */
1971 	if (cmd == NULL)
1972 		return (NULL);
1973 	qp->nq_cmd[cid] = NULL;
1974 	ASSERT3U(qp->nq_active_cmds, >, 0);
1975 	qp->nq_active_cmds--;
1976 	ASSERT3U(qp->nq_active_timeout, >=, cmd->nc_timeout);
1977 	qp->nq_active_timeout -= cmd->nc_timeout;
1978 	sema_v(&qp->nq_sema);
1979 
1980 	ASSERT3P(cmd, !=, NULL);
1981 	ASSERT3P(cmd->nc_nvme, ==, nvme);
1982 	ASSERT3S(cmd->nc_sqe.sqe_cid, ==, cid);
1983 
1984 	return (cmd);
1985 }
1986 
1987 /*
1988  * This is called when an admin abort has failed to complete, once for the
1989  * original command and once for the abort itself. At this point the controller
1990  * has been marked dead. The commands are considered lost, de-queued if
1991  * possible, and placed on a global lost commands list so that they cannot be
1992  * freed and so that any DMA memory they have have is not re-used.
1993  */
1994 static void
1995 nvme_lost_cmd(nvme_t *nvme, nvme_cmd_t *cmd)
1996 {
1997 	ASSERT(mutex_owned(&cmd->nc_mutex));
1998 
1999 	switch (cmd->nc_state) {
2000 	case NVME_CMD_SUBMITTED: {
2001 		nvme_qpair_t *qp = nvme->n_ioq[cmd->nc_sqid];
2002 
2003 		/*
2004 		 * The command is still in the submitted state, meaning that we
2005 		 * have not processed a completion queue entry for it. De-queue
2006 		 * should be successful and if the hardware does later report
2007 		 * completion we'll skip it as a command for which we aren't
2008 		 * expecting a response (see nvme_unqueue_cmd()).
2009 		 */
2010 		mutex_enter(&qp->nq_mutex);
2011 		(void) nvme_unqueue_cmd(nvme, qp, cmd->nc_sqe.sqe_cid);
2012 		mutex_exit(&qp->nq_mutex);
2013 	}
2014 	case NVME_CMD_ALLOCATED:
2015 	case NVME_CMD_COMPLETED:
2016 		/*
2017 		 * If the command has not been submitted, or has completed,
2018 		 * there is nothing to do here. In the event of an abort
2019 		 * command timeout, we can end up here in the process of
2020 		 * "losing" the original command. It's possible that command
2021 		 * has actually completed (or been queued on the taskq) in the
2022 		 * interim.
2023 		 */
2024 		break;
2025 	case NVME_CMD_QUEUED:
2026 		/*
2027 		 * The command is on the taskq, awaiting callback. This should
2028 		 * be fairly rapid so wait for completion.
2029 		 */
2030 		while (cmd->nc_state != NVME_CMD_COMPLETED)
2031 			cv_wait(&cmd->nc_cv, &cmd->nc_mutex);
2032 		break;
2033 	case NVME_CMD_LOST:
2034 		dev_err(cmd->nc_nvme->n_dip, CE_PANIC,
2035 		    "%s: command %p already lost", __func__, (void *)cmd);
2036 		break;
2037 	}
2038 
2039 	cmd->nc_state = NVME_CMD_LOST;
2040 
2041 	mutex_enter(&nvme_lc_mutex);
2042 	list_insert_head(&nvme_lost_cmds, cmd);
2043 	mutex_exit(&nvme_lc_mutex);
2044 }
2045 
2046 /*
2047  * Get the command tied to the next completed cqe and bump along completion
2048  * queue head counter.
2049  */
2050 static nvme_cmd_t *
2051 nvme_get_completed(nvme_t *nvme, nvme_cq_t *cq)
2052 {
2053 	nvme_qpair_t *qp;
2054 	nvme_cqe_t *cqe;
2055 	nvme_cmd_t *cmd;
2056 
2057 	ASSERT(mutex_owned(&cq->ncq_mutex));
2058 
2059 retry:
2060 	cqe = &cq->ncq_cq[cq->ncq_head];
2061 
2062 	/* Check phase tag of CQE. Hardware inverts it for new entries. */
2063 	if (cqe->cqe_sf.sf_p == cq->ncq_phase)
2064 		return (NULL);
2065 
2066 	qp = nvme->n_ioq[cqe->cqe_sqid];
2067 
2068 	mutex_enter(&qp->nq_mutex);
2069 	cmd = nvme_unqueue_cmd(nvme, qp, cqe->cqe_cid);
2070 	mutex_exit(&qp->nq_mutex);
2071 
2072 	qp->nq_sqhead = cqe->cqe_sqhd;
2073 	cq->ncq_head = (cq->ncq_head + 1) % cq->ncq_nentry;
2074 
2075 	/* Toggle phase on wrap-around. */
2076 	if (cq->ncq_head == 0)
2077 		cq->ncq_phase = cq->ncq_phase != 0 ? 0 : 1;
2078 
2079 	if (cmd == NULL) {
2080 		dev_err(nvme->n_dip, CE_WARN,
2081 		    "!received completion for unknown cid 0x%x", cqe->cqe_cid);
2082 		NVME_BUMP_STAT(nvme, unknown_cid);
2083 		/*
2084 		 * We want to ignore this unexpected completion entry as it
2085 		 * is most likely a result of a bug in the controller firmware.
2086 		 * However, if we return NULL, then callers will assume there
2087 		 * are no more pending commands for this wakeup. Retry to keep
2088 		 * enumerating commands until the phase tag indicates there are
2089 		 * no more and we are really done.
2090 		 */
2091 		goto retry;
2092 	}
2093 
2094 	ASSERT3U(cmd->nc_sqid, ==, cqe->cqe_sqid);
2095 	bcopy(cqe, &cmd->nc_cqe, sizeof (nvme_cqe_t));
2096 
2097 	return (cmd);
2098 }
2099 
2100 /*
2101  * Process all completed commands on the io completion queue.
2102  */
2103 static uint_t
2104 nvme_process_iocq(nvme_t *nvme, nvme_cq_t *cq)
2105 {
2106 	nvme_reg_cqhdbl_t head = { 0 };
2107 	nvme_cmd_t *cmd;
2108 	uint_t completed = 0;
2109 
2110 	if (ddi_dma_sync(cq->ncq_dma->nd_dmah, 0, 0, DDI_DMA_SYNC_FORKERNEL) !=
2111 	    DDI_SUCCESS)
2112 		dev_err(nvme->n_dip, CE_WARN, "!ddi_dma_sync() failed in %s",
2113 		    __func__);
2114 
2115 	mutex_enter(&cq->ncq_mutex);
2116 
2117 	while ((cmd = nvme_get_completed(nvme, cq)) != NULL) {
2118 		/*
2119 		 * NVME_CMD_F_USELOCK is applied to all commands which are
2120 		 * going to be waited for by another thread in nvme_wait_cmd
2121 		 * and indicates that the lock should be taken before modifying
2122 		 * protected fields, and that the mutex has been initialised.
2123 		 * Commands which do not require the mutex to be held have not
2124 		 * initialised it (to reduce overhead).
2125 		 */
2126 		if ((cmd->nc_flags & NVME_CMD_F_USELOCK) != 0) {
2127 			mutex_enter(&cmd->nc_mutex);
2128 			/*
2129 			 * The command could have been de-queued as lost while
2130 			 * we waited on the lock, in which case we drop it.
2131 			 */
2132 			if (cmd->nc_state == NVME_CMD_LOST) {
2133 				mutex_exit(&cmd->nc_mutex);
2134 				completed++;
2135 				continue;
2136 			}
2137 		}
2138 		cmd->nc_queue_ts = gethrtime();
2139 		cmd->nc_state = NVME_CMD_QUEUED;
2140 		if ((cmd->nc_flags & NVME_CMD_F_USELOCK) != 0)
2141 			mutex_exit(&cmd->nc_mutex);
2142 		taskq_dispatch_ent(cq->ncq_cmd_taskq, cmd->nc_callback, cmd,
2143 		    TQ_NOSLEEP, &cmd->nc_tqent);
2144 
2145 		completed++;
2146 	}
2147 
2148 	if (completed > 0) {
2149 		/*
2150 		 * Update the completion queue head doorbell.
2151 		 */
2152 		head.b.cqhdbl_cqh = cq->ncq_head;
2153 		nvme_put32(nvme, cq->ncq_hdbl, head.r);
2154 	}
2155 
2156 	mutex_exit(&cq->ncq_mutex);
2157 
2158 	return (completed);
2159 }
2160 
2161 static nvme_cmd_t *
2162 nvme_retrieve_cmd(nvme_t *nvme, nvme_qpair_t *qp)
2163 {
2164 	nvme_cq_t *cq = qp->nq_cq;
2165 	nvme_reg_cqhdbl_t head = { 0 };
2166 	nvme_cmd_t *cmd;
2167 
2168 	if (ddi_dma_sync(cq->ncq_dma->nd_dmah, 0, 0, DDI_DMA_SYNC_FORKERNEL) !=
2169 	    DDI_SUCCESS)
2170 		dev_err(nvme->n_dip, CE_WARN, "!ddi_dma_sync() failed in %s",
2171 		    __func__);
2172 
2173 	mutex_enter(&cq->ncq_mutex);
2174 
2175 	if ((cmd = nvme_get_completed(nvme, cq)) != NULL) {
2176 		head.b.cqhdbl_cqh = cq->ncq_head;
2177 		nvme_put32(nvme, cq->ncq_hdbl, head.r);
2178 	}
2179 
2180 	mutex_exit(&cq->ncq_mutex);
2181 
2182 	return (cmd);
2183 }
2184 
2185 static int
2186 nvme_check_unknown_cmd_status(nvme_cmd_t *cmd)
2187 {
2188 	nvme_cqe_t *cqe = &cmd->nc_cqe;
2189 
2190 	dev_err(cmd->nc_nvme->n_dip, CE_WARN,
2191 	    "!unknown command status received: opc = %x, sqid = %d, cid = %d, "
2192 	    "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc,
2193 	    cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct,
2194 	    cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m);
2195 
2196 	if (cmd->nc_xfer != NULL)
2197 		bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
2198 
2199 	/*
2200 	 * User commands should never cause us to mark the controller dead.
2201 	 * Though whether we ever should mark it dead as there currently isn't a
2202 	 * useful recovery path is another question.
2203 	 */
2204 	if (((cmd->nc_flags & NVME_CMD_F_DONTPANIC) == 0) &&
2205 	    cmd->nc_nvme->n_strict_version) {
2206 		nvme_ctrl_mark_dead(cmd->nc_nvme, B_FALSE);
2207 	}
2208 
2209 	return (EIO);
2210 }
2211 
2212 static int
2213 nvme_check_vendor_cmd_status(nvme_cmd_t *cmd)
2214 {
2215 	nvme_cqe_t *cqe = &cmd->nc_cqe;
2216 
2217 	dev_err(cmd->nc_nvme->n_dip, CE_WARN,
2218 	    "!unknown command status received: opc = %x, sqid = %d, cid = %d, "
2219 	    "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc,
2220 	    cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct,
2221 	    cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m);
2222 	if (!cmd->nc_nvme->n_ignore_unknown_vendor_status) {
2223 		nvme_ctrl_mark_dead(cmd->nc_nvme, B_FALSE);
2224 	}
2225 
2226 	return (EIO);
2227 }
2228 
2229 static int
2230 nvme_check_integrity_cmd_status(nvme_cmd_t *cmd)
2231 {
2232 	nvme_cqe_t *cqe = &cmd->nc_cqe;
2233 
2234 	switch (cqe->cqe_sf.sf_sc) {
2235 	case NVME_CQE_SC_INT_NVM_WRITE:
2236 		/* write fail */
2237 		/* TODO: post ereport */
2238 		if (cmd->nc_xfer != NULL)
2239 			bd_error(cmd->nc_xfer, BD_ERR_MEDIA);
2240 		return (EIO);
2241 
2242 	case NVME_CQE_SC_INT_NVM_READ:
2243 		/* read fail */
2244 		/* TODO: post ereport */
2245 		if (cmd->nc_xfer != NULL)
2246 			bd_error(cmd->nc_xfer, BD_ERR_MEDIA);
2247 		return (EIO);
2248 
2249 	default:
2250 		return (nvme_check_unknown_cmd_status(cmd));
2251 	}
2252 }
2253 
2254 static int
2255 nvme_check_generic_cmd_status(nvme_cmd_t *cmd)
2256 {
2257 	nvme_cqe_t *cqe = &cmd->nc_cqe;
2258 
2259 	switch (cqe->cqe_sf.sf_sc) {
2260 	case NVME_CQE_SC_GEN_SUCCESS:
2261 		return (0);
2262 
2263 	/*
2264 	 * Errors indicating a bug in the driver should cause a panic.
2265 	 */
2266 	case NVME_CQE_SC_GEN_INV_OPC:
2267 		/* Invalid Command Opcode */
2268 		NVME_BUMP_STAT(cmd->nc_nvme, inv_cmd_err);
2269 		if ((cmd->nc_flags & NVME_CMD_F_DONTPANIC) == 0) {
2270 			dev_err(cmd->nc_nvme->n_dip, CE_PANIC,
2271 			    "programming error: invalid opcode in cmd %p",
2272 			    (void *)cmd);
2273 		}
2274 		return (EINVAL);
2275 
2276 	case NVME_CQE_SC_GEN_INV_FLD:
2277 		/* Invalid Field in Command */
2278 		NVME_BUMP_STAT(cmd->nc_nvme, inv_field_err);
2279 		if ((cmd->nc_flags & NVME_CMD_F_DONTPANIC) == 0) {
2280 			dev_err(cmd->nc_nvme->n_dip, CE_PANIC,
2281 			    "programming error: invalid field in cmd %p",
2282 			    (void *)cmd);
2283 		}
2284 		return (EIO);
2285 
2286 	case NVME_CQE_SC_GEN_ID_CNFL:
2287 		/* Command ID Conflict */
2288 		dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
2289 		    "cmd ID conflict in cmd %p", (void *)cmd);
2290 		return (0);
2291 
2292 	case NVME_CQE_SC_GEN_INV_NS:
2293 		/* Invalid Namespace or Format */
2294 		NVME_BUMP_STAT(cmd->nc_nvme, inv_nsfmt_err);
2295 		if ((cmd->nc_flags & NVME_CMD_F_DONTPANIC) == 0) {
2296 			dev_err(cmd->nc_nvme->n_dip, CE_PANIC,
2297 			    "programming error: invalid NS/format in cmd %p",
2298 			    (void *)cmd);
2299 		}
2300 		return (EINVAL);
2301 
2302 	case NVME_CQE_SC_GEN_CMD_SEQ_ERR:
2303 		/*
2304 		 * Command Sequence Error
2305 		 *
2306 		 * This can be generated normally by user log page requests that
2307 		 * come out of order (e.g. getting the persistent event log
2308 		 * without establishing the context). If the kernel manages this
2309 		 * on its own then that's problematic.
2310 		 */
2311 		NVME_BUMP_STAT(cmd->nc_nvme, inv_cmdseq_err);
2312 		if ((cmd->nc_flags & NVME_CMD_F_DONTPANIC) == 0) {
2313 			dev_err(cmd->nc_nvme->n_dip, CE_PANIC,
2314 			    "programming error: command sequencing error %p",
2315 			    (void *)cmd);
2316 		}
2317 		return (EINVAL);
2318 
2319 	case NVME_CQE_SC_GEN_NVM_LBA_RANGE:
2320 		/* LBA Out Of Range */
2321 		dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
2322 		    "LBA out of range in cmd %p", (void *)cmd);
2323 		return (0);
2324 
2325 	/*
2326 	 * Non-fatal errors, handle gracefully.
2327 	 */
2328 	case NVME_CQE_SC_GEN_DATA_XFR_ERR:
2329 		/* Data Transfer Error (DMA) */
2330 		/* TODO: post ereport */
2331 		NVME_BUMP_STAT(cmd->nc_nvme, data_xfr_err);
2332 		if (cmd->nc_xfer != NULL)
2333 			bd_error(cmd->nc_xfer, BD_ERR_NTRDY);
2334 		return (EIO);
2335 
2336 	case NVME_CQE_SC_GEN_INTERNAL_ERR:
2337 		/*
2338 		 * Internal Error. The spec (v1.0, section 4.5.1.2) says
2339 		 * detailed error information is returned as async event,
2340 		 * so we pretty much ignore the error here and handle it
2341 		 * in the async event handler.
2342 		 */
2343 		NVME_BUMP_STAT(cmd->nc_nvme, internal_err);
2344 		if (cmd->nc_xfer != NULL)
2345 			bd_error(cmd->nc_xfer, BD_ERR_NTRDY);
2346 		return (EIO);
2347 
2348 	case NVME_CQE_SC_GEN_ABORT_REQUEST:
2349 		/*
2350 		 * Command Abort Requested. This normally happens only when a
2351 		 * command times out.
2352 		 */
2353 		/* TODO: post ereport or change blkdev to handle this? */
2354 		NVME_BUMP_STAT(cmd->nc_nvme, abort_rq_err);
2355 		return (ECANCELED);
2356 
2357 	case NVME_CQE_SC_GEN_ABORT_PWRLOSS:
2358 		/* Command Aborted due to Power Loss Notification */
2359 		NVME_BUMP_STAT(cmd->nc_nvme, abort_pwrloss_err);
2360 		nvme_ctrl_mark_dead(cmd->nc_nvme, B_FALSE);
2361 		return (EIO);
2362 
2363 	case NVME_CQE_SC_GEN_ABORT_SQ_DEL:
2364 		/* Command Aborted due to SQ Deletion */
2365 		NVME_BUMP_STAT(cmd->nc_nvme, abort_sq_del);
2366 		return (EIO);
2367 
2368 	case NVME_CQE_SC_GEN_NVM_CAP_EXC:
2369 		/* Capacity Exceeded */
2370 		NVME_BUMP_STAT(cmd->nc_nvme, nvm_cap_exc);
2371 		if (cmd->nc_xfer != NULL)
2372 			bd_error(cmd->nc_xfer, BD_ERR_MEDIA);
2373 		return (EIO);
2374 
2375 	case NVME_CQE_SC_GEN_NVM_NS_NOTRDY:
2376 		/* Namespace Not Ready */
2377 		NVME_BUMP_STAT(cmd->nc_nvme, nvm_ns_notrdy);
2378 		if (cmd->nc_xfer != NULL)
2379 			bd_error(cmd->nc_xfer, BD_ERR_NTRDY);
2380 		return (EIO);
2381 
2382 	case NVME_CQE_SC_GEN_NVM_FORMATTING:
2383 		/* Format in progress (1.2) */
2384 		if (!NVME_VERSION_ATLEAST(&cmd->nc_nvme->n_version, 1, 2))
2385 			return (nvme_check_unknown_cmd_status(cmd));
2386 		NVME_BUMP_STAT(cmd->nc_nvme, nvm_ns_formatting);
2387 		if (cmd->nc_xfer != NULL)
2388 			bd_error(cmd->nc_xfer, BD_ERR_NTRDY);
2389 		return (EIO);
2390 
2391 	default:
2392 		return (nvme_check_unknown_cmd_status(cmd));
2393 	}
2394 }
2395 
2396 static int
2397 nvme_check_specific_cmd_status(nvme_cmd_t *cmd)
2398 {
2399 	nvme_cqe_t *cqe = &cmd->nc_cqe;
2400 
2401 	switch (cqe->cqe_sf.sf_sc) {
2402 	case NVME_CQE_SC_SPC_INV_CQ:
2403 		/* Completion Queue Invalid */
2404 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE);
2405 		NVME_BUMP_STAT(cmd->nc_nvme, inv_cq_err);
2406 		return (EINVAL);
2407 
2408 	case NVME_CQE_SC_SPC_INV_QID:
2409 		/* Invalid Queue Identifier */
2410 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE ||
2411 		    cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_SQUEUE ||
2412 		    cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE ||
2413 		    cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_CQUEUE);
2414 		NVME_BUMP_STAT(cmd->nc_nvme, inv_qid_err);
2415 		return (EINVAL);
2416 
2417 	case NVME_CQE_SC_SPC_MAX_QSZ_EXC:
2418 		/* Max Queue Size Exceeded */
2419 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE ||
2420 		    cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE);
2421 		NVME_BUMP_STAT(cmd->nc_nvme, max_qsz_exc);
2422 		return (EINVAL);
2423 
2424 	case NVME_CQE_SC_SPC_ABRT_CMD_EXC:
2425 		/* Abort Command Limit Exceeded */
2426 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_ABORT);
2427 		dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
2428 		    "abort command limit exceeded in cmd %p", (void *)cmd);
2429 		return (0);
2430 
2431 	case NVME_CQE_SC_SPC_ASYNC_EVREQ_EXC:
2432 		/* Async Event Request Limit Exceeded */
2433 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_ASYNC_EVENT);
2434 		dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
2435 		    "async event request limit exceeded in cmd %p",
2436 		    (void *)cmd);
2437 		return (0);
2438 
2439 	case NVME_CQE_SC_SPC_INV_INT_VECT:
2440 		/* Invalid Interrupt Vector */
2441 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE);
2442 		NVME_BUMP_STAT(cmd->nc_nvme, inv_int_vect);
2443 		return (EINVAL);
2444 
2445 	case NVME_CQE_SC_SPC_INV_LOG_PAGE:
2446 		/* Invalid Log Page */
2447 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_GET_LOG_PAGE);
2448 		NVME_BUMP_STAT(cmd->nc_nvme, inv_log_page);
2449 		return (EINVAL);
2450 
2451 	case NVME_CQE_SC_SPC_INV_FORMAT:
2452 		/* Invalid Format */
2453 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_FORMAT ||
2454 		    cmd->nc_sqe.sqe_opc == NVME_OPC_NS_MGMT);
2455 		NVME_BUMP_STAT(cmd->nc_nvme, inv_format);
2456 		if (cmd->nc_xfer != NULL)
2457 			bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
2458 		return (EINVAL);
2459 
2460 	case NVME_CQE_SC_SPC_INV_Q_DEL:
2461 		/* Invalid Queue Deletion */
2462 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_CQUEUE);
2463 		NVME_BUMP_STAT(cmd->nc_nvme, inv_q_del);
2464 		return (EINVAL);
2465 
2466 	case NVME_CQE_SC_SPC_NVM_CNFL_ATTR:
2467 		/* Conflicting Attributes */
2468 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_DSET_MGMT ||
2469 		    cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_READ ||
2470 		    cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE);
2471 		NVME_BUMP_STAT(cmd->nc_nvme, cnfl_attr);
2472 		if (cmd->nc_xfer != NULL)
2473 			bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
2474 		return (EINVAL);
2475 
2476 	case NVME_CQE_SC_SPC_NVM_INV_PROT:
2477 		/* Invalid Protection Information */
2478 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_COMPARE ||
2479 		    cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_READ ||
2480 		    cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE);
2481 		NVME_BUMP_STAT(cmd->nc_nvme, inv_prot);
2482 		if (cmd->nc_xfer != NULL)
2483 			bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
2484 		return (EINVAL);
2485 
2486 	case NVME_CQE_SC_SPC_NVM_READONLY:
2487 		/* Write to Read Only Range */
2488 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE);
2489 		NVME_BUMP_STAT(cmd->nc_nvme, readonly);
2490 		if (cmd->nc_xfer != NULL)
2491 			bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
2492 		return (EROFS);
2493 
2494 	case NVME_CQE_SC_SPC_INV_FW_SLOT:
2495 		/* Invalid Firmware Slot */
2496 		NVME_BUMP_STAT(cmd->nc_nvme, inv_fwslot);
2497 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
2498 		return (EINVAL);
2499 
2500 	case NVME_CQE_SC_SPC_INV_FW_IMG:
2501 		/* Invalid Firmware Image */
2502 		NVME_BUMP_STAT(cmd->nc_nvme, inv_fwimg);
2503 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
2504 		return (EINVAL);
2505 
2506 	case NVME_CQE_SC_SPC_FW_RESET:
2507 		/* Conventional Reset Required */
2508 		NVME_BUMP_STAT(cmd->nc_nvme, fwact_creset);
2509 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
2510 		return (0);
2511 
2512 	case NVME_CQE_SC_SPC_FW_NSSR:
2513 		/* NVMe Subsystem Reset Required */
2514 		NVME_BUMP_STAT(cmd->nc_nvme, fwact_nssr);
2515 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
2516 		return (0);
2517 
2518 	case NVME_CQE_SC_SPC_FW_NEXT_RESET:
2519 		/* Activation Requires Reset */
2520 		NVME_BUMP_STAT(cmd->nc_nvme, fwact_reset);
2521 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
2522 		return (0);
2523 
2524 	case NVME_CQE_SC_SPC_FW_MTFA:
2525 		/* Activation Requires Maximum Time Violation */
2526 		NVME_BUMP_STAT(cmd->nc_nvme, fwact_mtfa);
2527 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
2528 		return (EAGAIN);
2529 
2530 	case NVME_CQE_SC_SPC_FW_PROHIBITED:
2531 		/* Activation Prohibited */
2532 		NVME_BUMP_STAT(cmd->nc_nvme, fwact_prohibited);
2533 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
2534 		return (EINVAL);
2535 
2536 	case NVME_CQE_SC_SPC_FW_OVERLAP:
2537 		/* Overlapping Firmware Ranges */
2538 		NVME_BUMP_STAT(cmd->nc_nvme, fw_overlap);
2539 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_IMAGE_LOAD ||
2540 		    cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
2541 		return (EINVAL);
2542 
2543 	case NVME_CQE_SC_SPC_NS_ATTACHED:
2544 		/* Namespace Already Attached */
2545 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NS_ATTACH);
2546 		NVME_BUMP_STAT(cmd->nc_nvme, ns_attached);
2547 		return (EEXIST);
2548 
2549 	case NVME_CQE_SC_SPC_NS_PRIV:
2550 		/* Namespace Is Private */
2551 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NS_ATTACH);
2552 		NVME_BUMP_STAT(cmd->nc_nvme, ns_priv);
2553 		return (EACCES);
2554 
2555 	case NVME_CQE_SC_SPC_NS_NOT_ATTACH:
2556 		/* Namespace Not Attached */
2557 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NS_ATTACH);
2558 		NVME_BUMP_STAT(cmd->nc_nvme, ns_not_attached);
2559 		return (ENOENT);
2560 
2561 	case NVME_CQE_SC_SPC_INV_CTRL_LIST:
2562 		/* Controller List Invalid */
2563 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NS_ATTACH);
2564 		NVME_BUMP_STAT(cmd->nc_nvme, ana_attach);
2565 		return (EINVAL);
2566 
2567 	case NVME_CQE_SC_SPC_ANA_ATTACH:
2568 		/* ANA Attach Failed */
2569 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NS_ATTACH);
2570 		NVME_BUMP_STAT(cmd->nc_nvme, ana_attach);
2571 		return (EIO);
2572 
2573 	case NVME_CQE_SC_SPC_NS_ATTACH_LIM:
2574 		/* Namespace Attachment Limit Exceeded */
2575 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NS_ATTACH);
2576 		NVME_BUMP_STAT(cmd->nc_nvme, ns_attach_lim);
2577 		return (EOVERFLOW);
2578 
2579 	default:
2580 		return (nvme_check_unknown_cmd_status(cmd));
2581 	}
2582 }
2583 
2584 static inline int
2585 nvme_check_cmd_status(nvme_cmd_t *cmd)
2586 {
2587 	nvme_cqe_t *cqe = &cmd->nc_cqe;
2588 
2589 	/*
2590 	 * Take a shortcut if the controller is dead, or if
2591 	 * command status indicates no error.
2592 	 */
2593 	if (cmd->nc_nvme->n_dead)
2594 		return (EIO);
2595 
2596 	if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
2597 	    cqe->cqe_sf.sf_sc == NVME_CQE_SC_GEN_SUCCESS)
2598 		return (0);
2599 
2600 	if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC)
2601 		return (nvme_check_generic_cmd_status(cmd));
2602 	else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_SPECIFIC)
2603 		return (nvme_check_specific_cmd_status(cmd));
2604 	else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_INTEGRITY)
2605 		return (nvme_check_integrity_cmd_status(cmd));
2606 	else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_VENDOR)
2607 		return (nvme_check_vendor_cmd_status(cmd));
2608 
2609 	return (nvme_check_unknown_cmd_status(cmd));
2610 }
2611 
2612 /*
2613  * Check the command status as used by an ioctl path and do not convert it to an
2614  * errno. We still allow all the command status checking to occur, but otherwise
2615  * will pass back the controller error as is.
2616  */
2617 static boolean_t
2618 nvme_check_cmd_status_ioctl(nvme_cmd_t *cmd, nvme_ioctl_common_t *ioc)
2619 {
2620 	nvme_cqe_t *cqe = &cmd->nc_cqe;
2621 	nvme_t *nvme = cmd->nc_nvme;
2622 
2623 	if (nvme->n_dead) {
2624 		return (nvme_ioctl_error(ioc, nvme->n_dead_status, 0, 0));
2625 	}
2626 
2627 	if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
2628 	    cqe->cqe_sf.sf_sc == NVME_CQE_SC_GEN_SUCCESS)
2629 		return (B_TRUE);
2630 
2631 	if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC) {
2632 		(void) nvme_check_generic_cmd_status(cmd);
2633 	} else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_SPECIFIC) {
2634 		(void) nvme_check_specific_cmd_status(cmd);
2635 	} else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_INTEGRITY) {
2636 		(void) nvme_check_integrity_cmd_status(cmd);
2637 	} else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_VENDOR) {
2638 		(void) nvme_check_vendor_cmd_status(cmd);
2639 	} else {
2640 		(void) nvme_check_unknown_cmd_status(cmd);
2641 	}
2642 
2643 	return (nvme_ioctl_error(ioc, NVME_IOCTL_E_CTRL_ERROR,
2644 	    cqe->cqe_sf.sf_sct, cqe->cqe_sf.sf_sc));
2645 }
2646 
2647 static int
2648 nvme_abort_cmd(nvme_cmd_t *cmd, const uint32_t sec)
2649 {
2650 	nvme_t *nvme = cmd->nc_nvme;
2651 	nvme_cmd_t *abort_cmd = nvme_alloc_admin_cmd(nvme, KM_SLEEP);
2652 	nvme_abort_cmd_t ac = { 0 };
2653 	int ret = 0;
2654 
2655 	sema_p(&nvme->n_abort_sema);
2656 
2657 	ac.b.ac_cid = cmd->nc_sqe.sqe_cid;
2658 	ac.b.ac_sqid = cmd->nc_sqid;
2659 
2660 	abort_cmd->nc_sqid = 0;
2661 	abort_cmd->nc_sqe.sqe_opc = NVME_OPC_ABORT;
2662 	abort_cmd->nc_callback = nvme_wakeup_cmd;
2663 	abort_cmd->nc_sqe.sqe_cdw10 = ac.r;
2664 
2665 	/*
2666 	 * Send the ABORT to the hardware. The ABORT command will return _after_
2667 	 * the aborted command has completed (aborted or otherwise) so we must
2668 	 * drop the aborted command's lock to allow it to complete.
2669 	 * We want to allow at least `nvme_abort_cmd_timeout` seconds for the
2670 	 * abort to be processed, but more if we are aborting a long-running
2671 	 * command to give that time to complete/abort too.
2672 	 */
2673 	mutex_exit(&cmd->nc_mutex);
2674 	nvme_admin_cmd(abort_cmd, MAX(nvme_abort_cmd_timeout, sec));
2675 	mutex_enter(&cmd->nc_mutex);
2676 
2677 	sema_v(&nvme->n_abort_sema);
2678 
2679 	/* BEGIN CSTYLED */
2680 	/*
2681 	 * If the abort command itself has timed out, it will have been
2682 	 * de-queued so that its callback will not be called after this point,
2683 	 * and its state will be NVME_CMD_LOST.
2684 	 *
2685 	 * nvme_admin_cmd(abort_cmd)
2686 	 *   -> nvme_wait_cmd(abort_cmd)
2687 	 *     -> nvme_cmd(abort_cmd)
2688 	 *     | -> nvme_admin_cmd(cmd)
2689 	 *     |   -> nvme_wait_cmd(cmd)
2690 	 *     |     -> nvme_ctrl_mark_dead()
2691 	 *     |     -> nvme_lost_cmd(cmd)
2692 	 *     |       -> cmd->nc_stat = NVME_CMD_LOST
2693 	 *     and here we are.
2694 	 */
2695 	/* END CSTYLED */
2696 	if (abort_cmd->nc_state == NVME_CMD_LOST) {
2697 		dev_err(nvme->n_dip, CE_WARN,
2698 		    "!ABORT of command %d/%d timed out",
2699 		    cmd->nc_sqe.sqe_cid, cmd->nc_sqid);
2700 		NVME_BUMP_STAT(nvme, abort_timeout);
2701 		ret = EIO;
2702 	} else if ((ret = nvme_check_cmd_status(abort_cmd)) != 0) {
2703 		dev_err(nvme->n_dip, CE_WARN,
2704 		    "!ABORT of command %d/%d "
2705 		    "failed with sct = %x, sc = %x",
2706 		    cmd->nc_sqe.sqe_cid, cmd->nc_sqid,
2707 		    abort_cmd->nc_cqe.cqe_sf.sf_sct,
2708 		    abort_cmd->nc_cqe.cqe_sf.sf_sc);
2709 		NVME_BUMP_STAT(nvme, abort_failed);
2710 	} else {
2711 		boolean_t success = ((abort_cmd->nc_cqe.cqe_dw0 & 1) == 0);
2712 
2713 		dev_err(nvme->n_dip, CE_WARN,
2714 		    "!ABORT of command %d/%d %ssuccessful",
2715 		    cmd->nc_sqe.sqe_cid, cmd->nc_sqid,
2716 		    success ? "" : "un");
2717 
2718 		if (success) {
2719 			NVME_BUMP_STAT(nvme, abort_successful);
2720 		} else {
2721 			NVME_BUMP_STAT(nvme, abort_unsuccessful);
2722 		}
2723 	}
2724 
2725 	/*
2726 	 * This abort abort_cmd has either completed or been de-queued as
2727 	 * lost in nvme_wait_cmd. Either way it's safe to free it here.
2728 	 */
2729 	nvme_free_cmd(abort_cmd);
2730 
2731 	return (ret);
2732 }
2733 
2734 /*
2735  * nvme_wait_cmd -- wait for command completion or timeout
2736  *
2737  * In case of a serious error or a timeout of the abort command the hardware
2738  * will be declared dead and FMA will be notified.
2739  */
2740 static void
2741 nvme_wait_cmd(nvme_cmd_t *cmd, uint32_t sec)
2742 {
2743 	nvme_t *nvme = cmd->nc_nvme;
2744 	nvme_reg_csts_t csts;
2745 
2746 	ASSERT(mutex_owned(&cmd->nc_mutex));
2747 
2748 	while (cmd->nc_state != NVME_CMD_COMPLETED) {
2749 		clock_t timeout = ddi_get_lbolt() +
2750 		    drv_usectohz((long)sec * MICROSEC);
2751 
2752 		if (cv_timedwait(&cmd->nc_cv, &cmd->nc_mutex, timeout) == -1) {
2753 			/*
2754 			 * If this command is on the task queue then we don't
2755 			 * consider it to have timed out. We are waiting for
2756 			 * the callback to be invoked, the timing of which can
2757 			 * be affected by system load and should not count
2758 			 * against the device; continue to wait.
2759 			 * While this doesn't help deal with the possibility of
2760 			 * a command timing out between being placed on the CQ
2761 			 * and arriving on the taskq, we expect interrupts to
2762 			 * run fairly promptly making this a small window.
2763 			 */
2764 			if (cmd->nc_state != NVME_CMD_QUEUED)
2765 				break;
2766 		}
2767 	}
2768 
2769 	if (cmd->nc_state == NVME_CMD_COMPLETED) {
2770 		DTRACE_PROBE1(nvme_admin_cmd_completed, nvme_cmd_t *, cmd);
2771 		nvme_admin_stat_cmd(nvme, cmd);
2772 		return;
2773 	}
2774 
2775 	/*
2776 	 * The command timed out.
2777 	 */
2778 
2779 	DTRACE_PROBE1(nvme_admin_cmd_timeout, nvme_cmd_t *, cmd);
2780 	csts.r = nvme_get32(nvme, NVME_REG_CSTS);
2781 	dev_err(nvme->n_dip, CE_WARN, "!command %d/%d timeout, "
2782 	    "OPC = %x, CFS = %d", cmd->nc_sqe.sqe_cid, cmd->nc_sqid,
2783 	    cmd->nc_sqe.sqe_opc, csts.b.csts_cfs);
2784 	NVME_BUMP_STAT(nvme, cmd_timeout);
2785 
2786 	/*
2787 	 * Check controller for fatal status, any errors associated with the
2788 	 * register or DMA handle, or for a double timeout (abort command timed
2789 	 * out). If necessary log a warning and call FMA.
2790 	 */
2791 	if (csts.b.csts_cfs ||
2792 	    nvme_check_regs_hdl(nvme) ||
2793 	    nvme_check_dma_hdl(cmd->nc_dma) ||
2794 	    cmd->nc_sqe.sqe_opc == NVME_OPC_ABORT) {
2795 		nvme_ctrl_mark_dead(cmd->nc_nvme, B_FALSE);
2796 		nvme_lost_cmd(nvme, cmd);
2797 		return;
2798 	}
2799 
2800 	/* Issue an abort for the command that has timed out */
2801 	if (nvme_abort_cmd(cmd, sec) == 0) {
2802 		/*
2803 		 * If the abort completed, whether or not it was
2804 		 * successful in aborting the command, that command
2805 		 * will also have completed with an appropriate
2806 		 * status.
2807 		 */
2808 		while (cmd->nc_state != NVME_CMD_COMPLETED)
2809 			cv_wait(&cmd->nc_cv, &cmd->nc_mutex);
2810 		return;
2811 	}
2812 
2813 	/*
2814 	 * Otherwise, the abort has also timed out or failed, which
2815 	 * will have marked the controller dead. De-queue the original command
2816 	 * and add it to the lost commands list.
2817 	 */
2818 	VERIFY(cmd->nc_nvme->n_dead);
2819 	nvme_lost_cmd(nvme, cmd);
2820 }
2821 
2822 static void
2823 nvme_wakeup_cmd(void *arg)
2824 {
2825 	nvme_cmd_t *cmd = arg;
2826 
2827 	ASSERT(cmd->nc_flags & NVME_CMD_F_USELOCK);
2828 
2829 	mutex_enter(&cmd->nc_mutex);
2830 	cmd->nc_state = NVME_CMD_COMPLETED;
2831 	cv_signal(&cmd->nc_cv);
2832 	mutex_exit(&cmd->nc_mutex);
2833 }
2834 
2835 static void
2836 nvme_async_event_task(void *arg)
2837 {
2838 	nvme_cmd_t *cmd = arg;
2839 	nvme_t *nvme = cmd->nc_nvme;
2840 	nvme_error_log_entry_t *error_log = NULL;
2841 	nvme_health_log_t *health_log = NULL;
2842 	nvme_nschange_list_t *nslist = NULL;
2843 	size_t logsize = 0;
2844 	nvme_async_event_t event;
2845 
2846 	/*
2847 	 * Check for errors associated with the async request itself. The only
2848 	 * command-specific error is "async event limit exceeded", which
2849 	 * indicates a programming error in the driver and causes a panic in
2850 	 * nvme_check_cmd_status().
2851 	 *
2852 	 * Other possible errors are various scenarios where the async request
2853 	 * was aborted, or internal errors in the device. Internal errors are
2854 	 * reported to FMA, the command aborts need no special handling here.
2855 	 *
2856 	 * And finally, at least qemu nvme does not support async events,
2857 	 * and will return NVME_CQE_SC_GEN_INV_OPC | DNR. If so, we
2858 	 * will avoid posting async events.
2859 	 */
2860 
2861 	if (nvme_check_cmd_status(cmd) != 0) {
2862 		dev_err(cmd->nc_nvme->n_dip, CE_WARN,
2863 		    "!async event request returned failure, sct = 0x%x, "
2864 		    "sc = 0x%x, dnr = %d, m = %d", cmd->nc_cqe.cqe_sf.sf_sct,
2865 		    cmd->nc_cqe.cqe_sf.sf_sc, cmd->nc_cqe.cqe_sf.sf_dnr,
2866 		    cmd->nc_cqe.cqe_sf.sf_m);
2867 
2868 		if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
2869 		    cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INTERNAL_ERR) {
2870 			nvme_ctrl_mark_dead(cmd->nc_nvme, B_FALSE);
2871 		}
2872 
2873 		if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
2874 		    cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INV_OPC &&
2875 		    cmd->nc_cqe.cqe_sf.sf_dnr == 1) {
2876 			nvme->n_async_event_supported = B_FALSE;
2877 		}
2878 
2879 		nvme_free_cmd(cmd);
2880 		return;
2881 	}
2882 
2883 	event.r = cmd->nc_cqe.cqe_dw0;
2884 
2885 	/* Clear CQE and re-submit the async request. */
2886 	bzero(&cmd->nc_cqe, sizeof (nvme_cqe_t));
2887 	nvme_submit_admin_cmd(nvme->n_adminq, cmd, NULL);
2888 	cmd = NULL;	/* cmd can no longer be used after resubmission */
2889 
2890 	switch (event.b.ae_type) {
2891 	case NVME_ASYNC_TYPE_ERROR:
2892 		if (event.b.ae_logpage == NVME_LOGPAGE_ERROR) {
2893 			if (!nvme_get_logpage_int(nvme, B_FALSE,
2894 			    (void **)&error_log, &logsize,
2895 			    NVME_LOGPAGE_ERROR)) {
2896 				return;
2897 			}
2898 		} else {
2899 			dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in "
2900 			    "async event reply: type=0x%x logpage=0x%x",
2901 			    event.b.ae_type, event.b.ae_logpage);
2902 			NVME_BUMP_STAT(nvme, wrong_logpage);
2903 			return;
2904 		}
2905 
2906 		switch (event.b.ae_info) {
2907 		case NVME_ASYNC_ERROR_INV_SQ:
2908 			dev_err(nvme->n_dip, CE_PANIC, "programming error: "
2909 			    "invalid submission queue");
2910 			return;
2911 
2912 		case NVME_ASYNC_ERROR_INV_DBL:
2913 			dev_err(nvme->n_dip, CE_PANIC, "programming error: "
2914 			    "invalid doorbell write value");
2915 			return;
2916 
2917 		case NVME_ASYNC_ERROR_DIAGFAIL:
2918 			dev_err(nvme->n_dip, CE_WARN, "!diagnostic failure");
2919 			nvme_ctrl_mark_dead(nvme, B_FALSE);
2920 			NVME_BUMP_STAT(nvme, diagfail_event);
2921 			break;
2922 
2923 		case NVME_ASYNC_ERROR_PERSISTENT:
2924 			dev_err(nvme->n_dip, CE_WARN, "!persistent internal "
2925 			    "device error");
2926 			nvme_ctrl_mark_dead(nvme, B_FALSE);
2927 			NVME_BUMP_STAT(nvme, persistent_event);
2928 			break;
2929 
2930 		case NVME_ASYNC_ERROR_TRANSIENT:
2931 			dev_err(nvme->n_dip, CE_WARN, "!transient internal "
2932 			    "device error");
2933 			/* TODO: send ereport */
2934 			NVME_BUMP_STAT(nvme, transient_event);
2935 			break;
2936 
2937 		case NVME_ASYNC_ERROR_FW_LOAD:
2938 			dev_err(nvme->n_dip, CE_WARN,
2939 			    "!firmware image load error");
2940 			NVME_BUMP_STAT(nvme, fw_load_event);
2941 			break;
2942 		}
2943 		break;
2944 
2945 	case NVME_ASYNC_TYPE_HEALTH:
2946 		if (event.b.ae_logpage == NVME_LOGPAGE_HEALTH) {
2947 			if (!nvme_get_logpage_int(nvme, B_FALSE,
2948 			    (void **)&health_log, &logsize,
2949 			    NVME_LOGPAGE_HEALTH)) {
2950 				return;
2951 			}
2952 		} else {
2953 			dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in "
2954 			    "type=0x%x logpage=0x%x", event.b.ae_type,
2955 			    event.b.ae_logpage);
2956 			NVME_BUMP_STAT(nvme, wrong_logpage);
2957 			return;
2958 		}
2959 
2960 		switch (event.b.ae_info) {
2961 		case NVME_ASYNC_HEALTH_RELIABILITY:
2962 			dev_err(nvme->n_dip, CE_WARN,
2963 			    "!device reliability compromised");
2964 			/* TODO: send ereport */
2965 			NVME_BUMP_STAT(nvme, reliability_event);
2966 			break;
2967 
2968 		case NVME_ASYNC_HEALTH_TEMPERATURE:
2969 			dev_err(nvme->n_dip, CE_WARN,
2970 			    "!temperature above threshold");
2971 			/* TODO: send ereport */
2972 			NVME_BUMP_STAT(nvme, temperature_event);
2973 			break;
2974 
2975 		case NVME_ASYNC_HEALTH_SPARE:
2976 			dev_err(nvme->n_dip, CE_WARN,
2977 			    "!spare space below threshold");
2978 			/* TODO: send ereport */
2979 			NVME_BUMP_STAT(nvme, spare_event);
2980 			break;
2981 		}
2982 		break;
2983 
2984 	case NVME_ASYNC_TYPE_NOTICE:
2985 		switch (event.b.ae_info) {
2986 		case NVME_ASYNC_NOTICE_NS_CHANGE:
2987 			if (event.b.ae_logpage != NVME_LOGPAGE_NSCHANGE) {
2988 				dev_err(nvme->n_dip, CE_WARN,
2989 				    "!wrong logpage in async event reply: "
2990 				    "type=0x%x logpage=0x%x",
2991 				    event.b.ae_type, event.b.ae_logpage);
2992 				NVME_BUMP_STAT(nvme, wrong_logpage);
2993 				break;
2994 			}
2995 
2996 			dev_err(nvme->n_dip, CE_NOTE,
2997 			    "namespace attribute change event, "
2998 			    "logpage = 0x%x", event.b.ae_logpage);
2999 			NVME_BUMP_STAT(nvme, notice_event);
3000 
3001 			if (!nvme_get_logpage_int(nvme, B_FALSE,
3002 			    (void **)&nslist, &logsize,
3003 			    NVME_LOGPAGE_NSCHANGE)) {
3004 				break;
3005 			}
3006 
3007 			if (nslist->nscl_ns[0] == UINT32_MAX) {
3008 				dev_err(nvme->n_dip, CE_CONT,
3009 				    "more than %u namespaces have changed.\n",
3010 				    NVME_NSCHANGE_LIST_SIZE);
3011 				break;
3012 			}
3013 
3014 			nvme_mgmt_lock(nvme, NVME_MGMT_LOCK_NVME);
3015 			for (uint_t i = 0; i < NVME_NSCHANGE_LIST_SIZE; i++) {
3016 				uint32_t nsid = nslist->nscl_ns[i];
3017 				nvme_namespace_t *ns;
3018 
3019 				if (nsid == 0)	/* end of list */
3020 					break;
3021 
3022 				dev_err(nvme->n_dip, CE_NOTE,
3023 				    "!namespace nvme%d/%u has changed.",
3024 				    ddi_get_instance(nvme->n_dip), nsid);
3025 
3026 				if (nvme_init_ns(nvme, nsid) != DDI_SUCCESS)
3027 					continue;
3028 
3029 				ns = nvme_nsid2ns(nvme, nsid);
3030 				if (ns->ns_state <= NVME_NS_STATE_NOT_IGNORED)
3031 					continue;
3032 
3033 				nvme_mgmt_bd_start(nvme);
3034 				bd_state_change(ns->ns_bd_hdl);
3035 				nvme_mgmt_bd_end(nvme);
3036 			}
3037 			nvme_mgmt_unlock(nvme);
3038 
3039 			break;
3040 
3041 		case NVME_ASYNC_NOTICE_FW_ACTIVATE:
3042 			dev_err(nvme->n_dip, CE_NOTE,
3043 			    "firmware activation starting, "
3044 			    "logpage = 0x%x", event.b.ae_logpage);
3045 			NVME_BUMP_STAT(nvme, notice_event);
3046 			break;
3047 
3048 		case NVME_ASYNC_NOTICE_TELEMETRY:
3049 			dev_err(nvme->n_dip, CE_NOTE,
3050 			    "telemetry log changed, "
3051 			    "logpage = 0x%x", event.b.ae_logpage);
3052 			NVME_BUMP_STAT(nvme, notice_event);
3053 			break;
3054 
3055 		case NVME_ASYNC_NOTICE_NS_ASYMM:
3056 			dev_err(nvme->n_dip, CE_NOTE,
3057 			    "asymmetric namespace access change, "
3058 			    "logpage = 0x%x", event.b.ae_logpage);
3059 			NVME_BUMP_STAT(nvme, notice_event);
3060 			break;
3061 
3062 		case NVME_ASYNC_NOTICE_LATENCYLOG:
3063 			dev_err(nvme->n_dip, CE_NOTE,
3064 			    "predictable latency event aggregate log change, "
3065 			    "logpage = 0x%x", event.b.ae_logpage);
3066 			NVME_BUMP_STAT(nvme, notice_event);
3067 			break;
3068 
3069 		case NVME_ASYNC_NOTICE_LBASTATUS:
3070 			dev_err(nvme->n_dip, CE_NOTE,
3071 			    "LBA status information alert, "
3072 			    "logpage = 0x%x", event.b.ae_logpage);
3073 			NVME_BUMP_STAT(nvme, notice_event);
3074 			break;
3075 
3076 		case NVME_ASYNC_NOTICE_ENDURANCELOG:
3077 			dev_err(nvme->n_dip, CE_NOTE,
3078 			    "endurance group event aggregate log page change, "
3079 			    "logpage = 0x%x", event.b.ae_logpage);
3080 			NVME_BUMP_STAT(nvme, notice_event);
3081 			break;
3082 
3083 		default:
3084 			dev_err(nvme->n_dip, CE_WARN,
3085 			    "!unknown notice async event received, "
3086 			    "info = 0x%x, logpage = 0x%x", event.b.ae_info,
3087 			    event.b.ae_logpage);
3088 			NVME_BUMP_STAT(nvme, unknown_event);
3089 			break;
3090 		}
3091 		break;
3092 
3093 	case NVME_ASYNC_TYPE_VENDOR:
3094 		dev_err(nvme->n_dip, CE_WARN, "!vendor specific async event "
3095 		    "received, info = 0x%x, logpage = 0x%x", event.b.ae_info,
3096 		    event.b.ae_logpage);
3097 		NVME_BUMP_STAT(nvme, vendor_event);
3098 		break;
3099 
3100 	default:
3101 		dev_err(nvme->n_dip, CE_WARN, "!unknown async event received, "
3102 		    "type = 0x%x, info = 0x%x, logpage = 0x%x", event.b.ae_type,
3103 		    event.b.ae_info, event.b.ae_logpage);
3104 		NVME_BUMP_STAT(nvme, unknown_event);
3105 		break;
3106 	}
3107 
3108 	if (error_log != NULL)
3109 		kmem_free(error_log, logsize);
3110 
3111 	if (health_log != NULL)
3112 		kmem_free(health_log, logsize);
3113 
3114 	if (nslist != NULL)
3115 		kmem_free(nslist, logsize);
3116 }
3117 
3118 static void
3119 nvme_admin_cmd(nvme_cmd_t *cmd, uint32_t sec)
3120 {
3121 	uint32_t qtimeout;
3122 
3123 	ASSERT(cmd->nc_flags & NVME_CMD_F_USELOCK);
3124 
3125 	mutex_enter(&cmd->nc_mutex);
3126 	cmd->nc_timeout = sec;
3127 	nvme_submit_admin_cmd(cmd->nc_nvme->n_adminq, cmd, &qtimeout);
3128 	/*
3129 	 * We will wait for a total of this command's specified timeout plus
3130 	 * the sum of the timeouts of any commands queued ahead of this one. If
3131 	 * we aren't first in the queue, this will inflate the timeout somewhat
3132 	 * but these times are not critical and it means that if we get stuck
3133 	 * behind a long running command such as a namespace format then we
3134 	 * won't time out and trigger an abort.
3135 	 */
3136 	nvme_wait_cmd(cmd, sec + qtimeout);
3137 	mutex_exit(&cmd->nc_mutex);
3138 }
3139 
3140 static void
3141 nvme_async_event(nvme_t *nvme)
3142 {
3143 	nvme_cmd_t *cmd;
3144 
3145 	cmd = nvme_alloc_admin_cmd(nvme, KM_SLEEP);
3146 	cmd->nc_sqid = 0;
3147 	cmd->nc_sqe.sqe_opc = NVME_OPC_ASYNC_EVENT;
3148 	cmd->nc_callback = nvme_async_event_task;
3149 	cmd->nc_flags |= NVME_CMD_F_DONTPANIC;
3150 
3151 	nvme_submit_admin_cmd(nvme->n_adminq, cmd, NULL);
3152 }
3153 
3154 /*
3155  * There are commands such as format or vendor unique commands that are going to
3156  * manipulate the data in a namespace or destroy them, we make sure that none of
3157  * the ones that will be impacted are actually attached.
3158  */
3159 static boolean_t
3160 nvme_no_blkdev_attached(nvme_t *nvme, uint32_t nsid)
3161 {
3162 	ASSERT(nvme_mgmt_lock_held(nvme));
3163 	ASSERT3U(nsid, !=, 0);
3164 
3165 	if (nsid != NVME_NSID_BCAST) {
3166 		nvme_namespace_t *ns = nvme_nsid2ns(nvme, nsid);
3167 		return (ns->ns_state < NVME_NS_STATE_ATTACHED);
3168 	}
3169 
3170 	for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) {
3171 		nvme_namespace_t *ns = nvme_nsid2ns(nvme, i);
3172 
3173 		if (ns->ns_state >= NVME_NS_STATE_ATTACHED) {
3174 			return (B_FALSE);
3175 		}
3176 	}
3177 
3178 	return (B_TRUE);
3179 }
3180 
3181 static boolean_t
3182 nvme_format_nvm(nvme_t *nvme, nvme_ioctl_format_t *ioc)
3183 {
3184 	nvme_cmd_t *cmd = nvme_alloc_admin_cmd(nvme, KM_SLEEP);
3185 	nvme_format_nvm_t format_nvm = { 0 };
3186 	boolean_t ret;
3187 
3188 	format_nvm.b.fm_lbaf = bitx32(ioc->nif_lbaf, 3, 0);
3189 	format_nvm.b.fm_ses = bitx32(ioc->nif_ses, 2, 0);
3190 
3191 	cmd->nc_sqid = 0;
3192 	cmd->nc_callback = nvme_wakeup_cmd;
3193 	cmd->nc_sqe.sqe_nsid = ioc->nif_common.nioc_nsid;
3194 	cmd->nc_sqe.sqe_opc = NVME_OPC_NVM_FORMAT;
3195 	cmd->nc_sqe.sqe_cdw10 = format_nvm.r;
3196 
3197 	/*
3198 	 * We don't want to panic on any format commands. There are two reasons
3199 	 * for this:
3200 	 *
3201 	 * 1) All format commands are initiated by users. We don't want to panic
3202 	 * on user commands.
3203 	 *
3204 	 * 2) Several devices like the Samsung SM951 don't allow formatting of
3205 	 * all namespaces in one command and we'd prefer to handle that
3206 	 * gracefully.
3207 	 */
3208 	cmd->nc_flags |= NVME_CMD_F_DONTPANIC;
3209 
3210 	nvme_admin_cmd(cmd, nvme_format_cmd_timeout);
3211 
3212 	if (!nvme_check_cmd_status_ioctl(cmd, &ioc->nif_common) != 0) {
3213 		dev_err(nvme->n_dip, CE_WARN,
3214 		    "!FORMAT failed with sct = %x, sc = %x",
3215 		    cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
3216 		ret = B_FALSE;
3217 		goto fail;
3218 	}
3219 
3220 	ret = B_TRUE;
3221 fail:
3222 	nvme_free_cmd(cmd);
3223 	return (ret);
3224 }
3225 
3226 /*
3227  * Retrieve a specific log page. The contents of the log page request should
3228  * have already been validated by the system.
3229  */
3230 static boolean_t
3231 nvme_get_logpage(nvme_t *nvme, boolean_t user, nvme_ioctl_get_logpage_t *log,
3232     void **buf)
3233 {
3234 	nvme_cmd_t *cmd = nvme_alloc_admin_cmd(nvme, KM_SLEEP);
3235 	nvme_getlogpage_dw10_t dw10;
3236 	uint32_t offlo, offhi;
3237 	nvme_getlogpage_dw11_t dw11;
3238 	nvme_getlogpage_dw14_t dw14;
3239 	uint32_t ndw;
3240 	boolean_t ret = B_FALSE;
3241 
3242 	bzero(&dw10, sizeof (dw10));
3243 	bzero(&dw11, sizeof (dw11));
3244 	bzero(&dw14, sizeof (dw14));
3245 
3246 	cmd->nc_sqid = 0;
3247 	cmd->nc_callback = nvme_wakeup_cmd;
3248 	cmd->nc_sqe.sqe_opc = NVME_OPC_GET_LOG_PAGE;
3249 	cmd->nc_sqe.sqe_nsid = log->nigl_common.nioc_nsid;
3250 
3251 	if (user)
3252 		cmd->nc_flags |= NVME_CMD_F_DONTPANIC;
3253 
3254 	/*
3255 	 * The size field is the number of double words, but is a zeros based
3256 	 * value. We need to store our actual value minus one.
3257 	 */
3258 	ndw = (uint32_t)(log->nigl_len / 4);
3259 	ASSERT3U(ndw, >, 0);
3260 	ndw--;
3261 
3262 	dw10.b.lp_lid = bitx32(log->nigl_lid, 7, 0);
3263 	dw10.b.lp_lsp = bitx32(log->nigl_lsp, 6, 0);
3264 	dw10.b.lp_rae = bitx32(log->nigl_lsp, 0, 0);
3265 	dw10.b.lp_lnumdl = bitx32(ndw, 15, 0);
3266 
3267 	dw11.b.lp_numdu = bitx32(ndw, 31, 16);
3268 	dw11.b.lp_lsi = bitx32(log->nigl_lsi, 15, 0);
3269 
3270 	offlo = bitx64(log->nigl_offset, 31, 0);
3271 	offhi = bitx64(log->nigl_offset, 63, 32);
3272 
3273 	dw14.b.lp_csi = bitx32(log->nigl_csi, 7, 0);
3274 
3275 	cmd->nc_sqe.sqe_cdw10 = dw10.r;
3276 	cmd->nc_sqe.sqe_cdw11 = dw11.r;
3277 	cmd->nc_sqe.sqe_cdw12 = offlo;
3278 	cmd->nc_sqe.sqe_cdw13 = offhi;
3279 	cmd->nc_sqe.sqe_cdw14 = dw14.r;
3280 
3281 	if (nvme_zalloc_dma(nvme, log->nigl_len, DDI_DMA_READ,
3282 	    &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) {
3283 		dev_err(nvme->n_dip, CE_WARN,
3284 		    "!nvme_zalloc_dma failed for GET LOG PAGE");
3285 		ret = nvme_ioctl_error(&log->nigl_common,
3286 		    NVME_IOCTL_E_NO_DMA_MEM, 0, 0);
3287 		goto fail;
3288 	}
3289 
3290 	if (nvme_fill_prp(cmd, cmd->nc_dma->nd_dmah) != 0) {
3291 		ret = nvme_ioctl_error(&log->nigl_common,
3292 		    NVME_IOCTL_E_NO_DMA_MEM, 0, 0);
3293 		goto fail;
3294 	}
3295 	nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
3296 
3297 	if (!nvme_check_cmd_status_ioctl(cmd, &log->nigl_common)) {
3298 		if (!user) {
3299 			dev_err(nvme->n_dip, CE_WARN,
3300 			    "!GET LOG PAGE failed with sct = %x, sc = %x",
3301 			    cmd->nc_cqe.cqe_sf.sf_sct,
3302 			    cmd->nc_cqe.cqe_sf.sf_sc);
3303 		}
3304 		ret = B_FALSE;
3305 		goto fail;
3306 	}
3307 
3308 	*buf = kmem_alloc(log->nigl_len, KM_SLEEP);
3309 	bcopy(cmd->nc_dma->nd_memp, *buf, log->nigl_len);
3310 
3311 	ret = B_TRUE;
3312 fail:
3313 	nvme_free_cmd(cmd);
3314 
3315 	return (ret);
3316 }
3317 
3318 /*
3319  * This is an internal wrapper for when the kernel wants to get a log page.
3320  * Currently this assumes that the only thing that is required is the log page
3321  * ID. If more information is required, we'll be better served to just use the
3322  * general ioctl interface.
3323  */
3324 static boolean_t
3325 nvme_get_logpage_int(nvme_t *nvme, boolean_t user, void **buf, size_t *bufsize,
3326     uint8_t lid)
3327 {
3328 	const nvme_log_page_info_t *info = NULL;
3329 	nvme_ioctl_get_logpage_t log;
3330 	nvme_valid_ctrl_data_t data;
3331 	boolean_t bret;
3332 	bool var;
3333 
3334 	for (size_t i = 0; i < nvme_std_log_npages; i++) {
3335 		if (nvme_std_log_pages[i].nlpi_lid == lid &&
3336 		    nvme_std_log_pages[i].nlpi_csi == NVME_CSI_NVM) {
3337 			info = &nvme_std_log_pages[i];
3338 			break;
3339 		}
3340 	}
3341 
3342 	if (info == NULL) {
3343 		return (B_FALSE);
3344 	}
3345 
3346 	data.vcd_vers = &nvme->n_version;
3347 	data.vcd_id = nvme->n_idctl;
3348 	bzero(&log, sizeof (log));
3349 	log.nigl_common.nioc_nsid = NVME_NSID_BCAST;
3350 	log.nigl_csi = info->nlpi_csi;
3351 	log.nigl_lid = info->nlpi_lid;
3352 	log.nigl_len = nvme_log_page_info_size(info, &data, &var);
3353 
3354 	/*
3355 	 * We only support getting standard fixed-length log pages through the
3356 	 * kernel interface at this time. If a log page either has an unknown
3357 	 * size or has a variable length, then we cannot get it.
3358 	 */
3359 	if (log.nigl_len == 0 || var) {
3360 		return (B_FALSE);
3361 	}
3362 
3363 	bret = nvme_get_logpage(nvme, user, &log, buf);
3364 	if (!bret) {
3365 		return (B_FALSE);
3366 	}
3367 
3368 	*bufsize = log.nigl_len;
3369 	return (B_TRUE);
3370 }
3371 
3372 static boolean_t
3373 nvme_identify(nvme_t *nvme, boolean_t user, nvme_ioctl_identify_t *ioc,
3374     void **buf)
3375 {
3376 	nvme_cmd_t *cmd = nvme_alloc_admin_cmd(nvme, KM_SLEEP);
3377 	boolean_t ret = B_FALSE;
3378 	nvme_identify_dw10_t dw10;
3379 
3380 	ASSERT3P(buf, !=, NULL);
3381 
3382 	bzero(&dw10, sizeof (dw10));
3383 
3384 	cmd->nc_sqid = 0;
3385 	cmd->nc_callback = nvme_wakeup_cmd;
3386 	cmd->nc_sqe.sqe_opc = NVME_OPC_IDENTIFY;
3387 	cmd->nc_sqe.sqe_nsid = ioc->nid_common.nioc_nsid;
3388 
3389 	dw10.b.id_cns = bitx32(ioc->nid_cns, 7, 0);
3390 	dw10.b.id_cntid = bitx32(ioc->nid_ctrlid, 15, 0);
3391 
3392 	cmd->nc_sqe.sqe_cdw10 = dw10.r;
3393 
3394 	if (nvme_zalloc_dma(nvme, NVME_IDENTIFY_BUFSIZE, DDI_DMA_READ,
3395 	    &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) {
3396 		dev_err(nvme->n_dip, CE_WARN,
3397 		    "!nvme_zalloc_dma failed for IDENTIFY");
3398 		ret = nvme_ioctl_error(&ioc->nid_common,
3399 		    NVME_IOCTL_E_NO_DMA_MEM, 0, 0);
3400 		goto fail;
3401 	}
3402 
3403 	if (cmd->nc_dma->nd_ncookie > 2) {
3404 		dev_err(nvme->n_dip, CE_WARN,
3405 		    "!too many DMA cookies for IDENTIFY");
3406 		NVME_BUMP_STAT(nvme, too_many_cookies);
3407 		ret = nvme_ioctl_error(&ioc->nid_common,
3408 		    NVME_IOCTL_E_BAD_PRP, 0, 0);
3409 		goto fail;
3410 	}
3411 
3412 	cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_dma->nd_cookie.dmac_laddress;
3413 	if (cmd->nc_dma->nd_ncookie > 1) {
3414 		ddi_dma_nextcookie(cmd->nc_dma->nd_dmah,
3415 		    &cmd->nc_dma->nd_cookie);
3416 		cmd->nc_sqe.sqe_dptr.d_prp[1] =
3417 		    cmd->nc_dma->nd_cookie.dmac_laddress;
3418 	}
3419 
3420 	if (user)
3421 		cmd->nc_flags |= NVME_CMD_F_DONTPANIC;
3422 
3423 	nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
3424 
3425 	if (!nvme_check_cmd_status_ioctl(cmd, &ioc->nid_common)) {
3426 		dev_err(nvme->n_dip, CE_WARN,
3427 		    "!IDENTIFY failed with sct = %x, sc = %x",
3428 		    cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
3429 		ret = B_FALSE;
3430 		goto fail;
3431 	}
3432 
3433 	*buf = kmem_alloc(NVME_IDENTIFY_BUFSIZE, KM_SLEEP);
3434 	bcopy(cmd->nc_dma->nd_memp, *buf, NVME_IDENTIFY_BUFSIZE);
3435 	ret = B_TRUE;
3436 
3437 fail:
3438 	nvme_free_cmd(cmd);
3439 
3440 	return (ret);
3441 }
3442 
3443 static boolean_t
3444 nvme_identify_int(nvme_t *nvme, uint32_t nsid, uint8_t cns, void **buf)
3445 {
3446 	nvme_ioctl_identify_t id;
3447 
3448 	bzero(&id, sizeof (nvme_ioctl_identify_t));
3449 	id.nid_common.nioc_nsid = nsid;
3450 	id.nid_cns = cns;
3451 
3452 	return (nvme_identify(nvme, B_FALSE, &id, buf));
3453 }
3454 
3455 static int
3456 nvme_set_features(nvme_t *nvme, boolean_t user, uint32_t nsid, uint8_t feature,
3457     uint32_t val, uint32_t *res)
3458 {
3459 	_NOTE(ARGUNUSED(nsid));
3460 	nvme_cmd_t *cmd = nvme_alloc_admin_cmd(nvme, KM_SLEEP);
3461 	int ret = EINVAL;
3462 
3463 	ASSERT(res != NULL);
3464 
3465 	cmd->nc_sqid = 0;
3466 	cmd->nc_callback = nvme_wakeup_cmd;
3467 	cmd->nc_sqe.sqe_opc = NVME_OPC_SET_FEATURES;
3468 	cmd->nc_sqe.sqe_cdw10 = feature;
3469 	cmd->nc_sqe.sqe_cdw11 = val;
3470 
3471 	if (user)
3472 		cmd->nc_flags |= NVME_CMD_F_DONTPANIC;
3473 
3474 	switch (feature) {
3475 	case NVME_FEAT_WRITE_CACHE:
3476 		if (!nvme->n_write_cache_present)
3477 			goto fail;
3478 		break;
3479 
3480 	case NVME_FEAT_NQUEUES:
3481 		break;
3482 
3483 	default:
3484 		goto fail;
3485 	}
3486 
3487 	nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
3488 
3489 	if ((ret = nvme_check_cmd_status(cmd)) != 0) {
3490 		dev_err(nvme->n_dip, CE_WARN,
3491 		    "!SET FEATURES %d failed with sct = %x, sc = %x",
3492 		    feature, cmd->nc_cqe.cqe_sf.sf_sct,
3493 		    cmd->nc_cqe.cqe_sf.sf_sc);
3494 		goto fail;
3495 	}
3496 
3497 	*res = cmd->nc_cqe.cqe_dw0;
3498 
3499 fail:
3500 	nvme_free_cmd(cmd);
3501 	return (ret);
3502 }
3503 
3504 static int
3505 nvme_write_cache_set(nvme_t *nvme, boolean_t enable)
3506 {
3507 	nvme_write_cache_t nwc = { 0 };
3508 
3509 	if (enable)
3510 		nwc.b.wc_wce = 1;
3511 
3512 	/*
3513 	 * We've seen some cases where this fails due to us being told we've
3514 	 * specified an invalid namespace when operating against the Xen xcp-ng
3515 	 * qemu NVMe virtual device. As such, we generally ensure that trying to
3516 	 * enable this doesn't lead us to panic. It's not completely clear why
3517 	 * specifying namespace zero here fails, but not when we're setting the
3518 	 * number of queues below.
3519 	 */
3520 	return (nvme_set_features(nvme, B_TRUE, 0, NVME_FEAT_WRITE_CACHE,
3521 	    nwc.r, &nwc.r));
3522 }
3523 
3524 static int
3525 nvme_set_nqueues(nvme_t *nvme)
3526 {
3527 	nvme_nqueues_t nq = { 0 };
3528 	int ret;
3529 
3530 	/*
3531 	 * The default is to allocate one completion queue per vector.
3532 	 */
3533 	if (nvme->n_completion_queues == -1)
3534 		nvme->n_completion_queues = nvme->n_intr_cnt;
3535 
3536 	/*
3537 	 * There is no point in having more completion queues than
3538 	 * interrupt vectors.
3539 	 */
3540 	nvme->n_completion_queues = MIN(nvme->n_completion_queues,
3541 	    nvme->n_intr_cnt);
3542 
3543 	/*
3544 	 * The default is to use one submission queue per completion queue.
3545 	 */
3546 	if (nvme->n_submission_queues == -1)
3547 		nvme->n_submission_queues = nvme->n_completion_queues;
3548 
3549 	/*
3550 	 * There is no point in having more completion queues than
3551 	 * submission queues.
3552 	 */
3553 	nvme->n_completion_queues = MIN(nvme->n_completion_queues,
3554 	    nvme->n_submission_queues);
3555 
3556 	ASSERT(nvme->n_submission_queues > 0);
3557 	ASSERT(nvme->n_completion_queues > 0);
3558 
3559 	nq.b.nq_nsq = nvme->n_submission_queues - 1;
3560 	nq.b.nq_ncq = nvme->n_completion_queues - 1;
3561 
3562 	ret = nvme_set_features(nvme, B_FALSE, 0, NVME_FEAT_NQUEUES, nq.r,
3563 	    &nq.r);
3564 
3565 	if (ret == 0) {
3566 		/*
3567 		 * Never use more than the requested number of queues.
3568 		 */
3569 		nvme->n_submission_queues = MIN(nvme->n_submission_queues,
3570 		    nq.b.nq_nsq + 1);
3571 		nvme->n_completion_queues = MIN(nvme->n_completion_queues,
3572 		    nq.b.nq_ncq + 1);
3573 	}
3574 
3575 	return (ret);
3576 }
3577 
3578 static int
3579 nvme_create_completion_queue(nvme_t *nvme, nvme_cq_t *cq)
3580 {
3581 	nvme_cmd_t *cmd = nvme_alloc_admin_cmd(nvme, KM_SLEEP);
3582 	nvme_create_queue_dw10_t dw10 = { 0 };
3583 	nvme_create_cq_dw11_t c_dw11 = { 0 };
3584 	int ret;
3585 
3586 	dw10.b.q_qid = cq->ncq_id;
3587 	dw10.b.q_qsize = cq->ncq_nentry - 1;
3588 
3589 	c_dw11.b.cq_pc = 1;
3590 	c_dw11.b.cq_ien = 1;
3591 	c_dw11.b.cq_iv = cq->ncq_id % nvme->n_intr_cnt;
3592 
3593 	cmd->nc_sqid = 0;
3594 	cmd->nc_callback = nvme_wakeup_cmd;
3595 	cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_CQUEUE;
3596 	cmd->nc_sqe.sqe_cdw10 = dw10.r;
3597 	cmd->nc_sqe.sqe_cdw11 = c_dw11.r;
3598 	cmd->nc_sqe.sqe_dptr.d_prp[0] = cq->ncq_dma->nd_cookie.dmac_laddress;
3599 
3600 	nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
3601 
3602 	if ((ret = nvme_check_cmd_status(cmd)) != 0) {
3603 		dev_err(nvme->n_dip, CE_WARN,
3604 		    "!CREATE CQUEUE failed with sct = %x, sc = %x",
3605 		    cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
3606 	}
3607 
3608 	nvme_free_cmd(cmd);
3609 
3610 	return (ret);
3611 }
3612 
3613 static int
3614 nvme_create_io_qpair(nvme_t *nvme, nvme_qpair_t *qp, uint16_t idx)
3615 {
3616 	nvme_cq_t *cq = qp->nq_cq;
3617 	nvme_cmd_t *cmd;
3618 	nvme_create_queue_dw10_t dw10 = { 0 };
3619 	nvme_create_sq_dw11_t s_dw11 = { 0 };
3620 	int ret;
3621 
3622 	/*
3623 	 * It is possible to have more qpairs than completion queues,
3624 	 * and when the idx > ncq_id, that completion queue is shared
3625 	 * and has already been created.
3626 	 */
3627 	if (idx <= cq->ncq_id &&
3628 	    nvme_create_completion_queue(nvme, cq) != DDI_SUCCESS)
3629 		return (DDI_FAILURE);
3630 
3631 	dw10.b.q_qid = idx;
3632 	dw10.b.q_qsize = qp->nq_nentry - 1;
3633 
3634 	s_dw11.b.sq_pc = 1;
3635 	s_dw11.b.sq_cqid = cq->ncq_id;
3636 
3637 	cmd = nvme_alloc_admin_cmd(nvme, KM_SLEEP);
3638 	cmd->nc_sqid = 0;
3639 	cmd->nc_callback = nvme_wakeup_cmd;
3640 	cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_SQUEUE;
3641 	cmd->nc_sqe.sqe_cdw10 = dw10.r;
3642 	cmd->nc_sqe.sqe_cdw11 = s_dw11.r;
3643 	cmd->nc_sqe.sqe_dptr.d_prp[0] = qp->nq_sqdma->nd_cookie.dmac_laddress;
3644 
3645 	nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
3646 
3647 	if ((ret = nvme_check_cmd_status(cmd)) != 0) {
3648 		dev_err(nvme->n_dip, CE_WARN,
3649 		    "!CREATE SQUEUE failed with sct = %x, sc = %x",
3650 		    cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
3651 	}
3652 
3653 	nvme_free_cmd(cmd);
3654 
3655 	return (ret);
3656 }
3657 
3658 static boolean_t
3659 nvme_reset(nvme_t *nvme, boolean_t quiesce)
3660 {
3661 	nvme_reg_csts_t csts;
3662 	int i;
3663 
3664 	/*
3665 	 * If the device is gone, do not try to interact with it.  We define
3666 	 * that resetting such a device is impossible, and always fails.
3667 	 */
3668 	if (nvme_ctrl_is_gone(nvme)) {
3669 		return (B_FALSE);
3670 	}
3671 
3672 	nvme_put32(nvme, NVME_REG_CC, 0);
3673 
3674 	csts.r = nvme_get32(nvme, NVME_REG_CSTS);
3675 	if (csts.b.csts_rdy == 1) {
3676 		nvme_put32(nvme, NVME_REG_CC, 0);
3677 
3678 		/*
3679 		 * The timeout value is from the Controller Capabilities
3680 		 * register (CAP.TO, section 3.1.1). This is the worst case
3681 		 * time to wait for CSTS.RDY to transition from 1 to 0 after
3682 		 * CC.EN transitions from 1 to 0.
3683 		 *
3684 		 * The timeout units are in 500 ms units, and we are delaying
3685 		 * in 50ms chunks, hence counting to n_timeout * 10.
3686 		 */
3687 		for (i = 0; i < nvme->n_timeout * 10; i++) {
3688 			csts.r = nvme_get32(nvme, NVME_REG_CSTS);
3689 			if (csts.b.csts_rdy == 0)
3690 				break;
3691 
3692 			/*
3693 			 * Quiescing drivers should not use locks or timeouts,
3694 			 * so if this is the quiesce path, use a quiesce-safe
3695 			 * delay.
3696 			 */
3697 			if (quiesce) {
3698 				drv_usecwait(50000);
3699 			} else {
3700 				delay(drv_usectohz(50000));
3701 			}
3702 		}
3703 	}
3704 
3705 	nvme_put32(nvme, NVME_REG_AQA, 0);
3706 	nvme_put32(nvme, NVME_REG_ASQ, 0);
3707 	nvme_put32(nvme, NVME_REG_ACQ, 0);
3708 
3709 	csts.r = nvme_get32(nvme, NVME_REG_CSTS);
3710 	return (csts.b.csts_rdy == 0 ? B_TRUE : B_FALSE);
3711 }
3712 
3713 static void
3714 nvme_shutdown(nvme_t *nvme, boolean_t quiesce)
3715 {
3716 	nvme_reg_cc_t cc;
3717 	nvme_reg_csts_t csts;
3718 	int i;
3719 
3720 	/*
3721 	 * Do not try to interact with the device if it is gone.  Since it is
3722 	 * not there, in some sense it must already be shut down anyway.
3723 	 */
3724 	if (nvme_ctrl_is_gone(nvme)) {
3725 		return;
3726 	}
3727 
3728 	cc.r = nvme_get32(nvme, NVME_REG_CC);
3729 	cc.b.cc_shn = NVME_CC_SHN_NORMAL;
3730 	nvme_put32(nvme, NVME_REG_CC, cc.r);
3731 
3732 	for (i = 0; i < 10; i++) {
3733 		csts.r = nvme_get32(nvme, NVME_REG_CSTS);
3734 		if (csts.b.csts_shst == NVME_CSTS_SHN_COMPLETE)
3735 			break;
3736 
3737 		if (quiesce) {
3738 			drv_usecwait(100000);
3739 		} else {
3740 			delay(drv_usectohz(100000));
3741 		}
3742 	}
3743 }
3744 
3745 /*
3746  * Return length of string without trailing spaces.
3747  */
3748 static size_t
3749 nvme_strlen(const char *str, size_t len)
3750 {
3751 	if (len <= 0)
3752 		return (0);
3753 
3754 	while (str[--len] == ' ')
3755 		;
3756 
3757 	return (++len);
3758 }
3759 
3760 static void
3761 nvme_config_min_block_size(nvme_t *nvme, char *model, char *val)
3762 {
3763 	ulong_t bsize = 0;
3764 	char *msg = "";
3765 
3766 	if (ddi_strtoul(val, NULL, 0, &bsize) != 0)
3767 		goto err;
3768 
3769 	if (!ISP2(bsize)) {
3770 		msg = ": not a power of 2";
3771 		goto err;
3772 	}
3773 
3774 	if (bsize < NVME_DEFAULT_MIN_BLOCK_SIZE) {
3775 		msg = ": too low";
3776 		goto err;
3777 	}
3778 
3779 	nvme->n_min_block_size = bsize;
3780 	return;
3781 
3782 err:
3783 	dev_err(nvme->n_dip, CE_WARN,
3784 	    "!nvme-config-list: ignoring invalid min-phys-block-size '%s' "
3785 	    "for model '%s'%s", val, model, msg);
3786 
3787 	nvme->n_min_block_size = NVME_DEFAULT_MIN_BLOCK_SIZE;
3788 }
3789 
3790 static void
3791 nvme_config_boolean(nvme_t *nvme, char *model, char *name, char *val,
3792     boolean_t *b)
3793 {
3794 	if (strcmp(val, "on") == 0 ||
3795 	    strcmp(val, "true") == 0)
3796 		*b = B_TRUE;
3797 	else if (strcmp(val, "off") == 0 ||
3798 	    strcmp(val, "false") == 0)
3799 		*b = B_FALSE;
3800 	else
3801 		dev_err(nvme->n_dip, CE_WARN,
3802 		    "!nvme-config-list: invalid value for %s '%s'"
3803 		    " for model '%s', ignoring", name, val, model);
3804 }
3805 
3806 static void
3807 nvme_config_list(nvme_t *nvme)
3808 {
3809 	char	**config_list;
3810 	uint_t	nelem;
3811 	int	rv;
3812 
3813 	/*
3814 	 * We're following the pattern of 'sd-config-list' here, but extend it.
3815 	 * Instead of two we have three separate strings for "model", "fwrev",
3816 	 * and "name-value-list".
3817 	 */
3818 	rv = ddi_prop_lookup_string_array(DDI_DEV_T_ANY, nvme->n_dip,
3819 	    DDI_PROP_DONTPASS, "nvme-config-list", &config_list, &nelem);
3820 
3821 	if (rv != DDI_PROP_SUCCESS) {
3822 		if (rv == DDI_PROP_CANNOT_DECODE) {
3823 			dev_err(nvme->n_dip, CE_WARN,
3824 			    "!nvme-config-list: cannot be decoded");
3825 		}
3826 
3827 		return;
3828 	}
3829 
3830 	if ((nelem % 3) != 0) {
3831 		dev_err(nvme->n_dip, CE_WARN, "!nvme-config-list: must be "
3832 		    "triplets of <model>/<fwrev>/<name-value-list> strings ");
3833 		goto out;
3834 	}
3835 
3836 	for (uint_t i = 0; i < nelem; i += 3) {
3837 		char	*model = config_list[i];
3838 		char	*fwrev = config_list[i + 1];
3839 		char	*nvp, *save_nv;
3840 		size_t	id_model_len, id_fwrev_len;
3841 
3842 		id_model_len = nvme_strlen(nvme->n_idctl->id_model,
3843 		    sizeof (nvme->n_idctl->id_model));
3844 
3845 		if (strlen(model) != id_model_len)
3846 			continue;
3847 
3848 		if (strncmp(model, nvme->n_idctl->id_model, id_model_len) != 0)
3849 			continue;
3850 
3851 		id_fwrev_len = nvme_strlen(nvme->n_idctl->id_fwrev,
3852 		    sizeof (nvme->n_idctl->id_fwrev));
3853 
3854 		if (strlen(fwrev) != 0) {
3855 			boolean_t match = B_FALSE;
3856 			char *fwr, *last_fw;
3857 
3858 			for (fwr = strtok_r(fwrev, ",", &last_fw);
3859 			    fwr != NULL;
3860 			    fwr = strtok_r(NULL, ",", &last_fw)) {
3861 				if (strlen(fwr) != id_fwrev_len)
3862 					continue;
3863 
3864 				if (strncmp(fwr, nvme->n_idctl->id_fwrev,
3865 				    id_fwrev_len) == 0)
3866 					match = B_TRUE;
3867 			}
3868 
3869 			if (!match)
3870 				continue;
3871 		}
3872 
3873 		/*
3874 		 * We should now have a comma-separated list of name:value
3875 		 * pairs.
3876 		 */
3877 		for (nvp = strtok_r(config_list[i + 2], ",", &save_nv);
3878 		    nvp != NULL; nvp = strtok_r(NULL, ",", &save_nv)) {
3879 			char	*name = nvp;
3880 			char	*val = strchr(nvp, ':');
3881 
3882 			if (val == NULL || name == val) {
3883 				dev_err(nvme->n_dip, CE_WARN,
3884 				    "!nvme-config-list: <name-value-list> "
3885 				    "for model '%s' is malformed", model);
3886 				goto out;
3887 			}
3888 
3889 			/*
3890 			 * Null-terminate 'name', move 'val' past ':' sep.
3891 			 */
3892 			*val++ = '\0';
3893 
3894 			/*
3895 			 * Process the name:val pairs that we know about.
3896 			 */
3897 			if (strcmp(name, "ignore-unknown-vendor-status") == 0) {
3898 				nvme_config_boolean(nvme, model, name, val,
3899 				    &nvme->n_ignore_unknown_vendor_status);
3900 			} else if (strcmp(name, "min-phys-block-size") == 0) {
3901 				nvme_config_min_block_size(nvme, model, val);
3902 			} else if (strcmp(name, "volatile-write-cache") == 0) {
3903 				nvme_config_boolean(nvme, model, name, val,
3904 				    &nvme->n_write_cache_enabled);
3905 			} else {
3906 				/*
3907 				 * Unknown 'name'.
3908 				 */
3909 				dev_err(nvme->n_dip, CE_WARN,
3910 				    "!nvme-config-list: unknown config '%s' "
3911 				    "for model '%s', ignoring", name, model);
3912 			}
3913 		}
3914 	}
3915 
3916 out:
3917 	ddi_prop_free(config_list);
3918 }
3919 
3920 static void
3921 nvme_prepare_devid(nvme_t *nvme, uint32_t nsid)
3922 {
3923 	/*
3924 	 * Section 7.7 of the spec describes how to get a unique ID for
3925 	 * the controller: the vendor ID, the model name and the serial
3926 	 * number shall be unique when combined.
3927 	 *
3928 	 * If a namespace has no EUI64 we use the above and add the hex
3929 	 * namespace ID to get a unique ID for the namespace.
3930 	 */
3931 	char model[sizeof (nvme->n_idctl->id_model) + 1];
3932 	char serial[sizeof (nvme->n_idctl->id_serial) + 1];
3933 
3934 	bcopy(nvme->n_idctl->id_model, model, sizeof (nvme->n_idctl->id_model));
3935 	bcopy(nvme->n_idctl->id_serial, serial,
3936 	    sizeof (nvme->n_idctl->id_serial));
3937 
3938 	model[sizeof (nvme->n_idctl->id_model)] = '\0';
3939 	serial[sizeof (nvme->n_idctl->id_serial)] = '\0';
3940 
3941 	nvme_nsid2ns(nvme, nsid)->ns_devid = kmem_asprintf("%4X-%s-%s-%X",
3942 	    nvme->n_idctl->id_vid, model, serial, nsid);
3943 }
3944 
3945 static nvme_identify_nsid_list_t *
3946 nvme_update_nsid_list(nvme_t *nvme, int cns)
3947 {
3948 	nvme_identify_nsid_list_t *nslist;
3949 
3950 	/*
3951 	 * We currently don't handle cases where there are more than
3952 	 * 1024 active namespaces, requiring several IDENTIFY commands.
3953 	 */
3954 	if (nvme_identify_int(nvme, 0, cns, (void **)&nslist))
3955 		return (nslist);
3956 
3957 	return (NULL);
3958 }
3959 
3960 nvme_namespace_t *
3961 nvme_nsid2ns(nvme_t *nvme, uint32_t nsid)
3962 {
3963 	ASSERT3U(nsid, !=, 0);
3964 	ASSERT3U(nsid, <=, nvme->n_namespace_count);
3965 	return (&nvme->n_ns[nsid - 1]);
3966 }
3967 
3968 static boolean_t
3969 nvme_allocated_ns(nvme_namespace_t *ns)
3970 {
3971 	nvme_t *nvme = ns->ns_nvme;
3972 	uint32_t i;
3973 
3974 	ASSERT(nvme_mgmt_lock_held(nvme));
3975 
3976 	/*
3977 	 * If supported, update the list of allocated namespace IDs.
3978 	 */
3979 	if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 2) &&
3980 	    nvme->n_idctl->id_oacs.oa_nsmgmt != 0) {
3981 		nvme_identify_nsid_list_t *nslist = nvme_update_nsid_list(nvme,
3982 		    NVME_IDENTIFY_NSID_ALLOC_LIST);
3983 		boolean_t found = B_FALSE;
3984 
3985 		/*
3986 		 * When namespace management is supported, this really shouldn't
3987 		 * be NULL. Treat all namespaces as allocated if it is.
3988 		 */
3989 		if (nslist == NULL)
3990 			return (B_TRUE);
3991 
3992 		for (i = 0; i < ARRAY_SIZE(nslist->nl_nsid); i++) {
3993 			if (ns->ns_id == 0)
3994 				break;
3995 
3996 			if (ns->ns_id == nslist->nl_nsid[i])
3997 				found = B_TRUE;
3998 		}
3999 
4000 		kmem_free(nslist, NVME_IDENTIFY_BUFSIZE);
4001 		return (found);
4002 	} else {
4003 		/*
4004 		 * If namespace management isn't supported, report all
4005 		 * namespaces as allocated.
4006 		 */
4007 		return (B_TRUE);
4008 	}
4009 }
4010 
4011 static boolean_t
4012 nvme_active_ns(nvme_namespace_t *ns)
4013 {
4014 	nvme_t *nvme = ns->ns_nvme;
4015 	uint64_t *ptr;
4016 	uint32_t i;
4017 
4018 	ASSERT(nvme_mgmt_lock_held(nvme));
4019 
4020 	/*
4021 	 * If supported, update the list of active namespace IDs.
4022 	 */
4023 	if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1)) {
4024 		nvme_identify_nsid_list_t *nslist = nvme_update_nsid_list(nvme,
4025 		    NVME_IDENTIFY_NSID_LIST);
4026 		boolean_t found = B_FALSE;
4027 
4028 		/*
4029 		 * When namespace management is supported, this really shouldn't
4030 		 * be NULL. Treat all namespaces as allocated if it is.
4031 		 */
4032 		if (nslist == NULL)
4033 			return (B_TRUE);
4034 
4035 		for (i = 0; i < ARRAY_SIZE(nslist->nl_nsid); i++) {
4036 			if (ns->ns_id == 0)
4037 				break;
4038 
4039 			if (ns->ns_id == nslist->nl_nsid[i])
4040 				found = B_TRUE;
4041 		}
4042 
4043 		kmem_free(nslist, NVME_IDENTIFY_BUFSIZE);
4044 		return (found);
4045 	}
4046 
4047 	/*
4048 	 * Workaround for revision 1.0:
4049 	 * Check whether the IDENTIFY NAMESPACE data is zero-filled.
4050 	 */
4051 	for (ptr = (uint64_t *)ns->ns_idns;
4052 	    ptr != (uint64_t *)(ns->ns_idns + 1);
4053 	    ptr++) {
4054 		if (*ptr != 0) {
4055 			return (B_TRUE);
4056 		}
4057 	}
4058 
4059 	return (B_FALSE);
4060 }
4061 
4062 static int
4063 nvme_init_ns(nvme_t *nvme, uint32_t nsid)
4064 {
4065 	nvme_namespace_t *ns = nvme_nsid2ns(nvme, nsid);
4066 	nvme_identify_nsid_t *idns;
4067 	nvme_ns_state_t orig_state;
4068 
4069 	ns->ns_nvme = nvme;
4070 
4071 	ASSERT(nvme_mgmt_lock_held(nvme));
4072 
4073 	/*
4074 	 * Because we might rescan a namespace and this will fail after boot
4075 	 * that'd leave us in a bad spot. We need to do something about this
4076 	 * longer term, but it's not clear how exactly we would recover right
4077 	 * now.
4078 	 */
4079 	if (!nvme_identify_int(nvme, nsid, NVME_IDENTIFY_NSID,
4080 	    (void **)&idns)) {
4081 		dev_err(nvme->n_dip, CE_WARN,
4082 		    "!failed to identify namespace %d", nsid);
4083 		return (DDI_FAILURE);
4084 	}
4085 
4086 	if (ns->ns_idns != NULL)
4087 		kmem_free(ns->ns_idns, sizeof (nvme_identify_nsid_t));
4088 
4089 	ns->ns_idns = idns;
4090 	ns->ns_id = nsid;
4091 
4092 	/*
4093 	 * Save the current state so we can tell what changed. Look at the
4094 	 * current state of the device. We will flag active devices that should
4095 	 * be ignored after this.
4096 	 */
4097 	orig_state = ns->ns_state;
4098 	if (nvme_active_ns(ns)) {
4099 		/*
4100 		 * If the device previously had blkdev active, then that is its
4101 		 * current state. Otherwise, we consider this an upgrade and
4102 		 * just set it to not ignored.
4103 		 */
4104 		if (orig_state == NVME_NS_STATE_ATTACHED) {
4105 			ns->ns_state = NVME_NS_STATE_ATTACHED;
4106 		} else {
4107 			ns->ns_state = NVME_NS_STATE_NOT_IGNORED;
4108 		}
4109 	} else if (nvme_allocated_ns(ns)) {
4110 		ns->ns_state = NVME_NS_STATE_ALLOCATED;
4111 	} else {
4112 		ns->ns_state = NVME_NS_STATE_UNALLOCATED;
4113 	}
4114 
4115 	ns->ns_block_count = idns->id_nsize;
4116 	ns->ns_block_size =
4117 	    1 << idns->id_lbaf[idns->id_flbas.lba_format].lbaf_lbads;
4118 	ns->ns_best_block_size = ns->ns_block_size;
4119 
4120 	/*
4121 	 * Get the EUI64 if present.
4122 	 */
4123 	if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1))
4124 		bcopy(idns->id_eui64, ns->ns_eui64, sizeof (ns->ns_eui64));
4125 
4126 	/*
4127 	 * Get the NGUID if present.
4128 	 */
4129 	if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 2))
4130 		bcopy(idns->id_nguid, ns->ns_nguid, sizeof (ns->ns_nguid));
4131 
4132 	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
4133 	if (*(uint64_t *)ns->ns_eui64 == 0)
4134 		nvme_prepare_devid(nvme, ns->ns_id);
4135 
4136 	(void) snprintf(ns->ns_name, sizeof (ns->ns_name), "%u", ns->ns_id);
4137 
4138 	/*
4139 	 * Find the LBA format with no metadata and the best relative
4140 	 * performance. A value of 3 means "degraded", 0 is best.
4141 	 */
4142 	for (uint32_t j = 0, last_rp = 3; j <= idns->id_nlbaf; j++) {
4143 		if (idns->id_lbaf[j].lbaf_lbads == 0)
4144 			break;
4145 		if (idns->id_lbaf[j].lbaf_ms != 0)
4146 			continue;
4147 		if (idns->id_lbaf[j].lbaf_rp >= last_rp)
4148 			continue;
4149 		last_rp = idns->id_lbaf[j].lbaf_rp;
4150 		ns->ns_best_block_size =
4151 		    1 << idns->id_lbaf[j].lbaf_lbads;
4152 	}
4153 
4154 	if (ns->ns_best_block_size < nvme->n_min_block_size)
4155 		ns->ns_best_block_size = nvme->n_min_block_size;
4156 
4157 	/*
4158 	 * We currently don't support namespaces that are inactive, or use
4159 	 * either:
4160 	 * - protection information
4161 	 * - illegal block size (< 512)
4162 	 */
4163 	if (ns->ns_state >= NVME_NS_STATE_NOT_IGNORED) {
4164 		if (idns->id_dps.dp_pinfo) {
4165 			dev_err(nvme->n_dip, CE_WARN,
4166 			    "!ignoring namespace %d, unsupported feature: "
4167 			    "pinfo = %d", nsid, idns->id_dps.dp_pinfo);
4168 			ns->ns_state = NVME_NS_STATE_ACTIVE;
4169 		}
4170 
4171 		if (ns->ns_block_size < 512) {
4172 			dev_err(nvme->n_dip, CE_WARN,
4173 			    "!ignoring namespace %d, unsupported block size "
4174 			    "%"PRIu64, nsid, (uint64_t)ns->ns_block_size);
4175 			ns->ns_state = NVME_NS_STATE_ACTIVE;
4176 		}
4177 	}
4178 
4179 	/*
4180 	 * If we were previously in a state where blkdev was active and suddenly
4181 	 * we think it should not be because ignore is set, then something has
4182 	 * gone behind our backs and this is not going to be recoverable.
4183 	 */
4184 	if (orig_state == NVME_NS_STATE_ATTACHED &&
4185 	    ns->ns_state != NVME_NS_STATE_ATTACHED) {
4186 		dev_err(nvme->n_dip, CE_PANIC, "namespace %u state "
4187 		    "unexpectedly changed and removed blkdev support!", nsid);
4188 	}
4189 
4190 	/*
4191 	 * Keep a count of namespaces which are attachable.
4192 	 * See comments in nvme_bd_driveinfo() to understand its effect.
4193 	 */
4194 	if (orig_state > NVME_NS_STATE_ACTIVE) {
4195 		/*
4196 		 * Wasn't attachable previously, but now needs to be.
4197 		 * Discount it.
4198 		 */
4199 		if (ns->ns_state < NVME_NS_STATE_NOT_IGNORED)
4200 			nvme->n_namespaces_attachable--;
4201 	} else if (ns->ns_state >= NVME_NS_STATE_NOT_IGNORED) {
4202 		/*
4203 		 * Previously ignored, but now not. Count it.
4204 		 */
4205 		nvme->n_namespaces_attachable++;
4206 	}
4207 
4208 	return (DDI_SUCCESS);
4209 }
4210 
4211 static boolean_t
4212 nvme_bd_attach_ns(nvme_t *nvme, nvme_ioctl_common_t *com)
4213 {
4214 	nvme_namespace_t *ns = nvme_nsid2ns(nvme, com->nioc_nsid);
4215 	int ret;
4216 
4217 	ASSERT(nvme_mgmt_lock_held(nvme));
4218 
4219 	if (!nvme_ns_state_check(ns, com, nvme_bd_attach_states)) {
4220 		return (B_FALSE);
4221 	}
4222 
4223 	if (ns->ns_bd_hdl == NULL) {
4224 		bd_ops_t ops = nvme_bd_ops;
4225 
4226 		if (!nvme->n_idctl->id_oncs.on_dset_mgmt)
4227 			ops.o_free_space = NULL;
4228 
4229 		ns->ns_bd_hdl = bd_alloc_handle(ns, &ops, &nvme->n_prp_dma_attr,
4230 		    KM_SLEEP);
4231 
4232 		if (ns->ns_bd_hdl == NULL) {
4233 			dev_err(nvme->n_dip, CE_WARN, "!Failed to get blkdev "
4234 			    "handle for namespace id %u", com->nioc_nsid);
4235 			return (nvme_ioctl_error(com,
4236 			    NVME_IOCTL_E_BLKDEV_ATTACH, 0, 0));
4237 		}
4238 	}
4239 
4240 	nvme_mgmt_bd_start(nvme);
4241 	ret = bd_attach_handle(nvme->n_dip, ns->ns_bd_hdl);
4242 	nvme_mgmt_bd_end(nvme);
4243 	if (ret != DDI_SUCCESS) {
4244 		return (nvme_ioctl_error(com, NVME_IOCTL_E_BLKDEV_ATTACH,
4245 		    0, 0));
4246 	}
4247 
4248 	ns->ns_state = NVME_NS_STATE_ATTACHED;
4249 
4250 	return (B_TRUE);
4251 }
4252 
4253 static boolean_t
4254 nvme_bd_detach_ns(nvme_t *nvme, nvme_ioctl_common_t *com)
4255 {
4256 	nvme_namespace_t *ns = nvme_nsid2ns(nvme, com->nioc_nsid);
4257 	int ret;
4258 
4259 	ASSERT(nvme_mgmt_lock_held(nvme));
4260 
4261 	if (!nvme_ns_state_check(ns, com, nvme_bd_detach_states)) {
4262 		return (B_FALSE);
4263 	}
4264 
4265 	nvme_mgmt_bd_start(nvme);
4266 	ASSERT3P(ns->ns_bd_hdl, !=, NULL);
4267 	ret = bd_detach_handle(ns->ns_bd_hdl);
4268 	nvme_mgmt_bd_end(nvme);
4269 
4270 	if (ret != DDI_SUCCESS) {
4271 		return (nvme_ioctl_error(com, NVME_IOCTL_E_BLKDEV_DETACH, 0,
4272 		    0));
4273 	}
4274 
4275 	ns->ns_state = NVME_NS_STATE_NOT_IGNORED;
4276 	return (B_TRUE);
4277 
4278 }
4279 
4280 /*
4281  * Rescan the namespace information associated with the namespaces indicated by
4282  * ioc. They should not be attached to blkdev right now.
4283  */
4284 static void
4285 nvme_rescan_ns(nvme_t *nvme, uint32_t nsid)
4286 {
4287 	ASSERT(nvme_mgmt_lock_held(nvme));
4288 	ASSERT3U(nsid, !=, 0);
4289 
4290 	if (nsid != NVME_NSID_BCAST) {
4291 		nvme_namespace_t *ns = nvme_nsid2ns(nvme, nsid);
4292 
4293 		ASSERT3U(ns->ns_state, <, NVME_NS_STATE_ATTACHED);
4294 		(void) nvme_init_ns(nvme, nsid);
4295 		return;
4296 	}
4297 
4298 	for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) {
4299 		nvme_namespace_t *ns = nvme_nsid2ns(nvme, i);
4300 
4301 		ASSERT3U(ns->ns_state, <, NVME_NS_STATE_ATTACHED);
4302 		(void) nvme_init_ns(nvme, i);
4303 	}
4304 }
4305 
4306 typedef struct nvme_quirk_table {
4307 	uint16_t nq_vendor_id;
4308 	uint16_t nq_device_id;
4309 	nvme_quirk_t nq_quirks;
4310 } nvme_quirk_table_t;
4311 
4312 static const nvme_quirk_table_t nvme_quirks[] = {
4313 	{ 0x1987, 0x5018, NVME_QUIRK_START_CID },	/* Phison E18 */
4314 };
4315 
4316 static void
4317 nvme_detect_quirks(nvme_t *nvme)
4318 {
4319 	for (uint_t i = 0; i < ARRAY_SIZE(nvme_quirks); i++) {
4320 		const nvme_quirk_table_t *nqt = &nvme_quirks[i];
4321 
4322 		if (nqt->nq_vendor_id == nvme->n_vendor_id &&
4323 		    nqt->nq_device_id == nvme->n_device_id) {
4324 			nvme->n_quirks = nqt->nq_quirks;
4325 			return;
4326 		}
4327 	}
4328 }
4329 
4330 /*
4331  * Indicate to the controller that we support various behaviors. These are
4332  * things the controller needs to be proactively told. We only will do this if
4333  * the controller indicates support for something that we care about, otherwise
4334  * there is no need to talk to the controller and there is no separate way to
4335  * know that this feature is otherwise supported. Support for most features is
4336  * indicated by setting it to 1.
4337  *
4338  * The current behaviors we enable are:
4339  *
4340  *  - Extended Telemetry Data Area 4: This enables additional telemetry to be
4341  *    possibly generated and depends on the DA4S bit in the log page attributes.
4342  */
4343 static void
4344 nvme_enable_host_behavior(nvme_t *nvme)
4345 {
4346 	nvme_host_behavior_t *hb;
4347 	nvme_ioc_cmd_args_t args = { NULL };
4348 	nvme_sqe_t sqe = {
4349 		.sqe_opc = NVME_OPC_SET_FEATURES,
4350 		.sqe_cdw10 = NVME_FEAT_HOST_BEHAVE,
4351 		.sqe_nsid = 0
4352 	};
4353 	nvme_ioctl_common_t err;
4354 
4355 	if (nvme->n_idctl->id_lpa.lp_da4s == 0)
4356 		return;
4357 
4358 	hb = kmem_zalloc(sizeof (nvme_host_behavior_t), KM_SLEEP);
4359 	hb->nhb_etdas = 1;
4360 
4361 	args.ica_sqe = &sqe;
4362 	args.ica_data = hb;
4363 	args.ica_data_len = sizeof (nvme_host_behavior_t);
4364 	args.ica_dma_flags = DDI_DMA_WRITE;
4365 	args.ica_copy_flags = FKIOCTL;
4366 	args.ica_timeout = nvme_admin_cmd_timeout;
4367 
4368 	if (!nvme_ioc_cmd(nvme, &err, &args)) {
4369 		dev_err(nvme->n_dip, CE_WARN, "failed to enable host behavior "
4370 		    "feature: 0x%x/0x%x/0x%x", err.nioc_drv_err,
4371 		    err.nioc_ctrl_sct, err.nioc_ctrl_sc);
4372 	}
4373 
4374 	kmem_free(hb, sizeof (nvme_host_behavior_t));
4375 }
4376 
4377 static int
4378 nvme_init(nvme_t *nvme)
4379 {
4380 	nvme_reg_cc_t cc = { 0 };
4381 	nvme_reg_aqa_t aqa = { 0 };
4382 	nvme_reg_asq_t asq = { 0 };
4383 	nvme_reg_acq_t acq = { 0 };
4384 	nvme_reg_cap_t cap;
4385 	nvme_reg_vs_t vs;
4386 	nvme_reg_csts_t csts;
4387 	int i = 0;
4388 	uint16_t nqueues;
4389 	uint_t tq_threads;
4390 	char model[sizeof (nvme->n_idctl->id_model) + 1];
4391 	char *vendor, *product;
4392 	uint32_t nsid;
4393 
4394 	/* Check controller version */
4395 	vs.r = nvme_get32(nvme, NVME_REG_VS);
4396 	nvme->n_version.v_major = vs.b.vs_mjr;
4397 	nvme->n_version.v_minor = vs.b.vs_mnr;
4398 	dev_err(nvme->n_dip, CE_CONT, "?NVMe spec version %d.%d\n",
4399 	    nvme->n_version.v_major, nvme->n_version.v_minor);
4400 
4401 	if (nvme->n_version.v_major > nvme_version_major) {
4402 		dev_err(nvme->n_dip, CE_WARN, "!no support for version > %d.x",
4403 		    nvme_version_major);
4404 		if (nvme->n_strict_version)
4405 			goto fail;
4406 	}
4407 
4408 	/* retrieve controller configuration */
4409 	cap.r = nvme_get64(nvme, NVME_REG_CAP);
4410 
4411 	if ((cap.b.cap_css & NVME_CAP_CSS_NVM) == 0) {
4412 		dev_err(nvme->n_dip, CE_WARN,
4413 		    "!NVM command set not supported by hardware");
4414 		goto fail;
4415 	}
4416 
4417 	nvme->n_nssr_supported = cap.b.cap_nssrs;
4418 	nvme->n_doorbell_stride = 4 << cap.b.cap_dstrd;
4419 	nvme->n_timeout = cap.b.cap_to;
4420 	nvme->n_arbitration_mechanisms = cap.b.cap_ams;
4421 	nvme->n_cont_queues_reqd = cap.b.cap_cqr;
4422 	nvme->n_max_queue_entries = cap.b.cap_mqes + 1;
4423 
4424 	/*
4425 	 * The MPSMIN and MPSMAX fields in the CAP register use 0 to specify
4426 	 * the base page size of 4k (1<<12), so add 12 here to get the real
4427 	 * page size value.
4428 	 */
4429 	nvme->n_pageshift = MIN(MAX(cap.b.cap_mpsmin + 12, PAGESHIFT),
4430 	    cap.b.cap_mpsmax + 12);
4431 	nvme->n_pagesize = 1UL << (nvme->n_pageshift);
4432 
4433 	/*
4434 	 * Set up Queue DMA to transfer at least 1 page-aligned page at a time.
4435 	 */
4436 	nvme->n_queue_dma_attr.dma_attr_align = nvme->n_pagesize;
4437 	nvme->n_queue_dma_attr.dma_attr_minxfer = nvme->n_pagesize;
4438 
4439 	/*
4440 	 * Set up PRP DMA to transfer 1 page-aligned page at a time.
4441 	 * Maxxfer may be increased after we identified the controller limits.
4442 	 */
4443 	nvme->n_prp_dma_attr.dma_attr_maxxfer = nvme->n_pagesize;
4444 	nvme->n_prp_dma_attr.dma_attr_minxfer = nvme->n_pagesize;
4445 	nvme->n_prp_dma_attr.dma_attr_align = nvme->n_pagesize;
4446 	nvme->n_prp_dma_attr.dma_attr_seg = nvme->n_pagesize - 1;
4447 
4448 	/*
4449 	 * Reset controller if it's still in ready state.
4450 	 */
4451 	if (nvme_reset(nvme, B_FALSE) == B_FALSE) {
4452 		dev_err(nvme->n_dip, CE_WARN, "!unable to reset controller");
4453 		ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST);
4454 		nvme->n_dead = B_TRUE;
4455 		goto fail;
4456 	}
4457 
4458 	/*
4459 	 * Create the cq array with one completion queue to be assigned
4460 	 * to the admin queue pair and a limited number of taskqs (4).
4461 	 */
4462 	if (nvme_create_cq_array(nvme, 1, nvme->n_admin_queue_len, 4) !=
4463 	    DDI_SUCCESS) {
4464 		dev_err(nvme->n_dip, CE_WARN,
4465 		    "!failed to pre-allocate admin completion queue");
4466 		goto fail;
4467 	}
4468 	/*
4469 	 * Create the admin queue pair.
4470 	 */
4471 	if (nvme_alloc_qpair(nvme, nvme->n_admin_queue_len, &nvme->n_adminq, 0)
4472 	    != DDI_SUCCESS) {
4473 		dev_err(nvme->n_dip, CE_WARN,
4474 		    "!unable to allocate admin qpair");
4475 		goto fail;
4476 	}
4477 	nvme->n_ioq = kmem_alloc(sizeof (nvme_qpair_t *), KM_SLEEP);
4478 	nvme->n_ioq[0] = nvme->n_adminq;
4479 
4480 	if (nvme->n_quirks & NVME_QUIRK_START_CID)
4481 		nvme->n_adminq->nq_next_cmd++;
4482 
4483 	nvme->n_progress |= NVME_ADMIN_QUEUE;
4484 
4485 	(void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip,
4486 	    "admin-queue-len", nvme->n_admin_queue_len);
4487 
4488 	aqa.b.aqa_asqs = aqa.b.aqa_acqs = nvme->n_admin_queue_len - 1;
4489 	asq = nvme->n_adminq->nq_sqdma->nd_cookie.dmac_laddress;
4490 	acq = nvme->n_adminq->nq_cq->ncq_dma->nd_cookie.dmac_laddress;
4491 
4492 	ASSERT((asq & (nvme->n_pagesize - 1)) == 0);
4493 	ASSERT((acq & (nvme->n_pagesize - 1)) == 0);
4494 
4495 	nvme_put32(nvme, NVME_REG_AQA, aqa.r);
4496 	nvme_put64(nvme, NVME_REG_ASQ, asq);
4497 	nvme_put64(nvme, NVME_REG_ACQ, acq);
4498 
4499 	cc.b.cc_ams = 0;	/* use Round-Robin arbitration */
4500 	cc.b.cc_css = 0;	/* use NVM command set */
4501 	cc.b.cc_mps = nvme->n_pageshift - 12;
4502 	cc.b.cc_shn = 0;	/* no shutdown in progress */
4503 	cc.b.cc_en = 1;		/* enable controller */
4504 	cc.b.cc_iosqes = 6;	/* submission queue entry is 2^6 bytes long */
4505 	cc.b.cc_iocqes = 4;	/* completion queue entry is 2^4 bytes long */
4506 
4507 	nvme_put32(nvme, NVME_REG_CC, cc.r);
4508 
4509 	/*
4510 	 * Wait for the controller to become ready.
4511 	 */
4512 	csts.r = nvme_get32(nvme, NVME_REG_CSTS);
4513 	if (csts.b.csts_rdy == 0) {
4514 		for (i = 0; i != nvme->n_timeout * 10; i++) {
4515 			delay(drv_usectohz(50000));
4516 			csts.r = nvme_get32(nvme, NVME_REG_CSTS);
4517 
4518 			if (csts.b.csts_cfs == 1) {
4519 				dev_err(nvme->n_dip, CE_WARN,
4520 				    "!controller fatal status at init");
4521 				ddi_fm_service_impact(nvme->n_dip,
4522 				    DDI_SERVICE_LOST);
4523 				nvme->n_dead = B_TRUE;
4524 				goto fail;
4525 			}
4526 
4527 			if (csts.b.csts_rdy == 1)
4528 				break;
4529 		}
4530 	}
4531 
4532 	if (csts.b.csts_rdy == 0) {
4533 		dev_err(nvme->n_dip, CE_WARN, "!controller not ready");
4534 		ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST);
4535 		nvme->n_dead = B_TRUE;
4536 		goto fail;
4537 	}
4538 
4539 	/*
4540 	 * Assume an abort command limit of 1. We'll destroy and re-init
4541 	 * that later when we know the true abort command limit.
4542 	 */
4543 	sema_init(&nvme->n_abort_sema, 1, NULL, SEMA_DRIVER, NULL);
4544 
4545 	/*
4546 	 * Set up initial interrupt for admin queue.
4547 	 */
4548 	if ((nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSIX, 1)
4549 	    != DDI_SUCCESS) &&
4550 	    (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSI, 1)
4551 	    != DDI_SUCCESS) &&
4552 	    (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_FIXED, 1)
4553 	    != DDI_SUCCESS)) {
4554 		dev_err(nvme->n_dip, CE_WARN,
4555 		    "!failed to set up initial interrupt");
4556 		goto fail;
4557 	}
4558 
4559 	/*
4560 	 * Initialize the failure status we should use if we mark the controller
4561 	 * dead. Do this ahead of issuing any commands.
4562 	 */
4563 	nvme->n_dead_status = NVME_IOCTL_E_CTRL_DEAD;
4564 
4565 	/*
4566 	 * Identify Controller
4567 	 */
4568 	if (!nvme_identify_int(nvme, 0, NVME_IDENTIFY_CTRL,
4569 	    (void **)&nvme->n_idctl)) {
4570 		dev_err(nvme->n_dip, CE_WARN, "!failed to identify controller");
4571 		goto fail;
4572 	}
4573 
4574 	/*
4575 	 * Process nvme-config-list (if present) in nvme.conf.
4576 	 */
4577 	nvme_config_list(nvme);
4578 
4579 	/*
4580 	 * Get Vendor & Product ID
4581 	 */
4582 	bcopy(nvme->n_idctl->id_model, model, sizeof (nvme->n_idctl->id_model));
4583 	model[sizeof (nvme->n_idctl->id_model)] = '\0';
4584 	sata_split_model(model, &vendor, &product);
4585 
4586 	if (vendor == NULL)
4587 		nvme->n_vendor = strdup("NVMe");
4588 	else
4589 		nvme->n_vendor = strdup(vendor);
4590 
4591 	nvme->n_product = strdup(product);
4592 
4593 	/*
4594 	 * Get controller limits.
4595 	 */
4596 	nvme->n_async_event_limit = MAX(NVME_MIN_ASYNC_EVENT_LIMIT,
4597 	    MIN(nvme->n_admin_queue_len / 10,
4598 	    MIN(nvme->n_idctl->id_aerl + 1, nvme->n_async_event_limit)));
4599 
4600 	(void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip,
4601 	    "async-event-limit", nvme->n_async_event_limit);
4602 
4603 	nvme->n_abort_command_limit = nvme->n_idctl->id_acl + 1;
4604 
4605 	/*
4606 	 * Reinitialize the semaphore with the true abort command limit
4607 	 * supported by the hardware. It's not necessary to disable interrupts
4608 	 * as only command aborts use the semaphore, and no commands are
4609 	 * executed or aborted while we're here.
4610 	 */
4611 	sema_destroy(&nvme->n_abort_sema);
4612 	sema_init(&nvme->n_abort_sema, nvme->n_abort_command_limit - 1, NULL,
4613 	    SEMA_DRIVER, NULL);
4614 
4615 	nvme->n_progress |= NVME_CTRL_LIMITS;
4616 
4617 	if (nvme->n_idctl->id_mdts == 0)
4618 		nvme->n_max_data_transfer_size = nvme->n_pagesize * 65536;
4619 	else
4620 		nvme->n_max_data_transfer_size =
4621 		    1ull << (nvme->n_pageshift + nvme->n_idctl->id_mdts);
4622 
4623 	nvme->n_error_log_len = nvme->n_idctl->id_elpe + 1;
4624 
4625 	/*
4626 	 * Limit n_max_data_transfer_size to what we can handle in one PRP.
4627 	 * Chained PRPs are currently unsupported.
4628 	 *
4629 	 * This is a no-op on hardware which doesn't support a transfer size
4630 	 * big enough to require chained PRPs.
4631 	 */
4632 	nvme->n_max_data_transfer_size = MIN(nvme->n_max_data_transfer_size,
4633 	    (nvme->n_pagesize / sizeof (uint64_t) * nvme->n_pagesize));
4634 
4635 	nvme->n_prp_dma_attr.dma_attr_maxxfer = nvme->n_max_data_transfer_size;
4636 
4637 	/*
4638 	 * Make sure the minimum/maximum queue entry sizes are not
4639 	 * larger/smaller than the default.
4640 	 */
4641 
4642 	if (((1 << nvme->n_idctl->id_sqes.qes_min) > sizeof (nvme_sqe_t)) ||
4643 	    ((1 << nvme->n_idctl->id_sqes.qes_max) < sizeof (nvme_sqe_t)) ||
4644 	    ((1 << nvme->n_idctl->id_cqes.qes_min) > sizeof (nvme_cqe_t)) ||
4645 	    ((1 << nvme->n_idctl->id_cqes.qes_max) < sizeof (nvme_cqe_t)))
4646 		goto fail;
4647 
4648 	/*
4649 	 * Check for the presence of a Volatile Write Cache. If present,
4650 	 * enable or disable based on the value of the property
4651 	 * volatile-write-cache-enable (default is enabled).
4652 	 */
4653 	nvme->n_write_cache_present =
4654 	    nvme->n_idctl->id_vwc.vwc_present == 0 ? B_FALSE : B_TRUE;
4655 
4656 	(void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip,
4657 	    "volatile-write-cache-present",
4658 	    nvme->n_write_cache_present ? 1 : 0);
4659 
4660 	if (!nvme->n_write_cache_present) {
4661 		nvme->n_write_cache_enabled = B_FALSE;
4662 	} else if (nvme_write_cache_set(nvme, nvme->n_write_cache_enabled)
4663 	    != 0) {
4664 		dev_err(nvme->n_dip, CE_WARN,
4665 		    "!failed to %sable volatile write cache",
4666 		    nvme->n_write_cache_enabled ? "en" : "dis");
4667 		/*
4668 		 * Assume the cache is (still) enabled.
4669 		 */
4670 		nvme->n_write_cache_enabled = B_TRUE;
4671 	}
4672 
4673 	(void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip,
4674 	    "volatile-write-cache-enable",
4675 	    nvme->n_write_cache_enabled ? 1 : 0);
4676 
4677 	/*
4678 	 * Get number of supported namespaces and allocate namespace array.
4679 	 */
4680 	nvme->n_namespace_count = nvme->n_idctl->id_nn;
4681 
4682 	if (nvme->n_namespace_count == 0) {
4683 		dev_err(nvme->n_dip, CE_WARN,
4684 		    "!controllers without namespaces are not supported");
4685 		goto fail;
4686 	}
4687 
4688 	nvme->n_ns = kmem_zalloc(sizeof (nvme_namespace_t) *
4689 	    nvme->n_namespace_count, KM_SLEEP);
4690 
4691 	/*
4692 	 * Get the common namespace information if available. If not, we use the
4693 	 * information for nsid 1.
4694 	 */
4695 	if (nvme_ctrl_atleast(nvme, &nvme_vers_1v2) &&
4696 	    nvme->n_idctl->id_oacs.oa_nsmgmt != 0) {
4697 		nsid = NVME_NSID_BCAST;
4698 	} else {
4699 		nsid = 1;
4700 	}
4701 
4702 	if (!nvme_identify_int(nvme, nsid, NVME_IDENTIFY_NSID,
4703 	    (void **)&nvme->n_idcomns)) {
4704 		dev_err(nvme->n_dip, CE_WARN, "!failed to identify common "
4705 		    "namespace information");
4706 		goto fail;
4707 	}
4708 
4709 	/*
4710 	 * Try to set up MSI/MSI-X interrupts.
4711 	 */
4712 	if ((nvme->n_intr_types & (DDI_INTR_TYPE_MSI | DDI_INTR_TYPE_MSIX))
4713 	    != 0) {
4714 		nvme_release_interrupts(nvme);
4715 
4716 		nqueues = MIN(UINT16_MAX, ncpus);
4717 
4718 		if ((nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSIX,
4719 		    nqueues) != DDI_SUCCESS) &&
4720 		    (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSI,
4721 		    nqueues) != DDI_SUCCESS)) {
4722 			dev_err(nvme->n_dip, CE_WARN,
4723 			    "!failed to set up MSI/MSI-X interrupts");
4724 			goto fail;
4725 		}
4726 	}
4727 
4728 	/*
4729 	 * Create I/O queue pairs.
4730 	 */
4731 
4732 	if (nvme_set_nqueues(nvme) != 0) {
4733 		dev_err(nvme->n_dip, CE_WARN,
4734 		    "!failed to set number of I/O queues to %d",
4735 		    nvme->n_intr_cnt);
4736 		goto fail;
4737 	}
4738 
4739 	/*
4740 	 * Reallocate I/O queue array
4741 	 */
4742 	kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *));
4743 	nvme->n_ioq = kmem_zalloc(sizeof (nvme_qpair_t *) *
4744 	    (nvme->n_submission_queues + 1), KM_SLEEP);
4745 	nvme->n_ioq[0] = nvme->n_adminq;
4746 
4747 	/*
4748 	 * There should always be at least as many submission queues
4749 	 * as completion queues.
4750 	 */
4751 	ASSERT(nvme->n_submission_queues >= nvme->n_completion_queues);
4752 
4753 	nvme->n_ioq_count = nvme->n_submission_queues;
4754 
4755 	nvme->n_io_squeue_len =
4756 	    MIN(nvme->n_io_squeue_len, nvme->n_max_queue_entries);
4757 
4758 	(void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-squeue-len",
4759 	    nvme->n_io_squeue_len);
4760 
4761 	/*
4762 	 * Pre-allocate completion queues.
4763 	 * When there are the same number of submission and completion
4764 	 * queues there is no value in having a larger completion
4765 	 * queue length.
4766 	 */
4767 	if (nvme->n_submission_queues == nvme->n_completion_queues)
4768 		nvme->n_io_cqueue_len = MIN(nvme->n_io_cqueue_len,
4769 		    nvme->n_io_squeue_len);
4770 
4771 	nvme->n_io_cqueue_len = MIN(nvme->n_io_cqueue_len,
4772 	    nvme->n_max_queue_entries);
4773 
4774 	(void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-cqueue-len",
4775 	    nvme->n_io_cqueue_len);
4776 
4777 	/*
4778 	 * Assign the equal quantity of taskq threads to each completion
4779 	 * queue, capping the total number of threads to the number
4780 	 * of CPUs.
4781 	 */
4782 	tq_threads = MIN(UINT16_MAX, ncpus) / nvme->n_completion_queues;
4783 
4784 	/*
4785 	 * In case the calculation above is zero, we need at least one
4786 	 * thread per completion queue.
4787 	 */
4788 	tq_threads = MAX(1, tq_threads);
4789 
4790 	if (nvme_create_cq_array(nvme, nvme->n_completion_queues + 1,
4791 	    nvme->n_io_cqueue_len, tq_threads) != DDI_SUCCESS) {
4792 		dev_err(nvme->n_dip, CE_WARN,
4793 		    "!failed to pre-allocate completion queues");
4794 		goto fail;
4795 	}
4796 
4797 	/*
4798 	 * If we use less completion queues than interrupt vectors return
4799 	 * some of the interrupt vectors back to the system.
4800 	 */
4801 	if (nvme->n_completion_queues + 1 < nvme->n_intr_cnt) {
4802 		nvme_release_interrupts(nvme);
4803 
4804 		if (nvme_setup_interrupts(nvme, nvme->n_intr_type,
4805 		    nvme->n_completion_queues + 1) != DDI_SUCCESS) {
4806 			dev_err(nvme->n_dip, CE_WARN,
4807 			    "!failed to reduce number of interrupts");
4808 			goto fail;
4809 		}
4810 	}
4811 
4812 	/*
4813 	 * Alloc & register I/O queue pairs
4814 	 */
4815 
4816 	for (i = 1; i != nvme->n_ioq_count + 1; i++) {
4817 		if (nvme_alloc_qpair(nvme, nvme->n_io_squeue_len,
4818 		    &nvme->n_ioq[i], i) != DDI_SUCCESS) {
4819 			dev_err(nvme->n_dip, CE_WARN,
4820 			    "!unable to allocate I/O qpair %d", i);
4821 			goto fail;
4822 		}
4823 
4824 		if (nvme_create_io_qpair(nvme, nvme->n_ioq[i], i) != 0) {
4825 			dev_err(nvme->n_dip, CE_WARN,
4826 			    "!unable to create I/O qpair %d", i);
4827 			goto fail;
4828 		}
4829 	}
4830 
4831 	/*
4832 	 * Enable any host behavior features that make sense for us.
4833 	 */
4834 	nvme_enable_host_behavior(nvme);
4835 
4836 	return (DDI_SUCCESS);
4837 
4838 fail:
4839 	(void) nvme_reset(nvme, B_FALSE);
4840 	return (DDI_FAILURE);
4841 }
4842 
4843 static uint_t
4844 nvme_intr(caddr_t arg1, caddr_t arg2)
4845 {
4846 	nvme_t *nvme = (nvme_t *)arg1;
4847 	int inum = (int)(uintptr_t)arg2;
4848 	int ccnt = 0;
4849 	int qnum;
4850 
4851 	if (inum >= nvme->n_intr_cnt)
4852 		return (DDI_INTR_UNCLAIMED);
4853 
4854 	if (nvme->n_dead) {
4855 		return (nvme->n_intr_type == DDI_INTR_TYPE_FIXED ?
4856 		    DDI_INTR_UNCLAIMED : DDI_INTR_CLAIMED);
4857 	}
4858 
4859 	/*
4860 	 * The interrupt vector a queue uses is calculated as queue_idx %
4861 	 * intr_cnt in nvme_create_io_qpair(). Iterate through the queue array
4862 	 * in steps of n_intr_cnt to process all queues using this vector.
4863 	 */
4864 	for (qnum = inum;
4865 	    qnum < nvme->n_cq_count && nvme->n_cq[qnum] != NULL;
4866 	    qnum += nvme->n_intr_cnt) {
4867 		ccnt += nvme_process_iocq(nvme, nvme->n_cq[qnum]);
4868 	}
4869 
4870 	return (ccnt > 0 ? DDI_INTR_CLAIMED : DDI_INTR_UNCLAIMED);
4871 }
4872 
4873 static void
4874 nvme_release_interrupts(nvme_t *nvme)
4875 {
4876 	int i;
4877 
4878 	for (i = 0; i < nvme->n_intr_cnt; i++) {
4879 		if (nvme->n_inth[i] == NULL)
4880 			break;
4881 
4882 		if (nvme->n_intr_cap & DDI_INTR_FLAG_BLOCK)
4883 			(void) ddi_intr_block_disable(&nvme->n_inth[i], 1);
4884 		else
4885 			(void) ddi_intr_disable(nvme->n_inth[i]);
4886 
4887 		(void) ddi_intr_remove_handler(nvme->n_inth[i]);
4888 		(void) ddi_intr_free(nvme->n_inth[i]);
4889 	}
4890 
4891 	kmem_free(nvme->n_inth, nvme->n_inth_sz);
4892 	nvme->n_inth = NULL;
4893 	nvme->n_inth_sz = 0;
4894 
4895 	nvme->n_progress &= ~NVME_INTERRUPTS;
4896 }
4897 
4898 static int
4899 nvme_setup_interrupts(nvme_t *nvme, int intr_type, int nqpairs)
4900 {
4901 	int nintrs, navail, count;
4902 	int ret;
4903 	int i;
4904 
4905 	if (nvme->n_intr_types == 0) {
4906 		ret = ddi_intr_get_supported_types(nvme->n_dip,
4907 		    &nvme->n_intr_types);
4908 		if (ret != DDI_SUCCESS) {
4909 			dev_err(nvme->n_dip, CE_WARN,
4910 			    "!%s: ddi_intr_get_supported types failed",
4911 			    __func__);
4912 			return (ret);
4913 		}
4914 #ifdef __x86
4915 		if (get_hwenv() == HW_VMWARE)
4916 			nvme->n_intr_types &= ~DDI_INTR_TYPE_MSIX;
4917 #endif
4918 	}
4919 
4920 	if ((nvme->n_intr_types & intr_type) == 0)
4921 		return (DDI_FAILURE);
4922 
4923 	ret = ddi_intr_get_nintrs(nvme->n_dip, intr_type, &nintrs);
4924 	if (ret != DDI_SUCCESS) {
4925 		dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_nintrs failed",
4926 		    __func__);
4927 		return (ret);
4928 	}
4929 
4930 	ret = ddi_intr_get_navail(nvme->n_dip, intr_type, &navail);
4931 	if (ret != DDI_SUCCESS) {
4932 		dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_navail failed",
4933 		    __func__);
4934 		return (ret);
4935 	}
4936 
4937 	/* We want at most one interrupt per queue pair. */
4938 	if (navail > nqpairs)
4939 		navail = nqpairs;
4940 
4941 	nvme->n_inth_sz = sizeof (ddi_intr_handle_t) * navail;
4942 	nvme->n_inth = kmem_zalloc(nvme->n_inth_sz, KM_SLEEP);
4943 
4944 	ret = ddi_intr_alloc(nvme->n_dip, nvme->n_inth, intr_type, 0, navail,
4945 	    &count, 0);
4946 	if (ret != DDI_SUCCESS) {
4947 		dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_alloc failed",
4948 		    __func__);
4949 		goto fail;
4950 	}
4951 
4952 	nvme->n_intr_cnt = count;
4953 
4954 	ret = ddi_intr_get_pri(nvme->n_inth[0], &nvme->n_intr_pri);
4955 	if (ret != DDI_SUCCESS) {
4956 		dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_pri failed",
4957 		    __func__);
4958 		goto fail;
4959 	}
4960 
4961 	for (i = 0; i < count; i++) {
4962 		ret = ddi_intr_add_handler(nvme->n_inth[i], nvme_intr,
4963 		    (void *)nvme, (void *)(uintptr_t)i);
4964 		if (ret != DDI_SUCCESS) {
4965 			dev_err(nvme->n_dip, CE_WARN,
4966 			    "!%s: ddi_intr_add_handler failed", __func__);
4967 			goto fail;
4968 		}
4969 	}
4970 
4971 	(void) ddi_intr_get_cap(nvme->n_inth[0], &nvme->n_intr_cap);
4972 
4973 	for (i = 0; i < count; i++) {
4974 		if (nvme->n_intr_cap & DDI_INTR_FLAG_BLOCK)
4975 			ret = ddi_intr_block_enable(&nvme->n_inth[i], 1);
4976 		else
4977 			ret = ddi_intr_enable(nvme->n_inth[i]);
4978 
4979 		if (ret != DDI_SUCCESS) {
4980 			dev_err(nvme->n_dip, CE_WARN,
4981 			    "!%s: enabling interrupt %d failed", __func__, i);
4982 			goto fail;
4983 		}
4984 	}
4985 
4986 	nvme->n_intr_type = intr_type;
4987 
4988 	nvme->n_progress |= NVME_INTERRUPTS;
4989 
4990 	return (DDI_SUCCESS);
4991 
4992 fail:
4993 	nvme_release_interrupts(nvme);
4994 
4995 	return (ret);
4996 }
4997 
4998 static int
4999 nvme_fm_errcb(dev_info_t *dip, ddi_fm_error_t *fm_error, const void *arg)
5000 {
5001 	_NOTE(ARGUNUSED(arg));
5002 
5003 	pci_ereport_post(dip, fm_error, NULL);
5004 	return (fm_error->fme_status);
5005 }
5006 
5007 static void
5008 nvme_remove_callback(dev_info_t *dip, ddi_eventcookie_t cookie, void *a,
5009     void *b)
5010 {
5011 	nvme_t *nvme = a;
5012 
5013 	nvme_ctrl_mark_dead(nvme, B_TRUE);
5014 
5015 	/*
5016 	 * Fail all outstanding commands, including those in the admin queue
5017 	 * (queue 0).
5018 	 */
5019 	for (uint_t i = 0; i < nvme->n_ioq_count + 1; i++) {
5020 		nvme_qpair_t *qp = nvme->n_ioq[i];
5021 
5022 		mutex_enter(&qp->nq_mutex);
5023 		for (size_t j = 0; j < qp->nq_nentry; j++) {
5024 			nvme_cmd_t *cmd = qp->nq_cmd[j];
5025 			nvme_cmd_t *u_cmd;
5026 
5027 			if (cmd == NULL) {
5028 				continue;
5029 			}
5030 
5031 			/*
5032 			 * Since we have the queue lock held the entire time we
5033 			 * iterate over it, it's not possible for the queue to
5034 			 * change underneath us. Thus, we don't need to check
5035 			 * that the return value of nvme_unqueue_cmd matches the
5036 			 * requested cmd to unqueue.
5037 			 */
5038 			u_cmd = nvme_unqueue_cmd(nvme, qp, cmd->nc_sqe.sqe_cid);
5039 			taskq_dispatch_ent(qp->nq_cq->ncq_cmd_taskq,
5040 			    cmd->nc_callback, cmd, TQ_NOSLEEP, &cmd->nc_tqent);
5041 
5042 			ASSERT3P(u_cmd, ==, cmd);
5043 		}
5044 		mutex_exit(&qp->nq_mutex);
5045 	}
5046 }
5047 
5048 /*
5049  * Open minor management
5050  */
5051 static int
5052 nvme_minor_comparator(const void *l, const void *r)
5053 {
5054 	const nvme_minor_t *lm = l;
5055 	const nvme_minor_t *rm = r;
5056 
5057 	if (lm->nm_minor > rm->nm_minor) {
5058 		return (1);
5059 	} else if (lm->nm_minor < rm->nm_minor) {
5060 		return (-1);
5061 	} else {
5062 		return (0);
5063 	}
5064 }
5065 
5066 static void
5067 nvme_minor_free(nvme_minor_t *minor)
5068 {
5069 	if (minor->nm_minor > 0) {
5070 		ASSERT3S(minor->nm_minor, >=, NVME_OPEN_MINOR_MIN);
5071 		id_free(nvme_open_minors, minor->nm_minor);
5072 		minor->nm_minor = 0;
5073 	}
5074 	VERIFY0(list_link_active(&minor->nm_ctrl_lock.nli_node));
5075 	VERIFY0(list_link_active(&minor->nm_ns_lock.nli_node));
5076 	cv_destroy(&minor->nm_cv);
5077 	kmem_free(minor, sizeof (nvme_minor_t));
5078 }
5079 
5080 static nvme_minor_t *
5081 nvme_minor_find_by_dev(dev_t dev)
5082 {
5083 	id_t id = (id_t)getminor(dev);
5084 	nvme_minor_t search = { .nm_minor = id };
5085 	nvme_minor_t *ret;
5086 
5087 	mutex_enter(&nvme_open_minors_mutex);
5088 	ret = avl_find(&nvme_open_minors_avl, &search, NULL);
5089 	mutex_exit(&nvme_open_minors_mutex);
5090 
5091 	return (ret);
5092 }
5093 
5094 static int
5095 nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
5096 {
5097 	nvme_t *nvme;
5098 	int instance;
5099 	int nregs;
5100 	off_t regsize;
5101 	char name[32];
5102 
5103 	if (cmd != DDI_ATTACH)
5104 		return (DDI_FAILURE);
5105 
5106 	instance = ddi_get_instance(dip);
5107 
5108 	if (ddi_soft_state_zalloc(nvme_state, instance) != DDI_SUCCESS)
5109 		return (DDI_FAILURE);
5110 
5111 	nvme = ddi_get_soft_state(nvme_state, instance);
5112 	ddi_set_driver_private(dip, nvme);
5113 	nvme->n_dip = dip;
5114 
5115 	/*
5116 	 * Map PCI config space
5117 	 */
5118 	if (pci_config_setup(dip, &nvme->n_pcicfg_handle) != DDI_SUCCESS) {
5119 		dev_err(dip, CE_WARN, "!failed to map PCI config space");
5120 		goto fail;
5121 	}
5122 	nvme->n_progress |= NVME_PCI_CONFIG;
5123 
5124 	/*
5125 	 * Get the various PCI IDs from config space
5126 	 */
5127 	nvme->n_vendor_id =
5128 	    pci_config_get16(nvme->n_pcicfg_handle, PCI_CONF_VENID);
5129 	nvme->n_device_id =
5130 	    pci_config_get16(nvme->n_pcicfg_handle, PCI_CONF_DEVID);
5131 	nvme->n_revision_id =
5132 	    pci_config_get8(nvme->n_pcicfg_handle, PCI_CONF_REVID);
5133 	nvme->n_subsystem_device_id =
5134 	    pci_config_get16(nvme->n_pcicfg_handle, PCI_CONF_SUBSYSID);
5135 	nvme->n_subsystem_vendor_id =
5136 	    pci_config_get16(nvme->n_pcicfg_handle, PCI_CONF_SUBVENID);
5137 
5138 	nvme_detect_quirks(nvme);
5139 
5140 	/*
5141 	 * Set up event handlers for hot removal. While npe(4D) supports the hot
5142 	 * removal event being injected for devices, the same is not true of all
5143 	 * of our possible parents (i.e. pci(4D) as of this writing). The most
5144 	 * common case this shows up is in some virtualization environments. We
5145 	 * should treat this as non-fatal so that way devices work but leave
5146 	 * this set up in such a way that if a nexus does grow support for this
5147 	 * we're good to go.
5148 	 */
5149 	if (ddi_get_eventcookie(nvme->n_dip, DDI_DEVI_REMOVE_EVENT,
5150 	    &nvme->n_rm_cookie) == DDI_SUCCESS) {
5151 		if (ddi_add_event_handler(nvme->n_dip, nvme->n_rm_cookie,
5152 		    nvme_remove_callback, nvme, &nvme->n_ev_rm_cb_id) !=
5153 		    DDI_SUCCESS) {
5154 			goto fail;
5155 		}
5156 	} else {
5157 		nvme->n_ev_rm_cb_id = NULL;
5158 	}
5159 
5160 	mutex_init(&nvme->n_minor_mutex, NULL, MUTEX_DRIVER, NULL);
5161 	nvme->n_progress |= NVME_MUTEX_INIT;
5162 
5163 	nvme->n_strict_version = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
5164 	    DDI_PROP_DONTPASS, "strict-version", 1) == 1 ? B_TRUE : B_FALSE;
5165 	nvme->n_ignore_unknown_vendor_status = ddi_prop_get_int(DDI_DEV_T_ANY,
5166 	    dip, DDI_PROP_DONTPASS, "ignore-unknown-vendor-status", 0) == 1 ?
5167 	    B_TRUE : B_FALSE;
5168 	nvme->n_admin_queue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
5169 	    DDI_PROP_DONTPASS, "admin-queue-len", NVME_DEFAULT_ADMIN_QUEUE_LEN);
5170 	nvme->n_io_squeue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
5171 	    DDI_PROP_DONTPASS, "io-squeue-len", NVME_DEFAULT_IO_QUEUE_LEN);
5172 	/*
5173 	 * Double up the default for completion queues in case of
5174 	 * queue sharing.
5175 	 */
5176 	nvme->n_io_cqueue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
5177 	    DDI_PROP_DONTPASS, "io-cqueue-len", 2 * NVME_DEFAULT_IO_QUEUE_LEN);
5178 	nvme->n_async_event_limit = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
5179 	    DDI_PROP_DONTPASS, "async-event-limit",
5180 	    NVME_DEFAULT_ASYNC_EVENT_LIMIT);
5181 	nvme->n_write_cache_enabled = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
5182 	    DDI_PROP_DONTPASS, "volatile-write-cache-enable", 1) != 0 ?
5183 	    B_TRUE : B_FALSE;
5184 	nvme->n_min_block_size = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
5185 	    DDI_PROP_DONTPASS, "min-phys-block-size",
5186 	    NVME_DEFAULT_MIN_BLOCK_SIZE);
5187 	nvme->n_submission_queues = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
5188 	    DDI_PROP_DONTPASS, "max-submission-queues", -1);
5189 	nvme->n_completion_queues = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
5190 	    DDI_PROP_DONTPASS, "max-completion-queues", -1);
5191 
5192 	if (!ISP2(nvme->n_min_block_size) ||
5193 	    (nvme->n_min_block_size < NVME_DEFAULT_MIN_BLOCK_SIZE)) {
5194 		dev_err(dip, CE_WARN, "!min-phys-block-size %s, "
5195 		    "using default %d", ISP2(nvme->n_min_block_size) ?
5196 		    "too low" : "not a power of 2",
5197 		    NVME_DEFAULT_MIN_BLOCK_SIZE);
5198 		nvme->n_min_block_size = NVME_DEFAULT_MIN_BLOCK_SIZE;
5199 	}
5200 
5201 	if (nvme->n_submission_queues != -1 &&
5202 	    (nvme->n_submission_queues < 1 ||
5203 	    nvme->n_submission_queues > UINT16_MAX)) {
5204 		dev_err(dip, CE_WARN, "!\"submission-queues\"=%d is not "
5205 		    "valid. Must be [1..%d]", nvme->n_submission_queues,
5206 		    UINT16_MAX);
5207 		nvme->n_submission_queues = -1;
5208 	}
5209 
5210 	if (nvme->n_completion_queues != -1 &&
5211 	    (nvme->n_completion_queues < 1 ||
5212 	    nvme->n_completion_queues > UINT16_MAX)) {
5213 		dev_err(dip, CE_WARN, "!\"completion-queues\"=%d is not "
5214 		    "valid. Must be [1..%d]", nvme->n_completion_queues,
5215 		    UINT16_MAX);
5216 		nvme->n_completion_queues = -1;
5217 	}
5218 
5219 	if (nvme->n_admin_queue_len < NVME_MIN_ADMIN_QUEUE_LEN)
5220 		nvme->n_admin_queue_len = NVME_MIN_ADMIN_QUEUE_LEN;
5221 	else if (nvme->n_admin_queue_len > NVME_MAX_ADMIN_QUEUE_LEN)
5222 		nvme->n_admin_queue_len = NVME_MAX_ADMIN_QUEUE_LEN;
5223 
5224 	if (nvme->n_io_squeue_len < NVME_MIN_IO_QUEUE_LEN)
5225 		nvme->n_io_squeue_len = NVME_MIN_IO_QUEUE_LEN;
5226 	if (nvme->n_io_cqueue_len < NVME_MIN_IO_QUEUE_LEN)
5227 		nvme->n_io_cqueue_len = NVME_MIN_IO_QUEUE_LEN;
5228 
5229 	if (nvme->n_async_event_limit < 1)
5230 		nvme->n_async_event_limit = NVME_DEFAULT_ASYNC_EVENT_LIMIT;
5231 
5232 	nvme->n_reg_acc_attr = nvme_reg_acc_attr;
5233 	nvme->n_queue_dma_attr = nvme_queue_dma_attr;
5234 	nvme->n_prp_dma_attr = nvme_prp_dma_attr;
5235 	nvme->n_sgl_dma_attr = nvme_sgl_dma_attr;
5236 
5237 	/*
5238 	 * Set up FMA support.
5239 	 */
5240 	nvme->n_fm_cap = ddi_getprop(DDI_DEV_T_ANY, dip,
5241 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "fm-capable",
5242 	    DDI_FM_EREPORT_CAPABLE | DDI_FM_ACCCHK_CAPABLE |
5243 	    DDI_FM_DMACHK_CAPABLE | DDI_FM_ERRCB_CAPABLE);
5244 
5245 	ddi_fm_init(dip, &nvme->n_fm_cap, &nvme->n_fm_ibc);
5246 
5247 	if (nvme->n_fm_cap) {
5248 		if (nvme->n_fm_cap & DDI_FM_ACCCHK_CAPABLE)
5249 			nvme->n_reg_acc_attr.devacc_attr_access =
5250 			    DDI_FLAGERR_ACC;
5251 
5252 		if (nvme->n_fm_cap & DDI_FM_DMACHK_CAPABLE) {
5253 			nvme->n_prp_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
5254 			nvme->n_sgl_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
5255 		}
5256 
5257 		if (DDI_FM_EREPORT_CAP(nvme->n_fm_cap) ||
5258 		    DDI_FM_ERRCB_CAP(nvme->n_fm_cap))
5259 			pci_ereport_setup(dip);
5260 
5261 		if (DDI_FM_ERRCB_CAP(nvme->n_fm_cap))
5262 			ddi_fm_handler_register(dip, nvme_fm_errcb,
5263 			    (void *)nvme);
5264 	}
5265 
5266 	nvme->n_progress |= NVME_FMA_INIT;
5267 
5268 	/*
5269 	 * The spec defines several register sets. Only the controller
5270 	 * registers (set 1) are currently used.
5271 	 */
5272 	if (ddi_dev_nregs(dip, &nregs) == DDI_FAILURE ||
5273 	    nregs < 2 ||
5274 	    ddi_dev_regsize(dip, 1, &regsize) == DDI_FAILURE)
5275 		goto fail;
5276 
5277 	if (ddi_regs_map_setup(dip, 1, &nvme->n_regs, 0, regsize,
5278 	    &nvme->n_reg_acc_attr, &nvme->n_regh) != DDI_SUCCESS) {
5279 		dev_err(dip, CE_WARN, "!failed to map regset 1");
5280 		goto fail;
5281 	}
5282 
5283 	nvme->n_progress |= NVME_REGS_MAPPED;
5284 
5285 	/*
5286 	 * Set up kstats
5287 	 */
5288 	if (!nvme_stat_init(nvme)) {
5289 		dev_err(dip, CE_WARN, "!failed to create device kstats");
5290 		goto fail;
5291 	}
5292 	nvme->n_progress |= NVME_STAT_INIT;
5293 
5294 	/*
5295 	 * Create PRP DMA cache
5296 	 */
5297 	(void) snprintf(name, sizeof (name), "%s%d_prp_cache",
5298 	    ddi_driver_name(dip), ddi_get_instance(dip));
5299 	nvme->n_prp_cache = kmem_cache_create(name, sizeof (nvme_dma_t),
5300 	    0, nvme_prp_dma_constructor, nvme_prp_dma_destructor,
5301 	    NULL, (void *)nvme, NULL, 0);
5302 
5303 	if (nvme_init(nvme) != DDI_SUCCESS)
5304 		goto fail;
5305 
5306 	/*
5307 	 * Initialize the driver with the UFM subsystem
5308 	 */
5309 	if (ddi_ufm_init(dip, DDI_UFM_CURRENT_VERSION, &nvme_ufm_ops,
5310 	    &nvme->n_ufmh, nvme) != 0) {
5311 		dev_err(dip, CE_WARN, "!failed to initialize UFM subsystem");
5312 		goto fail;
5313 	}
5314 	mutex_init(&nvme->n_fwslot_mutex, NULL, MUTEX_DRIVER, NULL);
5315 	ddi_ufm_update(nvme->n_ufmh);
5316 	nvme->n_progress |= NVME_UFM_INIT;
5317 
5318 	nvme_mgmt_lock_init(&nvme->n_mgmt);
5319 	nvme_lock_init(&nvme->n_lock);
5320 	nvme->n_progress |= NVME_MGMT_INIT;
5321 
5322 	/*
5323 	 * Identify namespaces.
5324 	 */
5325 	nvme_mgmt_lock(nvme, NVME_MGMT_LOCK_NVME);
5326 
5327 	boolean_t minor_logged = B_FALSE;
5328 	for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) {
5329 		nvme_namespace_t *ns = nvme_nsid2ns(nvme, i);
5330 
5331 		nvme_lock_init(&ns->ns_lock);
5332 		ns->ns_progress |= NVME_NS_LOCK;
5333 
5334 		/*
5335 		 * Namespaces start out in the active state. This is the
5336 		 * default state until we find out information about the
5337 		 * namespaces in more detail. nvme_init_ns() will go through and
5338 		 * determine what the proper state should be. It will also use
5339 		 * this state change to keep an accurate count of attachable
5340 		 * namespaces.
5341 		 */
5342 		ns->ns_state = NVME_NS_STATE_ACTIVE;
5343 		if (nvme_init_ns(nvme, i) != 0) {
5344 			nvme_mgmt_unlock(nvme);
5345 			goto fail;
5346 		}
5347 
5348 		/*
5349 		 * We only create compat minor nodes for the namespace for the
5350 		 * first NVME_MINOR_MAX namespaces. Those that are beyond this
5351 		 * can only be accessed through the primary controller node,
5352 		 * which is generally fine as that's what libnvme uses and is
5353 		 * our preferred path. Not having a minor is better than not
5354 		 * having the namespace!
5355 		 */
5356 		if (i > NVME_MINOR_MAX) {
5357 			if (!minor_logged) {
5358 				dev_err(dip, CE_WARN, "namespace minor "
5359 				    "creation limited to the first %u "
5360 				    "namespaces, device has %u",
5361 				    NVME_MINOR_MAX, nvme->n_namespace_count);
5362 				minor_logged = B_TRUE;
5363 			}
5364 			continue;
5365 		}
5366 
5367 		if (ddi_create_minor_node(nvme->n_dip, ns->ns_name, S_IFCHR,
5368 		    NVME_MINOR(ddi_get_instance(nvme->n_dip), i),
5369 		    DDI_NT_NVME_ATTACHMENT_POINT, 0) != DDI_SUCCESS) {
5370 			nvme_mgmt_unlock(nvme);
5371 			dev_err(dip, CE_WARN,
5372 			    "!failed to create minor node for namespace %d", i);
5373 			goto fail;
5374 		}
5375 		ns->ns_progress |= NVME_NS_MINOR;
5376 	}
5377 
5378 	/*
5379 	 * Indicate that namespace initialization is complete and therefore
5380 	 * marking the controller dead can evaluate every namespace lock.
5381 	 */
5382 	nvme->n_progress |= NVME_NS_INIT;
5383 
5384 	if (ddi_create_minor_node(dip, "devctl", S_IFCHR,
5385 	    NVME_MINOR(ddi_get_instance(dip), 0), DDI_NT_NVME_NEXUS, 0) !=
5386 	    DDI_SUCCESS) {
5387 		nvme_mgmt_unlock(nvme);
5388 		dev_err(dip, CE_WARN, "nvme_attach: "
5389 		    "cannot create devctl minor node");
5390 		goto fail;
5391 	}
5392 
5393 	/*
5394 	 * Attempt to attach all namespaces that are in a reasonable state. This
5395 	 * should not fail attach.
5396 	 */
5397 	for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) {
5398 		nvme_namespace_t *ns = nvme_nsid2ns(nvme, i);
5399 		nvme_ioctl_common_t com = { .nioc_nsid = i };
5400 
5401 		if (ns->ns_state < NVME_NS_STATE_NOT_IGNORED)
5402 			continue;
5403 
5404 		if (!nvme_bd_attach_ns(nvme, &com) && com.nioc_drv_err !=
5405 		    NVME_IOCTL_E_UNSUP_ATTACH_NS) {
5406 			dev_err(nvme->n_dip, CE_WARN, "!failed to attach "
5407 			    "namespace %d due to blkdev error (0x%x)", i,
5408 			    com.nioc_drv_err);
5409 		}
5410 	}
5411 
5412 	nvme_mgmt_unlock(nvme);
5413 
5414 	/*
5415 	 * As the last thing that we do, we finally go ahead and enable
5416 	 * asynchronous event notifications. Currently we rely upon whatever
5417 	 * defaults the device has for the events that we will receive. If we
5418 	 * enable this earlier, it's possible that we'll get events that we
5419 	 * cannot handle yet because all of our data structures are not valid.
5420 	 * The device will queue all asynchronous events on a per-log page basis
5421 	 * until we submit this. If the device is totally broken, it will have
5422 	 * likely failed our commands already. If we add support for configuring
5423 	 * which asynchronous events we would like to receive via the SET
5424 	 * FEATURES command, then we should do that as one of the first commands
5425 	 * we send in nvme_init().
5426 	 *
5427 	 * We start by assuming asynchronous events are supported. However, not
5428 	 * all devices (e.g. some versions of QEMU) support this, so we end up
5429 	 * tracking whether or not we think these actually work.
5430 	 */
5431 	nvme->n_async_event_supported = B_TRUE;
5432 	for (uint16_t i = 0; i < nvme->n_async_event_limit; i++) {
5433 		nvme_async_event(nvme);
5434 	}
5435 
5436 
5437 	return (DDI_SUCCESS);
5438 
5439 fail:
5440 	/* attach successful anyway so that FMA can retire the device */
5441 	if (nvme->n_dead)
5442 		return (DDI_SUCCESS);
5443 
5444 	(void) nvme_detach(dip, DDI_DETACH);
5445 
5446 	return (DDI_FAILURE);
5447 }
5448 
5449 static int
5450 nvme_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
5451 {
5452 	int instance;
5453 	nvme_t *nvme;
5454 
5455 	if (cmd != DDI_DETACH)
5456 		return (DDI_FAILURE);
5457 
5458 	instance = ddi_get_instance(dip);
5459 
5460 	nvme = ddi_get_soft_state(nvme_state, instance);
5461 
5462 	if (nvme == NULL)
5463 		return (DDI_FAILURE);
5464 
5465 	/*
5466 	 * Remove all minor nodes from the device regardless of the source in
5467 	 * one swoop.
5468 	 */
5469 	ddi_remove_minor_node(dip, NULL);
5470 
5471 	/*
5472 	 * We need to remove the event handler as one of the first things that
5473 	 * we do. If we proceed with other teardown without removing the event
5474 	 * handler, we could end up in a very unfortunate race with ourselves.
5475 	 * The DDI does not serialize these with detach (just like timeout(9F)
5476 	 * and others).
5477 	 */
5478 	if (nvme->n_ev_rm_cb_id != NULL) {
5479 		(void) ddi_remove_event_handler(nvme->n_ev_rm_cb_id);
5480 	}
5481 	nvme->n_ev_rm_cb_id = NULL;
5482 
5483 	/*
5484 	 * If the controller was marked dead, there is a slight chance that we
5485 	 * are asynchronusly processing the removal taskq. Because we have
5486 	 * removed the callback handler above and all minor nodes and commands
5487 	 * are closed, there is no other way to get in here. As such, we wait on
5488 	 * the nvme_dead_taskq to complete so we can avoid tracking if it's
5489 	 * running or not.
5490 	 */
5491 	taskq_wait(nvme_dead_taskq);
5492 
5493 	if (nvme->n_ns) {
5494 		for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) {
5495 			nvme_namespace_t *ns = nvme_nsid2ns(nvme, i);
5496 
5497 			if (ns->ns_bd_hdl) {
5498 				(void) bd_detach_handle(ns->ns_bd_hdl);
5499 				bd_free_handle(ns->ns_bd_hdl);
5500 			}
5501 
5502 			if (ns->ns_idns)
5503 				kmem_free(ns->ns_idns,
5504 				    sizeof (nvme_identify_nsid_t));
5505 			if (ns->ns_devid)
5506 				strfree(ns->ns_devid);
5507 
5508 			if ((ns->ns_progress & NVME_NS_LOCK) != 0)
5509 				nvme_lock_fini(&ns->ns_lock);
5510 		}
5511 
5512 		kmem_free(nvme->n_ns, sizeof (nvme_namespace_t) *
5513 		    nvme->n_namespace_count);
5514 	}
5515 
5516 	if (nvme->n_progress & NVME_MGMT_INIT) {
5517 		nvme_lock_fini(&nvme->n_lock);
5518 		nvme_mgmt_lock_fini(&nvme->n_mgmt);
5519 	}
5520 
5521 	if (nvme->n_progress & NVME_UFM_INIT) {
5522 		ddi_ufm_fini(nvme->n_ufmh);
5523 		mutex_destroy(&nvme->n_fwslot_mutex);
5524 	}
5525 
5526 	if (nvme->n_progress & NVME_INTERRUPTS)
5527 		nvme_release_interrupts(nvme);
5528 
5529 	for (uint_t i = 0; i < nvme->n_cq_count; i++) {
5530 		if (nvme->n_cq[i]->ncq_cmd_taskq != NULL)
5531 			taskq_wait(nvme->n_cq[i]->ncq_cmd_taskq);
5532 	}
5533 
5534 	if (nvme->n_progress & NVME_MUTEX_INIT) {
5535 		mutex_destroy(&nvme->n_minor_mutex);
5536 	}
5537 
5538 	if (nvme->n_ioq_count > 0) {
5539 		for (uint_t i = 1; i != nvme->n_ioq_count + 1; i++) {
5540 			if (nvme->n_ioq[i] != NULL) {
5541 				/* TODO: send destroy queue commands */
5542 				nvme_free_qpair(nvme->n_ioq[i]);
5543 			}
5544 		}
5545 
5546 		kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *) *
5547 		    (nvme->n_ioq_count + 1));
5548 	}
5549 
5550 	if (nvme->n_prp_cache != NULL) {
5551 		kmem_cache_destroy(nvme->n_prp_cache);
5552 	}
5553 
5554 	if (nvme->n_progress & NVME_REGS_MAPPED) {
5555 		nvme_shutdown(nvme, B_FALSE);
5556 		(void) nvme_reset(nvme, B_FALSE);
5557 	}
5558 
5559 	if (nvme->n_progress & NVME_CTRL_LIMITS)
5560 		sema_destroy(&nvme->n_abort_sema);
5561 
5562 	if (nvme->n_progress & NVME_ADMIN_QUEUE)
5563 		nvme_free_qpair(nvme->n_adminq);
5564 
5565 	if (nvme->n_cq_count > 0) {
5566 		nvme_destroy_cq_array(nvme, 0);
5567 		nvme->n_cq = NULL;
5568 		nvme->n_cq_count = 0;
5569 	}
5570 
5571 	if (nvme->n_idcomns)
5572 		kmem_free(nvme->n_idcomns, NVME_IDENTIFY_BUFSIZE);
5573 
5574 	if (nvme->n_idctl)
5575 		kmem_free(nvme->n_idctl, NVME_IDENTIFY_BUFSIZE);
5576 
5577 	if (nvme->n_progress & NVME_REGS_MAPPED)
5578 		ddi_regs_map_free(&nvme->n_regh);
5579 
5580 	if (nvme->n_progress & NVME_STAT_INIT)
5581 		nvme_stat_cleanup(nvme);
5582 
5583 	if (nvme->n_progress & NVME_FMA_INIT) {
5584 		if (DDI_FM_ERRCB_CAP(nvme->n_fm_cap))
5585 			ddi_fm_handler_unregister(nvme->n_dip);
5586 
5587 		if (DDI_FM_EREPORT_CAP(nvme->n_fm_cap) ||
5588 		    DDI_FM_ERRCB_CAP(nvme->n_fm_cap))
5589 			pci_ereport_teardown(nvme->n_dip);
5590 
5591 		ddi_fm_fini(nvme->n_dip);
5592 	}
5593 
5594 	if (nvme->n_progress & NVME_PCI_CONFIG)
5595 		pci_config_teardown(&nvme->n_pcicfg_handle);
5596 
5597 	if (nvme->n_vendor != NULL)
5598 		strfree(nvme->n_vendor);
5599 
5600 	if (nvme->n_product != NULL)
5601 		strfree(nvme->n_product);
5602 
5603 	ddi_soft_state_free(nvme_state, instance);
5604 
5605 	return (DDI_SUCCESS);
5606 }
5607 
5608 static int
5609 nvme_quiesce(dev_info_t *dip)
5610 {
5611 	int instance;
5612 	nvme_t *nvme;
5613 
5614 	instance = ddi_get_instance(dip);
5615 
5616 	nvme = ddi_get_soft_state(nvme_state, instance);
5617 
5618 	if (nvme == NULL)
5619 		return (DDI_FAILURE);
5620 
5621 	nvme_shutdown(nvme, B_TRUE);
5622 
5623 	(void) nvme_reset(nvme, B_TRUE);
5624 
5625 	return (DDI_SUCCESS);
5626 }
5627 
5628 static int
5629 nvme_fill_prp(nvme_cmd_t *cmd, ddi_dma_handle_t dma)
5630 {
5631 	nvme_t *nvme = cmd->nc_nvme;
5632 	uint_t nprp_per_page, nprp;
5633 	uint64_t *prp;
5634 	const ddi_dma_cookie_t *cookie;
5635 	uint_t idx;
5636 	uint_t ncookies = ddi_dma_ncookies(dma);
5637 
5638 	if (ncookies == 0)
5639 		return (DDI_FAILURE);
5640 
5641 	if ((cookie = ddi_dma_cookie_get(dma, 0)) == NULL)
5642 		return (DDI_FAILURE);
5643 	cmd->nc_sqe.sqe_dptr.d_prp[0] = cookie->dmac_laddress;
5644 
5645 	if (ncookies == 1) {
5646 		cmd->nc_sqe.sqe_dptr.d_prp[1] = 0;
5647 		return (DDI_SUCCESS);
5648 	} else if (ncookies == 2) {
5649 		if ((cookie = ddi_dma_cookie_get(dma, 1)) == NULL)
5650 			return (DDI_FAILURE);
5651 		cmd->nc_sqe.sqe_dptr.d_prp[1] = cookie->dmac_laddress;
5652 		return (DDI_SUCCESS);
5653 	}
5654 
5655 	/*
5656 	 * At this point, we're always operating on cookies at
5657 	 * index >= 1 and writing the addresses of those cookies
5658 	 * into a new page. The address of that page is stored
5659 	 * as the second PRP entry.
5660 	 */
5661 	nprp_per_page = nvme->n_pagesize / sizeof (uint64_t);
5662 	ASSERT(nprp_per_page > 0);
5663 
5664 	/*
5665 	 * We currently don't support chained PRPs and set up our DMA
5666 	 * attributes to reflect that. If we still get an I/O request
5667 	 * that needs a chained PRP something is very wrong. Account
5668 	 * for the first cookie here, which we've placed in d_prp[0].
5669 	 */
5670 	nprp = howmany(ncookies - 1, nprp_per_page);
5671 	VERIFY(nprp == 1);
5672 
5673 	/*
5674 	 * Allocate a page of pointers, in which we'll write the
5675 	 * addresses of cookies 1 to `ncookies`.
5676 	 */
5677 	cmd->nc_prp = kmem_cache_alloc(nvme->n_prp_cache, KM_SLEEP);
5678 	bzero(cmd->nc_prp->nd_memp, cmd->nc_prp->nd_len);
5679 	cmd->nc_sqe.sqe_dptr.d_prp[1] = cmd->nc_prp->nd_cookie.dmac_laddress;
5680 
5681 	prp = (uint64_t *)cmd->nc_prp->nd_memp;
5682 	for (idx = 1; idx < ncookies; idx++) {
5683 		if ((cookie = ddi_dma_cookie_get(dma, idx)) == NULL)
5684 			return (DDI_FAILURE);
5685 		*prp++ = cookie->dmac_laddress;
5686 	}
5687 
5688 	(void) ddi_dma_sync(cmd->nc_prp->nd_dmah, 0, cmd->nc_prp->nd_len,
5689 	    DDI_DMA_SYNC_FORDEV);
5690 	return (DDI_SUCCESS);
5691 }
5692 
5693 /*
5694  * The maximum number of requests supported for a deallocate request is
5695  * NVME_DSET_MGMT_MAX_RANGES (256) -- this is from the NVMe 1.1 spec (and
5696  * unchanged through at least 1.4a). The definition of nvme_range_t is also
5697  * from the NVMe 1.1 spec. Together, the result is that all of the ranges for
5698  * a deallocate request will fit into the smallest supported namespace page
5699  * (4k).
5700  */
5701 CTASSERT(sizeof (nvme_range_t) * NVME_DSET_MGMT_MAX_RANGES == 4096);
5702 
5703 static int
5704 nvme_fill_ranges(nvme_cmd_t *cmd, bd_xfer_t *xfer, uint64_t blocksize,
5705     int allocflag)
5706 {
5707 	const dkioc_free_list_t *dfl = xfer->x_dfl;
5708 	const dkioc_free_list_ext_t *exts = dfl->dfl_exts;
5709 	nvme_t *nvme = cmd->nc_nvme;
5710 	nvme_range_t *ranges = NULL;
5711 	uint_t i;
5712 
5713 	/*
5714 	 * The number of ranges in the request is 0s based (that is
5715 	 * word10 == 0 -> 1 range, word10 == 1 -> 2 ranges, ...,
5716 	 * word10 == 255 -> 256 ranges). Therefore the allowed values are
5717 	 * [1..NVME_DSET_MGMT_MAX_RANGES]. If blkdev gives us a bad request,
5718 	 * we either provided bad info in nvme_bd_driveinfo() or there is a bug
5719 	 * in blkdev.
5720 	 */
5721 	VERIFY3U(dfl->dfl_num_exts, >, 0);
5722 	VERIFY3U(dfl->dfl_num_exts, <=, NVME_DSET_MGMT_MAX_RANGES);
5723 	cmd->nc_sqe.sqe_cdw10 = (dfl->dfl_num_exts - 1) & 0xff;
5724 
5725 	cmd->nc_sqe.sqe_cdw11 = NVME_DSET_MGMT_ATTR_DEALLOCATE;
5726 
5727 	cmd->nc_prp = kmem_cache_alloc(nvme->n_prp_cache, allocflag);
5728 	if (cmd->nc_prp == NULL)
5729 		return (DDI_FAILURE);
5730 
5731 	bzero(cmd->nc_prp->nd_memp, cmd->nc_prp->nd_len);
5732 	ranges = (nvme_range_t *)cmd->nc_prp->nd_memp;
5733 
5734 	cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_prp->nd_cookie.dmac_laddress;
5735 	cmd->nc_sqe.sqe_dptr.d_prp[1] = 0;
5736 
5737 	for (i = 0; i < dfl->dfl_num_exts; i++) {
5738 		uint64_t lba, len;
5739 
5740 		lba = (dfl->dfl_offset + exts[i].dfle_start) / blocksize;
5741 		len = exts[i].dfle_length / blocksize;
5742 
5743 		VERIFY3U(len, <=, UINT32_MAX);
5744 
5745 		/* No context attributes for a deallocate request */
5746 		ranges[i].nr_ctxattr = 0;
5747 		ranges[i].nr_len = len;
5748 		ranges[i].nr_lba = lba;
5749 	}
5750 
5751 	(void) ddi_dma_sync(cmd->nc_prp->nd_dmah, 0, cmd->nc_prp->nd_len,
5752 	    DDI_DMA_SYNC_FORDEV);
5753 
5754 	return (DDI_SUCCESS);
5755 }
5756 
5757 static nvme_cmd_t *
5758 nvme_create_nvm_cmd(nvme_namespace_t *ns, uint8_t opc, bd_xfer_t *xfer)
5759 {
5760 	nvme_t *nvme = ns->ns_nvme;
5761 	nvme_cmd_t *cmd;
5762 	int allocflag;
5763 
5764 	/*
5765 	 * Blkdev only sets BD_XFER_POLL when dumping, so don't sleep.
5766 	 */
5767 	allocflag = (xfer->x_flags & BD_XFER_POLL) ? KM_NOSLEEP : KM_SLEEP;
5768 	cmd = nvme_alloc_cmd(nvme, allocflag);
5769 
5770 	if (cmd == NULL)
5771 		return (NULL);
5772 
5773 	cmd->nc_sqe.sqe_opc = opc;
5774 	cmd->nc_callback = nvme_bd_xfer_done;
5775 	cmd->nc_xfer = xfer;
5776 
5777 	switch (opc) {
5778 	case NVME_OPC_NVM_WRITE:
5779 	case NVME_OPC_NVM_READ:
5780 		VERIFY(xfer->x_nblks <= 0x10000);
5781 
5782 		cmd->nc_sqe.sqe_nsid = ns->ns_id;
5783 
5784 		cmd->nc_sqe.sqe_cdw10 = xfer->x_blkno & 0xffffffffu;
5785 		cmd->nc_sqe.sqe_cdw11 = (xfer->x_blkno >> 32);
5786 		cmd->nc_sqe.sqe_cdw12 = (uint16_t)(xfer->x_nblks - 1);
5787 
5788 		if (nvme_fill_prp(cmd, xfer->x_dmah) != DDI_SUCCESS)
5789 			goto fail;
5790 		break;
5791 
5792 	case NVME_OPC_NVM_FLUSH:
5793 		cmd->nc_sqe.sqe_nsid = ns->ns_id;
5794 		break;
5795 
5796 	case NVME_OPC_NVM_DSET_MGMT:
5797 		cmd->nc_sqe.sqe_nsid = ns->ns_id;
5798 
5799 		if (nvme_fill_ranges(cmd, xfer,
5800 		    (uint64_t)ns->ns_block_size, allocflag) != DDI_SUCCESS)
5801 			goto fail;
5802 		break;
5803 
5804 	default:
5805 		goto fail;
5806 	}
5807 
5808 	return (cmd);
5809 
5810 fail:
5811 	nvme_free_cmd(cmd);
5812 	return (NULL);
5813 }
5814 
5815 static void
5816 nvme_bd_xfer_done(void *arg)
5817 {
5818 	nvme_cmd_t *cmd = arg;
5819 	bd_xfer_t *xfer = cmd->nc_xfer;
5820 	int error = 0;
5821 
5822 	error = nvme_check_cmd_status(cmd);
5823 	nvme_free_cmd(cmd);
5824 
5825 	bd_xfer_done(xfer, error);
5826 }
5827 
5828 static void
5829 nvme_bd_driveinfo(void *arg, bd_drive_t *drive)
5830 {
5831 	nvme_namespace_t *ns = arg;
5832 	nvme_t *nvme = ns->ns_nvme;
5833 	uint_t ns_count = MAX(1, nvme->n_namespaces_attachable);
5834 
5835 	nvme_mgmt_lock(nvme, NVME_MGMT_LOCK_BDRO);
5836 
5837 	/*
5838 	 * Set the blkdev qcount to the number of submission queues.
5839 	 * It will then create one waitq/runq pair for each submission
5840 	 * queue and spread I/O requests across the queues.
5841 	 */
5842 	drive->d_qcount = nvme->n_ioq_count;
5843 
5844 	/*
5845 	 * I/O activity to individual namespaces is distributed across
5846 	 * each of the d_qcount blkdev queues (which has been set to
5847 	 * the number of nvme submission queues). d_qsize is the number
5848 	 * of submitted and not completed I/Os within each queue that blkdev
5849 	 * will allow before it starts holding them in the waitq.
5850 	 *
5851 	 * Each namespace will create a child blkdev instance, for each one
5852 	 * we try and set the d_qsize so that each namespace gets an
5853 	 * equal portion of the submission queue.
5854 	 *
5855 	 * If post instantiation of the nvme drive, n_namespaces_attachable
5856 	 * changes and a namespace is attached it could calculate a
5857 	 * different d_qsize. It may even be that the sum of the d_qsizes is
5858 	 * now beyond the submission queue size. Should that be the case
5859 	 * and the I/O rate is such that blkdev attempts to submit more
5860 	 * I/Os than the size of the submission queue, the excess I/Os
5861 	 * will be held behind the semaphore nq_sema.
5862 	 */
5863 	drive->d_qsize = nvme->n_io_squeue_len / ns_count;
5864 
5865 	/*
5866 	 * Don't let the queue size drop below the minimum, though.
5867 	 */
5868 	drive->d_qsize = MAX(drive->d_qsize, NVME_MIN_IO_QUEUE_LEN);
5869 
5870 	/*
5871 	 * d_maxxfer is not set, which means the value is taken from the DMA
5872 	 * attributes specified to bd_alloc_handle.
5873 	 */
5874 
5875 	drive->d_removable = B_FALSE;
5876 	drive->d_hotpluggable = B_FALSE;
5877 
5878 	bcopy(ns->ns_eui64, drive->d_eui64, sizeof (drive->d_eui64));
5879 	drive->d_target = ns->ns_id;
5880 	drive->d_lun = 0;
5881 
5882 	drive->d_model = nvme->n_idctl->id_model;
5883 	drive->d_model_len = sizeof (nvme->n_idctl->id_model);
5884 	drive->d_vendor = nvme->n_vendor;
5885 	drive->d_vendor_len = strlen(nvme->n_vendor);
5886 	drive->d_product = nvme->n_product;
5887 	drive->d_product_len = strlen(nvme->n_product);
5888 	drive->d_serial = nvme->n_idctl->id_serial;
5889 	drive->d_serial_len = sizeof (nvme->n_idctl->id_serial);
5890 	drive->d_revision = nvme->n_idctl->id_fwrev;
5891 	drive->d_revision_len = sizeof (nvme->n_idctl->id_fwrev);
5892 
5893 	/*
5894 	 * If we support the dataset management command, the only restrictions
5895 	 * on a discard request are the maximum number of ranges (segments)
5896 	 * per single request.
5897 	 */
5898 	if (nvme->n_idctl->id_oncs.on_dset_mgmt)
5899 		drive->d_max_free_seg = NVME_DSET_MGMT_MAX_RANGES;
5900 
5901 	nvme_mgmt_unlock(nvme);
5902 }
5903 
5904 static int
5905 nvme_bd_mediainfo(void *arg, bd_media_t *media)
5906 {
5907 	nvme_namespace_t *ns = arg;
5908 	nvme_t *nvme = ns->ns_nvme;
5909 
5910 	if (nvme->n_dead) {
5911 		return (EIO);
5912 	}
5913 
5914 	nvme_mgmt_lock(nvme, NVME_MGMT_LOCK_BDRO);
5915 
5916 	media->m_nblks = ns->ns_block_count;
5917 	media->m_blksize = ns->ns_block_size;
5918 	media->m_readonly = B_FALSE;
5919 	media->m_solidstate = B_TRUE;
5920 
5921 	media->m_pblksize = ns->ns_best_block_size;
5922 
5923 	nvme_mgmt_unlock(nvme);
5924 
5925 	return (0);
5926 }
5927 
5928 static int
5929 nvme_bd_cmd(nvme_namespace_t *ns, bd_xfer_t *xfer, uint8_t opc)
5930 {
5931 	nvme_t *nvme = ns->ns_nvme;
5932 	nvme_cmd_t *cmd;
5933 	nvme_qpair_t *ioq;
5934 	boolean_t poll;
5935 	int ret;
5936 
5937 	if (nvme->n_dead) {
5938 		return (EIO);
5939 	}
5940 
5941 	cmd = nvme_create_nvm_cmd(ns, opc, xfer);
5942 	if (cmd == NULL)
5943 		return (ENOMEM);
5944 
5945 	cmd->nc_sqid = xfer->x_qnum + 1;
5946 	ASSERT(cmd->nc_sqid <= nvme->n_ioq_count);
5947 	ioq = nvme->n_ioq[cmd->nc_sqid];
5948 
5949 	/*
5950 	 * Get the polling flag before submitting the command. The command may
5951 	 * complete immediately after it was submitted, which means we must
5952 	 * treat both cmd and xfer as if they have been freed already.
5953 	 */
5954 	poll = (xfer->x_flags & BD_XFER_POLL) != 0;
5955 
5956 	ret = nvme_submit_io_cmd(ioq, cmd);
5957 
5958 	if (ret != 0)
5959 		return (ret);
5960 
5961 	if (!poll)
5962 		return (0);
5963 
5964 	do {
5965 		cmd = nvme_retrieve_cmd(nvme, ioq);
5966 		if (cmd != NULL) {
5967 			ASSERT0(cmd->nc_flags & NVME_CMD_F_USELOCK);
5968 			cmd->nc_callback(cmd);
5969 		} else {
5970 			drv_usecwait(10);
5971 		}
5972 	} while (ioq->nq_active_cmds != 0);
5973 
5974 	return (0);
5975 }
5976 
5977 static int
5978 nvme_bd_read(void *arg, bd_xfer_t *xfer)
5979 {
5980 	nvme_namespace_t *ns = arg;
5981 
5982 	return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_READ));
5983 }
5984 
5985 static int
5986 nvme_bd_write(void *arg, bd_xfer_t *xfer)
5987 {
5988 	nvme_namespace_t *ns = arg;
5989 
5990 	return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_WRITE));
5991 }
5992 
5993 static int
5994 nvme_bd_sync(void *arg, bd_xfer_t *xfer)
5995 {
5996 	nvme_namespace_t *ns = arg;
5997 
5998 	if (ns->ns_nvme->n_dead)
5999 		return (EIO);
6000 
6001 	/*
6002 	 * If the volatile write cache is not present or not enabled the FLUSH
6003 	 * command is a no-op, so we can take a shortcut here.
6004 	 */
6005 	if (!ns->ns_nvme->n_write_cache_present) {
6006 		bd_xfer_done(xfer, ENOTSUP);
6007 		return (0);
6008 	}
6009 
6010 	if (!ns->ns_nvme->n_write_cache_enabled) {
6011 		bd_xfer_done(xfer, 0);
6012 		return (0);
6013 	}
6014 
6015 	return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_FLUSH));
6016 }
6017 
6018 static int
6019 nvme_bd_devid(void *arg, dev_info_t *devinfo, ddi_devid_t *devid)
6020 {
6021 	nvme_namespace_t *ns = arg;
6022 	nvme_t *nvme = ns->ns_nvme;
6023 
6024 	if (nvme->n_dead) {
6025 		return (EIO);
6026 	}
6027 
6028 	if (*(uint64_t *)ns->ns_nguid != 0 ||
6029 	    *(uint64_t *)(ns->ns_nguid + 8) != 0) {
6030 		return (ddi_devid_init(devinfo, DEVID_NVME_NGUID,
6031 		    sizeof (ns->ns_nguid), ns->ns_nguid, devid));
6032 	} else if (*(uint64_t *)ns->ns_eui64 != 0) {
6033 		return (ddi_devid_init(devinfo, DEVID_NVME_EUI64,
6034 		    sizeof (ns->ns_eui64), ns->ns_eui64, devid));
6035 	} else {
6036 		return (ddi_devid_init(devinfo, DEVID_NVME_NSID,
6037 		    strlen(ns->ns_devid), ns->ns_devid, devid));
6038 	}
6039 }
6040 
6041 static int
6042 nvme_bd_free_space(void *arg, bd_xfer_t *xfer)
6043 {
6044 	nvme_namespace_t *ns = arg;
6045 
6046 	if (xfer->x_dfl == NULL)
6047 		return (EINVAL);
6048 
6049 	if (!ns->ns_nvme->n_idctl->id_oncs.on_dset_mgmt)
6050 		return (ENOTSUP);
6051 
6052 	return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_DSET_MGMT));
6053 }
6054 
6055 static int
6056 nvme_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
6057 {
6058 #ifndef __lock_lint
6059 	_NOTE(ARGUNUSED(cred_p));
6060 #endif
6061 	nvme_t *nvme;
6062 	nvme_minor_t *minor = NULL;
6063 	uint32_t nsid;
6064 	minor_t m = getminor(*devp);
6065 	int rv = 0;
6066 
6067 	if (otyp != OTYP_CHR)
6068 		return (EINVAL);
6069 
6070 	if (m >= NVME_OPEN_MINOR_MIN)
6071 		return (ENXIO);
6072 
6073 	nvme = ddi_get_soft_state(nvme_state, NVME_MINOR_INST(m));
6074 	nsid = NVME_MINOR_NSID(m);
6075 
6076 	if (nvme == NULL)
6077 		return (ENXIO);
6078 
6079 	if (nsid > MIN(nvme->n_namespace_count, NVME_MINOR_MAX))
6080 		return (ENXIO);
6081 
6082 	if (nvme->n_dead)
6083 		return (EIO);
6084 
6085 	/*
6086 	 * At this point, we're going to allow an open to proceed on this
6087 	 * device. We need to allocate a new instance for this (presuming one is
6088 	 * available).
6089 	 */
6090 	minor = kmem_zalloc(sizeof (nvme_minor_t), KM_NOSLEEP_LAZY);
6091 	if (minor == NULL) {
6092 		return (ENOMEM);
6093 	}
6094 
6095 	cv_init(&minor->nm_cv, NULL, CV_DRIVER, NULL);
6096 	list_link_init(&minor->nm_ctrl_lock.nli_node);
6097 	minor->nm_ctrl_lock.nli_nvme = nvme;
6098 	minor->nm_ctrl_lock.nli_minor = minor;
6099 	list_link_init(&minor->nm_ns_lock.nli_node);
6100 	minor->nm_ns_lock.nli_nvme = nvme;
6101 	minor->nm_ns_lock.nli_minor = minor;
6102 	minor->nm_minor = id_alloc_nosleep(nvme_open_minors);
6103 	if (minor->nm_minor == -1) {
6104 		nvme_minor_free(minor);
6105 		return (ENOSPC);
6106 	}
6107 
6108 	minor->nm_ctrl = nvme;
6109 	if (nsid != 0) {
6110 		minor->nm_ns = nvme_nsid2ns(nvme, nsid);
6111 	}
6112 
6113 	/*
6114 	 * Before we check for exclusive access and attempt a lock if requested,
6115 	 * ensure that this minor is persisted.
6116 	 */
6117 	mutex_enter(&nvme_open_minors_mutex);
6118 	avl_add(&nvme_open_minors_avl, minor);
6119 	mutex_exit(&nvme_open_minors_mutex);
6120 
6121 	/*
6122 	 * A request for opening this FEXCL, is translated into a non-blocking
6123 	 * write lock of the appropriate entity. This honors the original
6124 	 * semantics here. In the future, we should see if we can remove this
6125 	 * and turn a request for FEXCL at open into ENOTSUP.
6126 	 */
6127 	mutex_enter(&nvme->n_minor_mutex);
6128 	if ((flag & FEXCL) != 0) {
6129 		nvme_ioctl_lock_t lock = {
6130 			.nil_level = NVME_LOCK_L_WRITE,
6131 			.nil_flags = NVME_LOCK_F_DONT_BLOCK
6132 		};
6133 
6134 		if (minor->nm_ns != NULL) {
6135 			lock.nil_ent = NVME_LOCK_E_NS;
6136 			lock.nil_common.nioc_nsid = nsid;
6137 		} else {
6138 			lock.nil_ent = NVME_LOCK_E_CTRL;
6139 		}
6140 		nvme_rwlock(minor, &lock);
6141 		if (lock.nil_common.nioc_drv_err != NVME_IOCTL_E_OK) {
6142 			mutex_exit(&nvme->n_minor_mutex);
6143 
6144 			mutex_enter(&nvme_open_minors_mutex);
6145 			avl_remove(&nvme_open_minors_avl, minor);
6146 			mutex_exit(&nvme_open_minors_mutex);
6147 
6148 			nvme_minor_free(minor);
6149 			return (EBUSY);
6150 		}
6151 	}
6152 	mutex_exit(&nvme->n_minor_mutex);
6153 
6154 	*devp = makedevice(getmajor(*devp), (minor_t)minor->nm_minor);
6155 	return (rv);
6156 
6157 }
6158 
6159 static int
6160 nvme_close(dev_t dev, int flag __unused, int otyp, cred_t *cred_p __unused)
6161 {
6162 	nvme_minor_t *minor;
6163 	nvme_t *nvme;
6164 
6165 	if (otyp != OTYP_CHR) {
6166 		return (ENXIO);
6167 	}
6168 
6169 	minor = nvme_minor_find_by_dev(dev);
6170 	if (minor == NULL) {
6171 		return (ENXIO);
6172 	}
6173 
6174 	mutex_enter(&nvme_open_minors_mutex);
6175 	avl_remove(&nvme_open_minors_avl, minor);
6176 	mutex_exit(&nvme_open_minors_mutex);
6177 
6178 	/*
6179 	 * When this device is being closed, we must ensure that any locks held
6180 	 * by this are dealt with.
6181 	 */
6182 	nvme = minor->nm_ctrl;
6183 	mutex_enter(&nvme->n_minor_mutex);
6184 	ASSERT3U(minor->nm_ctrl_lock.nli_state, !=, NVME_LOCK_STATE_BLOCKED);
6185 	ASSERT3U(minor->nm_ns_lock.nli_state, !=, NVME_LOCK_STATE_BLOCKED);
6186 
6187 	if (minor->nm_ctrl_lock.nli_state == NVME_LOCK_STATE_ACQUIRED) {
6188 		VERIFY3P(minor->nm_ctrl_lock.nli_lock, !=, NULL);
6189 		nvme_rwunlock(&minor->nm_ctrl_lock,
6190 		    minor->nm_ctrl_lock.nli_lock);
6191 	}
6192 
6193 	if (minor->nm_ns_lock.nli_state == NVME_LOCK_STATE_ACQUIRED) {
6194 		VERIFY3P(minor->nm_ns_lock.nli_lock, !=, NULL);
6195 		nvme_rwunlock(&minor->nm_ns_lock, minor->nm_ns_lock.nli_lock);
6196 	}
6197 	mutex_exit(&nvme->n_minor_mutex);
6198 
6199 	nvme_minor_free(minor);
6200 
6201 	return (0);
6202 }
6203 
6204 void
6205 nvme_ioctl_success(nvme_ioctl_common_t *ioc)
6206 {
6207 	ioc->nioc_drv_err = NVME_IOCTL_E_OK;
6208 	ioc->nioc_ctrl_sc = NVME_CQE_SC_GEN_SUCCESS;
6209 	ioc->nioc_ctrl_sct = NVME_CQE_SCT_GENERIC;
6210 }
6211 
6212 boolean_t
6213 nvme_ioctl_error(nvme_ioctl_common_t *ioc, nvme_ioctl_errno_t err, uint32_t sct,
6214     uint32_t sc)
6215 {
6216 	ioc->nioc_drv_err = err;
6217 	ioc->nioc_ctrl_sct = sct;
6218 	ioc->nioc_ctrl_sc = sc;
6219 
6220 	return (B_FALSE);
6221 }
6222 
6223 static int
6224 nvme_ioctl_copyout_error(nvme_ioctl_errno_t err, intptr_t uaddr, int mode)
6225 {
6226 	nvme_ioctl_common_t ioc;
6227 
6228 	ASSERT3U(err, !=, NVME_IOCTL_E_CTRL_ERROR);
6229 	bzero(&ioc, sizeof (ioc));
6230 	if (ddi_copyout(&ioc, (void *)uaddr, sizeof (nvme_ioctl_common_t),
6231 	    mode & FKIOCTL) != 0) {
6232 		return (EFAULT);
6233 	}
6234 	return (0);
6235 }
6236 
6237 /*
6238  * The companion to the namespace checking. This occurs after any rewriting
6239  * occurs. This is the primary point that we attempt to enforce any operation's
6240  * exclusivity. Note, it is theoretically possible for an operation to be
6241  * ongoing and to have someone with an exclusive lock ask to unlock it for some
6242  * reason. This does not maintain the number of such events that are going on.
6243  * While perhaps this is leaving too much up to the user, by the same token we
6244  * don't try to stop them from issuing two different format NVM commands
6245  * targeting the whole device at the same time either, even though the
6246  * controller would really rather that didn't happen.
6247  */
6248 static boolean_t
6249 nvme_ioctl_excl_check(nvme_minor_t *minor, nvme_ioctl_common_t *ioc,
6250     const nvme_ioctl_check_t *check)
6251 {
6252 	nvme_t *const nvme = minor->nm_ctrl;
6253 	nvme_namespace_t *ns;
6254 	boolean_t have_ctrl, have_ns, ctrl_is_excl, ns_is_excl;
6255 
6256 	/*
6257 	 * If the command doesn't require anything, then we're done.
6258 	 */
6259 	if (check->nck_excl == NVME_IOCTL_EXCL_SKIP) {
6260 		return (B_TRUE);
6261 	}
6262 
6263 	if (ioc->nioc_nsid == 0 || ioc->nioc_nsid == NVME_NSID_BCAST) {
6264 		ns = NULL;
6265 	} else {
6266 		ns = nvme_nsid2ns(nvme, ioc->nioc_nsid);
6267 	}
6268 
6269 	mutex_enter(&nvme->n_minor_mutex);
6270 	ctrl_is_excl = nvme->n_lock.nl_writer != NULL;
6271 	have_ctrl = nvme->n_lock.nl_writer == &minor->nm_ctrl_lock;
6272 	if (ns != NULL) {
6273 		/*
6274 		 * We explicitly test the namespace lock's writer versus asking
6275 		 * the minor because the minor's namespace lock may apply to a
6276 		 * different namespace.
6277 		 */
6278 		ns_is_excl = ns->ns_lock.nl_writer != NULL;
6279 		have_ns = ns->ns_lock.nl_writer == &minor->nm_ns_lock;
6280 		ASSERT0(have_ctrl && have_ns);
6281 #ifdef	DEBUG
6282 		if (have_ns) {
6283 			ASSERT3P(minor->nm_ns_lock.nli_ns, ==, ns);
6284 		}
6285 #endif
6286 	} else {
6287 		ns_is_excl = B_FALSE;
6288 		have_ns = B_FALSE;
6289 	}
6290 	ASSERT0(ctrl_is_excl && ns_is_excl);
6291 	mutex_exit(&nvme->n_minor_mutex);
6292 
6293 	if (check->nck_excl == NVME_IOCTL_EXCL_CTRL) {
6294 		if (have_ctrl) {
6295 			return (B_TRUE);
6296 		}
6297 
6298 		return (nvme_ioctl_error(ioc, NVME_IOCTL_E_NEED_CTRL_WRLOCK,
6299 		    0, 0));
6300 	}
6301 
6302 	if (check->nck_excl == NVME_IOCTL_EXCL_WRITE) {
6303 		if (ns == NULL) {
6304 			if (have_ctrl) {
6305 				return (B_TRUE);
6306 			}
6307 			return (nvme_ioctl_error(ioc,
6308 			    NVME_IOCTL_E_NEED_CTRL_WRLOCK, 0, 0));
6309 		} else {
6310 			if (have_ctrl || have_ns) {
6311 				return (B_TRUE);
6312 			}
6313 			return (nvme_ioctl_error(ioc,
6314 			    NVME_IOCTL_E_NEED_NS_WRLOCK, 0, 0));
6315 		}
6316 	}
6317 
6318 	/*
6319 	 * Now we have an operation that does not require exclusive access. We
6320 	 * can proceed as long as no one else has it or if someone does it is
6321 	 * us. Regardless of what we target, a controller lock will stop us.
6322 	 */
6323 	if (ctrl_is_excl && !have_ctrl) {
6324 		return (nvme_ioctl_error(ioc, NVME_IOCTL_E_CTRL_LOCKED, 0, 0));
6325 	}
6326 
6327 	/*
6328 	 * Only check namespace exclusivity if we are targeting one.
6329 	 */
6330 	if (ns != NULL && ns_is_excl && !have_ns) {
6331 		return (nvme_ioctl_error(ioc, NVME_IOCTL_E_NS_LOCKED, 0, 0));
6332 	}
6333 
6334 	return (B_TRUE);
6335 }
6336 
6337 /*
6338  * Perform common checking as to whether or not an ioctl operation may proceed.
6339  * We check in this function various aspects of the namespace attributes that
6340  * it's calling on. Once the namespace attributes and any possible rewriting
6341  * have been performed, then we proceed to check whether or not the requisite
6342  * exclusive access is present in nvme_ioctl_excl_check().
6343  */
6344 static boolean_t
6345 nvme_ioctl_check(nvme_minor_t *minor, nvme_ioctl_common_t *ioc,
6346     const nvme_ioctl_check_t *check)
6347 {
6348 	/*
6349 	 * If the minor has a namespace pointer, then it is constrained to that
6350 	 * namespace. If a namespace is allowed, then there are only two valid
6351 	 * values that we can find. The first is matching the minor. The second
6352 	 * is our value zero, which will be transformed to the current
6353 	 * namespace.
6354 	 */
6355 	if (minor->nm_ns != NULL) {
6356 		if (!check->nck_ns_ok || !check->nck_ns_minor_ok) {
6357 			return (nvme_ioctl_error(ioc, NVME_IOCTL_E_NOT_CTRL, 0,
6358 			    0));
6359 		}
6360 
6361 		if (ioc->nioc_nsid == 0) {
6362 			ioc->nioc_nsid = minor->nm_ns->ns_id;
6363 		} else if (ioc->nioc_nsid != minor->nm_ns->ns_id) {
6364 			return (nvme_ioctl_error(ioc,
6365 			    NVME_IOCTL_E_MINOR_WRONG_NS, 0, 0));
6366 		}
6367 
6368 		return (nvme_ioctl_excl_check(minor, ioc, check));
6369 	}
6370 
6371 	/*
6372 	 * If we've been told to skip checking the controller, here's where we
6373 	 * do that. This should really only be for commands which use the
6374 	 * namespace ID for listing purposes and therefore can have
6375 	 * traditionally illegal values here.
6376 	 */
6377 	if (check->nck_skip_ctrl) {
6378 		return (nvme_ioctl_excl_check(minor, ioc, check));
6379 	}
6380 
6381 	/*
6382 	 * At this point, we know that we're on the controller's node. We first
6383 	 * deal with the simple case, is a namespace allowed at all or not. If
6384 	 * it is not allowed, then the only acceptable value is zero.
6385 	 */
6386 	if (!check->nck_ns_ok) {
6387 		if (ioc->nioc_nsid != 0) {
6388 			return (nvme_ioctl_error(ioc, NVME_IOCTL_E_NS_UNUSE, 0,
6389 			    0));
6390 		}
6391 
6392 		return (nvme_ioctl_excl_check(minor, ioc, check));
6393 	}
6394 
6395 	/*
6396 	 * At this point, we know that a controller is allowed to use a
6397 	 * namespace. If we haven't been given zero or the broadcast namespace,
6398 	 * check to see if it's actually a valid namespace ID. If is outside of
6399 	 * range, then it is an error. Next, if we have been requested to
6400 	 * rewrite 0 (the this controller indicator) as the broadcast namespace,
6401 	 * do so.
6402 	 *
6403 	 * While we validate that this namespace is within the valid range, we
6404 	 * do not check if it is active or inactive. That is left to our callers
6405 	 * to determine.
6406 	 */
6407 	if (ioc->nioc_nsid > minor->nm_ctrl->n_namespace_count &&
6408 	    ioc->nioc_nsid != NVME_NSID_BCAST) {
6409 		return (nvme_ioctl_error(ioc, NVME_IOCTL_E_NS_RANGE, 0, 0));
6410 	}
6411 
6412 	if (ioc->nioc_nsid == 0 && check->nck_ctrl_rewrite) {
6413 		ioc->nioc_nsid = NVME_NSID_BCAST;
6414 	}
6415 
6416 	/*
6417 	 * Finally, see if we have ended up with a broadcast namespace ID
6418 	 * whether through specification or rewriting. If that is not allowed,
6419 	 * then that is an error.
6420 	 */
6421 	if (!check->nck_bcast_ok && ioc->nioc_nsid == NVME_NSID_BCAST) {
6422 		return (nvme_ioctl_error(ioc, NVME_IOCTL_E_NO_BCAST_NS, 0, 0));
6423 	}
6424 
6425 	return (nvme_ioctl_excl_check(minor, ioc, check));
6426 }
6427 
6428 static int
6429 nvme_ioctl_ctrl_info(nvme_minor_t *minor, intptr_t arg, int mode,
6430     cred_t *cred_p)
6431 {
6432 	nvme_t *const nvme = minor->nm_ctrl;
6433 	nvme_ioctl_ctrl_info_t *info;
6434 	nvme_reg_cap_t cap = { 0 };
6435 	nvme_ioctl_identify_t id = { .nid_cns = NVME_IDENTIFY_CTRL };
6436 	void *idbuf;
6437 
6438 	if ((mode & FREAD) == 0)
6439 		return (EBADF);
6440 
6441 	info = kmem_alloc(sizeof (nvme_ioctl_ctrl_info_t), KM_NOSLEEP_LAZY);
6442 	if (info == NULL) {
6443 		return (nvme_ioctl_copyout_error(NVME_IOCTL_E_NO_KERN_MEM, arg,
6444 		    mode));
6445 	}
6446 
6447 	if (ddi_copyin((void *)arg, info, sizeof (nvme_ioctl_ctrl_info_t),
6448 	    mode & FKIOCTL) != 0) {
6449 		kmem_free(info, sizeof (nvme_ioctl_ctrl_info_t));
6450 		return (EFAULT);
6451 	}
6452 
6453 	if (!nvme_ioctl_check(minor, &info->nci_common,
6454 	    &nvme_check_ctrl_info)) {
6455 		goto copyout;
6456 	}
6457 
6458 	/*
6459 	 * We explicitly do not use the identify controller copy in the kernel
6460 	 * right now so that way we can get a snapshot of the controller's
6461 	 * current capacity and values. While it's tempting to try to use this
6462 	 * to refresh the kernel's version we don't just to simplify the rest of
6463 	 * the driver right now.
6464 	 */
6465 	if (!nvme_identify(nvme, B_TRUE, &id, &idbuf)) {
6466 		info->nci_common = id.nid_common;
6467 		goto copyout;
6468 	}
6469 	bcopy(idbuf, &info->nci_ctrl_id, sizeof (nvme_identify_ctrl_t));
6470 	kmem_free(idbuf, NVME_IDENTIFY_BUFSIZE);
6471 
6472 	/*
6473 	 * Use the kernel's cached common namespace information for this.
6474 	 */
6475 	bcopy(nvme->n_idcomns, &info->nci_common_ns,
6476 	    sizeof (nvme_identify_nsid_t));
6477 
6478 	info->nci_vers = nvme->n_version;
6479 
6480 	/*
6481 	 * The MPSMIN and MPSMAX fields in the CAP register use 0 to
6482 	 * specify the base page size of 4k (1<<12), so add 12 here to
6483 	 * get the real page size value.
6484 	 */
6485 	cap.r = nvme_get64(nvme, NVME_REG_CAP);
6486 	info->nci_caps.cap_mpsmax = 1 << (12 + cap.b.cap_mpsmax);
6487 	info->nci_caps.cap_mpsmin = 1 << (12 + cap.b.cap_mpsmin);
6488 
6489 	info->nci_nintrs = (uint32_t)nvme->n_intr_cnt;
6490 
6491 copyout:
6492 	if (ddi_copyout(info, (void *)arg, sizeof (nvme_ioctl_ctrl_info_t),
6493 	    mode & FKIOCTL) != 0) {
6494 		kmem_free(info, sizeof (nvme_ioctl_ctrl_info_t));
6495 		return (EFAULT);
6496 	}
6497 
6498 	kmem_free(info, sizeof (nvme_ioctl_ctrl_info_t));
6499 	return (0);
6500 }
6501 
6502 static int
6503 nvme_ioctl_ns_info(nvme_minor_t *minor, intptr_t arg, int mode, cred_t *cred_p)
6504 {
6505 	nvme_t *const nvme = minor->nm_ctrl;
6506 	nvme_ioctl_ns_info_t *ns_info;
6507 	nvme_namespace_t *ns;
6508 	nvme_ioctl_identify_t id = { .nid_cns = NVME_IDENTIFY_NSID };
6509 	void *idbuf;
6510 
6511 	if ((mode & FREAD) == 0)
6512 		return (EBADF);
6513 
6514 	ns_info = kmem_zalloc(sizeof (nvme_ioctl_ns_info_t), KM_NOSLEEP_LAZY);
6515 	if (ns_info == NULL) {
6516 		return (nvme_ioctl_copyout_error(NVME_IOCTL_E_NO_KERN_MEM, arg,
6517 		    mode));
6518 	}
6519 
6520 	if (ddi_copyin((void *)arg, ns_info, sizeof (nvme_ioctl_ns_info_t),
6521 	    mode & FKIOCTL) != 0) {
6522 		kmem_free(ns_info, sizeof (nvme_ioctl_ns_info_t));
6523 		return (EFAULT);
6524 	}
6525 
6526 	if (!nvme_ioctl_check(minor, &ns_info->nni_common,
6527 	    &nvme_check_ns_info)) {
6528 		goto copyout;
6529 	}
6530 
6531 	ASSERT3U(ns_info->nni_common.nioc_nsid, >, 0);
6532 	ns = nvme_nsid2ns(nvme, ns_info->nni_common.nioc_nsid);
6533 
6534 	/*
6535 	 * First fetch a fresh copy of the namespace information. Most callers
6536 	 * are using this because they will want a mostly accurate snapshot of
6537 	 * capacity and utilization.
6538 	 */
6539 	id.nid_common.nioc_nsid = ns_info->nni_common.nioc_nsid;
6540 	if (!nvme_identify(nvme, B_TRUE, &id, &idbuf)) {
6541 		ns_info->nni_common = id.nid_common;
6542 		goto copyout;
6543 	}
6544 	bcopy(idbuf, &ns_info->nni_id, sizeof (nvme_identify_nsid_t));
6545 	kmem_free(idbuf, NVME_IDENTIFY_BUFSIZE);
6546 
6547 	nvme_mgmt_lock(nvme, NVME_MGMT_LOCK_NVME);
6548 	ns_info->nni_state = ns->ns_state;
6549 	if (ns->ns_state >= NVME_NS_STATE_ATTACHED) {
6550 		const char *addr;
6551 
6552 		ns_info->nni_state = NVME_NS_STATE_ATTACHED;
6553 		addr = bd_address(ns->ns_bd_hdl);
6554 		if (strlcpy(ns_info->nni_addr, addr,
6555 		    sizeof (ns_info->nni_addr)) >= sizeof (ns_info->nni_addr)) {
6556 			nvme_mgmt_unlock(nvme);
6557 			(void) nvme_ioctl_error(&ns_info->nni_common,
6558 			    NVME_IOCTL_E_BD_ADDR_OVER, 0, 0);
6559 			goto copyout;
6560 		}
6561 	}
6562 	nvme_mgmt_unlock(nvme);
6563 
6564 copyout:
6565 	if (ddi_copyout(ns_info, (void *)arg, sizeof (nvme_ioctl_ns_info_t),
6566 	    mode & FKIOCTL) != 0) {
6567 		kmem_free(ns_info, sizeof (nvme_ioctl_ns_info_t));
6568 		return (EFAULT);
6569 	}
6570 
6571 	kmem_free(ns_info, sizeof (nvme_ioctl_ns_info_t));
6572 	return (0);
6573 }
6574 
6575 static int
6576 nvme_ioctl_identify(nvme_minor_t *minor, intptr_t arg, int mode, cred_t *cred_p)
6577 {
6578 	_NOTE(ARGUNUSED(cred_p));
6579 	nvme_t *const nvme = minor->nm_ctrl;
6580 	void *idctl;
6581 	uint_t model;
6582 	nvme_ioctl_identify_t id;
6583 #ifdef	_MULTI_DATAMODEL
6584 	nvme_ioctl_identify32_t id32;
6585 #endif
6586 	boolean_t ns_minor;
6587 
6588 	if ((mode & FREAD) == 0)
6589 		return (EBADF);
6590 
6591 	model = ddi_model_convert_from(mode);
6592 	switch (model) {
6593 #ifdef	_MULTI_DATAMODEL
6594 	case DDI_MODEL_ILP32:
6595 		bzero(&id, sizeof (id));
6596 		if (ddi_copyin((void *)arg, &id32, sizeof (id32),
6597 		    mode & FKIOCTL) != 0) {
6598 			return (EFAULT);
6599 		}
6600 		id.nid_common.nioc_nsid = id32.nid_common.nioc_nsid;
6601 		id.nid_cns = id32.nid_cns;
6602 		id.nid_ctrlid = id32.nid_ctrlid;
6603 		id.nid_data = id32.nid_data;
6604 		break;
6605 #endif	/* _MULTI_DATAMODEL */
6606 	case DDI_MODEL_NONE:
6607 		if (ddi_copyin((void *)arg, &id, sizeof (id),
6608 		    mode & FKIOCTL) != 0) {
6609 			return (EFAULT);
6610 		}
6611 		break;
6612 	default:
6613 		return (ENOTSUP);
6614 	}
6615 
6616 	if (!nvme_ioctl_check(minor, &id.nid_common, &nvme_check_identify)) {
6617 		goto copyout;
6618 	}
6619 
6620 	ns_minor = minor->nm_ns != NULL;
6621 	if (!nvme_validate_identify(nvme, &id, ns_minor)) {
6622 		goto copyout;
6623 	}
6624 
6625 	if (nvme_identify(nvme, B_TRUE, &id, &idctl)) {
6626 		int ret = ddi_copyout(idctl, (void *)id.nid_data,
6627 		    NVME_IDENTIFY_BUFSIZE, mode & FKIOCTL);
6628 		kmem_free(idctl, NVME_IDENTIFY_BUFSIZE);
6629 		if (ret != 0) {
6630 			(void) nvme_ioctl_error(&id.nid_common,
6631 			    NVME_IOCTL_E_BAD_USER_DATA, 0, 0);
6632 			goto copyout;
6633 		}
6634 
6635 		nvme_ioctl_success(&id.nid_common);
6636 	}
6637 
6638 copyout:
6639 	switch (model) {
6640 #ifdef	_MULTI_DATAMODEL
6641 	case DDI_MODEL_ILP32:
6642 		id32.nid_common = id.nid_common;
6643 
6644 		if (ddi_copyout(&id32, (void *)arg, sizeof (id32),
6645 		    mode & FKIOCTL) != 0) {
6646 			return (EFAULT);
6647 		}
6648 		break;
6649 #endif	/* _MULTI_DATAMODEL */
6650 	case DDI_MODEL_NONE:
6651 		if (ddi_copyout(&id, (void *)arg, sizeof (id),
6652 		    mode & FKIOCTL) != 0) {
6653 			return (EFAULT);
6654 		}
6655 		break;
6656 	default:
6657 		return (ENOTSUP);
6658 	}
6659 
6660 	return (0);
6661 }
6662 
6663 /*
6664  * Execute commands on behalf of the various ioctls.
6665  *
6666  * If this returns true then the command completed successfully. Otherwise error
6667  * information is returned in the nvme_ioctl_common_t arguments.
6668  */
6669 static boolean_t
6670 nvme_ioc_cmd(nvme_t *nvme, nvme_ioctl_common_t *ioc, nvme_ioc_cmd_args_t *args)
6671 {
6672 	nvme_cmd_t *cmd;
6673 	boolean_t ret = B_FALSE;
6674 
6675 	cmd = nvme_alloc_admin_cmd(nvme, KM_SLEEP);
6676 	cmd->nc_sqid = 0;
6677 
6678 	/*
6679 	 * This function is used to facilitate requests from
6680 	 * userspace, so don't panic if the command fails. This
6681 	 * is especially true for admin passthru commands, where
6682 	 * the actual command data structure is entirely defined
6683 	 * by userspace.
6684 	 */
6685 	cmd->nc_flags |= NVME_CMD_F_DONTPANIC;
6686 
6687 	cmd->nc_callback = nvme_wakeup_cmd;
6688 	cmd->nc_sqe = *args->ica_sqe;
6689 
6690 	if ((args->ica_dma_flags & DDI_DMA_RDWR) != 0) {
6691 		if (args->ica_data == NULL) {
6692 			ret = nvme_ioctl_error(ioc, NVME_IOCTL_E_NO_DMA_MEM,
6693 			    0, 0);
6694 			goto free_cmd;
6695 		}
6696 
6697 		if (nvme_zalloc_dma(nvme, args->ica_data_len,
6698 		    args->ica_dma_flags, &nvme->n_prp_dma_attr, &cmd->nc_dma) !=
6699 		    DDI_SUCCESS) {
6700 			dev_err(nvme->n_dip, CE_WARN,
6701 			    "!nvme_zalloc_dma failed for nvme_ioc_cmd()");
6702 			ret = nvme_ioctl_error(ioc,
6703 			    NVME_IOCTL_E_NO_DMA_MEM, 0, 0);
6704 			goto free_cmd;
6705 		}
6706 
6707 		if (nvme_fill_prp(cmd, cmd->nc_dma->nd_dmah) != 0) {
6708 			ret = nvme_ioctl_error(ioc,
6709 			    NVME_IOCTL_E_NO_DMA_MEM, 0, 0);
6710 			goto free_cmd;
6711 		}
6712 
6713 		if ((args->ica_dma_flags & DDI_DMA_WRITE) != 0 &&
6714 		    ddi_copyin(args->ica_data, cmd->nc_dma->nd_memp,
6715 		    args->ica_data_len, args->ica_copy_flags) != 0) {
6716 			ret = nvme_ioctl_error(ioc, NVME_IOCTL_E_BAD_USER_DATA,
6717 			    0, 0);
6718 			goto free_cmd;
6719 		}
6720 	}
6721 
6722 	nvme_admin_cmd(cmd, args->ica_timeout);
6723 
6724 	if (!nvme_check_cmd_status_ioctl(cmd, ioc)) {
6725 		ret = B_FALSE;
6726 		goto free_cmd;
6727 	}
6728 
6729 	args->ica_cdw0 = cmd->nc_cqe.cqe_dw0;
6730 
6731 	if ((args->ica_dma_flags & DDI_DMA_READ) != 0 &&
6732 	    ddi_copyout(cmd->nc_dma->nd_memp, args->ica_data,
6733 	    args->ica_data_len, args->ica_copy_flags) != 0) {
6734 		ret = nvme_ioctl_error(ioc, NVME_IOCTL_E_BAD_USER_DATA, 0, 0);
6735 		goto free_cmd;
6736 	}
6737 
6738 	ret = B_TRUE;
6739 	nvme_ioctl_success(ioc);
6740 
6741 free_cmd:
6742 	nvme_free_cmd(cmd);
6743 
6744 	return (ret);
6745 }
6746 
6747 static int
6748 nvme_ioctl_get_logpage(nvme_minor_t *minor, intptr_t arg, int mode,
6749     cred_t *cred_p)
6750 {
6751 	nvme_t *const nvme = minor->nm_ctrl;
6752 	void *buf;
6753 	nvme_ioctl_get_logpage_t log;
6754 	uint_t model;
6755 #ifdef	_MULTI_DATAMODEL
6756 	nvme_ioctl_get_logpage32_t log32;
6757 #endif
6758 
6759 	if ((mode & FREAD) == 0) {
6760 		return (EBADF);
6761 	}
6762 
6763 	model = ddi_model_convert_from(mode);
6764 	switch (model) {
6765 #ifdef	_MULTI_DATAMODEL
6766 	case DDI_MODEL_ILP32:
6767 		bzero(&log, sizeof (log));
6768 		if (ddi_copyin((void *)arg, &log32, sizeof (log32),
6769 		    mode & FKIOCTL) != 0) {
6770 			return (EFAULT);
6771 		}
6772 
6773 		log.nigl_common.nioc_nsid = log32.nigl_common.nioc_nsid;
6774 		log.nigl_csi = log32.nigl_csi;
6775 		log.nigl_lid = log32.nigl_lid;
6776 		log.nigl_lsp = log32.nigl_lsp;
6777 		log.nigl_len = log32.nigl_len;
6778 		log.nigl_offset = log32.nigl_offset;
6779 		log.nigl_data = log32.nigl_data;
6780 		break;
6781 #endif	/* _MULTI_DATAMODEL */
6782 	case DDI_MODEL_NONE:
6783 		if (ddi_copyin((void *)arg, &log, sizeof (log),
6784 		    mode & FKIOCTL) != 0) {
6785 			return (EFAULT);
6786 		}
6787 		break;
6788 	default:
6789 		return (ENOTSUP);
6790 	}
6791 
6792 	/*
6793 	 * Eventually we'd like to do a soft lock on the namespaces from
6794 	 * changing out from us during this operation in the future. But we
6795 	 * haven't implemented that yet.
6796 	 */
6797 	if (!nvme_ioctl_check(minor, &log.nigl_common,
6798 	    &nvme_check_get_logpage)) {
6799 		goto copyout;
6800 	}
6801 
6802 	if (!nvme_validate_logpage(nvme, &log)) {
6803 		goto copyout;
6804 	}
6805 
6806 	if (nvme_get_logpage(nvme, B_TRUE, &log, &buf)) {
6807 		int copy;
6808 
6809 		copy = ddi_copyout(buf, (void *)log.nigl_data, log.nigl_len,
6810 		    mode & FKIOCTL);
6811 		kmem_free(buf, log.nigl_len);
6812 		if (copy != 0) {
6813 			(void) nvme_ioctl_error(&log.nigl_common,
6814 			    NVME_IOCTL_E_BAD_USER_DATA, 0, 0);
6815 			goto copyout;
6816 		}
6817 
6818 		nvme_ioctl_success(&log.nigl_common);
6819 	}
6820 
6821 copyout:
6822 	switch (model) {
6823 #ifdef	_MULTI_DATAMODEL
6824 	case DDI_MODEL_ILP32:
6825 		bzero(&log32, sizeof (log32));
6826 
6827 		log32.nigl_common = log.nigl_common;
6828 		log32.nigl_csi = log.nigl_csi;
6829 		log32.nigl_lid = log.nigl_lid;
6830 		log32.nigl_lsp = log.nigl_lsp;
6831 		log32.nigl_len = log.nigl_len;
6832 		log32.nigl_offset = log.nigl_offset;
6833 		log32.nigl_data = log.nigl_data;
6834 		if (ddi_copyout(&log32, (void *)arg, sizeof (log32),
6835 		    mode & FKIOCTL) != 0) {
6836 			return (EFAULT);
6837 		}
6838 		break;
6839 #endif	/* _MULTI_DATAMODEL */
6840 	case DDI_MODEL_NONE:
6841 		if (ddi_copyout(&log, (void *)arg, sizeof (log),
6842 		    mode & FKIOCTL) != 0) {
6843 			return (EFAULT);
6844 		}
6845 		break;
6846 	default:
6847 		return (ENOTSUP);
6848 	}
6849 
6850 	return (0);
6851 }
6852 
6853 static int
6854 nvme_ioctl_get_feature(nvme_minor_t *minor, intptr_t arg, int mode,
6855     cred_t *cred_p)
6856 {
6857 	nvme_t *const nvme = minor->nm_ctrl;
6858 	nvme_ioctl_get_feature_t feat;
6859 	uint_t model;
6860 #ifdef	_MULTI_DATAMODEL
6861 	nvme_ioctl_get_feature32_t feat32;
6862 #endif
6863 	nvme_get_features_dw10_t gf_dw10 = { 0 };
6864 	nvme_ioc_cmd_args_t args = { NULL };
6865 	nvme_sqe_t sqe = {
6866 	    .sqe_opc	= NVME_OPC_GET_FEATURES
6867 	};
6868 
6869 	if ((mode & FREAD) == 0) {
6870 		return (EBADF);
6871 	}
6872 
6873 	model = ddi_model_convert_from(mode);
6874 	switch (model) {
6875 #ifdef	_MULTI_DATAMODEL
6876 	case DDI_MODEL_ILP32:
6877 		bzero(&feat, sizeof (feat));
6878 		if (ddi_copyin((void *)arg, &feat32, sizeof (feat32),
6879 		    mode & FKIOCTL) != 0) {
6880 			return (EFAULT);
6881 		}
6882 
6883 		feat.nigf_common.nioc_nsid = feat32.nigf_common.nioc_nsid;
6884 		feat.nigf_fid = feat32.nigf_fid;
6885 		feat.nigf_sel = feat32.nigf_sel;
6886 		feat.nigf_cdw11 = feat32.nigf_cdw11;
6887 		feat.nigf_data = feat32.nigf_data;
6888 		feat.nigf_len = feat32.nigf_len;
6889 		break;
6890 #endif	/* _MULTI_DATAMODEL */
6891 	case DDI_MODEL_NONE:
6892 		if (ddi_copyin((void *)arg, &feat, sizeof (feat),
6893 		    mode & FKIOCTL) != 0) {
6894 			return (EFAULT);
6895 		}
6896 		break;
6897 	default:
6898 		return (ENOTSUP);
6899 	}
6900 
6901 	if (!nvme_ioctl_check(minor, &feat.nigf_common,
6902 	    &nvme_check_get_feature)) {
6903 		goto copyout;
6904 	}
6905 
6906 	if (!nvme_validate_get_feature(nvme, &feat)) {
6907 		goto copyout;
6908 	}
6909 
6910 	gf_dw10.b.gt_fid = bitx32(feat.nigf_fid, 7, 0);
6911 	gf_dw10.b.gt_sel = bitx32(feat.nigf_sel, 2, 0);
6912 	sqe.sqe_cdw10 = gf_dw10.r;
6913 	sqe.sqe_cdw11 = feat.nigf_cdw11;
6914 	sqe.sqe_nsid = feat.nigf_common.nioc_nsid;
6915 
6916 	args.ica_sqe = &sqe;
6917 	if (feat.nigf_len != 0) {
6918 		args.ica_data = (void *)feat.nigf_data;
6919 		args.ica_data_len = feat.nigf_len;
6920 		args.ica_dma_flags = DDI_DMA_READ;
6921 	}
6922 	args.ica_copy_flags = mode;
6923 	args.ica_timeout = nvme_admin_cmd_timeout;
6924 
6925 	if (!nvme_ioc_cmd(nvme, &feat.nigf_common, &args)) {
6926 		goto copyout;
6927 	}
6928 
6929 	feat.nigf_cdw0 = args.ica_cdw0;
6930 
6931 copyout:
6932 	switch (model) {
6933 #ifdef	_MULTI_DATAMODEL
6934 	case DDI_MODEL_ILP32:
6935 		bzero(&feat32, sizeof (feat32));
6936 
6937 		feat32.nigf_common = feat.nigf_common;
6938 		feat32.nigf_fid = feat.nigf_fid;
6939 		feat32.nigf_sel = feat.nigf_sel;
6940 		feat32.nigf_cdw11 = feat.nigf_cdw11;
6941 		feat32.nigf_data = feat.nigf_data;
6942 		feat32.nigf_len = feat.nigf_len;
6943 		feat32.nigf_cdw0 = feat.nigf_cdw0;
6944 		if (ddi_copyout(&feat32, (void *)arg, sizeof (feat32),
6945 		    mode & FKIOCTL) != 0) {
6946 			return (EFAULT);
6947 		}
6948 		break;
6949 #endif	/* _MULTI_DATAMODEL */
6950 	case DDI_MODEL_NONE:
6951 		if (ddi_copyout(&feat, (void *)arg, sizeof (feat),
6952 		    mode & FKIOCTL) != 0) {
6953 			return (EFAULT);
6954 		}
6955 		break;
6956 	default:
6957 		return (ENOTSUP);
6958 	}
6959 
6960 	return (0);
6961 }
6962 
6963 static int
6964 nvme_ioctl_format(nvme_minor_t *minor, intptr_t arg, int mode, cred_t *cred_p)
6965 {
6966 	nvme_t *const nvme = minor->nm_ctrl;
6967 	nvme_ioctl_format_t ioc;
6968 
6969 	if ((mode & FWRITE) == 0)
6970 		return (EBADF);
6971 
6972 	if (secpolicy_sys_config(cred_p, B_FALSE) != 0)
6973 		return (EPERM);
6974 
6975 	if (ddi_copyin((void *)(uintptr_t)arg, &ioc,
6976 	    sizeof (nvme_ioctl_format_t), mode & FKIOCTL) != 0)
6977 		return (EFAULT);
6978 
6979 	if (!nvme_ioctl_check(minor, &ioc.nif_common, &nvme_check_format)) {
6980 		goto copyout;
6981 	}
6982 
6983 	if (!nvme_validate_format(nvme, &ioc)) {
6984 		goto copyout;
6985 	}
6986 
6987 	/*
6988 	 * The broadcast namespace can format all namespaces attached to the
6989 	 * controller, meaning active namespaces. However, a targeted format can
6990 	 * impact any allocated namespace, even one not attached. As such, we
6991 	 * need different checks for each situation.
6992 	 */
6993 	nvme_mgmt_lock(nvme, NVME_MGMT_LOCK_NVME);
6994 	if (ioc.nif_common.nioc_nsid == NVME_NSID_BCAST) {
6995 		if (!nvme_no_blkdev_attached(nvme, ioc.nif_common.nioc_nsid)) {
6996 			nvme_mgmt_unlock(nvme);
6997 			(void) nvme_ioctl_error(&ioc.nif_common,
6998 			    NVME_IOCTL_E_NS_BLKDEV_ATTACH, 0, 0);
6999 			goto copyout;
7000 		}
7001 	} else {
7002 		nvme_namespace_t *ns = nvme_nsid2ns(nvme,
7003 		    ioc.nif_common.nioc_nsid);
7004 
7005 		if (!nvme_ns_state_check(ns, &ioc.nif_common,
7006 		    nvme_format_nvm_states)) {
7007 			nvme_mgmt_unlock(nvme);
7008 			goto copyout;
7009 		}
7010 	}
7011 
7012 	if (nvme_format_nvm(nvme, &ioc)) {
7013 		nvme_ioctl_success(&ioc.nif_common);
7014 		nvme_rescan_ns(nvme, ioc.nif_common.nioc_nsid);
7015 	}
7016 	nvme_mgmt_unlock(nvme);
7017 
7018 copyout:
7019 	if (ddi_copyout(&ioc, (void *)(uintptr_t)arg, sizeof (ioc),
7020 	    mode & FKIOCTL) != 0) {
7021 		return (EFAULT);
7022 	}
7023 
7024 	return (0);
7025 }
7026 
7027 static int
7028 nvme_ioctl_bd_detach(nvme_minor_t *minor, intptr_t arg, int mode,
7029     cred_t *cred_p)
7030 {
7031 	nvme_t *const nvme = minor->nm_ctrl;
7032 	nvme_ioctl_common_t com;
7033 
7034 	if ((mode & FWRITE) == 0)
7035 		return (EBADF);
7036 
7037 	if (secpolicy_sys_config(cred_p, B_FALSE) != 0)
7038 		return (EPERM);
7039 
7040 	if (ddi_copyin((void *)(uintptr_t)arg, &com, sizeof (com),
7041 	    mode & FKIOCTL) != 0) {
7042 		return (EFAULT);
7043 	}
7044 
7045 	if (!nvme_ioctl_check(minor, &com, &nvme_check_attach_detach)) {
7046 		goto copyout;
7047 	}
7048 
7049 	nvme_mgmt_lock(nvme, NVME_MGMT_LOCK_NVME);
7050 	if (nvme_bd_detach_ns(nvme, &com)) {
7051 		nvme_ioctl_success(&com);
7052 	}
7053 	nvme_mgmt_unlock(nvme);
7054 
7055 copyout:
7056 	if (ddi_copyout(&com, (void *)(uintptr_t)arg, sizeof (com),
7057 	    mode & FKIOCTL) != 0) {
7058 		return (EFAULT);
7059 	}
7060 
7061 	return (0);
7062 }
7063 
7064 static int
7065 nvme_ioctl_bd_attach(nvme_minor_t *minor, intptr_t arg, int mode,
7066     cred_t *cred_p)
7067 {
7068 	nvme_t *const nvme = minor->nm_ctrl;
7069 	nvme_ioctl_common_t com;
7070 	nvme_namespace_t *ns;
7071 
7072 	if ((mode & FWRITE) == 0)
7073 		return (EBADF);
7074 
7075 	if (secpolicy_sys_config(cred_p, B_FALSE) != 0)
7076 		return (EPERM);
7077 
7078 	if (ddi_copyin((void *)(uintptr_t)arg, &com, sizeof (com),
7079 	    mode & FKIOCTL) != 0) {
7080 		return (EFAULT);
7081 	}
7082 
7083 	if (!nvme_ioctl_check(minor, &com, &nvme_check_attach_detach)) {
7084 		goto copyout;
7085 	}
7086 
7087 	nvme_mgmt_lock(nvme, NVME_MGMT_LOCK_NVME);
7088 	ns = nvme_nsid2ns(nvme, com.nioc_nsid);
7089 
7090 	/*
7091 	 * Strictly speaking we shouldn't need to call nvme_init_ns() here as
7092 	 * we should be properly refreshing the internal state when we are
7093 	 * issuing commands that change things. However, we opt to still do so
7094 	 * as a bit of a safety check lest we give the kernel something bad or a
7095 	 * vendor unique command somehow did something behind our backs.
7096 	 */
7097 	if (ns->ns_state < NVME_NS_STATE_ATTACHED) {
7098 		nvme_rescan_ns(nvme, com.nioc_nsid);
7099 	}
7100 
7101 	if (nvme_bd_attach_ns(nvme, &com)) {
7102 		nvme_ioctl_success(&com);
7103 	}
7104 	nvme_mgmt_unlock(nvme);
7105 
7106 copyout:
7107 	if (ddi_copyout(&com, (void *)(uintptr_t)arg, sizeof (com),
7108 	    mode & FKIOCTL) != 0) {
7109 		return (EFAULT);
7110 	}
7111 
7112 	return (0);
7113 }
7114 
7115 /*
7116  * Attach or detach a controller from the specified namespace. While this in
7117  * theory allows for multiple controllers to be specified, currently we only
7118  * support using the controller that we've issued this ioctl on. In the future
7119  * when we have better ways to test dual-attached controllers then this should
7120  * be extended to take the controller list from userland.
7121  */
7122 static boolean_t
7123 nvme_ctrl_attach_detach_ns(nvme_t *nvme, nvme_namespace_t *ns,
7124     nvme_ioctl_common_t *ioc, boolean_t attach)
7125 {
7126 	nvme_ioc_cmd_args_t args = { NULL };
7127 	nvme_sqe_t sqe;
7128 	nvme_ns_mgmt_dw10_t dw10;
7129 	uint16_t ctrlids[2];
7130 
7131 	ASSERT(nvme_mgmt_lock_held(nvme));
7132 
7133 	bzero(&sqe, sizeof (sqe));
7134 	sqe.sqe_nsid = ioc->nioc_nsid;
7135 	sqe.sqe_opc = NVME_OPC_NS_ATTACH;
7136 
7137 	dw10.r = 0;
7138 	dw10.b.nsm_sel = attach ? NVME_NS_ATTACH_CTRL_ATTACH :
7139 	    NVME_NS_ATTACH_CTRL_DETACH;
7140 	sqe.sqe_cdw10 = dw10.r;
7141 
7142 	/*
7143 	 * As we only support sending our current controller's id along, we can
7144 	 * simplify this and don't need both allocating a full
7145 	 * nvme_identify_ctrl_list_t for two items.
7146 	 */
7147 	ctrlids[0] = 1;
7148 	ctrlids[1] = nvme->n_idctl->id_cntlid;
7149 
7150 	args.ica_sqe = &sqe;
7151 	args.ica_data = ctrlids;
7152 	args.ica_data_len = sizeof (ctrlids);
7153 	args.ica_dma_flags = DDI_DMA_WRITE;
7154 	args.ica_copy_flags = FKIOCTL;
7155 	args.ica_timeout = nvme_admin_cmd_timeout;
7156 
7157 	return (nvme_ioc_cmd(nvme, ioc, &args));
7158 }
7159 
7160 static int
7161 nvme_ioctl_ctrl_detach(nvme_minor_t *minor, intptr_t arg, int mode,
7162     cred_t *cred_p)
7163 {
7164 	nvme_t *const nvme = minor->nm_ctrl;
7165 	nvme_ioctl_common_t com;
7166 	nvme_namespace_t *ns;
7167 
7168 	if ((mode & FWRITE) == 0)
7169 		return (EBADF);
7170 
7171 	if (secpolicy_sys_config(cred_p, B_FALSE) != 0)
7172 		return (EPERM);
7173 
7174 	if (ddi_copyin((void *)(uintptr_t)arg, &com, sizeof (com),
7175 	    mode & FKIOCTL) != 0) {
7176 		return (EFAULT);
7177 	}
7178 
7179 	if (!nvme_ioctl_check(minor, &com, &nvme_check_attach_detach)) {
7180 		goto copyout;
7181 	}
7182 
7183 	if (!nvme_validate_ctrl_attach_detach_ns(nvme, &com)) {
7184 		goto copyout;
7185 	}
7186 
7187 	nvme_mgmt_lock(nvme, NVME_MGMT_LOCK_NVME);
7188 	ns = nvme_nsid2ns(nvme, com.nioc_nsid);
7189 
7190 	if (nvme_ns_state_check(ns, &com, nvme_ctrl_detach_states)) {
7191 		if (nvme_ctrl_attach_detach_ns(nvme, ns, &com, B_FALSE)) {
7192 			nvme_rescan_ns(nvme, com.nioc_nsid);
7193 			nvme_ioctl_success(&com);
7194 		}
7195 	}
7196 	nvme_mgmt_unlock(nvme);
7197 
7198 copyout:
7199 	if (ddi_copyout(&com, (void *)(uintptr_t)arg, sizeof (com),
7200 	    mode & FKIOCTL) != 0) {
7201 		return (EFAULT);
7202 	}
7203 
7204 	return (0);
7205 }
7206 
7207 static int
7208 nvme_ioctl_ns_create(nvme_minor_t *minor, intptr_t arg, int mode,
7209     cred_t *cred_p)
7210 {
7211 	nvme_t *const nvme = minor->nm_ctrl;
7212 	nvme_ioctl_ns_create_t create;
7213 
7214 	if ((mode & FWRITE) == 0)
7215 		return (EBADF);
7216 
7217 	if (secpolicy_sys_config(cred_p, B_FALSE) != 0)
7218 		return (EPERM);
7219 
7220 	if (ddi_copyin((void *)(uintptr_t)arg, &create, sizeof (create),
7221 	    mode & FKIOCTL) != 0) {
7222 		return (EFAULT);
7223 	}
7224 
7225 	if (!nvme_ioctl_check(minor, &create.nnc_common,
7226 	    &nvme_check_ns_create)) {
7227 		goto copyout;
7228 	}
7229 
7230 	if (!nvme_validate_ns_create(nvme, &create)) {
7231 		goto copyout;
7232 	}
7233 
7234 	/*
7235 	 * Now that we've validated this, proceed to build up the actual data
7236 	 * request. We need to fill out the relevant identify namespace data
7237 	 * structure fields.
7238 	 */
7239 	nvme_identify_nsid_t *idns = kmem_zalloc(sizeof (nvme_identify_nsid_t),
7240 	    KM_NOSLEEP_LAZY);
7241 	if (idns == NULL) {
7242 		(void) nvme_ioctl_error(&create.nnc_common,
7243 		    NVME_IOCTL_E_NO_KERN_MEM, 0, 0);
7244 		goto copyout;
7245 	}
7246 
7247 	idns->id_nsize = create.nnc_nsze;
7248 	idns->id_ncap = create.nnc_ncap;
7249 	idns->id_flbas.lba_format = create.nnc_flbas;
7250 	idns->id_nmic.nm_shared = bitx32(create.nnc_nmic, 0, 0);
7251 
7252 	nvme_ioc_cmd_args_t args = { NULL };
7253 	nvme_sqe_t sqe;
7254 	nvme_ns_mgmt_dw10_t dw10;
7255 	nvme_ns_mgmt_dw11_t dw11;
7256 
7257 	bzero(&sqe, sizeof (sqe));
7258 	sqe.sqe_nsid = create.nnc_common.nioc_nsid;
7259 	sqe.sqe_opc = NVME_OPC_NS_MGMT;
7260 
7261 	dw10.r = 0;
7262 	dw10.b.nsm_sel = NVME_NS_MGMT_NS_CREATE;
7263 	sqe.sqe_cdw10 = dw10.r;
7264 
7265 	dw11.r = 0;
7266 	dw11.b.nsm_csi = create.nnc_csi;
7267 	sqe.sqe_cdw11 = dw11.r;
7268 
7269 	args.ica_sqe = &sqe;
7270 	args.ica_data = idns;
7271 	args.ica_data_len = sizeof (nvme_identify_nsid_t);
7272 	args.ica_dma_flags = DDI_DMA_WRITE;
7273 	args.ica_copy_flags = FKIOCTL;
7274 	args.ica_timeout = nvme_format_cmd_timeout;
7275 
7276 	/*
7277 	 * This command manipulates our understanding of a namespace's state.
7278 	 * While we don't need to check anything before we proceed, we still
7279 	 * logically require the lock.
7280 	 */
7281 	nvme_mgmt_lock(nvme, NVME_MGMT_LOCK_NVME);
7282 	if (nvme_ioc_cmd(nvme, &create.nnc_common, &args)) {
7283 		create.nnc_nsid = args.ica_cdw0;
7284 		nvme_rescan_ns(nvme, create.nnc_nsid);
7285 		nvme_ioctl_success(&create.nnc_common);
7286 	}
7287 	nvme_mgmt_unlock(nvme);
7288 	kmem_free(idns, sizeof (nvme_identify_nsid_t));
7289 
7290 copyout:
7291 	if (ddi_copyout(&create, (void *)(uintptr_t)arg, sizeof (create),
7292 	    mode & FKIOCTL) != 0) {
7293 		return (EFAULT);
7294 	}
7295 
7296 	return (0);
7297 
7298 }
7299 
7300 static int
7301 nvme_ioctl_ns_delete(nvme_minor_t *minor, intptr_t arg, int mode,
7302     cred_t *cred_p)
7303 {
7304 	nvme_t *const nvme = minor->nm_ctrl;
7305 	nvme_ioctl_common_t com;
7306 
7307 	if ((mode & FWRITE) == 0)
7308 		return (EBADF);
7309 
7310 	if (secpolicy_sys_config(cred_p, B_FALSE) != 0)
7311 		return (EPERM);
7312 
7313 	if (ddi_copyin((void *)(uintptr_t)arg, &com, sizeof (com),
7314 	    mode & FKIOCTL) != 0) {
7315 		return (EFAULT);
7316 	}
7317 
7318 	if (!nvme_ioctl_check(minor, &com, &nvme_check_ns_delete)) {
7319 		goto copyout;
7320 	}
7321 
7322 	if (!nvme_validate_ns_delete(nvme, &com)) {
7323 		goto copyout;
7324 	}
7325 
7326 	nvme_mgmt_lock(nvme, NVME_MGMT_LOCK_NVME);
7327 	if (com.nioc_nsid == NVME_NSID_BCAST) {
7328 		if (!nvme_no_blkdev_attached(nvme, com.nioc_nsid)) {
7329 			nvme_mgmt_unlock(nvme);
7330 			(void) nvme_ioctl_error(&com,
7331 			    NVME_IOCTL_E_NS_BLKDEV_ATTACH, 0, 0);
7332 			goto copyout;
7333 		}
7334 	} else {
7335 		nvme_namespace_t *ns = nvme_nsid2ns(nvme, com.nioc_nsid);
7336 
7337 		if (!nvme_ns_state_check(ns, &com, nvme_ns_delete_states)) {
7338 			nvme_mgmt_unlock(nvme);
7339 			goto copyout;
7340 		}
7341 	}
7342 
7343 	nvme_ioc_cmd_args_t args = { NULL };
7344 	nvme_sqe_t sqe;
7345 	nvme_ns_mgmt_dw10_t dw10;
7346 
7347 	bzero(&sqe, sizeof (sqe));
7348 	sqe.sqe_nsid = com.nioc_nsid;
7349 	sqe.sqe_opc = NVME_OPC_NS_MGMT;
7350 
7351 	dw10.r = 0;
7352 	dw10.b.nsm_sel = NVME_NS_MGMT_NS_DELETE;
7353 	sqe.sqe_cdw10 = dw10.r;
7354 
7355 	args.ica_sqe = &sqe;
7356 	args.ica_data = NULL;
7357 	args.ica_data_len = 0;
7358 	args.ica_dma_flags = 0;
7359 	args.ica_copy_flags = 0;
7360 	args.ica_timeout = nvme_format_cmd_timeout;
7361 
7362 	if (nvme_ioc_cmd(nvme, &com, &args)) {
7363 		nvme_rescan_ns(nvme, com.nioc_nsid);
7364 		nvme_ioctl_success(&com);
7365 	}
7366 	nvme_mgmt_unlock(nvme);
7367 
7368 copyout:
7369 	if (ddi_copyout(&com, (void *)(uintptr_t)arg, sizeof (com),
7370 	    mode & FKIOCTL) != 0) {
7371 		return (EFAULT);
7372 	}
7373 
7374 	return (0);
7375 }
7376 
7377 static int
7378 nvme_ioctl_ctrl_attach(nvme_minor_t *minor, intptr_t arg, int mode,
7379     cred_t *cred_p)
7380 {
7381 	nvme_t *const nvme = minor->nm_ctrl;
7382 	nvme_ioctl_common_t com;
7383 	nvme_namespace_t *ns;
7384 
7385 	if ((mode & FWRITE) == 0)
7386 		return (EBADF);
7387 
7388 	if (secpolicy_sys_config(cred_p, B_FALSE) != 0)
7389 		return (EPERM);
7390 
7391 	if (ddi_copyin((void *)(uintptr_t)arg, &com, sizeof (com),
7392 	    mode & FKIOCTL) != 0) {
7393 		return (EFAULT);
7394 	}
7395 
7396 	if (!nvme_ioctl_check(minor, &com, &nvme_check_attach_detach)) {
7397 		goto copyout;
7398 	}
7399 
7400 	if (!nvme_validate_ctrl_attach_detach_ns(nvme, &com)) {
7401 		goto copyout;
7402 	}
7403 
7404 	nvme_mgmt_lock(nvme, NVME_MGMT_LOCK_NVME);
7405 	ns = nvme_nsid2ns(nvme, com.nioc_nsid);
7406 
7407 	if (nvme_ns_state_check(ns, &com, nvme_ctrl_attach_states)) {
7408 		if (nvme_ctrl_attach_detach_ns(nvme, ns, &com, B_TRUE)) {
7409 			nvme_rescan_ns(nvme, com.nioc_nsid);
7410 			nvme_ioctl_success(&com);
7411 		}
7412 	}
7413 	nvme_mgmt_unlock(nvme);
7414 
7415 copyout:
7416 	if (ddi_copyout(&com, (void *)(uintptr_t)arg, sizeof (com),
7417 	    mode & FKIOCTL) != 0) {
7418 		return (EFAULT);
7419 	}
7420 
7421 	return (0);
7422 }
7423 
7424 static void
7425 nvme_ufm_update(nvme_t *nvme)
7426 {
7427 	mutex_enter(&nvme->n_fwslot_mutex);
7428 	ddi_ufm_update(nvme->n_ufmh);
7429 	if (nvme->n_fwslot != NULL) {
7430 		kmem_free(nvme->n_fwslot, sizeof (nvme_fwslot_log_t));
7431 		nvme->n_fwslot = NULL;
7432 	}
7433 	mutex_exit(&nvme->n_fwslot_mutex);
7434 }
7435 
7436 /*
7437  * Download new firmware to the device's internal staging area. We do not call
7438  * nvme_ufm_update() here because after a firmware download, there has been no
7439  * change to any of the actual persistent firmware data. That requires a
7440  * subsequent ioctl (NVME_IOC_FIRMWARE_COMMIT) to commit the firmware to a slot
7441  * or to activate a slot.
7442  */
7443 static int
7444 nvme_ioctl_firmware_download(nvme_minor_t *minor, intptr_t arg, int mode,
7445     cred_t *cred_p)
7446 {
7447 	nvme_t *const nvme = minor->nm_ctrl;
7448 	nvme_ioctl_fw_load_t fw;
7449 	uint64_t len, maxcopy;
7450 	offset_t offset;
7451 	uint32_t gran;
7452 	nvme_valid_ctrl_data_t data;
7453 	uintptr_t buf;
7454 	nvme_sqe_t sqe = {
7455 	    .sqe_opc	= NVME_OPC_FW_IMAGE_LOAD
7456 	};
7457 
7458 	if ((mode & FWRITE) == 0)
7459 		return (EBADF);
7460 
7461 	if (secpolicy_sys_config(cred_p, B_FALSE) != 0)
7462 		return (EPERM);
7463 
7464 	if (ddi_copyin((void *)(uintptr_t)arg, &fw, sizeof (fw),
7465 	    mode & FKIOCTL) != 0) {
7466 		return (EFAULT);
7467 	}
7468 
7469 	if (!nvme_ioctl_check(minor, &fw.fwl_common, &nvme_check_firmware)) {
7470 		goto copyout;
7471 	}
7472 
7473 	if (!nvme_validate_fw_load(nvme, &fw)) {
7474 		goto copyout;
7475 	}
7476 
7477 	len = fw.fwl_len;
7478 	offset = fw.fwl_off;
7479 	buf = fw.fwl_buf;
7480 
7481 	/*
7482 	 * We need to determine the minimum and maximum amount of data that we
7483 	 * will send to the device in a given go. Starting in NMVe 1.3 this must
7484 	 * be a multiple of the firmware update granularity (FWUG), but must not
7485 	 * exceed the maximum data transfer that we've set. Many devices don't
7486 	 * report something here, which means we'll end up getting our default
7487 	 * value. Our policy is a little simple, but it's basically if the
7488 	 * maximum data transfer is evenly divided by the granularity, then use
7489 	 * it. Otherwise we use the granularity itself. The granularity is
7490 	 * always in page sized units, so trying to find another optimum point
7491 	 * isn't worth it. If we encounter a contradiction, then we will have to
7492 	 * error out.
7493 	 */
7494 	data.vcd_vers = &nvme->n_version;
7495 	data.vcd_id = nvme->n_idctl;
7496 	gran = nvme_fw_load_granularity(&data);
7497 
7498 	if ((nvme->n_max_data_transfer_size % gran) == 0) {
7499 		maxcopy = nvme->n_max_data_transfer_size;
7500 	} else if (gran <= nvme->n_max_data_transfer_size) {
7501 		maxcopy = gran;
7502 	} else {
7503 		(void) nvme_ioctl_error(&fw.fwl_common,
7504 		    NVME_IOCTL_E_FW_LOAD_IMPOS_GRAN, 0, 0);
7505 		goto copyout;
7506 	}
7507 
7508 	while (len > 0) {
7509 		nvme_ioc_cmd_args_t args = { NULL };
7510 		uint64_t copylen = MIN(maxcopy, len);
7511 
7512 		sqe.sqe_cdw10 = (uint32_t)(copylen >> NVME_DWORD_SHIFT) - 1;
7513 		sqe.sqe_cdw11 = (uint32_t)(offset >> NVME_DWORD_SHIFT);
7514 
7515 		args.ica_sqe = &sqe;
7516 		args.ica_data = (void *)buf;
7517 		args.ica_data_len = copylen;
7518 		args.ica_dma_flags = DDI_DMA_WRITE;
7519 		args.ica_copy_flags = mode;
7520 		args.ica_timeout = nvme_admin_cmd_timeout;
7521 
7522 		if (!nvme_ioc_cmd(nvme, &fw.fwl_common, &args)) {
7523 			break;
7524 		}
7525 
7526 		buf += copylen;
7527 		offset += copylen;
7528 		len -= copylen;
7529 	}
7530 
7531 copyout:
7532 	if (ddi_copyout(&fw, (void *)(uintptr_t)arg, sizeof (fw),
7533 	    mode & FKIOCTL) != 0) {
7534 		return (EFAULT);
7535 	}
7536 
7537 	return (0);
7538 }
7539 
7540 static int
7541 nvme_ioctl_firmware_commit(nvme_minor_t *minor, intptr_t arg, int mode,
7542     cred_t *cred_p)
7543 {
7544 	nvme_t *const nvme = minor->nm_ctrl;
7545 	nvme_ioctl_fw_commit_t fw;
7546 	nvme_firmware_commit_dw10_t fc_dw10 = { 0 };
7547 	nvme_ioc_cmd_args_t args = { NULL };
7548 	nvme_sqe_t sqe = {
7549 	    .sqe_opc	= NVME_OPC_FW_ACTIVATE
7550 	};
7551 
7552 	if ((mode & FWRITE) == 0)
7553 		return (EBADF);
7554 
7555 	if (secpolicy_sys_config(cred_p, B_FALSE) != 0)
7556 		return (EPERM);
7557 
7558 	if (ddi_copyin((void *)(uintptr_t)arg, &fw, sizeof (fw),
7559 	    mode & FKIOCTL) != 0) {
7560 		return (EFAULT);
7561 	}
7562 
7563 	if (!nvme_ioctl_check(minor, &fw.fwc_common, &nvme_check_firmware)) {
7564 		goto copyout;
7565 	}
7566 
7567 	if (!nvme_validate_fw_commit(nvme, &fw)) {
7568 		goto copyout;
7569 	}
7570 
7571 	fc_dw10.b.fc_slot = fw.fwc_slot;
7572 	fc_dw10.b.fc_action = fw.fwc_action;
7573 	sqe.sqe_cdw10 = fc_dw10.r;
7574 
7575 	args.ica_sqe = &sqe;
7576 	args.ica_timeout = nvme_commit_save_cmd_timeout;
7577 
7578 	/*
7579 	 * There are no conditional actions to take based on this succeeding or
7580 	 * failing. A failure is recorded in the ioctl structure returned to the
7581 	 * user.
7582 	 */
7583 	(void) nvme_ioc_cmd(nvme, &fw.fwc_common, &args);
7584 
7585 	/*
7586 	 * Let the DDI UFM subsystem know that the firmware information for
7587 	 * this device has changed. We perform this unconditionally as an
7588 	 * invalidation doesn't particularly hurt us.
7589 	 */
7590 	nvme_ufm_update(nvme);
7591 
7592 copyout:
7593 	if (ddi_copyout(&fw, (void *)(uintptr_t)arg, sizeof (fw),
7594 	    mode & FKIOCTL) != 0) {
7595 		return (EFAULT);
7596 	}
7597 
7598 	return (0);
7599 }
7600 
7601 /*
7602  * Helper to copy in a passthru command from userspace, handling
7603  * different data models.
7604  */
7605 static int
7606 nvme_passthru_copyin_cmd(const void *buf, nvme_ioctl_passthru_t *cmd, int mode)
7607 {
7608 	switch (ddi_model_convert_from(mode & FMODELS)) {
7609 #ifdef _MULTI_DATAMODEL
7610 	case DDI_MODEL_ILP32: {
7611 		nvme_ioctl_passthru32_t cmd32;
7612 
7613 		if (ddi_copyin(buf, (void*)&cmd32, sizeof (cmd32), mode) != 0)
7614 			return (EFAULT);
7615 
7616 		bzero(cmd, sizeof (nvme_ioctl_passthru_t));
7617 
7618 		cmd->npc_common.nioc_nsid = cmd32.npc_common.nioc_nsid;
7619 		cmd->npc_opcode = cmd32.npc_opcode;
7620 		cmd->npc_timeout = cmd32.npc_timeout;
7621 		cmd->npc_flags = cmd32.npc_flags;
7622 		cmd->npc_impact = cmd32.npc_impact;
7623 		cmd->npc_cdw12 = cmd32.npc_cdw12;
7624 		cmd->npc_cdw13 = cmd32.npc_cdw13;
7625 		cmd->npc_cdw14 = cmd32.npc_cdw14;
7626 		cmd->npc_cdw15 = cmd32.npc_cdw15;
7627 		cmd->npc_buflen = cmd32.npc_buflen;
7628 		cmd->npc_buf = cmd32.npc_buf;
7629 		break;
7630 	}
7631 #endif	/* _MULTI_DATAMODEL */
7632 	case DDI_MODEL_NONE:
7633 		if (ddi_copyin(buf, (void *)cmd, sizeof (nvme_ioctl_passthru_t),
7634 		    mode) != 0) {
7635 			return (EFAULT);
7636 		}
7637 		break;
7638 	default:
7639 		return (ENOTSUP);
7640 	}
7641 
7642 	return (0);
7643 }
7644 
7645 /*
7646  * Helper to copy out a passthru command result to userspace, handling
7647  * different data models.
7648  */
7649 static int
7650 nvme_passthru_copyout_cmd(const nvme_ioctl_passthru_t *cmd, void *buf, int mode)
7651 {
7652 	switch (ddi_model_convert_from(mode & FMODELS)) {
7653 #ifdef _MULTI_DATAMODEL
7654 	case DDI_MODEL_ILP32: {
7655 		nvme_ioctl_passthru32_t cmd32;
7656 
7657 		bzero(&cmd32, sizeof (nvme_ioctl_passthru32_t));
7658 
7659 		cmd32.npc_common = cmd->npc_common;
7660 		cmd32.npc_opcode = cmd->npc_opcode;
7661 		cmd32.npc_timeout = cmd->npc_timeout;
7662 		cmd32.npc_flags = cmd->npc_flags;
7663 		cmd32.npc_impact = cmd->npc_impact;
7664 		cmd32.npc_cdw0 = cmd->npc_cdw0;
7665 		cmd32.npc_cdw12 = cmd->npc_cdw12;
7666 		cmd32.npc_cdw13 = cmd->npc_cdw13;
7667 		cmd32.npc_cdw14 = cmd->npc_cdw14;
7668 		cmd32.npc_cdw15 = cmd->npc_cdw15;
7669 		cmd32.npc_buflen = (size32_t)cmd->npc_buflen;
7670 		cmd32.npc_buf = (uintptr32_t)cmd->npc_buf;
7671 		if (ddi_copyout(&cmd32, buf, sizeof (cmd32), mode) != 0)
7672 			return (EFAULT);
7673 		break;
7674 	}
7675 #endif	/* _MULTI_DATAMODEL */
7676 	case DDI_MODEL_NONE:
7677 		if (ddi_copyout(cmd, buf, sizeof (nvme_ioctl_passthru_t),
7678 		    mode) != 0) {
7679 			return (EFAULT);
7680 		}
7681 		break;
7682 	default:
7683 		return (ENOTSUP);
7684 	}
7685 	return (0);
7686 }
7687 
7688 /*
7689  * Run an arbitrary vendor-specific admin command on the device.
7690  */
7691 static int
7692 nvme_ioctl_passthru(nvme_minor_t *minor, intptr_t arg, int mode, cred_t *cred_p)
7693 {
7694 	nvme_t *const nvme = minor->nm_ctrl;
7695 	int rv;
7696 	nvme_ioctl_passthru_t pass;
7697 	nvme_sqe_t sqe;
7698 	nvme_ioc_cmd_args_t args = { NULL };
7699 
7700 	/*
7701 	 * Basic checks: permissions, data model, argument size.
7702 	 */
7703 	if ((mode & FWRITE) == 0)
7704 		return (EBADF);
7705 
7706 	if (secpolicy_sys_config(cred_p, B_FALSE) != 0)
7707 		return (EPERM);
7708 
7709 	if ((rv = nvme_passthru_copyin_cmd((void *)(uintptr_t)arg, &pass,
7710 	    mode)) != 0) {
7711 		return (rv);
7712 	}
7713 
7714 	if (!nvme_ioctl_check(minor, &pass.npc_common, &nvme_check_passthru)) {
7715 		goto copyout;
7716 	}
7717 
7718 	if (!nvme_validate_vuc(nvme, &pass)) {
7719 		goto copyout;
7720 	}
7721 
7722 	nvme_mgmt_lock(nvme, NVME_MGMT_LOCK_NVME);
7723 	if ((pass.npc_impact & NVME_IMPACT_NS) != 0) {
7724 		/*
7725 		 * We've been told this has ns impact. Right now force that to
7726 		 * be every ns until we have more use cases and reason to trust
7727 		 * the nsid field.
7728 		 */
7729 		if (!nvme_no_blkdev_attached(nvme, NVME_NSID_BCAST)) {
7730 			nvme_mgmt_unlock(nvme);
7731 			(void) nvme_ioctl_error(&pass.npc_common,
7732 			    NVME_IOCTL_E_NS_BLKDEV_ATTACH, 0, 0);
7733 			goto copyout;
7734 		}
7735 	}
7736 
7737 	bzero(&sqe, sizeof (sqe));
7738 
7739 	sqe.sqe_opc = pass.npc_opcode;
7740 	sqe.sqe_nsid = pass.npc_common.nioc_nsid;
7741 	sqe.sqe_cdw10 = (uint32_t)(pass.npc_buflen >> NVME_DWORD_SHIFT);
7742 	sqe.sqe_cdw12 = pass.npc_cdw12;
7743 	sqe.sqe_cdw13 = pass.npc_cdw13;
7744 	sqe.sqe_cdw14 = pass.npc_cdw14;
7745 	sqe.sqe_cdw15 = pass.npc_cdw15;
7746 
7747 	args.ica_sqe = &sqe;
7748 	args.ica_data = (void *)pass.npc_buf;
7749 	args.ica_data_len = pass.npc_buflen;
7750 	args.ica_copy_flags = mode;
7751 	args.ica_timeout = pass.npc_timeout;
7752 
7753 	if ((pass.npc_flags & NVME_PASSTHRU_READ) != 0)
7754 		args.ica_dma_flags |= DDI_DMA_READ;
7755 	else if ((pass.npc_flags & NVME_PASSTHRU_WRITE) != 0)
7756 		args.ica_dma_flags |= DDI_DMA_WRITE;
7757 
7758 	if (nvme_ioc_cmd(nvme, &pass.npc_common, &args)) {
7759 		pass.npc_cdw0 = args.ica_cdw0;
7760 		if ((pass.npc_impact & NVME_IMPACT_NS) != 0) {
7761 			nvme_rescan_ns(nvme, NVME_NSID_BCAST);
7762 		}
7763 	}
7764 	nvme_mgmt_unlock(nvme);
7765 
7766 copyout:
7767 	rv = nvme_passthru_copyout_cmd(&pass, (void *)(uintptr_t)arg,
7768 	    mode);
7769 
7770 	return (rv);
7771 }
7772 
7773 static int
7774 nvme_ioctl_lock(nvme_minor_t *minor, intptr_t arg, int mode,
7775     cred_t *cred_p)
7776 {
7777 	nvme_ioctl_lock_t lock;
7778 	const nvme_lock_flags_t all_flags = NVME_LOCK_F_DONT_BLOCK;
7779 	nvme_t *nvme = minor->nm_ctrl;
7780 
7781 	if ((mode & FWRITE) == 0)
7782 		return (EBADF);
7783 
7784 	if (secpolicy_sys_config(cred_p, B_FALSE) != 0)
7785 		return (EPERM);
7786 
7787 	if (ddi_copyin((void *)(uintptr_t)arg, &lock, sizeof (lock),
7788 	    mode & FKIOCTL) != 0) {
7789 		return (EFAULT);
7790 	}
7791 
7792 	if (lock.nil_ent != NVME_LOCK_E_CTRL &&
7793 	    lock.nil_ent != NVME_LOCK_E_NS) {
7794 		(void) nvme_ioctl_error(&lock.nil_common,
7795 		    NVME_IOCTL_E_BAD_LOCK_ENTITY, 0, 0);
7796 		goto copyout;
7797 	}
7798 
7799 	if (lock.nil_level != NVME_LOCK_L_READ &&
7800 	    lock.nil_level != NVME_LOCK_L_WRITE) {
7801 		(void) nvme_ioctl_error(&lock.nil_common,
7802 		    NVME_IOCTL_E_BAD_LOCK_LEVEL, 0, 0);
7803 		goto copyout;
7804 	}
7805 
7806 	if ((lock.nil_flags & ~all_flags) != 0) {
7807 		(void) nvme_ioctl_error(&lock.nil_common,
7808 		    NVME_IOCTL_E_BAD_LOCK_FLAGS, 0, 0);
7809 		goto copyout;
7810 	}
7811 
7812 	if (!nvme_ioctl_check(minor, &lock.nil_common, &nvme_check_locking)) {
7813 		goto copyout;
7814 	}
7815 
7816 	/*
7817 	 * If we're on a namespace, confirm that we're not asking for the
7818 	 * controller.
7819 	 */
7820 	if (lock.nil_common.nioc_nsid != 0 &&
7821 	    lock.nil_ent == NVME_LOCK_E_CTRL) {
7822 		(void) nvme_ioctl_error(&lock.nil_common,
7823 		    NVME_IOCTL_E_NS_CANNOT_LOCK_CTRL, 0, 0);
7824 		goto copyout;
7825 	}
7826 
7827 	/*
7828 	 * We've reached the point where we can no longer actually check things
7829 	 * without serializing state. First, we need to check to make sure that
7830 	 * none of our invariants are being broken for locking:
7831 	 *
7832 	 * 1) The caller isn't already blocking for a lock operation to
7833 	 * complete.
7834 	 *
7835 	 * 2) The caller is attempting to grab a lock that they already have.
7836 	 * While there are other rule violations that this might create, we opt
7837 	 * to check this ahead of it so we can have slightly better error
7838 	 * messages for our callers.
7839 	 *
7840 	 * 3) The caller is trying to grab a controller lock, while holding a
7841 	 * namespace lock.
7842 	 *
7843 	 * 4) The caller has a controller write lock and is trying to get a
7844 	 * namespace lock. For now, we disallow this case. Holding a controller
7845 	 * read lock is allowed, but the write lock allows you to operate on all
7846 	 * namespaces anyways. In addition, this simplifies the locking logic;
7847 	 * however, this constraint may be loosened in the future.
7848 	 *
7849 	 * 5) The caller is trying to acquire a second namespace lock when they
7850 	 * already have one.
7851 	 */
7852 	mutex_enter(&nvme->n_minor_mutex);
7853 	if (minor->nm_ctrl_lock.nli_state == NVME_LOCK_STATE_BLOCKED ||
7854 	    minor->nm_ns_lock.nli_state == NVME_LOCK_STATE_BLOCKED) {
7855 		(void) nvme_ioctl_error(&lock.nil_common,
7856 		    NVME_IOCTL_E_LOCK_PENDING, 0, 0);
7857 		mutex_exit(&nvme->n_minor_mutex);
7858 		goto copyout;
7859 	}
7860 
7861 	if ((lock.nil_ent == NVME_LOCK_E_CTRL &&
7862 	    minor->nm_ctrl_lock.nli_state == NVME_LOCK_STATE_ACQUIRED) ||
7863 	    (lock.nil_ent == NVME_LOCK_E_NS &&
7864 	    minor->nm_ns_lock.nli_state == NVME_LOCK_STATE_ACQUIRED &&
7865 	    minor->nm_ns_lock.nli_ns->ns_id == lock.nil_common.nioc_nsid)) {
7866 		(void) nvme_ioctl_error(&lock.nil_common,
7867 		    NVME_IOCTL_E_LOCK_ALREADY_HELD, 0, 0);
7868 		mutex_exit(&nvme->n_minor_mutex);
7869 		goto copyout;
7870 	}
7871 
7872 	if (lock.nil_ent == NVME_LOCK_E_CTRL &&
7873 	    minor->nm_ns_lock.nli_state != NVME_LOCK_STATE_UNLOCKED) {
7874 		(void) nvme_ioctl_error(&lock.nil_common,
7875 		    NVME_IOCTL_E_LOCK_NO_CTRL_WITH_NS, 0, 0);
7876 		mutex_exit(&nvme->n_minor_mutex);
7877 		goto copyout;
7878 	}
7879 
7880 	if (lock.nil_ent == NVME_LOCK_E_NS &&
7881 	    (minor->nm_ctrl_lock.nli_state == NVME_LOCK_STATE_ACQUIRED &&
7882 	    minor->nm_ctrl_lock.nli_curlevel == NVME_LOCK_L_WRITE)) {
7883 		(void) nvme_ioctl_error(&lock.nil_common,
7884 		    NVME_IOCTL_LOCK_NO_NS_WITH_CTRL_WRLOCK, 0, 0);
7885 		mutex_exit(&nvme->n_minor_mutex);
7886 		goto copyout;
7887 	}
7888 
7889 	if (lock.nil_ent == NVME_LOCK_E_NS &&
7890 	    minor->nm_ns_lock.nli_state != NVME_LOCK_STATE_UNLOCKED) {
7891 		(void) nvme_ioctl_error(&lock.nil_common,
7892 		    NVME_IOCTL_E_LOCK_NO_2ND_NS, 0, 0);
7893 		mutex_exit(&nvme->n_minor_mutex);
7894 		goto copyout;
7895 	}
7896 
7897 #ifdef	DEBUG
7898 	/*
7899 	 * This is a big block of sanity checks to make sure that we haven't
7900 	 * allowed anything bad to happen.
7901 	 */
7902 	if (lock.nil_ent == NVME_LOCK_E_NS) {
7903 		ASSERT3P(minor->nm_ns_lock.nli_lock, ==, NULL);
7904 		ASSERT3U(minor->nm_ns_lock.nli_state, ==,
7905 		    NVME_LOCK_STATE_UNLOCKED);
7906 		ASSERT3U(minor->nm_ns_lock.nli_curlevel, ==, 0);
7907 		ASSERT3P(minor->nm_ns_lock.nli_ns, ==, NULL);
7908 
7909 		if (minor->nm_ns != NULL) {
7910 			ASSERT3U(minor->nm_ns->ns_id, ==,
7911 			    lock.nil_common.nioc_nsid);
7912 		}
7913 
7914 		ASSERT0(list_link_active(&minor->nm_ns_lock.nli_node));
7915 	} else {
7916 		ASSERT3P(minor->nm_ctrl_lock.nli_lock, ==, NULL);
7917 		ASSERT3U(minor->nm_ctrl_lock.nli_state, ==,
7918 		    NVME_LOCK_STATE_UNLOCKED);
7919 		ASSERT3U(minor->nm_ctrl_lock.nli_curlevel, ==, 0);
7920 		ASSERT3P(minor->nm_ns_lock.nli_ns, ==, NULL);
7921 		ASSERT0(list_link_active(&minor->nm_ctrl_lock.nli_node));
7922 
7923 		ASSERT3P(minor->nm_ns_lock.nli_lock, ==, NULL);
7924 		ASSERT3U(minor->nm_ns_lock.nli_state, ==,
7925 		    NVME_LOCK_STATE_UNLOCKED);
7926 		ASSERT3U(minor->nm_ns_lock.nli_curlevel, ==, 0);
7927 		ASSERT3P(minor->nm_ns_lock.nli_ns, ==, NULL);
7928 		ASSERT0(list_link_active(&minor->nm_ns_lock.nli_node));
7929 	}
7930 #endif	/* DEBUG */
7931 
7932 	/*
7933 	 * At this point we should actually attempt a locking operation.
7934 	 */
7935 	nvme_rwlock(minor, &lock);
7936 	mutex_exit(&nvme->n_minor_mutex);
7937 
7938 copyout:
7939 	if (ddi_copyout(&lock, (void *)(uintptr_t)arg, sizeof (lock),
7940 	    mode & FKIOCTL) != 0) {
7941 		return (EFAULT);
7942 	}
7943 
7944 	return (0);
7945 }
7946 
7947 static int
7948 nvme_ioctl_unlock(nvme_minor_t *minor, intptr_t arg, int mode,
7949     cred_t *cred_p)
7950 {
7951 	nvme_ioctl_unlock_t unlock;
7952 	nvme_t *const nvme = minor->nm_ctrl;
7953 	boolean_t is_ctrl;
7954 	nvme_lock_t *lock;
7955 	nvme_minor_lock_info_t *info;
7956 
7957 	/*
7958 	 * Note, we explicitly don't check for privileges for unlock. The idea
7959 	 * being that if you have the lock, that's what matters. If you don't
7960 	 * have the lock, it doesn't matter what privileges that you have at
7961 	 * all.
7962 	 */
7963 	if ((mode & FWRITE) == 0)
7964 		return (EBADF);
7965 
7966 	if (ddi_copyin((void *)(uintptr_t)arg, &unlock, sizeof (unlock),
7967 	    mode & FKIOCTL) != 0) {
7968 		return (EFAULT);
7969 	}
7970 
7971 	if (unlock.niu_ent != NVME_LOCK_E_CTRL &&
7972 	    unlock.niu_ent != NVME_LOCK_E_NS) {
7973 		(void) nvme_ioctl_error(&unlock.niu_common,
7974 		    NVME_IOCTL_E_BAD_LOCK_ENTITY, 0, 0);
7975 		goto copyout;
7976 	}
7977 
7978 	if (!nvme_ioctl_check(minor, &unlock.niu_common, &nvme_check_locking)) {
7979 		goto copyout;
7980 	}
7981 
7982 	/*
7983 	 * If we're on a namespace, confirm that we're not asking for the
7984 	 * controller.
7985 	 */
7986 	if (unlock.niu_common.nioc_nsid != 0 &&
7987 	    unlock.niu_ent == NVME_LOCK_E_CTRL) {
7988 		(void) nvme_ioctl_error(&unlock.niu_common,
7989 		    NVME_IOCTL_E_NS_CANNOT_UNLOCK_CTRL, 0, 0);
7990 		goto copyout;
7991 	}
7992 
7993 	mutex_enter(&nvme->n_minor_mutex);
7994 	if (unlock.niu_ent == NVME_LOCK_E_CTRL) {
7995 		if (minor->nm_ctrl_lock.nli_state != NVME_LOCK_STATE_ACQUIRED) {
7996 			mutex_exit(&nvme->n_minor_mutex);
7997 			(void) nvme_ioctl_error(&unlock.niu_common,
7998 			    NVME_IOCTL_E_LOCK_NOT_HELD, 0, 0);
7999 			goto copyout;
8000 		}
8001 	} else {
8002 		if (minor->nm_ns_lock.nli_ns == NULL) {
8003 			mutex_exit(&nvme->n_minor_mutex);
8004 			(void) nvme_ioctl_error(&unlock.niu_common,
8005 			    NVME_IOCTL_E_LOCK_NOT_HELD, 0, 0);
8006 			goto copyout;
8007 		}
8008 
8009 		/*
8010 		 * Check that our unlock request corresponds to the namespace ID
8011 		 * that is currently locked. This could happen if we're using
8012 		 * the controller node and it specified a valid, but not locked,
8013 		 * namespace ID.
8014 		 */
8015 		if (minor->nm_ns_lock.nli_ns->ns_id !=
8016 		    unlock.niu_common.nioc_nsid) {
8017 			mutex_exit(&nvme->n_minor_mutex);
8018 			ASSERT3P(minor->nm_ns, ==, NULL);
8019 			(void) nvme_ioctl_error(&unlock.niu_common,
8020 			    NVME_IOCTL_E_LOCK_WRONG_NS, 0, 0);
8021 			goto copyout;
8022 		}
8023 
8024 		if (minor->nm_ns_lock.nli_state != NVME_LOCK_STATE_ACQUIRED) {
8025 			mutex_exit(&nvme->n_minor_mutex);
8026 			(void) nvme_ioctl_error(&unlock.niu_common,
8027 			    NVME_IOCTL_E_LOCK_NOT_HELD, 0, 0);
8028 			goto copyout;
8029 		}
8030 	}
8031 
8032 	/*
8033 	 * Finally, perform the unlock.
8034 	 */
8035 	is_ctrl = unlock.niu_ent == NVME_LOCK_E_CTRL;
8036 	if (is_ctrl) {
8037 		lock = &nvme->n_lock;
8038 		info = &minor->nm_ctrl_lock;
8039 	} else {
8040 		nvme_namespace_t *ns;
8041 		const uint32_t nsid = unlock.niu_common.nioc_nsid;
8042 
8043 		ns = nvme_nsid2ns(nvme, nsid);
8044 		lock = &ns->ns_lock;
8045 		info = &minor->nm_ns_lock;
8046 		VERIFY3P(ns, ==, info->nli_ns);
8047 	}
8048 	nvme_rwunlock(info, lock);
8049 	mutex_exit(&nvme->n_minor_mutex);
8050 	nvme_ioctl_success(&unlock.niu_common);
8051 
8052 copyout:
8053 	if (ddi_copyout(&unlock, (void *)(uintptr_t)arg, sizeof (unlock),
8054 	    mode & FKIOCTL) != 0) {
8055 		return (EFAULT);
8056 	}
8057 
8058 	return (0);
8059 }
8060 
8061 static int
8062 nvme_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred_p,
8063     int *rval_p)
8064 {
8065 #ifndef __lock_lint
8066 	_NOTE(ARGUNUSED(rval_p));
8067 #endif
8068 	int ret;
8069 	nvme_minor_t *minor;
8070 	nvme_t *nvme;
8071 
8072 	minor = nvme_minor_find_by_dev(dev);
8073 	if (minor == NULL) {
8074 		return (ENXIO);
8075 	}
8076 
8077 	nvme = minor->nm_ctrl;
8078 	if (nvme == NULL)
8079 		return (ENXIO);
8080 
8081 	if (IS_DEVCTL(cmd))
8082 		return (ndi_devctl_ioctl(nvme->n_dip, cmd, arg, mode, 0));
8083 
8084 	if (nvme->n_dead && (cmd != NVME_IOC_BD_DETACH && cmd !=
8085 	    NVME_IOC_UNLOCK)) {
8086 		if (IS_NVME_IOC(cmd) == 0) {
8087 			return (EIO);
8088 		}
8089 
8090 		return (nvme_ioctl_copyout_error(nvme->n_dead_status, arg,
8091 		    mode));
8092 	}
8093 
8094 	/*
8095 	 * ioctls that are no longer using the original ioctl structure.
8096 	 */
8097 	switch (cmd) {
8098 	case NVME_IOC_CTRL_INFO:
8099 		ret = nvme_ioctl_ctrl_info(minor, arg, mode, cred_p);
8100 		break;
8101 	case NVME_IOC_IDENTIFY:
8102 		ret = nvme_ioctl_identify(minor, arg, mode, cred_p);
8103 		break;
8104 	case NVME_IOC_GET_LOGPAGE:
8105 		ret = nvme_ioctl_get_logpage(minor, arg, mode, cred_p);
8106 		break;
8107 	case NVME_IOC_GET_FEATURE:
8108 		ret = nvme_ioctl_get_feature(minor, arg, mode, cred_p);
8109 		break;
8110 	case NVME_IOC_BD_DETACH:
8111 		ret = nvme_ioctl_bd_detach(minor, arg, mode, cred_p);
8112 		break;
8113 	case NVME_IOC_BD_ATTACH:
8114 		ret = nvme_ioctl_bd_attach(minor, arg, mode, cred_p);
8115 		break;
8116 	case NVME_IOC_FORMAT:
8117 		ret = nvme_ioctl_format(minor, arg, mode, cred_p);
8118 		break;
8119 	case NVME_IOC_FIRMWARE_DOWNLOAD:
8120 		ret = nvme_ioctl_firmware_download(minor, arg, mode, cred_p);
8121 		break;
8122 	case NVME_IOC_FIRMWARE_COMMIT:
8123 		ret = nvme_ioctl_firmware_commit(minor, arg, mode, cred_p);
8124 		break;
8125 	case NVME_IOC_NS_INFO:
8126 		ret = nvme_ioctl_ns_info(minor, arg, mode, cred_p);
8127 		break;
8128 	case NVME_IOC_PASSTHRU:
8129 		ret = nvme_ioctl_passthru(minor, arg, mode, cred_p);
8130 		break;
8131 	case NVME_IOC_LOCK:
8132 		ret = nvme_ioctl_lock(minor, arg, mode, cred_p);
8133 		break;
8134 	case NVME_IOC_UNLOCK:
8135 		ret = nvme_ioctl_unlock(minor, arg, mode, cred_p);
8136 		break;
8137 	case NVME_IOC_CTRL_DETACH:
8138 		ret = nvme_ioctl_ctrl_detach(minor, arg, mode, cred_p);
8139 		break;
8140 	case NVME_IOC_CTRL_ATTACH:
8141 		ret = nvme_ioctl_ctrl_attach(minor, arg, mode, cred_p);
8142 		break;
8143 	case NVME_IOC_NS_CREATE:
8144 		ret = nvme_ioctl_ns_create(minor, arg, mode, cred_p);
8145 		break;
8146 	case NVME_IOC_NS_DELETE:
8147 		ret = nvme_ioctl_ns_delete(minor, arg, mode, cred_p);
8148 		break;
8149 	default:
8150 		ret = ENOTTY;
8151 		break;
8152 	}
8153 
8154 	ASSERT(!nvme_mgmt_lock_held(nvme));
8155 	return (ret);
8156 }
8157 
8158 /*
8159  * DDI UFM Callbacks
8160  */
8161 static int
8162 nvme_ufm_fill_image(ddi_ufm_handle_t *ufmh, void *arg, uint_t imgno,
8163     ddi_ufm_image_t *img)
8164 {
8165 	nvme_t *nvme = arg;
8166 
8167 	if (imgno != 0)
8168 		return (EINVAL);
8169 
8170 	ddi_ufm_image_set_desc(img, "Firmware");
8171 	ddi_ufm_image_set_nslots(img, nvme->n_idctl->id_frmw.fw_nslot);
8172 
8173 	return (0);
8174 }
8175 
8176 /*
8177  * Fill out firmware slot information for the requested slot.  The firmware
8178  * slot information is gathered by requesting the Firmware Slot Information log
8179  * page.  The format of the page is described in section 5.10.1.3.
8180  *
8181  * We lazily cache the log page on the first call and then invalidate the cache
8182  * data after a successful firmware download or firmware commit command.
8183  * The cached data is protected by a mutex as the state can change
8184  * asynchronous to this callback.
8185  */
8186 static int
8187 nvme_ufm_fill_slot(ddi_ufm_handle_t *ufmh, void *arg, uint_t imgno,
8188     uint_t slotno, ddi_ufm_slot_t *slot)
8189 {
8190 	nvme_t *nvme = arg;
8191 	void *log = NULL;
8192 	size_t bufsize;
8193 	ddi_ufm_attr_t attr = 0;
8194 	char fw_ver[NVME_FWVER_SZ + 1];
8195 
8196 	if (imgno > 0 || slotno > (nvme->n_idctl->id_frmw.fw_nslot - 1))
8197 		return (EINVAL);
8198 
8199 	mutex_enter(&nvme->n_fwslot_mutex);
8200 	if (nvme->n_fwslot == NULL) {
8201 		if (!nvme_get_logpage_int(nvme, B_TRUE, &log, &bufsize,
8202 		    NVME_LOGPAGE_FWSLOT) ||
8203 		    bufsize != sizeof (nvme_fwslot_log_t)) {
8204 			if (log != NULL)
8205 				kmem_free(log, bufsize);
8206 			mutex_exit(&nvme->n_fwslot_mutex);
8207 			return (EIO);
8208 		}
8209 		nvme->n_fwslot = (nvme_fwslot_log_t *)log;
8210 	}
8211 
8212 	/*
8213 	 * NVMe numbers firmware slots starting at 1
8214 	 */
8215 	if (slotno == (nvme->n_fwslot->fw_afi - 1))
8216 		attr |= DDI_UFM_ATTR_ACTIVE;
8217 
8218 	if (slotno != 0 || nvme->n_idctl->id_frmw.fw_readonly == 0)
8219 		attr |= DDI_UFM_ATTR_WRITEABLE;
8220 
8221 	if (nvme->n_fwslot->fw_frs[slotno][0] == '\0') {
8222 		attr |= DDI_UFM_ATTR_EMPTY;
8223 	} else {
8224 		(void) strncpy(fw_ver, nvme->n_fwslot->fw_frs[slotno],
8225 		    NVME_FWVER_SZ);
8226 		fw_ver[NVME_FWVER_SZ] = '\0';
8227 		ddi_ufm_slot_set_version(slot, fw_ver);
8228 	}
8229 	mutex_exit(&nvme->n_fwslot_mutex);
8230 
8231 	ddi_ufm_slot_set_attrs(slot, attr);
8232 
8233 	return (0);
8234 }
8235 
8236 static int
8237 nvme_ufm_getcaps(ddi_ufm_handle_t *ufmh, void *arg, ddi_ufm_cap_t *caps)
8238 {
8239 	*caps = DDI_UFM_CAP_REPORT;
8240 	return (0);
8241 }
8242 
8243 boolean_t
8244 nvme_ctrl_atleast(nvme_t *nvme, const nvme_version_t *min)
8245 {
8246 	return (nvme_vers_atleast(&nvme->n_version, min) ? B_TRUE : B_FALSE);
8247 }
8248