xref: /illumos-gate/usr/src/uts/common/io/nvme/nvme.c (revision 5113ed77779e8e6bace1231ea82f9fd924e9d849)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright (c) 2016 The MathWorks, Inc.  All rights reserved.
14  * Copyright 2019 Unix Software Ltd.
15  * Copyright 2020 Joyent, Inc.
16  * Copyright 2020 Racktop Systems.
17  * Copyright 2024 Oxide Computer Company.
18  * Copyright 2022 OmniOS Community Edition (OmniOSce) Association.
19  * Copyright 2022 Tintri by DDN, Inc. All rights reserved.
20  */
21 
22 /*
23  * blkdev driver for NVMe compliant storage devices
24  *
25  * This driver targets and is designed to support all NVMe 1.x and NVMe 2.x
26  * devices. Features are added to the driver as we encounter devices that
27  * require them and our needs, so some commands or log pages may not take
28  * advantage of newer features that devices support at this time. When you
29  * encounter such a case, it is generally fine to add that support to the driver
30  * as long as you take care to ensure that the requisite device version is met
31  * before using it.
32  *
33  * The driver has only been tested on x86 systems and will not work on big-
34  * endian systems without changes to the code accessing registers and data
35  * structures used by the hardware.
36  *
37  *
38  * Interrupt Usage:
39  *
40  * The driver will use a single interrupt while configuring the device as the
41  * specification requires, but contrary to the specification it will try to use
42  * a single-message MSI(-X) or FIXED interrupt. Later in the attach process it
43  * will switch to multiple-message MSI(-X) if supported. The driver wants to
44  * have one interrupt vector per CPU, but it will work correctly if less are
45  * available. Interrupts can be shared by queues, the interrupt handler will
46  * iterate through the I/O queue array by steps of n_intr_cnt. Usually only
47  * the admin queue will share an interrupt with one I/O queue. The interrupt
48  * handler will retrieve completed commands from all queues sharing an interrupt
49  * vector and will post them to a taskq for completion processing.
50  *
51  *
52  * Command Processing:
53  *
54  * NVMe devices can have up to 65535 I/O queue pairs, with each queue holding up
55  * to 65536 I/O commands. The driver will configure one I/O queue pair per
56  * available interrupt vector, with the queue length usually much smaller than
57  * the maximum of 65536. If the hardware doesn't provide enough queues, fewer
58  * interrupt vectors will be used.
59  *
60  * Additionally the hardware provides a single special admin queue pair that can
61  * hold up to 4096 admin commands.
62  *
63  * From the hardware perspective both queues of a queue pair are independent,
64  * but they share some driver state: the command array (holding pointers to
65  * commands currently being processed by the hardware) and the active command
66  * counter. Access to a submission queue and the shared state is protected by
67  * nq_mutex; completion queue is protected by ncq_mutex.
68  *
69  * When a command is submitted to a queue pair the active command counter is
70  * incremented and a pointer to the command is stored in the command array. The
71  * array index is used as command identifier (CID) in the submission queue
72  * entry. Some commands may take a very long time to complete, and if the queue
73  * wraps around in that time a submission may find the next array slot to still
74  * be used by a long-running command. In this case the array is sequentially
75  * searched for the next free slot. The length of the command array is the same
76  * as the configured queue length. Queue overrun is prevented by the semaphore,
77  * so a command submission may block if the queue is full.
78  *
79  *
80  * Polled I/O Support:
81  *
82  * For kernel core dump support the driver can do polled I/O. As interrupts are
83  * turned off while dumping the driver will just submit a command in the regular
84  * way, and then repeatedly attempt a command retrieval until it gets the
85  * command back.
86  *
87  *
88  * Namespace Support:
89  *
90  * NVMe devices can have multiple namespaces, each being a independent data
91  * store. The driver supports multiple namespaces and creates a blkdev interface
92  * for each namespace found. Namespaces can have various attributes to support
93  * protection information. This driver does not support any of this and ignores
94  * namespaces that have these attributes.
95  *
96  * As of NVMe 1.1 namespaces can have an 64bit Extended Unique Identifier
97  * (EUI64), and NVMe 1.2 introduced an additional 128bit Namespace Globally
98  * Unique Identifier (NGUID). This driver uses either the NGUID or the EUI64
99  * if present to generate the devid, and passes the EUI64 to blkdev to use it
100  * in the device node names.
101  *
102  * We currently support only (2 << NVME_MINOR_INST_SHIFT) - 2 namespaces in a
103  * single controller. This is an artificial limit imposed by the driver to be
104  * able to address a reasonable number of controllers and namespaces while
105  * fitting within the constraints of MAXMIN32, aka a 32-bit device number which
106  * only has 18-bits for the minor number. See the minor node section for more
107  * information.
108  *
109  *
110  * Minor nodes:
111  *
112  * For each NVMe device the driver exposes one minor node for the controller and
113  * one minor node for each namespace. The only operations supported by those
114  * minor nodes are open(9E), close(9E), and ioctl(9E). This serves as the
115  * primary control interface for the devices. The character device is a private
116  * interface and we attempt stability through libnvme and more so nvmeadm.
117  *
118  * The controller minor node is much more flexible than the namespace minor node
119  * and should be preferred. The controller node allows one to target any
120  * namespace that the device has, while the namespace is limited in what it can
121  * acquire. While the namespace minor exists, it should not be relied upon and
122  * is not by libnvme.
123  *
124  * The minor number space is split in two. We use the lower part to support the
125  * controller and namespaces as described above in the 'Namespace Support'
126  * section. The second set is used for cloning opens. We set aside one million
127  * minors for this purpose. We utilize a cloning open so that way we can have
128  * per-file_t state. This is how we end up implementing and tracking locking
129  * state and related.
130  *
131  * When we have this cloned open, then we allocate a new nvme_minor_t which gets
132  * its minor number from the nvme_open_minors id_space_t and is stored in the
133  * nvme_open_minors_avl. While someone calls open on a controller or namespace
134  * minor, everything else occurs in the context of one of these ephemeral
135  * minors.
136  *
137  *
138  * ioctls, Errors, and Exclusive Access:
139  *
140  * All of the logical commands that one can issue are driven through the
141  * ioctl(9E) interface. All of our ioctls have a similar shape where they
142  * all include the 'nvme_ioctl_common_t' as their first member.
143  *
144  * This common ioctl structure is used to communicate the namespace that should
145  * be targeted. When the namespace is left as 0, then that indicates that it
146  * should target whatever the default is of the minor node. For a namespace
147  * minor, that will be transparently rewritten to the namespace's namespace id.
148  *
149  * In addition, the nvme_ioctl_common_t structure also has a standard error
150  * return. Our goal in our ioctl path is to ensure that we have useful semantic
151  * errors as much as possible. EINVAL, EIO, etc. are all overloaded. Instead as
152  * long as we can copy in our structure, then we will set a semantic error. If
153  * we have an error from the controller, then that will be included there.
154  *
155  * Each command has a specific policy that controls whether or not it is allowed
156  * on the namespace or controller minor, whether the broadcast namespace is
157  * allowed, various settings around what kind of exclusive access is allowed,
158  * and more. Each of these is wrapped up in a bit of policy described by the
159  * 'nvme_ioctl_check_t' structure.
160  *
161  * The device provides a form of exclusion in the form of both a
162  * controller-level and namespace-level read and write lock. Most operations do
163  * not require a lock (e.g. get log page, identify, etc.), but a few do (e.g.
164  * format nvm, firmware related activity, etc.). A read lock guarantees that you
165  * can complete your operation without interference, but read locks are not
166  * required. If you don't take a read lock and someone comes in with a write
167  * lock, then subsequent operations will fail with a semantic error indicating
168  * that you were blocked due to this.
169  *
170  * Here are some of the rules that govern our locks:
171  *
172  * 1. Writers starve readers. Any readers are allowed to finish when there is a
173  *    pending writer; however, all subsequent readers will be blocked upon that
174  *    writer.
175  * 2. A controller write lock takes priority over all other locks. Put
176  *    differently a controller writer not only starves subsequent controller
177  *    readers, but also all namespace read and write locks.
178  * 3. Each namespace lock is independent.
179  * 4. At most a single namespace lock may be owned.
180  * 5. If you own a namespace lock, you may not take a controller lock (to help
181  *    with lock ordering).
182  * 6. In a similar spirit, if you own a controller write lock, you may not take
183  *    any namespace lock. Someone with the controller write lock can perform any
184  *    operations that they need to. However, if you have a controller read lock
185  *    you may take any namespace lock.
186  * 7. There is no ability to upgrade a read lock to a write lock.
187  * 8. There is no recursive locking.
188  *
189  * While there's a lot there to keep track of, the goals of these are to
190  * constrain things so as to avoid deadlock. This is more complex than the
191  * original implementation in the driver which only allowed for an exclusive
192  * open that was tied to the thread. The first issue with tying this to the
193  * thread was that that didn't work well for software that utilized thread
194  * pools, like complex daemons. The second issue is that we want the ability for
195  * daemons, such as a FRU monitor, to be able to retain a file descriptor to the
196  * device without blocking others from taking action except during critical
197  * periods.
198  *
199  * In particular to enable something like libnvme, we didn't want someone to
200  * have to open and close the file descriptor to change what kind of exclusive
201  * access they desired.
202  *
203  * There are two different sets of data structures that we employ for tracking
204  * locking information:
205  *
206  * 1) The nvme_lock_t structure is contained in both the nvme_t and the
207  * nvme_namespace_t and tracks the current writer, readers, and pending writers
208  * and readers. Each of these lists or the writer pointer all refer to our
209  * second data structure.
210  *
211  * When a lock is owned by a single writer, then the nl_writer field is set to a
212  * specific minor's lock data structure. If instead readers are present, then
213  * the nl_readers list_t is not empty. An invariant of the system is that if
214  * nl_writer is non-NULL, nl_readers must be empty and conversely, if nl_readers
215  * is not empty, nl_writer must be NULL.
216  *
217  * 2) The nvme_minor_lock_info_t exists in the nvme_minor_t. There is one
218  * information structure which represents the minor's controller lock and a
219  * second one that represents the minor's namespace lock. The members of this
220  * are broken into tracking what the current lock is and what it targets. It
221  * also several members that are intended for debugging (nli_last_change,
222  * nli_acq_kthread, etc.).
223  *
224  * While the minor has two different lock information structures, our rules
225  * ensure that only one of the two can be pending and that they shouldn't result
226  * in a deadlock. When a lock is pending, the caller is sleeping on the minor's
227  * nm_cv member.
228  *
229  * These relationships are represented in the following image which shows a
230  * controller write lock being held with a pending readers on the controller
231  * lock and pending writers on one of the controller's namespaces.
232  *
233  *  +---------+
234  *  | nvme_t  |
235  *  |         |
236  *  | n_lock -|-------+
237  *  | n_ns -+ |       |                          +-----------------------------+
238  *  +-------|-+   +-----------------+            | nvme_minor_t                |
239  *          |     | nvme_lock_t     |            |                             |
240  *          |     |                 |            |  +------------------------+ |
241  *          |     | writer        --|-------------->| nvme_minor_lock_info_t | |
242  *          |     | reader list     |            |  | nm_ctrl_lock           | |
243  *          |     | pending writers |            |  +------------------------+ |
244  *          |     | pending readers |------+     |  +------------------------+ |
245  *          |     +-----------------+      |     |  | nvme_minor_lock_info_t | |
246  *          |                              |     |  | nm_ns_lock             | |
247  *          |                              |     |  +------------------------+ |
248  *          |                              |     +-----------------------------+
249  *  +------------------+                   |                 +-----------------+
250  *  | nvme_namespace_t |                   |                 | nvme_minor_t    |
251  *  |                  |                   |                 |                 |
252  *  | ns_lock ---+     |                   |                 | +-------------+ |
253  *  +------------|-----+                   +-----------------|>|nm_ctrl_lock | |
254  *               |                                           | +-------------+ |
255  *               v                                           +-----------------+
256  *     +------------------+                                         ...
257  *     | nvme_lock_t      |                                  +-----------------+
258  *     |                  |                                  | nvme_minor_t    |
259  *     | writer           |                                  |                 |
260  *     | reader list      |                                  | +-------------+ |
261  *     | pending writers -|-----------------+                | |nm_ctrl_lock | |
262  *     | pending readers  |                 |                | +-------------+ |
263  *     +------------------+                 |                +-----------------+
264  *         +-----------------------------+  |  +-----------------------------+
265  *         | nvme_minor_t                |  |  | nvme_minor_t                |
266  *         |                             |  |  |                             |
267  *         |  +------------------------+ |  |  |  +------------------------+ |
268  *         |  | nvme_minor_lock_info_t | |  |  |  | nvme_minor_lock_info_t | |
269  *         |  | nm_ctrl_lock           | |  |  |  | nm_ctrl_lock           | |
270  *         |  +------------------------+ |  |  |  +------------------------+ |
271  *         |  +------------------------+ |  v  |  +------------------------+ |
272  *         |  | nvme_minor_lock_info_t |-|-----|->| nvme_minor_lock_info_t | |
273  *         |  | nm_ns_lock             | |     |  | nm_ns_lock             | |
274  *         |  +------------------------+ |     |  +------------------------+ |
275  *         +-----------------------------+     +-----------------------------+
276  *
277  * Blkdev Interface:
278  *
279  * This driver uses blkdev to do all the heavy lifting involved with presenting
280  * a disk device to the system. As a result, the processing of I/O requests is
281  * relatively simple as blkdev takes care of partitioning, boundary checks, DMA
282  * setup, and splitting of transfers into manageable chunks.
283  *
284  * I/O requests coming in from blkdev are turned into NVM commands and posted to
285  * an I/O queue. The queue is selected by taking the CPU id modulo the number of
286  * queues. There is currently no timeout handling of I/O commands.
287  *
288  * Blkdev also supports querying device/media information and generating a
289  * devid. The driver reports the best block size as determined by the namespace
290  * format back to blkdev as physical block size to support partition and block
291  * alignment. The devid is either based on the namespace GUID or EUI64, if
292  * present, or composed using the device vendor ID, model number, serial number,
293  * and the namespace ID.
294  *
295  *
296  * Error Handling:
297  *
298  * Error handling is currently limited to detecting fatal hardware errors,
299  * either by asynchronous events, or synchronously through command status or
300  * admin command timeouts. In case of severe errors the device is fenced off,
301  * all further requests will return EIO. FMA is then called to fault the device.
302  *
303  * The hardware has a limit for outstanding asynchronous event requests. Before
304  * this limit is known the driver assumes it is at least 1 and posts a single
305  * asynchronous request. Later when the limit is known more asynchronous event
306  * requests are posted to allow quicker reception of error information. When an
307  * asynchronous event is posted by the hardware the driver will parse the error
308  * status fields and log information or fault the device, depending on the
309  * severity of the asynchronous event. The asynchronous event request is then
310  * reused and posted to the admin queue again.
311  *
312  * On command completion the command status is checked for errors. In case of
313  * errors indicating a driver bug the driver panics. Almost all other error
314  * status values just cause EIO to be returned.
315  *
316  * Command timeouts are currently detected for all admin commands except
317  * asynchronous event requests. If a command times out and the hardware appears
318  * to be healthy the driver attempts to abort the command. The original command
319  * timeout is also applied to the abort command. If the abort times out too the
320  * driver assumes the device to be dead, fences it off, and calls FMA to retire
321  * it. In all other cases the aborted command should return immediately with a
322  * status indicating it was aborted, and the driver will wait indefinitely for
323  * that to happen. No timeout handling of normal I/O commands is presently done.
324  *
325  * Any command that times out due to the controller dropping dead will be put on
326  * nvme_lost_cmds list if it references DMA memory. This will prevent the DMA
327  * memory being reused by the system and later be written to by a "dead" NVMe
328  * controller.
329  *
330  *
331  * Locking:
332  *
333  * Each queue pair has a nq_mutex and ncq_mutex. The nq_mutex must be held
334  * when accessing shared state and submission queue registers, ncq_mutex
335  * is held when accessing completion queue state and registers.
336  * Callers of nvme_unqueue_cmd() must make sure that nq_mutex is held, while
337  * nvme_submit_{admin,io}_cmd() and nvme_retrieve_cmd() take care of both
338  * mutexes themselves.
339  *
340  * Each command also has its own nc_mutex, which is associated with the
341  * condition variable nc_cv. It is only used on admin commands which are run
342  * synchronously. In that case it must be held across calls to
343  * nvme_submit_{admin,io}_cmd() and nvme_wait_cmd(), which is taken care of by
344  * nvme_admin_cmd(). It must also be held whenever the completion state of the
345  * command is changed or while a admin command timeout is handled.
346  *
347  * If both nc_mutex and nq_mutex must be held, nc_mutex must be acquired first.
348  * More than one nc_mutex may only be held when aborting commands. In this case,
349  * the nc_mutex of the command to be aborted must be held across the call to
350  * nvme_abort_cmd() to prevent the command from completing while the abort is in
351  * progress.
352  *
353  * If both nq_mutex and ncq_mutex need to be held, ncq_mutex must be
354  * acquired first. More than one nq_mutex is never held by a single thread.
355  * The ncq_mutex is only held by nvme_retrieve_cmd() and
356  * nvme_process_iocq(). nvme_process_iocq() is only called from the
357  * interrupt thread and nvme_retrieve_cmd() during polled I/O, so the
358  * mutex is non-contentious but is required for implementation completeness
359  * and safety.
360  *
361  * There is one mutex n_minor_mutex which protects all open flags nm_open and
362  * exclusive-open thread pointers nm_oexcl of each minor node associated with a
363  * controller and its namespaces.
364  *
365  * In addition, there is one mutex n_mgmt_mutex which must be held whenever the
366  * driver state for any namespace is changed, especially across calls to
367  * nvme_init_ns(), nvme_attach_ns() and nvme_detach_ns(). Except when detaching
368  * nvme, it should also be held across calls that modify the blkdev handle of a
369  * namespace. Command and queue mutexes may be acquired and released while
370  * n_mgmt_mutex is held, n_minor_mutex should not.
371  *
372  *
373  * Quiesce / Fast Reboot:
374  *
375  * The driver currently does not support fast reboot. A quiesce(9E) entry point
376  * is still provided which is used to send a shutdown notification to the
377  * device.
378  *
379  *
380  * NVMe Hotplug:
381  *
382  * The driver supports hot removal. The driver uses the NDI event framework
383  * to register a callback, nvme_remove_callback, to clean up when a disk is
384  * removed. In particular, the driver will unqueue outstanding I/O commands and
385  * set n_dead on the softstate to true so that other operations, such as ioctls
386  * and command submissions, fail as well.
387  *
388  * While the callback registration relies on the NDI event framework, the
389  * removal event itself is kicked off in the PCIe hotplug framework, when the
390  * PCIe bridge driver ("pcieb") gets a hotplug interrupt indicating that a
391  * device was removed from the slot.
392  *
393  * The NVMe driver instance itself will remain until the final close of the
394  * device.
395  *
396  *
397  * DDI UFM Support
398  *
399  * The driver supports the DDI UFM framework for reporting information about
400  * the device's firmware image and slot configuration. This data can be
401  * queried by userland software via ioctls to the ufm driver. For more
402  * information, see ddi_ufm(9E).
403  *
404  *
405  * Driver Configuration:
406  *
407  * The following driver properties can be changed to control some aspects of the
408  * drivers operation:
409  * - strict-version: can be set to 0 to allow devices conforming to newer
410  *   major versions to be used
411  * - ignore-unknown-vendor-status: can be set to 1 to not handle any vendor
412  *   specific command status as a fatal error leading device faulting
413  * - admin-queue-len: the maximum length of the admin queue (16-4096)
414  * - io-squeue-len: the maximum length of the I/O submission queues (16-65536)
415  * - io-cqueue-len: the maximum length of the I/O completion queues (16-65536)
416  * - async-event-limit: the maximum number of asynchronous event requests to be
417  *   posted by the driver
418  * - volatile-write-cache-enable: can be set to 0 to disable the volatile write
419  *   cache
420  * - min-phys-block-size: the minimum physical block size to report to blkdev,
421  *   which is among other things the basis for ZFS vdev ashift
422  * - max-submission-queues: the maximum number of I/O submission queues.
423  * - max-completion-queues: the maximum number of I/O completion queues,
424  *   can be less than max-submission-queues, in which case the completion
425  *   queues are shared.
426  *
427  * In addition to the above properties, some device-specific tunables can be
428  * configured using the nvme-config-list global property. The value of this
429  * property is a list of triplets. The formal syntax is:
430  *
431  *   nvme-config-list ::= <triplet> [, <triplet>]* ;
432  *   <triplet>        ::= "<model>" , "<rev-list>" , "<tuple-list>"
433  *   <rev-list>       ::= [ <fwrev> [, <fwrev>]*]
434  *   <tuple-list>     ::= <tunable> [, <tunable>]*
435  *   <tunable>        ::= <name> : <value>
436  *
437  * The <model> and <fwrev> are the strings in nvme_identify_ctrl_t`id_model and
438  * nvme_identify_ctrl_t`id_fwrev, respectively. The remainder of <tuple-list>
439  * contains one or more tunables to apply to all controllers that match the
440  * specified model number and optionally firmware revision. Each <tunable> is a
441  * <name> : <value> pair.  Supported tunables are:
442  *
443  * - ignore-unknown-vendor-status:  can be set to "on" to not handle any vendor
444  *   specific command status as a fatal error leading device faulting
445  *
446  * - min-phys-block-size: the minimum physical block size to report to blkdev,
447  *   which is among other things the basis for ZFS vdev ashift
448  *
449  * - volatile-write-cache: can be set to "on" or "off" to enable or disable the
450  *   volatile write cache, if present
451  *
452  *
453  * TODO:
454  * - figure out sane default for I/O queue depth reported to blkdev
455  * - FMA handling of media errors
456  * - support for devices supporting very large I/O requests using chained PRPs
457  * - support for configuring hardware parameters like interrupt coalescing
458  * - support for media formatting and hard partitioning into namespaces
459  * - support for big-endian systems
460  * - support for fast reboot
461  * - support for NVMe Subsystem Reset (1.1)
462  * - support for Scatter/Gather lists (1.1)
463  * - support for Reservations (1.1)
464  * - support for power management
465  */
466 
467 #include <sys/byteorder.h>
468 #ifdef _BIG_ENDIAN
469 #error nvme driver needs porting for big-endian platforms
470 #endif
471 
472 #include <sys/modctl.h>
473 #include <sys/conf.h>
474 #include <sys/devops.h>
475 #include <sys/ddi.h>
476 #include <sys/ddi_ufm.h>
477 #include <sys/sunddi.h>
478 #include <sys/sunndi.h>
479 #include <sys/bitmap.h>
480 #include <sys/sysmacros.h>
481 #include <sys/param.h>
482 #include <sys/varargs.h>
483 #include <sys/cpuvar.h>
484 #include <sys/disp.h>
485 #include <sys/blkdev.h>
486 #include <sys/atomic.h>
487 #include <sys/archsystm.h>
488 #include <sys/sata/sata_hba.h>
489 #include <sys/stat.h>
490 #include <sys/policy.h>
491 #include <sys/list.h>
492 #include <sys/dkio.h>
493 #include <sys/pci.h>
494 #include <sys/mkdev.h>
495 
496 #include <sys/nvme.h>
497 
498 #ifdef __x86
499 #include <sys/x86_archext.h>
500 #endif
501 
502 #include "nvme_reg.h"
503 #include "nvme_var.h"
504 
505 /*
506  * Assertions to make sure that we've properly captured various aspects of the
507  * packed structures and haven't broken them during updates.
508  */
509 CTASSERT(sizeof (nvme_identify_ctrl_t) == NVME_IDENTIFY_BUFSIZE);
510 CTASSERT(offsetof(nvme_identify_ctrl_t, id_oacs) == 256);
511 CTASSERT(offsetof(nvme_identify_ctrl_t, id_sqes) == 512);
512 CTASSERT(offsetof(nvme_identify_ctrl_t, id_oncs) == 520);
513 CTASSERT(offsetof(nvme_identify_ctrl_t, id_subnqn) == 768);
514 CTASSERT(offsetof(nvme_identify_ctrl_t, id_nvmof) == 1792);
515 CTASSERT(offsetof(nvme_identify_ctrl_t, id_psd) == 2048);
516 CTASSERT(offsetof(nvme_identify_ctrl_t, id_vs) == 3072);
517 
518 CTASSERT(sizeof (nvme_identify_nsid_t) == NVME_IDENTIFY_BUFSIZE);
519 CTASSERT(offsetof(nvme_identify_nsid_t, id_fpi) == 32);
520 CTASSERT(offsetof(nvme_identify_nsid_t, id_anagrpid) == 92);
521 CTASSERT(offsetof(nvme_identify_nsid_t, id_nguid) == 104);
522 CTASSERT(offsetof(nvme_identify_nsid_t, id_lbaf) == 128);
523 CTASSERT(offsetof(nvme_identify_nsid_t, id_vs) == 384);
524 
525 CTASSERT(sizeof (nvme_identify_nsid_list_t) == NVME_IDENTIFY_BUFSIZE);
526 CTASSERT(sizeof (nvme_identify_ctrl_list_t) == NVME_IDENTIFY_BUFSIZE);
527 
528 CTASSERT(sizeof (nvme_identify_primary_caps_t) == NVME_IDENTIFY_BUFSIZE);
529 CTASSERT(offsetof(nvme_identify_primary_caps_t, nipc_vqfrt) == 32);
530 CTASSERT(offsetof(nvme_identify_primary_caps_t, nipc_vifrt) == 64);
531 
532 CTASSERT(sizeof (nvme_nschange_list_t) == 4096);
533 
534 
535 /* NVMe spec version supported */
536 static const int nvme_version_major = 2;
537 
538 /* tunable for admin command timeout in seconds, default is 1s */
539 uint32_t nvme_admin_cmd_timeout = 1;
540 
541 /* tunable for FORMAT NVM command timeout in seconds, default is 600s */
542 uint32_t nvme_format_cmd_timeout = 600;
543 
544 /* tunable for firmware commit with NVME_FWC_SAVE, default is 15s */
545 uint32_t nvme_commit_save_cmd_timeout = 15;
546 
547 /*
548  * tunable for the size of arbitrary vendor specific admin commands,
549  * default is 16MiB.
550  */
551 uint32_t nvme_vendor_specific_admin_cmd_size = 1 << 24;
552 
553 /*
554  * tunable for the max timeout of arbitary vendor specific admin commands,
555  * default is 60s.
556  */
557 uint_t nvme_vendor_specific_admin_cmd_max_timeout = 60;
558 
559 /*
560  * This ID space, AVL, and lock are used for keeping track of minor state across
561  * opens between different devices.
562  */
563 static id_space_t *nvme_open_minors;
564 static avl_tree_t nvme_open_minors_avl;
565 kmutex_t nvme_open_minors_mutex;
566 
567 /*
568  * Removal taskq used for n_dead callback processing.
569  */
570 taskq_t *nvme_dead_taskq;
571 
572 static int nvme_attach(dev_info_t *, ddi_attach_cmd_t);
573 static int nvme_detach(dev_info_t *, ddi_detach_cmd_t);
574 static int nvme_quiesce(dev_info_t *);
575 static int nvme_fm_errcb(dev_info_t *, ddi_fm_error_t *, const void *);
576 static int nvme_setup_interrupts(nvme_t *, int, int);
577 static void nvme_release_interrupts(nvme_t *);
578 static uint_t nvme_intr(caddr_t, caddr_t);
579 
580 static void nvme_shutdown(nvme_t *, boolean_t);
581 static boolean_t nvme_reset(nvme_t *, boolean_t);
582 static int nvme_init(nvme_t *);
583 static nvme_cmd_t *nvme_alloc_cmd(nvme_t *, int);
584 static void nvme_free_cmd(nvme_cmd_t *);
585 static nvme_cmd_t *nvme_create_nvm_cmd(nvme_namespace_t *, uint8_t,
586     bd_xfer_t *);
587 static void nvme_admin_cmd(nvme_cmd_t *, uint32_t);
588 static void nvme_submit_admin_cmd(nvme_qpair_t *, nvme_cmd_t *);
589 static int nvme_submit_io_cmd(nvme_qpair_t *, nvme_cmd_t *);
590 static void nvme_submit_cmd_common(nvme_qpair_t *, nvme_cmd_t *);
591 static nvme_cmd_t *nvme_unqueue_cmd(nvme_t *, nvme_qpair_t *, int);
592 static nvme_cmd_t *nvme_retrieve_cmd(nvme_t *, nvme_qpair_t *);
593 static void nvme_wait_cmd(nvme_cmd_t *, uint_t);
594 static void nvme_wakeup_cmd(void *);
595 static void nvme_async_event_task(void *);
596 
597 static int nvme_check_unknown_cmd_status(nvme_cmd_t *);
598 static int nvme_check_vendor_cmd_status(nvme_cmd_t *);
599 static int nvme_check_integrity_cmd_status(nvme_cmd_t *);
600 static int nvme_check_specific_cmd_status(nvme_cmd_t *);
601 static int nvme_check_generic_cmd_status(nvme_cmd_t *);
602 static inline int nvme_check_cmd_status(nvme_cmd_t *);
603 static boolean_t nvme_check_cmd_status_ioctl(nvme_cmd_t *,
604     nvme_ioctl_common_t *);
605 
606 static int nvme_abort_cmd(nvme_cmd_t *, uint_t);
607 static void nvme_async_event(nvme_t *);
608 static boolean_t nvme_format_nvm(nvme_t *, nvme_ioctl_format_t *);
609 static boolean_t nvme_get_logpage_int(nvme_t *, boolean_t, void **, size_t *,
610     uint8_t);
611 static boolean_t nvme_identify(nvme_t *, boolean_t, nvme_ioctl_identify_t *,
612     void **);
613 static boolean_t nvme_identify_int(nvme_t *, uint32_t, uint8_t, void **);
614 static int nvme_set_features(nvme_t *, boolean_t, uint32_t, uint8_t, uint32_t,
615     uint32_t *);
616 static int nvme_write_cache_set(nvme_t *, boolean_t);
617 static int nvme_set_nqueues(nvme_t *);
618 
619 static void nvme_free_dma(nvme_dma_t *);
620 static int nvme_zalloc_dma(nvme_t *, size_t, uint_t, ddi_dma_attr_t *,
621     nvme_dma_t **);
622 static int nvme_zalloc_queue_dma(nvme_t *, uint32_t, uint16_t, uint_t,
623     nvme_dma_t **);
624 static void nvme_free_qpair(nvme_qpair_t *);
625 static int nvme_alloc_qpair(nvme_t *, uint32_t, nvme_qpair_t **, uint_t);
626 static int nvme_create_io_qpair(nvme_t *, nvme_qpair_t *, uint16_t);
627 
628 static inline void nvme_put64(nvme_t *, uintptr_t, uint64_t);
629 static inline void nvme_put32(nvme_t *, uintptr_t, uint32_t);
630 static inline uint64_t nvme_get64(nvme_t *, uintptr_t);
631 static inline uint32_t nvme_get32(nvme_t *, uintptr_t);
632 
633 static boolean_t nvme_check_regs_hdl(nvme_t *);
634 static boolean_t nvme_check_dma_hdl(nvme_dma_t *);
635 
636 static int nvme_fill_prp(nvme_cmd_t *, ddi_dma_handle_t);
637 
638 static void nvme_bd_xfer_done(void *);
639 static void nvme_bd_driveinfo(void *, bd_drive_t *);
640 static int nvme_bd_mediainfo(void *, bd_media_t *);
641 static int nvme_bd_cmd(nvme_namespace_t *, bd_xfer_t *, uint8_t);
642 static int nvme_bd_read(void *, bd_xfer_t *);
643 static int nvme_bd_write(void *, bd_xfer_t *);
644 static int nvme_bd_sync(void *, bd_xfer_t *);
645 static int nvme_bd_devid(void *, dev_info_t *, ddi_devid_t *);
646 static int nvme_bd_free_space(void *, bd_xfer_t *);
647 
648 static int nvme_prp_dma_constructor(void *, void *, int);
649 static void nvme_prp_dma_destructor(void *, void *);
650 
651 static void nvme_prepare_devid(nvme_t *, uint32_t);
652 
653 /* DDI UFM callbacks */
654 static int nvme_ufm_fill_image(ddi_ufm_handle_t *, void *, uint_t,
655     ddi_ufm_image_t *);
656 static int nvme_ufm_fill_slot(ddi_ufm_handle_t *, void *, uint_t, uint_t,
657     ddi_ufm_slot_t *);
658 static int nvme_ufm_getcaps(ddi_ufm_handle_t *, void *, ddi_ufm_cap_t *);
659 
660 static int nvme_open(dev_t *, int, int, cred_t *);
661 static int nvme_close(dev_t, int, int, cred_t *);
662 static int nvme_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
663 
664 static int nvme_init_ns(nvme_t *, uint32_t);
665 static boolean_t nvme_attach_ns(nvme_t *, nvme_ioctl_common_t *);
666 static boolean_t nvme_detach_ns(nvme_t *, nvme_ioctl_common_t *);
667 
668 static int nvme_minor_comparator(const void *, const void *);
669 
670 static ddi_ufm_ops_t nvme_ufm_ops = {
671 	NULL,
672 	nvme_ufm_fill_image,
673 	nvme_ufm_fill_slot,
674 	nvme_ufm_getcaps
675 };
676 
677 /*
678  * Minor numbers are split amongst those used for controllers and for device
679  * opens. The number of controller minors are limited based open MAXMIN32 per
680  * the theory statement. We allocate 1 million minors as a total guess at a
681  * number that'll probably be enough. The starting point of the open minors can
682  * be shifted to accommodate future expansion of the NVMe device minors.
683  */
684 #define	NVME_MINOR_INST_SHIFT	9
685 #define	NVME_MINOR(inst, nsid)	(((inst) << NVME_MINOR_INST_SHIFT) | (nsid))
686 #define	NVME_MINOR_INST(minor)	((minor) >> NVME_MINOR_INST_SHIFT)
687 #define	NVME_MINOR_NSID(minor)	((minor) & ((1 << NVME_MINOR_INST_SHIFT) - 1))
688 #define	NVME_MINOR_MAX		(NVME_MINOR(1, 0) - 2)
689 #define	NVME_IS_VENDOR_SPECIFIC_CMD(x)	(((x) >= 0xC0) && ((x) <= 0xFF))
690 
691 #define	NVME_OPEN_NMINORS		(1024 * 1024)
692 #define	NVME_OPEN_MINOR_MIN		(MAXMIN32 + 1)
693 #define	NVME_OPEN_MINOR_MAX_EXCL	(NVME_OPEN_MINOR_MIN + \
694     NVME_OPEN_NMINORS)
695 
696 static void *nvme_state;
697 static kmem_cache_t *nvme_cmd_cache;
698 
699 /*
700  * DMA attributes for queue DMA memory
701  *
702  * Queue DMA memory must be page aligned. The maximum length of a queue is
703  * 65536 entries, and an entry can be 64 bytes long.
704  */
705 static const ddi_dma_attr_t nvme_queue_dma_attr = {
706 	.dma_attr_version	= DMA_ATTR_V0,
707 	.dma_attr_addr_lo	= 0,
708 	.dma_attr_addr_hi	= 0xffffffffffffffffULL,
709 	.dma_attr_count_max	= (UINT16_MAX + 1) * sizeof (nvme_sqe_t) - 1,
710 	.dma_attr_align		= 0x1000,
711 	.dma_attr_burstsizes	= 0x7ff,
712 	.dma_attr_minxfer	= 0x1000,
713 	.dma_attr_maxxfer	= (UINT16_MAX + 1) * sizeof (nvme_sqe_t),
714 	.dma_attr_seg		= 0xffffffffffffffffULL,
715 	.dma_attr_sgllen	= 1,
716 	.dma_attr_granular	= 1,
717 	.dma_attr_flags		= 0,
718 };
719 
720 /*
721  * DMA attributes for transfers using Physical Region Page (PRP) entries
722  *
723  * A PRP entry describes one page of DMA memory using the page size specified
724  * in the controller configuration's memory page size register (CC.MPS). It uses
725  * a 64bit base address aligned to this page size. There is no limitation on
726  * chaining PRPs together for arbitrarily large DMA transfers. These DMA
727  * attributes will be copied into the nvme_t during nvme_attach() and the
728  * dma_attr_maxxfer will be updated.
729  */
730 static const ddi_dma_attr_t nvme_prp_dma_attr = {
731 	.dma_attr_version	= DMA_ATTR_V0,
732 	.dma_attr_addr_lo	= 0,
733 	.dma_attr_addr_hi	= 0xffffffffffffffffULL,
734 	.dma_attr_count_max	= 0xfff,
735 	.dma_attr_align		= 0x1000,
736 	.dma_attr_burstsizes	= 0x7ff,
737 	.dma_attr_minxfer	= 0x1000,
738 	.dma_attr_maxxfer	= 0x1000,
739 	.dma_attr_seg		= 0xfff,
740 	.dma_attr_sgllen	= -1,
741 	.dma_attr_granular	= 1,
742 	.dma_attr_flags		= 0,
743 };
744 
745 /*
746  * DMA attributes for transfers using scatter/gather lists
747  *
748  * A SGL entry describes a chunk of DMA memory using a 64bit base address and a
749  * 32bit length field. SGL Segment and SGL Last Segment entries require the
750  * length to be a multiple of 16 bytes. While the SGL DMA attributes are copied
751  * into the nvme_t, they are not currently used for any I/O.
752  */
753 static const ddi_dma_attr_t nvme_sgl_dma_attr = {
754 	.dma_attr_version	= DMA_ATTR_V0,
755 	.dma_attr_addr_lo	= 0,
756 	.dma_attr_addr_hi	= 0xffffffffffffffffULL,
757 	.dma_attr_count_max	= 0xffffffffUL,
758 	.dma_attr_align		= 1,
759 	.dma_attr_burstsizes	= 0x7ff,
760 	.dma_attr_minxfer	= 0x10,
761 	.dma_attr_maxxfer	= 0xfffffffffULL,
762 	.dma_attr_seg		= 0xffffffffffffffffULL,
763 	.dma_attr_sgllen	= -1,
764 	.dma_attr_granular	= 0x10,
765 	.dma_attr_flags		= 0
766 };
767 
768 static ddi_device_acc_attr_t nvme_reg_acc_attr = {
769 	.devacc_attr_version	= DDI_DEVICE_ATTR_V0,
770 	.devacc_attr_endian_flags = DDI_STRUCTURE_LE_ACC,
771 	.devacc_attr_dataorder	= DDI_STRICTORDER_ACC
772 };
773 
774 /*
775  * ioctl validation policies. These are policies that determine which namespaces
776  * are allowed or disallowed for various operations. Note, all policy items
777  * should be explicitly listed here to help make it clear what our intent is.
778  * That is also why some of these are identical or repeated when they cover
779  * different ioctls.
780  */
781 
782 /*
783  * The controller information ioctl generally contains read-only information
784  * about the controller that is sourced from multiple different pieces of
785  * information. This does not operate on a namespace and none are accepted.
786  */
787 static const nvme_ioctl_check_t nvme_check_ctrl_info = {
788 	.nck_ns_ok = B_FALSE, .nck_ns_minor_ok = B_FALSE,
789 	.nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_FALSE,
790 	.nck_bcast_ok = B_FALSE, .nck_excl = NVME_IOCTL_EXCL_NONE
791 };
792 
793 /*
794  * The kernel namespace information requires a namespace ID to be specified. It
795  * does not allow for the broadcast ID to be specified.
796  */
797 static const nvme_ioctl_check_t nvme_check_ns_info = {
798 	.nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_TRUE,
799 	.nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_FALSE,
800 	.nck_bcast_ok = B_FALSE, .nck_excl = NVME_IOCTL_EXCL_NONE
801 };
802 
803 /*
804  * Identify commands are allowed to operate on a namespace minor. Unfortunately,
805  * the namespace field in identify commands is a bit, weird. In particular, some
806  * commands need a valid namespace, while others are namespace listing
807  * operations, which means illegal namespaces like zero are allowed.
808  */
809 static const nvme_ioctl_check_t nvme_check_identify = {
810 	.nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_TRUE,
811 	.nck_skip_ctrl = B_TRUE, .nck_ctrl_rewrite = B_FALSE,
812 	.nck_bcast_ok = B_TRUE, .nck_excl = NVME_IOCTL_EXCL_NONE
813 };
814 
815 /*
816  * The get log page command requires the ability to specify namespaces. When
817  * targeting the controller, one must use the broadcast NSID.
818  */
819 static const nvme_ioctl_check_t nvme_check_get_logpage = {
820 	.nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_TRUE,
821 	.nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_TRUE,
822 	.nck_bcast_ok = B_TRUE, .nck_excl = NVME_IOCTL_EXCL_NONE
823 };
824 
825 /*
826  * When getting a feature, we do not want rewriting behavior as most features do
827  * not require a namespace to be specified. Specific instances are checked in
828  * nvme_validate_get_feature().
829  */
830 static const nvme_ioctl_check_t nvme_check_get_feature = {
831 	.nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_TRUE,
832 	.nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_FALSE,
833 	.nck_bcast_ok = B_TRUE, .nck_excl = NVME_IOCTL_EXCL_NONE
834 };
835 
836 /*
837  * Format commands must target a namespace. The broadcast namespace must be used
838  * when referring to the controller.
839  */
840 static const nvme_ioctl_check_t nvme_check_format = {
841 	.nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_TRUE,
842 	.nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_TRUE,
843 	.nck_bcast_ok = B_TRUE, .nck_excl = NVME_IOCTL_EXCL_WRITE
844 };
845 
846 /*
847  * Attach and detach must always target a minor. However, the broadcast
848  * namespace is not allowed. We still perform rewriting so that way specifying
849  * the controller node with 0 will be caught.
850  */
851 static const nvme_ioctl_check_t nvme_check_attach_detach = {
852 	.nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_TRUE,
853 	.nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_TRUE,
854 	.nck_bcast_ok = B_FALSE, .nck_excl = NVME_IOCTL_EXCL_WRITE
855 };
856 
857 /*
858  * Firmware operations must not target a namespace and are only allowed from the
859  * controller.
860  */
861 static const nvme_ioctl_check_t nvme_check_firmware = {
862 	.nck_ns_ok = B_FALSE, .nck_ns_minor_ok = B_FALSE,
863 	.nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_FALSE,
864 	.nck_bcast_ok = B_FALSE, .nck_excl = NVME_IOCTL_EXCL_WRITE
865 };
866 
867 /*
868  * Passthru commands are an odd set. We only allow them from the primary
869  * controller; however, we allow a namespace to be specified in them and allow
870  * the broadcast namespace. We do not perform rewriting because we don't know
871  * what the semantics are. We explicitly exempt passthru commands from needing
872  * an exclusive lock and leave it up to them to tell us the impact of the
873  * command and semantics. As this is a privileged interface and the semantics
874  * are arbitrary, there's not much we can do without some assistance from the
875  * consumer.
876  */
877 static const nvme_ioctl_check_t nvme_check_passthru = {
878 	.nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_FALSE,
879 	.nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_FALSE,
880 	.nck_bcast_ok = B_TRUE, .nck_excl = NVME_IOCTL_EXCL_NONE
881 };
882 
883 /*
884  * Lock operations are allowed to target a namespace, but must not be rewritten.
885  * There is no support for the broadcast namespace. This is the only ioctl that
886  * should skip exclusive checking as it's used to grant it.
887  */
888 static const nvme_ioctl_check_t nvme_check_locking = {
889 	.nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_TRUE,
890 	.nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_FALSE,
891 	.nck_bcast_ok = B_FALSE, .nck_excl = NVME_IOCTL_EXCL_SKIP
892 };
893 
894 static struct cb_ops nvme_cb_ops = {
895 	.cb_open	= nvme_open,
896 	.cb_close	= nvme_close,
897 	.cb_strategy	= nodev,
898 	.cb_print	= nodev,
899 	.cb_dump	= nodev,
900 	.cb_read	= nodev,
901 	.cb_write	= nodev,
902 	.cb_ioctl	= nvme_ioctl,
903 	.cb_devmap	= nodev,
904 	.cb_mmap	= nodev,
905 	.cb_segmap	= nodev,
906 	.cb_chpoll	= nochpoll,
907 	.cb_prop_op	= ddi_prop_op,
908 	.cb_str		= 0,
909 	.cb_flag	= D_NEW | D_MP,
910 	.cb_rev		= CB_REV,
911 	.cb_aread	= nodev,
912 	.cb_awrite	= nodev
913 };
914 
915 static struct dev_ops nvme_dev_ops = {
916 	.devo_rev	= DEVO_REV,
917 	.devo_refcnt	= 0,
918 	.devo_getinfo	= ddi_no_info,
919 	.devo_identify	= nulldev,
920 	.devo_probe	= nulldev,
921 	.devo_attach	= nvme_attach,
922 	.devo_detach	= nvme_detach,
923 	.devo_reset	= nodev,
924 	.devo_cb_ops	= &nvme_cb_ops,
925 	.devo_bus_ops	= NULL,
926 	.devo_power	= NULL,
927 	.devo_quiesce	= nvme_quiesce,
928 };
929 
930 static struct modldrv nvme_modldrv = {
931 	.drv_modops	= &mod_driverops,
932 	.drv_linkinfo	= "NVMe driver",
933 	.drv_dev_ops	= &nvme_dev_ops
934 };
935 
936 static struct modlinkage nvme_modlinkage = {
937 	.ml_rev		= MODREV_1,
938 	.ml_linkage	= { &nvme_modldrv, NULL }
939 };
940 
941 static bd_ops_t nvme_bd_ops = {
942 	.o_version	= BD_OPS_CURRENT_VERSION,
943 	.o_drive_info	= nvme_bd_driveinfo,
944 	.o_media_info	= nvme_bd_mediainfo,
945 	.o_devid_init	= nvme_bd_devid,
946 	.o_sync_cache	= nvme_bd_sync,
947 	.o_read		= nvme_bd_read,
948 	.o_write	= nvme_bd_write,
949 	.o_free_space	= nvme_bd_free_space,
950 };
951 
952 /*
953  * This list will hold commands that have timed out and couldn't be aborted.
954  * As we don't know what the hardware may still do with the DMA memory we can't
955  * free them, so we'll keep them forever on this list where we can easily look
956  * at them with mdb.
957  */
958 static struct list nvme_lost_cmds;
959 static kmutex_t nvme_lc_mutex;
960 
961 int
962 _init(void)
963 {
964 	int error;
965 
966 	error = ddi_soft_state_init(&nvme_state, sizeof (nvme_t), 1);
967 	if (error != DDI_SUCCESS)
968 		return (error);
969 
970 	if ((nvme_open_minors = id_space_create("nvme_open_minors",
971 	    NVME_OPEN_MINOR_MIN, NVME_OPEN_MINOR_MAX_EXCL)) == NULL) {
972 		ddi_soft_state_fini(&nvme_state);
973 		return (ENOMEM);
974 	}
975 
976 	nvme_cmd_cache = kmem_cache_create("nvme_cmd_cache",
977 	    sizeof (nvme_cmd_t), 64, NULL, NULL, NULL, NULL, NULL, 0);
978 
979 	mutex_init(&nvme_lc_mutex, NULL, MUTEX_DRIVER, NULL);
980 	list_create(&nvme_lost_cmds, sizeof (nvme_cmd_t),
981 	    offsetof(nvme_cmd_t, nc_list));
982 
983 	mutex_init(&nvme_open_minors_mutex, NULL, MUTEX_DRIVER, NULL);
984 	avl_create(&nvme_open_minors_avl, nvme_minor_comparator,
985 	    sizeof (nvme_minor_t), offsetof(nvme_minor_t, nm_avl));
986 
987 	nvme_dead_taskq = taskq_create("nvme_dead_taskq", 1, minclsyspri, 1, 1,
988 	    TASKQ_PREPOPULATE);
989 
990 	bd_mod_init(&nvme_dev_ops);
991 
992 	error = mod_install(&nvme_modlinkage);
993 	if (error != DDI_SUCCESS) {
994 		ddi_soft_state_fini(&nvme_state);
995 		id_space_destroy(nvme_open_minors);
996 		mutex_destroy(&nvme_lc_mutex);
997 		list_destroy(&nvme_lost_cmds);
998 		bd_mod_fini(&nvme_dev_ops);
999 		mutex_destroy(&nvme_open_minors_mutex);
1000 		avl_destroy(&nvme_open_minors_avl);
1001 		taskq_destroy(nvme_dead_taskq);
1002 	}
1003 
1004 	return (error);
1005 }
1006 
1007 int
1008 _fini(void)
1009 {
1010 	int error;
1011 
1012 	if (!list_is_empty(&nvme_lost_cmds))
1013 		return (DDI_FAILURE);
1014 
1015 	error = mod_remove(&nvme_modlinkage);
1016 	if (error == DDI_SUCCESS) {
1017 		ddi_soft_state_fini(&nvme_state);
1018 		id_space_destroy(nvme_open_minors);
1019 		kmem_cache_destroy(nvme_cmd_cache);
1020 		mutex_destroy(&nvme_lc_mutex);
1021 		list_destroy(&nvme_lost_cmds);
1022 		bd_mod_fini(&nvme_dev_ops);
1023 		mutex_destroy(&nvme_open_minors_mutex);
1024 		avl_destroy(&nvme_open_minors_avl);
1025 		taskq_destroy(nvme_dead_taskq);
1026 	}
1027 
1028 	return (error);
1029 }
1030 
1031 int
1032 _info(struct modinfo *modinfop)
1033 {
1034 	return (mod_info(&nvme_modlinkage, modinfop));
1035 }
1036 
1037 static inline void
1038 nvme_put64(nvme_t *nvme, uintptr_t reg, uint64_t val)
1039 {
1040 	ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0);
1041 
1042 	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
1043 	ddi_put64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg), val);
1044 }
1045 
1046 static inline void
1047 nvme_put32(nvme_t *nvme, uintptr_t reg, uint32_t val)
1048 {
1049 	ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x3) == 0);
1050 
1051 	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
1052 	ddi_put32(nvme->n_regh, (uint32_t *)(nvme->n_regs + reg), val);
1053 }
1054 
1055 static inline uint64_t
1056 nvme_get64(nvme_t *nvme, uintptr_t reg)
1057 {
1058 	uint64_t val;
1059 
1060 	ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0);
1061 
1062 	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
1063 	val = ddi_get64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg));
1064 
1065 	return (val);
1066 }
1067 
1068 static inline uint32_t
1069 nvme_get32(nvme_t *nvme, uintptr_t reg)
1070 {
1071 	uint32_t val;
1072 
1073 	ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x3) == 0);
1074 
1075 	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
1076 	val = ddi_get32(nvme->n_regh, (uint32_t *)(nvme->n_regs + reg));
1077 
1078 	return (val);
1079 }
1080 
1081 /*
1082  * This is a central clearing house for marking an NVMe controller dead and/or
1083  * removed. This takes care of setting the flag, taking care of outstanding
1084  * blocked locks, and sending a DDI FMA impact. This is called from a precarious
1085  * place where locking is suspect. The only guarantee we have is that the nvme_t
1086  * is valid and won't disappear until we return.
1087  *
1088  * This should only be used after attach has been called.
1089  */
1090 static void
1091 nvme_ctrl_mark_dead(nvme_t *nvme, boolean_t removed)
1092 {
1093 	boolean_t was_dead;
1094 
1095 	/*
1096 	 * See if we win the race to set things up here. If someone beat us to
1097 	 * it, we do not do anything.
1098 	 */
1099 	was_dead = atomic_cas_32((volatile uint32_t *)&nvme->n_dead, B_FALSE,
1100 	    B_TRUE);
1101 	if (was_dead) {
1102 		return;
1103 	}
1104 
1105 	/*
1106 	 * If this was removed, there is no reason to change the service impact.
1107 	 * However, then we need to change our default return code that we use
1108 	 * here to indicate that it was gone versus that it is dead.
1109 	 */
1110 	if (removed) {
1111 		nvme->n_dead_status = NVME_IOCTL_E_CTRL_GONE;
1112 	} else {
1113 		ASSERT3U(nvme->n_dead_status, ==, NVME_IOCTL_E_CTRL_DEAD);
1114 		ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST);
1115 	}
1116 
1117 	taskq_dispatch_ent(nvme_dead_taskq, nvme_rwlock_ctrl_dead, nvme,
1118 	    TQ_NOSLEEP, &nvme->n_dead_tqent);
1119 }
1120 
1121 static boolean_t
1122 nvme_check_regs_hdl(nvme_t *nvme)
1123 {
1124 	ddi_fm_error_t error;
1125 
1126 	ddi_fm_acc_err_get(nvme->n_regh, &error, DDI_FME_VERSION);
1127 
1128 	if (error.fme_status != DDI_FM_OK)
1129 		return (B_TRUE);
1130 
1131 	return (B_FALSE);
1132 }
1133 
1134 static boolean_t
1135 nvme_check_dma_hdl(nvme_dma_t *dma)
1136 {
1137 	ddi_fm_error_t error;
1138 
1139 	if (dma == NULL)
1140 		return (B_FALSE);
1141 
1142 	ddi_fm_dma_err_get(dma->nd_dmah, &error, DDI_FME_VERSION);
1143 
1144 	if (error.fme_status != DDI_FM_OK)
1145 		return (B_TRUE);
1146 
1147 	return (B_FALSE);
1148 }
1149 
1150 static void
1151 nvme_free_dma_common(nvme_dma_t *dma)
1152 {
1153 	if (dma->nd_dmah != NULL)
1154 		(void) ddi_dma_unbind_handle(dma->nd_dmah);
1155 	if (dma->nd_acch != NULL)
1156 		ddi_dma_mem_free(&dma->nd_acch);
1157 	if (dma->nd_dmah != NULL)
1158 		ddi_dma_free_handle(&dma->nd_dmah);
1159 }
1160 
1161 static void
1162 nvme_free_dma(nvme_dma_t *dma)
1163 {
1164 	nvme_free_dma_common(dma);
1165 	kmem_free(dma, sizeof (*dma));
1166 }
1167 
1168 /* ARGSUSED */
1169 static void
1170 nvme_prp_dma_destructor(void *buf, void *private)
1171 {
1172 	nvme_dma_t *dma = (nvme_dma_t *)buf;
1173 
1174 	nvme_free_dma_common(dma);
1175 }
1176 
1177 static int
1178 nvme_alloc_dma_common(nvme_t *nvme, nvme_dma_t *dma,
1179     size_t len, uint_t flags, ddi_dma_attr_t *dma_attr)
1180 {
1181 	if (ddi_dma_alloc_handle(nvme->n_dip, dma_attr, DDI_DMA_SLEEP, NULL,
1182 	    &dma->nd_dmah) != DDI_SUCCESS) {
1183 		/*
1184 		 * Due to DDI_DMA_SLEEP this can't be DDI_DMA_NORESOURCES, and
1185 		 * the only other possible error is DDI_DMA_BADATTR which
1186 		 * indicates a driver bug which should cause a panic.
1187 		 */
1188 		dev_err(nvme->n_dip, CE_PANIC,
1189 		    "!failed to get DMA handle, check DMA attributes");
1190 		return (DDI_FAILURE);
1191 	}
1192 
1193 	/*
1194 	 * ddi_dma_mem_alloc() can only fail when DDI_DMA_NOSLEEP is specified
1195 	 * or the flags are conflicting, which isn't the case here.
1196 	 */
1197 	(void) ddi_dma_mem_alloc(dma->nd_dmah, len, &nvme->n_reg_acc_attr,
1198 	    DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL, &dma->nd_memp,
1199 	    &dma->nd_len, &dma->nd_acch);
1200 
1201 	if (ddi_dma_addr_bind_handle(dma->nd_dmah, NULL, dma->nd_memp,
1202 	    dma->nd_len, flags | DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL,
1203 	    &dma->nd_cookie, &dma->nd_ncookie) != DDI_DMA_MAPPED) {
1204 		dev_err(nvme->n_dip, CE_WARN,
1205 		    "!failed to bind DMA memory");
1206 		atomic_inc_32(&nvme->n_dma_bind_err);
1207 		nvme_free_dma_common(dma);
1208 		return (DDI_FAILURE);
1209 	}
1210 
1211 	return (DDI_SUCCESS);
1212 }
1213 
1214 static int
1215 nvme_zalloc_dma(nvme_t *nvme, size_t len, uint_t flags,
1216     ddi_dma_attr_t *dma_attr, nvme_dma_t **ret)
1217 {
1218 	nvme_dma_t *dma = kmem_zalloc(sizeof (nvme_dma_t), KM_SLEEP);
1219 
1220 	if (nvme_alloc_dma_common(nvme, dma, len, flags, dma_attr) !=
1221 	    DDI_SUCCESS) {
1222 		*ret = NULL;
1223 		kmem_free(dma, sizeof (nvme_dma_t));
1224 		return (DDI_FAILURE);
1225 	}
1226 
1227 	bzero(dma->nd_memp, dma->nd_len);
1228 
1229 	*ret = dma;
1230 	return (DDI_SUCCESS);
1231 }
1232 
1233 /* ARGSUSED */
1234 static int
1235 nvme_prp_dma_constructor(void *buf, void *private, int flags)
1236 {
1237 	nvme_dma_t *dma = (nvme_dma_t *)buf;
1238 	nvme_t *nvme = (nvme_t *)private;
1239 
1240 	dma->nd_dmah = NULL;
1241 	dma->nd_acch = NULL;
1242 
1243 	if (nvme_alloc_dma_common(nvme, dma, nvme->n_pagesize,
1244 	    DDI_DMA_READ, &nvme->n_prp_dma_attr) != DDI_SUCCESS) {
1245 		return (-1);
1246 	}
1247 
1248 	ASSERT(dma->nd_ncookie == 1);
1249 
1250 	dma->nd_cached = B_TRUE;
1251 
1252 	return (0);
1253 }
1254 
1255 static int
1256 nvme_zalloc_queue_dma(nvme_t *nvme, uint32_t nentry, uint16_t qe_len,
1257     uint_t flags, nvme_dma_t **dma)
1258 {
1259 	uint32_t len = nentry * qe_len;
1260 	ddi_dma_attr_t q_dma_attr = nvme->n_queue_dma_attr;
1261 
1262 	len = roundup(len, nvme->n_pagesize);
1263 
1264 	if (nvme_zalloc_dma(nvme, len, flags, &q_dma_attr, dma)
1265 	    != DDI_SUCCESS) {
1266 		dev_err(nvme->n_dip, CE_WARN,
1267 		    "!failed to get DMA memory for queue");
1268 		goto fail;
1269 	}
1270 
1271 	if ((*dma)->nd_ncookie != 1) {
1272 		dev_err(nvme->n_dip, CE_WARN,
1273 		    "!got too many cookies for queue DMA");
1274 		goto fail;
1275 	}
1276 
1277 	return (DDI_SUCCESS);
1278 
1279 fail:
1280 	if (*dma) {
1281 		nvme_free_dma(*dma);
1282 		*dma = NULL;
1283 	}
1284 
1285 	return (DDI_FAILURE);
1286 }
1287 
1288 static void
1289 nvme_free_cq(nvme_cq_t *cq)
1290 {
1291 	mutex_destroy(&cq->ncq_mutex);
1292 
1293 	if (cq->ncq_cmd_taskq != NULL)
1294 		taskq_destroy(cq->ncq_cmd_taskq);
1295 
1296 	if (cq->ncq_dma != NULL)
1297 		nvme_free_dma(cq->ncq_dma);
1298 
1299 	kmem_free(cq, sizeof (*cq));
1300 }
1301 
1302 static void
1303 nvme_free_qpair(nvme_qpair_t *qp)
1304 {
1305 	int i;
1306 
1307 	mutex_destroy(&qp->nq_mutex);
1308 	sema_destroy(&qp->nq_sema);
1309 
1310 	if (qp->nq_sqdma != NULL)
1311 		nvme_free_dma(qp->nq_sqdma);
1312 
1313 	if (qp->nq_active_cmds > 0)
1314 		for (i = 0; i != qp->nq_nentry; i++)
1315 			if (qp->nq_cmd[i] != NULL)
1316 				nvme_free_cmd(qp->nq_cmd[i]);
1317 
1318 	if (qp->nq_cmd != NULL)
1319 		kmem_free(qp->nq_cmd, sizeof (nvme_cmd_t *) * qp->nq_nentry);
1320 
1321 	kmem_free(qp, sizeof (nvme_qpair_t));
1322 }
1323 
1324 /*
1325  * Destroy the pre-allocated cq array, but only free individual completion
1326  * queues from the given starting index.
1327  */
1328 static void
1329 nvme_destroy_cq_array(nvme_t *nvme, uint_t start)
1330 {
1331 	uint_t i;
1332 
1333 	for (i = start; i < nvme->n_cq_count; i++)
1334 		if (nvme->n_cq[i] != NULL)
1335 			nvme_free_cq(nvme->n_cq[i]);
1336 
1337 	kmem_free(nvme->n_cq, sizeof (*nvme->n_cq) * nvme->n_cq_count);
1338 }
1339 
1340 static int
1341 nvme_alloc_cq(nvme_t *nvme, uint32_t nentry, nvme_cq_t **cqp, uint16_t idx,
1342     uint_t nthr)
1343 {
1344 	nvme_cq_t *cq = kmem_zalloc(sizeof (*cq), KM_SLEEP);
1345 	char name[64];		/* large enough for the taskq name */
1346 
1347 	mutex_init(&cq->ncq_mutex, NULL, MUTEX_DRIVER,
1348 	    DDI_INTR_PRI(nvme->n_intr_pri));
1349 
1350 	if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_cqe_t),
1351 	    DDI_DMA_READ, &cq->ncq_dma) != DDI_SUCCESS)
1352 		goto fail;
1353 
1354 	cq->ncq_cq = (nvme_cqe_t *)cq->ncq_dma->nd_memp;
1355 	cq->ncq_nentry = nentry;
1356 	cq->ncq_id = idx;
1357 	cq->ncq_hdbl = NVME_REG_CQHDBL(nvme, idx);
1358 
1359 	/*
1360 	 * Each completion queue has its own command taskq.
1361 	 */
1362 	(void) snprintf(name, sizeof (name), "%s%d_cmd_taskq%u",
1363 	    ddi_driver_name(nvme->n_dip), ddi_get_instance(nvme->n_dip), idx);
1364 
1365 	cq->ncq_cmd_taskq = taskq_create(name, nthr, minclsyspri, 64, INT_MAX,
1366 	    TASKQ_PREPOPULATE);
1367 
1368 	if (cq->ncq_cmd_taskq == NULL) {
1369 		dev_err(nvme->n_dip, CE_WARN, "!failed to create cmd "
1370 		    "taskq for cq %u", idx);
1371 		goto fail;
1372 	}
1373 
1374 	*cqp = cq;
1375 	return (DDI_SUCCESS);
1376 
1377 fail:
1378 	nvme_free_cq(cq);
1379 	*cqp = NULL;
1380 
1381 	return (DDI_FAILURE);
1382 }
1383 
1384 /*
1385  * Create the n_cq array big enough to hold "ncq" completion queues.
1386  * If the array already exists it will be re-sized (but only larger).
1387  * The admin queue is included in this array, which boosts the
1388  * max number of entries to UINT16_MAX + 1.
1389  */
1390 static int
1391 nvme_create_cq_array(nvme_t *nvme, uint_t ncq, uint32_t nentry, uint_t nthr)
1392 {
1393 	nvme_cq_t **cq;
1394 	uint_t i, cq_count;
1395 
1396 	ASSERT3U(ncq, >, nvme->n_cq_count);
1397 
1398 	cq = nvme->n_cq;
1399 	cq_count = nvme->n_cq_count;
1400 
1401 	nvme->n_cq = kmem_zalloc(sizeof (*nvme->n_cq) * ncq, KM_SLEEP);
1402 	nvme->n_cq_count = ncq;
1403 
1404 	for (i = 0; i < cq_count; i++)
1405 		nvme->n_cq[i] = cq[i];
1406 
1407 	for (; i < nvme->n_cq_count; i++)
1408 		if (nvme_alloc_cq(nvme, nentry, &nvme->n_cq[i], i, nthr) !=
1409 		    DDI_SUCCESS)
1410 			goto fail;
1411 
1412 	if (cq != NULL)
1413 		kmem_free(cq, sizeof (*cq) * cq_count);
1414 
1415 	return (DDI_SUCCESS);
1416 
1417 fail:
1418 	nvme_destroy_cq_array(nvme, cq_count);
1419 	/*
1420 	 * Restore the original array
1421 	 */
1422 	nvme->n_cq_count = cq_count;
1423 	nvme->n_cq = cq;
1424 
1425 	return (DDI_FAILURE);
1426 }
1427 
1428 static int
1429 nvme_alloc_qpair(nvme_t *nvme, uint32_t nentry, nvme_qpair_t **nqp,
1430     uint_t idx)
1431 {
1432 	nvme_qpair_t *qp = kmem_zalloc(sizeof (*qp), KM_SLEEP);
1433 	uint_t cq_idx;
1434 
1435 	mutex_init(&qp->nq_mutex, NULL, MUTEX_DRIVER,
1436 	    DDI_INTR_PRI(nvme->n_intr_pri));
1437 
1438 	/*
1439 	 * The NVMe spec defines that a full queue has one empty (unused) slot;
1440 	 * initialize the semaphore accordingly.
1441 	 */
1442 	sema_init(&qp->nq_sema, nentry - 1, NULL, SEMA_DRIVER, NULL);
1443 
1444 	if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_sqe_t),
1445 	    DDI_DMA_WRITE, &qp->nq_sqdma) != DDI_SUCCESS)
1446 		goto fail;
1447 
1448 	/*
1449 	 * idx == 0 is adminq, those above 0 are shared io completion queues.
1450 	 */
1451 	cq_idx = idx == 0 ? 0 : 1 + (idx - 1) % (nvme->n_cq_count - 1);
1452 	qp->nq_cq = nvme->n_cq[cq_idx];
1453 	qp->nq_sq = (nvme_sqe_t *)qp->nq_sqdma->nd_memp;
1454 	qp->nq_nentry = nentry;
1455 
1456 	qp->nq_sqtdbl = NVME_REG_SQTDBL(nvme, idx);
1457 
1458 	qp->nq_cmd = kmem_zalloc(sizeof (nvme_cmd_t *) * nentry, KM_SLEEP);
1459 	qp->nq_next_cmd = 0;
1460 
1461 	*nqp = qp;
1462 	return (DDI_SUCCESS);
1463 
1464 fail:
1465 	nvme_free_qpair(qp);
1466 	*nqp = NULL;
1467 
1468 	return (DDI_FAILURE);
1469 }
1470 
1471 static nvme_cmd_t *
1472 nvme_alloc_cmd(nvme_t *nvme, int kmflag)
1473 {
1474 	nvme_cmd_t *cmd = kmem_cache_alloc(nvme_cmd_cache, kmflag);
1475 
1476 	if (cmd == NULL)
1477 		return (cmd);
1478 
1479 	bzero(cmd, sizeof (nvme_cmd_t));
1480 
1481 	cmd->nc_nvme = nvme;
1482 
1483 	mutex_init(&cmd->nc_mutex, NULL, MUTEX_DRIVER,
1484 	    DDI_INTR_PRI(nvme->n_intr_pri));
1485 	cv_init(&cmd->nc_cv, NULL, CV_DRIVER, NULL);
1486 
1487 	return (cmd);
1488 }
1489 
1490 static void
1491 nvme_free_cmd(nvme_cmd_t *cmd)
1492 {
1493 	/* Don't free commands on the lost commands list. */
1494 	if (list_link_active(&cmd->nc_list))
1495 		return;
1496 
1497 	if (cmd->nc_dma) {
1498 		nvme_free_dma(cmd->nc_dma);
1499 		cmd->nc_dma = NULL;
1500 	}
1501 
1502 	if (cmd->nc_prp) {
1503 		kmem_cache_free(cmd->nc_nvme->n_prp_cache, cmd->nc_prp);
1504 		cmd->nc_prp = NULL;
1505 	}
1506 
1507 	cv_destroy(&cmd->nc_cv);
1508 	mutex_destroy(&cmd->nc_mutex);
1509 
1510 	kmem_cache_free(nvme_cmd_cache, cmd);
1511 }
1512 
1513 static void
1514 nvme_submit_admin_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd)
1515 {
1516 	sema_p(&qp->nq_sema);
1517 	nvme_submit_cmd_common(qp, cmd);
1518 }
1519 
1520 static int
1521 nvme_submit_io_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd)
1522 {
1523 	if (cmd->nc_nvme->n_dead) {
1524 		return (EIO);
1525 	}
1526 
1527 	if (sema_tryp(&qp->nq_sema) == 0)
1528 		return (EAGAIN);
1529 
1530 	nvme_submit_cmd_common(qp, cmd);
1531 	return (0);
1532 }
1533 
1534 static void
1535 nvme_submit_cmd_common(nvme_qpair_t *qp, nvme_cmd_t *cmd)
1536 {
1537 	nvme_reg_sqtdbl_t tail = { 0 };
1538 
1539 	mutex_enter(&qp->nq_mutex);
1540 	cmd->nc_completed = B_FALSE;
1541 
1542 	/*
1543 	 * Now that we hold the queue pair lock, we must check whether or not
1544 	 * the controller has been listed as dead (e.g. was removed due to
1545 	 * hotplug). This is necessary as otherwise we could race with
1546 	 * nvme_remove_callback(). Because this has not been enqueued, we don't
1547 	 * call nvme_unqueue_cmd(), which is why we must manually decrement the
1548 	 * semaphore.
1549 	 */
1550 	if (cmd->nc_nvme->n_dead) {
1551 		taskq_dispatch_ent(qp->nq_cq->ncq_cmd_taskq, cmd->nc_callback,
1552 		    cmd, TQ_NOSLEEP, &cmd->nc_tqent);
1553 		sema_v(&qp->nq_sema);
1554 		mutex_exit(&qp->nq_mutex);
1555 		return;
1556 	}
1557 
1558 	/*
1559 	 * Try to insert the cmd into the active cmd array at the nq_next_cmd
1560 	 * slot. If the slot is already occupied advance to the next slot and
1561 	 * try again. This can happen for long running commands like async event
1562 	 * requests.
1563 	 */
1564 	while (qp->nq_cmd[qp->nq_next_cmd] != NULL)
1565 		qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry;
1566 	qp->nq_cmd[qp->nq_next_cmd] = cmd;
1567 
1568 	qp->nq_active_cmds++;
1569 
1570 	cmd->nc_sqe.sqe_cid = qp->nq_next_cmd;
1571 	bcopy(&cmd->nc_sqe, &qp->nq_sq[qp->nq_sqtail], sizeof (nvme_sqe_t));
1572 	(void) ddi_dma_sync(qp->nq_sqdma->nd_dmah,
1573 	    sizeof (nvme_sqe_t) * qp->nq_sqtail,
1574 	    sizeof (nvme_sqe_t), DDI_DMA_SYNC_FORDEV);
1575 	qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry;
1576 
1577 	tail.b.sqtdbl_sqt = qp->nq_sqtail = (qp->nq_sqtail + 1) % qp->nq_nentry;
1578 	nvme_put32(cmd->nc_nvme, qp->nq_sqtdbl, tail.r);
1579 
1580 	mutex_exit(&qp->nq_mutex);
1581 }
1582 
1583 static nvme_cmd_t *
1584 nvme_unqueue_cmd(nvme_t *nvme, nvme_qpair_t *qp, int cid)
1585 {
1586 	nvme_cmd_t *cmd;
1587 
1588 	ASSERT(mutex_owned(&qp->nq_mutex));
1589 	ASSERT3S(cid, <, qp->nq_nentry);
1590 
1591 	cmd = qp->nq_cmd[cid];
1592 	/*
1593 	 * Some controllers will erroneously add things to the completion queue
1594 	 * for which there is no matching outstanding command. If this happens,
1595 	 * it is almost certainly a controller firmware bug since nq_mutex
1596 	 * is held across command submission and ringing the queue doorbell,
1597 	 * and is also held in this function.
1598 	 *
1599 	 * If we see such an unexpected command, there is not much we can do.
1600 	 * These will be logged and counted in nvme_get_completed(), but
1601 	 * otherwise ignored.
1602 	 */
1603 	if (cmd == NULL)
1604 		return (NULL);
1605 	qp->nq_cmd[cid] = NULL;
1606 	ASSERT3U(qp->nq_active_cmds, >, 0);
1607 	qp->nq_active_cmds--;
1608 	sema_v(&qp->nq_sema);
1609 
1610 	ASSERT3P(cmd, !=, NULL);
1611 	ASSERT3P(cmd->nc_nvme, ==, nvme);
1612 	ASSERT3S(cmd->nc_sqe.sqe_cid, ==, cid);
1613 
1614 	return (cmd);
1615 }
1616 
1617 /*
1618  * Get the command tied to the next completed cqe and bump along completion
1619  * queue head counter.
1620  */
1621 static nvme_cmd_t *
1622 nvme_get_completed(nvme_t *nvme, nvme_cq_t *cq)
1623 {
1624 	nvme_qpair_t *qp;
1625 	nvme_cqe_t *cqe;
1626 	nvme_cmd_t *cmd;
1627 
1628 	ASSERT(mutex_owned(&cq->ncq_mutex));
1629 
1630 retry:
1631 	cqe = &cq->ncq_cq[cq->ncq_head];
1632 
1633 	/* Check phase tag of CQE. Hardware inverts it for new entries. */
1634 	if (cqe->cqe_sf.sf_p == cq->ncq_phase)
1635 		return (NULL);
1636 
1637 	qp = nvme->n_ioq[cqe->cqe_sqid];
1638 
1639 	mutex_enter(&qp->nq_mutex);
1640 	cmd = nvme_unqueue_cmd(nvme, qp, cqe->cqe_cid);
1641 	mutex_exit(&qp->nq_mutex);
1642 
1643 	qp->nq_sqhead = cqe->cqe_sqhd;
1644 	cq->ncq_head = (cq->ncq_head + 1) % cq->ncq_nentry;
1645 
1646 	/* Toggle phase on wrap-around. */
1647 	if (cq->ncq_head == 0)
1648 		cq->ncq_phase = cq->ncq_phase != 0 ? 0 : 1;
1649 
1650 	if (cmd == NULL) {
1651 		dev_err(nvme->n_dip, CE_WARN,
1652 		    "!received completion for unknown cid 0x%x", cqe->cqe_cid);
1653 		atomic_inc_32(&nvme->n_unknown_cid);
1654 		/*
1655 		 * We want to ignore this unexpected completion entry as it
1656 		 * is most likely a result of a bug in the controller firmware.
1657 		 * However, if we return NULL, then callers will assume there
1658 		 * are no more pending commands for this wakeup. Retry to keep
1659 		 * enumerating commands until the phase tag indicates there are
1660 		 * no more and we are really done.
1661 		 */
1662 		goto retry;
1663 	}
1664 
1665 	ASSERT3U(cmd->nc_sqid, ==, cqe->cqe_sqid);
1666 	bcopy(cqe, &cmd->nc_cqe, sizeof (nvme_cqe_t));
1667 
1668 	return (cmd);
1669 }
1670 
1671 /*
1672  * Process all completed commands on the io completion queue.
1673  */
1674 static uint_t
1675 nvme_process_iocq(nvme_t *nvme, nvme_cq_t *cq)
1676 {
1677 	nvme_reg_cqhdbl_t head = { 0 };
1678 	nvme_cmd_t *cmd;
1679 	uint_t completed = 0;
1680 
1681 	if (ddi_dma_sync(cq->ncq_dma->nd_dmah, 0, 0, DDI_DMA_SYNC_FORKERNEL) !=
1682 	    DDI_SUCCESS)
1683 		dev_err(nvme->n_dip, CE_WARN, "!ddi_dma_sync() failed in %s",
1684 		    __func__);
1685 
1686 	mutex_enter(&cq->ncq_mutex);
1687 
1688 	while ((cmd = nvme_get_completed(nvme, cq)) != NULL) {
1689 		taskq_dispatch_ent(cq->ncq_cmd_taskq, cmd->nc_callback, cmd,
1690 		    TQ_NOSLEEP, &cmd->nc_tqent);
1691 
1692 		completed++;
1693 	}
1694 
1695 	if (completed > 0) {
1696 		/*
1697 		 * Update the completion queue head doorbell.
1698 		 */
1699 		head.b.cqhdbl_cqh = cq->ncq_head;
1700 		nvme_put32(nvme, cq->ncq_hdbl, head.r);
1701 	}
1702 
1703 	mutex_exit(&cq->ncq_mutex);
1704 
1705 	return (completed);
1706 }
1707 
1708 static nvme_cmd_t *
1709 nvme_retrieve_cmd(nvme_t *nvme, nvme_qpair_t *qp)
1710 {
1711 	nvme_cq_t *cq = qp->nq_cq;
1712 	nvme_reg_cqhdbl_t head = { 0 };
1713 	nvme_cmd_t *cmd;
1714 
1715 	if (ddi_dma_sync(cq->ncq_dma->nd_dmah, 0, 0, DDI_DMA_SYNC_FORKERNEL) !=
1716 	    DDI_SUCCESS)
1717 		dev_err(nvme->n_dip, CE_WARN, "!ddi_dma_sync() failed in %s",
1718 		    __func__);
1719 
1720 	mutex_enter(&cq->ncq_mutex);
1721 
1722 	if ((cmd = nvme_get_completed(nvme, cq)) != NULL) {
1723 		head.b.cqhdbl_cqh = cq->ncq_head;
1724 		nvme_put32(nvme, cq->ncq_hdbl, head.r);
1725 	}
1726 
1727 	mutex_exit(&cq->ncq_mutex);
1728 
1729 	return (cmd);
1730 }
1731 
1732 static int
1733 nvme_check_unknown_cmd_status(nvme_cmd_t *cmd)
1734 {
1735 	nvme_cqe_t *cqe = &cmd->nc_cqe;
1736 
1737 	dev_err(cmd->nc_nvme->n_dip, CE_WARN,
1738 	    "!unknown command status received: opc = %x, sqid = %d, cid = %d, "
1739 	    "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc,
1740 	    cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct,
1741 	    cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m);
1742 
1743 	if (cmd->nc_xfer != NULL)
1744 		bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
1745 
1746 	if (cmd->nc_nvme->n_strict_version) {
1747 		nvme_ctrl_mark_dead(cmd->nc_nvme, B_FALSE);
1748 	}
1749 
1750 	return (EIO);
1751 }
1752 
1753 static int
1754 nvme_check_vendor_cmd_status(nvme_cmd_t *cmd)
1755 {
1756 	nvme_cqe_t *cqe = &cmd->nc_cqe;
1757 
1758 	dev_err(cmd->nc_nvme->n_dip, CE_WARN,
1759 	    "!unknown command status received: opc = %x, sqid = %d, cid = %d, "
1760 	    "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc,
1761 	    cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct,
1762 	    cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m);
1763 	if (!cmd->nc_nvme->n_ignore_unknown_vendor_status) {
1764 		nvme_ctrl_mark_dead(cmd->nc_nvme, B_FALSE);
1765 	}
1766 
1767 	return (EIO);
1768 }
1769 
1770 static int
1771 nvme_check_integrity_cmd_status(nvme_cmd_t *cmd)
1772 {
1773 	nvme_cqe_t *cqe = &cmd->nc_cqe;
1774 
1775 	switch (cqe->cqe_sf.sf_sc) {
1776 	case NVME_CQE_SC_INT_NVM_WRITE:
1777 		/* write fail */
1778 		/* TODO: post ereport */
1779 		if (cmd->nc_xfer != NULL)
1780 			bd_error(cmd->nc_xfer, BD_ERR_MEDIA);
1781 		return (EIO);
1782 
1783 	case NVME_CQE_SC_INT_NVM_READ:
1784 		/* read fail */
1785 		/* TODO: post ereport */
1786 		if (cmd->nc_xfer != NULL)
1787 			bd_error(cmd->nc_xfer, BD_ERR_MEDIA);
1788 		return (EIO);
1789 
1790 	default:
1791 		return (nvme_check_unknown_cmd_status(cmd));
1792 	}
1793 }
1794 
1795 static int
1796 nvme_check_generic_cmd_status(nvme_cmd_t *cmd)
1797 {
1798 	nvme_cqe_t *cqe = &cmd->nc_cqe;
1799 
1800 	switch (cqe->cqe_sf.sf_sc) {
1801 	case NVME_CQE_SC_GEN_SUCCESS:
1802 		return (0);
1803 
1804 	/*
1805 	 * Errors indicating a bug in the driver should cause a panic.
1806 	 */
1807 	case NVME_CQE_SC_GEN_INV_OPC:
1808 		/* Invalid Command Opcode */
1809 		if (!cmd->nc_dontpanic)
1810 			dev_err(cmd->nc_nvme->n_dip, CE_PANIC,
1811 			    "programming error: invalid opcode in cmd %p",
1812 			    (void *)cmd);
1813 		return (EINVAL);
1814 
1815 	case NVME_CQE_SC_GEN_INV_FLD:
1816 		/* Invalid Field in Command */
1817 		if (!cmd->nc_dontpanic)
1818 			dev_err(cmd->nc_nvme->n_dip, CE_PANIC,
1819 			    "programming error: invalid field in cmd %p",
1820 			    (void *)cmd);
1821 		return (EIO);
1822 
1823 	case NVME_CQE_SC_GEN_ID_CNFL:
1824 		/* Command ID Conflict */
1825 		dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
1826 		    "cmd ID conflict in cmd %p", (void *)cmd);
1827 		return (0);
1828 
1829 	case NVME_CQE_SC_GEN_INV_NS:
1830 		/* Invalid Namespace or Format */
1831 		if (!cmd->nc_dontpanic)
1832 			dev_err(cmd->nc_nvme->n_dip, CE_PANIC,
1833 			    "programming error: invalid NS/format in cmd %p",
1834 			    (void *)cmd);
1835 		return (EINVAL);
1836 
1837 	case NVME_CQE_SC_GEN_NVM_LBA_RANGE:
1838 		/* LBA Out Of Range */
1839 		dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
1840 		    "LBA out of range in cmd %p", (void *)cmd);
1841 		return (0);
1842 
1843 	/*
1844 	 * Non-fatal errors, handle gracefully.
1845 	 */
1846 	case NVME_CQE_SC_GEN_DATA_XFR_ERR:
1847 		/* Data Transfer Error (DMA) */
1848 		/* TODO: post ereport */
1849 		atomic_inc_32(&cmd->nc_nvme->n_data_xfr_err);
1850 		if (cmd->nc_xfer != NULL)
1851 			bd_error(cmd->nc_xfer, BD_ERR_NTRDY);
1852 		return (EIO);
1853 
1854 	case NVME_CQE_SC_GEN_INTERNAL_ERR:
1855 		/*
1856 		 * Internal Error. The spec (v1.0, section 4.5.1.2) says
1857 		 * detailed error information is returned as async event,
1858 		 * so we pretty much ignore the error here and handle it
1859 		 * in the async event handler.
1860 		 */
1861 		atomic_inc_32(&cmd->nc_nvme->n_internal_err);
1862 		if (cmd->nc_xfer != NULL)
1863 			bd_error(cmd->nc_xfer, BD_ERR_NTRDY);
1864 		return (EIO);
1865 
1866 	case NVME_CQE_SC_GEN_ABORT_REQUEST:
1867 		/*
1868 		 * Command Abort Requested. This normally happens only when a
1869 		 * command times out.
1870 		 */
1871 		/* TODO: post ereport or change blkdev to handle this? */
1872 		atomic_inc_32(&cmd->nc_nvme->n_abort_rq_err);
1873 		return (ECANCELED);
1874 
1875 	case NVME_CQE_SC_GEN_ABORT_PWRLOSS:
1876 		/* Command Aborted due to Power Loss Notification */
1877 		nvme_ctrl_mark_dead(cmd->nc_nvme, B_FALSE);
1878 		return (EIO);
1879 
1880 	case NVME_CQE_SC_GEN_ABORT_SQ_DEL:
1881 		/* Command Aborted due to SQ Deletion */
1882 		atomic_inc_32(&cmd->nc_nvme->n_abort_sq_del);
1883 		return (EIO);
1884 
1885 	case NVME_CQE_SC_GEN_NVM_CAP_EXC:
1886 		/* Capacity Exceeded */
1887 		atomic_inc_32(&cmd->nc_nvme->n_nvm_cap_exc);
1888 		if (cmd->nc_xfer != NULL)
1889 			bd_error(cmd->nc_xfer, BD_ERR_MEDIA);
1890 		return (EIO);
1891 
1892 	case NVME_CQE_SC_GEN_NVM_NS_NOTRDY:
1893 		/* Namespace Not Ready */
1894 		atomic_inc_32(&cmd->nc_nvme->n_nvm_ns_notrdy);
1895 		if (cmd->nc_xfer != NULL)
1896 			bd_error(cmd->nc_xfer, BD_ERR_NTRDY);
1897 		return (EIO);
1898 
1899 	case NVME_CQE_SC_GEN_NVM_FORMATTING:
1900 		/* Format in progress (1.2) */
1901 		if (!NVME_VERSION_ATLEAST(&cmd->nc_nvme->n_version, 1, 2))
1902 			return (nvme_check_unknown_cmd_status(cmd));
1903 		atomic_inc_32(&cmd->nc_nvme->n_nvm_ns_formatting);
1904 		if (cmd->nc_xfer != NULL)
1905 			bd_error(cmd->nc_xfer, BD_ERR_NTRDY);
1906 		return (EIO);
1907 
1908 	default:
1909 		return (nvme_check_unknown_cmd_status(cmd));
1910 	}
1911 }
1912 
1913 static int
1914 nvme_check_specific_cmd_status(nvme_cmd_t *cmd)
1915 {
1916 	nvme_cqe_t *cqe = &cmd->nc_cqe;
1917 
1918 	switch (cqe->cqe_sf.sf_sc) {
1919 	case NVME_CQE_SC_SPC_INV_CQ:
1920 		/* Completion Queue Invalid */
1921 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE);
1922 		atomic_inc_32(&cmd->nc_nvme->n_inv_cq_err);
1923 		return (EINVAL);
1924 
1925 	case NVME_CQE_SC_SPC_INV_QID:
1926 		/* Invalid Queue Identifier */
1927 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE ||
1928 		    cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_SQUEUE ||
1929 		    cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE ||
1930 		    cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_CQUEUE);
1931 		atomic_inc_32(&cmd->nc_nvme->n_inv_qid_err);
1932 		return (EINVAL);
1933 
1934 	case NVME_CQE_SC_SPC_MAX_QSZ_EXC:
1935 		/* Max Queue Size Exceeded */
1936 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE ||
1937 		    cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE);
1938 		atomic_inc_32(&cmd->nc_nvme->n_max_qsz_exc);
1939 		return (EINVAL);
1940 
1941 	case NVME_CQE_SC_SPC_ABRT_CMD_EXC:
1942 		/* Abort Command Limit Exceeded */
1943 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_ABORT);
1944 		dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
1945 		    "abort command limit exceeded in cmd %p", (void *)cmd);
1946 		return (0);
1947 
1948 	case NVME_CQE_SC_SPC_ASYNC_EVREQ_EXC:
1949 		/* Async Event Request Limit Exceeded */
1950 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_ASYNC_EVENT);
1951 		dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
1952 		    "async event request limit exceeded in cmd %p",
1953 		    (void *)cmd);
1954 		return (0);
1955 
1956 	case NVME_CQE_SC_SPC_INV_INT_VECT:
1957 		/* Invalid Interrupt Vector */
1958 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE);
1959 		atomic_inc_32(&cmd->nc_nvme->n_inv_int_vect);
1960 		return (EINVAL);
1961 
1962 	case NVME_CQE_SC_SPC_INV_LOG_PAGE:
1963 		/* Invalid Log Page */
1964 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_GET_LOG_PAGE);
1965 		atomic_inc_32(&cmd->nc_nvme->n_inv_log_page);
1966 		return (EINVAL);
1967 
1968 	case NVME_CQE_SC_SPC_INV_FORMAT:
1969 		/* Invalid Format */
1970 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_FORMAT);
1971 		atomic_inc_32(&cmd->nc_nvme->n_inv_format);
1972 		if (cmd->nc_xfer != NULL)
1973 			bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
1974 		return (EINVAL);
1975 
1976 	case NVME_CQE_SC_SPC_INV_Q_DEL:
1977 		/* Invalid Queue Deletion */
1978 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_CQUEUE);
1979 		atomic_inc_32(&cmd->nc_nvme->n_inv_q_del);
1980 		return (EINVAL);
1981 
1982 	case NVME_CQE_SC_SPC_NVM_CNFL_ATTR:
1983 		/* Conflicting Attributes */
1984 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_DSET_MGMT ||
1985 		    cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_READ ||
1986 		    cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE);
1987 		atomic_inc_32(&cmd->nc_nvme->n_cnfl_attr);
1988 		if (cmd->nc_xfer != NULL)
1989 			bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
1990 		return (EINVAL);
1991 
1992 	case NVME_CQE_SC_SPC_NVM_INV_PROT:
1993 		/* Invalid Protection Information */
1994 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_COMPARE ||
1995 		    cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_READ ||
1996 		    cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE);
1997 		atomic_inc_32(&cmd->nc_nvme->n_inv_prot);
1998 		if (cmd->nc_xfer != NULL)
1999 			bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
2000 		return (EINVAL);
2001 
2002 	case NVME_CQE_SC_SPC_NVM_READONLY:
2003 		/* Write to Read Only Range */
2004 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE);
2005 		atomic_inc_32(&cmd->nc_nvme->n_readonly);
2006 		if (cmd->nc_xfer != NULL)
2007 			bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
2008 		return (EROFS);
2009 
2010 	case NVME_CQE_SC_SPC_INV_FW_SLOT:
2011 		/* Invalid Firmware Slot */
2012 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
2013 		return (EINVAL);
2014 
2015 	case NVME_CQE_SC_SPC_INV_FW_IMG:
2016 		/* Invalid Firmware Image */
2017 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
2018 		return (EINVAL);
2019 
2020 	case NVME_CQE_SC_SPC_FW_RESET:
2021 		/* Conventional Reset Required */
2022 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
2023 		return (0);
2024 
2025 	case NVME_CQE_SC_SPC_FW_NSSR:
2026 		/* NVMe Subsystem Reset Required */
2027 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
2028 		return (0);
2029 
2030 	case NVME_CQE_SC_SPC_FW_NEXT_RESET:
2031 		/* Activation Requires Reset */
2032 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
2033 		return (0);
2034 
2035 	case NVME_CQE_SC_SPC_FW_MTFA:
2036 		/* Activation Requires Maximum Time Violation */
2037 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
2038 		return (EAGAIN);
2039 
2040 	case NVME_CQE_SC_SPC_FW_PROHIBITED:
2041 		/* Activation Prohibited */
2042 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
2043 		return (EINVAL);
2044 
2045 	case NVME_CQE_SC_SPC_FW_OVERLAP:
2046 		/* Overlapping Firmware Ranges */
2047 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_IMAGE_LOAD);
2048 		return (EINVAL);
2049 
2050 	default:
2051 		return (nvme_check_unknown_cmd_status(cmd));
2052 	}
2053 }
2054 
2055 static inline int
2056 nvme_check_cmd_status(nvme_cmd_t *cmd)
2057 {
2058 	nvme_cqe_t *cqe = &cmd->nc_cqe;
2059 
2060 	/*
2061 	 * Take a shortcut if the controller is dead, or if
2062 	 * command status indicates no error.
2063 	 */
2064 	if (cmd->nc_nvme->n_dead)
2065 		return (EIO);
2066 
2067 	if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
2068 	    cqe->cqe_sf.sf_sc == NVME_CQE_SC_GEN_SUCCESS)
2069 		return (0);
2070 
2071 	if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC)
2072 		return (nvme_check_generic_cmd_status(cmd));
2073 	else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_SPECIFIC)
2074 		return (nvme_check_specific_cmd_status(cmd));
2075 	else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_INTEGRITY)
2076 		return (nvme_check_integrity_cmd_status(cmd));
2077 	else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_VENDOR)
2078 		return (nvme_check_vendor_cmd_status(cmd));
2079 
2080 	return (nvme_check_unknown_cmd_status(cmd));
2081 }
2082 
2083 /*
2084  * Check the command status as used by an ioctl path and do not convert it to an
2085  * errno. We still allow all the command status checking to occur, but otherwise
2086  * will pass back the controller error as is.
2087  */
2088 static boolean_t
2089 nvme_check_cmd_status_ioctl(nvme_cmd_t *cmd, nvme_ioctl_common_t *ioc)
2090 {
2091 	nvme_cqe_t *cqe = &cmd->nc_cqe;
2092 	nvme_t *nvme = cmd->nc_nvme;
2093 
2094 	if (nvme->n_dead) {
2095 		return (nvme_ioctl_error(ioc, nvme->n_dead_status, 0, 0));
2096 	}
2097 
2098 	if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
2099 	    cqe->cqe_sf.sf_sc == NVME_CQE_SC_GEN_SUCCESS)
2100 		return (B_TRUE);
2101 
2102 	if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC) {
2103 		(void) nvme_check_generic_cmd_status(cmd);
2104 	} else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_SPECIFIC) {
2105 		(void) nvme_check_specific_cmd_status(cmd);
2106 	} else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_INTEGRITY) {
2107 		(void) nvme_check_integrity_cmd_status(cmd);
2108 	} else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_VENDOR) {
2109 		(void) nvme_check_vendor_cmd_status(cmd);
2110 	} else {
2111 		(void) nvme_check_unknown_cmd_status(cmd);
2112 	}
2113 
2114 	return (nvme_ioctl_error(ioc, NVME_IOCTL_E_CTRL_ERROR,
2115 	    cqe->cqe_sf.sf_sct, cqe->cqe_sf.sf_sc));
2116 }
2117 
2118 static int
2119 nvme_abort_cmd(nvme_cmd_t *abort_cmd, uint_t sec)
2120 {
2121 	nvme_t *nvme = abort_cmd->nc_nvme;
2122 	nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
2123 	nvme_abort_cmd_t ac = { 0 };
2124 	int ret = 0;
2125 
2126 	sema_p(&nvme->n_abort_sema);
2127 
2128 	ac.b.ac_cid = abort_cmd->nc_sqe.sqe_cid;
2129 	ac.b.ac_sqid = abort_cmd->nc_sqid;
2130 
2131 	cmd->nc_sqid = 0;
2132 	cmd->nc_sqe.sqe_opc = NVME_OPC_ABORT;
2133 	cmd->nc_callback = nvme_wakeup_cmd;
2134 	cmd->nc_sqe.sqe_cdw10 = ac.r;
2135 
2136 	/*
2137 	 * Send the ABORT to the hardware. The ABORT command will return _after_
2138 	 * the aborted command has completed (aborted or otherwise), but since
2139 	 * we still hold the aborted command's mutex its callback hasn't been
2140 	 * processed yet.
2141 	 */
2142 	nvme_admin_cmd(cmd, sec);
2143 	sema_v(&nvme->n_abort_sema);
2144 
2145 	if ((ret = nvme_check_cmd_status(cmd)) != 0) {
2146 		dev_err(nvme->n_dip, CE_WARN,
2147 		    "!ABORT failed with sct = %x, sc = %x",
2148 		    cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
2149 		atomic_inc_32(&nvme->n_abort_failed);
2150 	} else {
2151 		dev_err(nvme->n_dip, CE_WARN,
2152 		    "!ABORT of command %d/%d %ssuccessful",
2153 		    abort_cmd->nc_sqe.sqe_cid, abort_cmd->nc_sqid,
2154 		    cmd->nc_cqe.cqe_dw0 & 1 ? "un" : "");
2155 		if ((cmd->nc_cqe.cqe_dw0 & 1) == 0)
2156 			atomic_inc_32(&nvme->n_cmd_aborted);
2157 	}
2158 
2159 	nvme_free_cmd(cmd);
2160 	return (ret);
2161 }
2162 
2163 /*
2164  * nvme_wait_cmd -- wait for command completion or timeout
2165  *
2166  * In case of a serious error or a timeout of the abort command the hardware
2167  * will be declared dead and FMA will be notified.
2168  */
2169 static void
2170 nvme_wait_cmd(nvme_cmd_t *cmd, uint32_t sec)
2171 {
2172 	clock_t timeout = ddi_get_lbolt() + drv_usectohz((long)sec * MICROSEC);
2173 	nvme_t *nvme = cmd->nc_nvme;
2174 	nvme_reg_csts_t csts;
2175 	nvme_qpair_t *qp;
2176 
2177 	ASSERT(mutex_owned(&cmd->nc_mutex));
2178 
2179 	while (!cmd->nc_completed) {
2180 		if (cv_timedwait(&cmd->nc_cv, &cmd->nc_mutex, timeout) == -1)
2181 			break;
2182 	}
2183 
2184 	if (cmd->nc_completed)
2185 		return;
2186 
2187 	/*
2188 	 * The command timed out.
2189 	 *
2190 	 * Check controller for fatal status, any errors associated with the
2191 	 * register or DMA handle, or for a double timeout (abort command timed
2192 	 * out). If necessary log a warning and call FMA.
2193 	 */
2194 	csts.r = nvme_get32(nvme, NVME_REG_CSTS);
2195 	dev_err(nvme->n_dip, CE_WARN, "!command %d/%d timeout, "
2196 	    "OPC = %x, CFS = %d", cmd->nc_sqe.sqe_cid, cmd->nc_sqid,
2197 	    cmd->nc_sqe.sqe_opc, csts.b.csts_cfs);
2198 	atomic_inc_32(&nvme->n_cmd_timeout);
2199 
2200 	if (csts.b.csts_cfs ||
2201 	    nvme_check_regs_hdl(nvme) ||
2202 	    nvme_check_dma_hdl(cmd->nc_dma) ||
2203 	    cmd->nc_sqe.sqe_opc == NVME_OPC_ABORT) {
2204 		nvme_ctrl_mark_dead(cmd->nc_nvme, B_FALSE);
2205 	} else if (nvme_abort_cmd(cmd, sec) == 0) {
2206 		/*
2207 		 * If the abort succeeded the command should complete
2208 		 * immediately with an appropriate status.
2209 		 */
2210 		while (!cmd->nc_completed)
2211 			cv_wait(&cmd->nc_cv, &cmd->nc_mutex);
2212 
2213 		return;
2214 	}
2215 
2216 	qp = nvme->n_ioq[cmd->nc_sqid];
2217 
2218 	mutex_enter(&qp->nq_mutex);
2219 	(void) nvme_unqueue_cmd(nvme, qp, cmd->nc_sqe.sqe_cid);
2220 	mutex_exit(&qp->nq_mutex);
2221 
2222 	/*
2223 	 * As we don't know what the presumed dead hardware might still do with
2224 	 * the DMA memory, we'll put the command on the lost commands list if it
2225 	 * has any DMA memory.
2226 	 */
2227 	if (cmd->nc_dma != NULL) {
2228 		mutex_enter(&nvme_lc_mutex);
2229 		list_insert_head(&nvme_lost_cmds, cmd);
2230 		mutex_exit(&nvme_lc_mutex);
2231 	}
2232 }
2233 
2234 static void
2235 nvme_wakeup_cmd(void *arg)
2236 {
2237 	nvme_cmd_t *cmd = arg;
2238 
2239 	mutex_enter(&cmd->nc_mutex);
2240 	cmd->nc_completed = B_TRUE;
2241 	cv_signal(&cmd->nc_cv);
2242 	mutex_exit(&cmd->nc_mutex);
2243 }
2244 
2245 static void
2246 nvme_async_event_task(void *arg)
2247 {
2248 	nvme_cmd_t *cmd = arg;
2249 	nvme_t *nvme = cmd->nc_nvme;
2250 	nvme_error_log_entry_t *error_log = NULL;
2251 	nvme_health_log_t *health_log = NULL;
2252 	nvme_nschange_list_t *nslist = NULL;
2253 	size_t logsize = 0;
2254 	nvme_async_event_t event;
2255 
2256 	/*
2257 	 * Check for errors associated with the async request itself. The only
2258 	 * command-specific error is "async event limit exceeded", which
2259 	 * indicates a programming error in the driver and causes a panic in
2260 	 * nvme_check_cmd_status().
2261 	 *
2262 	 * Other possible errors are various scenarios where the async request
2263 	 * was aborted, or internal errors in the device. Internal errors are
2264 	 * reported to FMA, the command aborts need no special handling here.
2265 	 *
2266 	 * And finally, at least qemu nvme does not support async events,
2267 	 * and will return NVME_CQE_SC_GEN_INV_OPC | DNR. If so, we
2268 	 * will avoid posting async events.
2269 	 */
2270 
2271 	if (nvme_check_cmd_status(cmd) != 0) {
2272 		dev_err(cmd->nc_nvme->n_dip, CE_WARN,
2273 		    "!async event request returned failure, sct = 0x%x, "
2274 		    "sc = 0x%x, dnr = %d, m = %d", cmd->nc_cqe.cqe_sf.sf_sct,
2275 		    cmd->nc_cqe.cqe_sf.sf_sc, cmd->nc_cqe.cqe_sf.sf_dnr,
2276 		    cmd->nc_cqe.cqe_sf.sf_m);
2277 
2278 		if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
2279 		    cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INTERNAL_ERR) {
2280 			nvme_ctrl_mark_dead(cmd->nc_nvme, B_FALSE);
2281 		}
2282 
2283 		if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
2284 		    cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INV_OPC &&
2285 		    cmd->nc_cqe.cqe_sf.sf_dnr == 1) {
2286 			nvme->n_async_event_supported = B_FALSE;
2287 		}
2288 
2289 		nvme_free_cmd(cmd);
2290 		return;
2291 	}
2292 
2293 	event.r = cmd->nc_cqe.cqe_dw0;
2294 
2295 	/* Clear CQE and re-submit the async request. */
2296 	bzero(&cmd->nc_cqe, sizeof (nvme_cqe_t));
2297 	nvme_submit_admin_cmd(nvme->n_adminq, cmd);
2298 	cmd = NULL;	/* cmd can no longer be used after resubmission */
2299 
2300 	switch (event.b.ae_type) {
2301 	case NVME_ASYNC_TYPE_ERROR:
2302 		if (event.b.ae_logpage == NVME_LOGPAGE_ERROR) {
2303 			if (!nvme_get_logpage_int(nvme, B_FALSE,
2304 			    (void **)&error_log, &logsize,
2305 			    NVME_LOGPAGE_ERROR)) {
2306 				return;
2307 			}
2308 		} else {
2309 			dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in "
2310 			    "async event reply: type=0x%x logpage=0x%x",
2311 			    event.b.ae_type, event.b.ae_logpage);
2312 			atomic_inc_32(&nvme->n_wrong_logpage);
2313 			return;
2314 		}
2315 
2316 		switch (event.b.ae_info) {
2317 		case NVME_ASYNC_ERROR_INV_SQ:
2318 			dev_err(nvme->n_dip, CE_PANIC, "programming error: "
2319 			    "invalid submission queue");
2320 			return;
2321 
2322 		case NVME_ASYNC_ERROR_INV_DBL:
2323 			dev_err(nvme->n_dip, CE_PANIC, "programming error: "
2324 			    "invalid doorbell write value");
2325 			return;
2326 
2327 		case NVME_ASYNC_ERROR_DIAGFAIL:
2328 			dev_err(nvme->n_dip, CE_WARN, "!diagnostic failure");
2329 			nvme_ctrl_mark_dead(cmd->nc_nvme, B_FALSE);
2330 			atomic_inc_32(&nvme->n_diagfail_event);
2331 			break;
2332 
2333 		case NVME_ASYNC_ERROR_PERSISTENT:
2334 			dev_err(nvme->n_dip, CE_WARN, "!persistent internal "
2335 			    "device error");
2336 			nvme_ctrl_mark_dead(cmd->nc_nvme, B_FALSE);
2337 			atomic_inc_32(&nvme->n_persistent_event);
2338 			break;
2339 
2340 		case NVME_ASYNC_ERROR_TRANSIENT:
2341 			dev_err(nvme->n_dip, CE_WARN, "!transient internal "
2342 			    "device error");
2343 			/* TODO: send ereport */
2344 			atomic_inc_32(&nvme->n_transient_event);
2345 			break;
2346 
2347 		case NVME_ASYNC_ERROR_FW_LOAD:
2348 			dev_err(nvme->n_dip, CE_WARN,
2349 			    "!firmware image load error");
2350 			atomic_inc_32(&nvme->n_fw_load_event);
2351 			break;
2352 		}
2353 		break;
2354 
2355 	case NVME_ASYNC_TYPE_HEALTH:
2356 		if (event.b.ae_logpage == NVME_LOGPAGE_HEALTH) {
2357 			if (!nvme_get_logpage_int(nvme, B_FALSE,
2358 			    (void **)&health_log, &logsize,
2359 			    NVME_LOGPAGE_HEALTH)) {
2360 				return;
2361 			}
2362 		} else {
2363 			dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in "
2364 			    "type=0x%x logpage=0x%x", event.b.ae_type,
2365 			    event.b.ae_logpage);
2366 			atomic_inc_32(&nvme->n_wrong_logpage);
2367 			return;
2368 		}
2369 
2370 		switch (event.b.ae_info) {
2371 		case NVME_ASYNC_HEALTH_RELIABILITY:
2372 			dev_err(nvme->n_dip, CE_WARN,
2373 			    "!device reliability compromised");
2374 			/* TODO: send ereport */
2375 			atomic_inc_32(&nvme->n_reliability_event);
2376 			break;
2377 
2378 		case NVME_ASYNC_HEALTH_TEMPERATURE:
2379 			dev_err(nvme->n_dip, CE_WARN,
2380 			    "!temperature above threshold");
2381 			/* TODO: send ereport */
2382 			atomic_inc_32(&nvme->n_temperature_event);
2383 			break;
2384 
2385 		case NVME_ASYNC_HEALTH_SPARE:
2386 			dev_err(nvme->n_dip, CE_WARN,
2387 			    "!spare space below threshold");
2388 			/* TODO: send ereport */
2389 			atomic_inc_32(&nvme->n_spare_event);
2390 			break;
2391 		}
2392 		break;
2393 
2394 	case NVME_ASYNC_TYPE_NOTICE:
2395 		switch (event.b.ae_info) {
2396 		case NVME_ASYNC_NOTICE_NS_CHANGE:
2397 			if (event.b.ae_logpage != NVME_LOGPAGE_NSCHANGE) {
2398 				dev_err(nvme->n_dip, CE_WARN,
2399 				    "!wrong logpage in async event reply: "
2400 				    "type=0x%x logpage=0x%x",
2401 				    event.b.ae_type, event.b.ae_logpage);
2402 				atomic_inc_32(&nvme->n_wrong_logpage);
2403 				break;
2404 			}
2405 
2406 			dev_err(nvme->n_dip, CE_NOTE,
2407 			    "namespace attribute change event, "
2408 			    "logpage = 0x%x", event.b.ae_logpage);
2409 			atomic_inc_32(&nvme->n_notice_event);
2410 
2411 			if (!nvme_get_logpage_int(nvme, B_FALSE,
2412 			    (void **)&nslist, &logsize,
2413 			    NVME_LOGPAGE_NSCHANGE)) {
2414 				break;
2415 			}
2416 
2417 			if (nslist->nscl_ns[0] == UINT32_MAX) {
2418 				dev_err(nvme->n_dip, CE_CONT,
2419 				    "more than %u namespaces have changed.\n",
2420 				    NVME_NSCHANGE_LIST_SIZE);
2421 				break;
2422 			}
2423 
2424 			mutex_enter(&nvme->n_mgmt_mutex);
2425 			for (uint_t i = 0; i < NVME_NSCHANGE_LIST_SIZE; i++) {
2426 				uint32_t nsid = nslist->nscl_ns[i];
2427 
2428 				if (nsid == 0)	/* end of list */
2429 					break;
2430 
2431 				dev_err(nvme->n_dip, CE_NOTE,
2432 				    "!namespace nvme%d/%u has changed.",
2433 				    ddi_get_instance(nvme->n_dip), nsid);
2434 
2435 
2436 				if (nvme_init_ns(nvme, nsid) != DDI_SUCCESS)
2437 					continue;
2438 
2439 				bd_state_change(nvme_nsid2ns(nvme,
2440 				    nsid)->ns_bd_hdl);
2441 			}
2442 			mutex_exit(&nvme->n_mgmt_mutex);
2443 
2444 			break;
2445 
2446 		case NVME_ASYNC_NOTICE_FW_ACTIVATE:
2447 			dev_err(nvme->n_dip, CE_NOTE,
2448 			    "firmware activation starting, "
2449 			    "logpage = 0x%x", event.b.ae_logpage);
2450 			atomic_inc_32(&nvme->n_notice_event);
2451 			break;
2452 
2453 		case NVME_ASYNC_NOTICE_TELEMETRY:
2454 			dev_err(nvme->n_dip, CE_NOTE,
2455 			    "telemetry log changed, "
2456 			    "logpage = 0x%x", event.b.ae_logpage);
2457 			atomic_inc_32(&nvme->n_notice_event);
2458 			break;
2459 
2460 		case NVME_ASYNC_NOTICE_NS_ASYMM:
2461 			dev_err(nvme->n_dip, CE_NOTE,
2462 			    "asymmetric namespace access change, "
2463 			    "logpage = 0x%x", event.b.ae_logpage);
2464 			atomic_inc_32(&nvme->n_notice_event);
2465 			break;
2466 
2467 		case NVME_ASYNC_NOTICE_LATENCYLOG:
2468 			dev_err(nvme->n_dip, CE_NOTE,
2469 			    "predictable latency event aggregate log change, "
2470 			    "logpage = 0x%x", event.b.ae_logpage);
2471 			atomic_inc_32(&nvme->n_notice_event);
2472 			break;
2473 
2474 		case NVME_ASYNC_NOTICE_LBASTATUS:
2475 			dev_err(nvme->n_dip, CE_NOTE,
2476 			    "LBA status information alert, "
2477 			    "logpage = 0x%x", event.b.ae_logpage);
2478 			atomic_inc_32(&nvme->n_notice_event);
2479 			break;
2480 
2481 		case NVME_ASYNC_NOTICE_ENDURANCELOG:
2482 			dev_err(nvme->n_dip, CE_NOTE,
2483 			    "endurance group event aggregate log page change, "
2484 			    "logpage = 0x%x", event.b.ae_logpage);
2485 			atomic_inc_32(&nvme->n_notice_event);
2486 			break;
2487 
2488 		default:
2489 			dev_err(nvme->n_dip, CE_WARN,
2490 			    "!unknown notice async event received, "
2491 			    "info = 0x%x, logpage = 0x%x", event.b.ae_info,
2492 			    event.b.ae_logpage);
2493 			atomic_inc_32(&nvme->n_unknown_event);
2494 			break;
2495 		}
2496 		break;
2497 
2498 	case NVME_ASYNC_TYPE_VENDOR:
2499 		dev_err(nvme->n_dip, CE_WARN, "!vendor specific async event "
2500 		    "received, info = 0x%x, logpage = 0x%x", event.b.ae_info,
2501 		    event.b.ae_logpage);
2502 		atomic_inc_32(&nvme->n_vendor_event);
2503 		break;
2504 
2505 	default:
2506 		dev_err(nvme->n_dip, CE_WARN, "!unknown async event received, "
2507 		    "type = 0x%x, info = 0x%x, logpage = 0x%x", event.b.ae_type,
2508 		    event.b.ae_info, event.b.ae_logpage);
2509 		atomic_inc_32(&nvme->n_unknown_event);
2510 		break;
2511 	}
2512 
2513 	if (error_log != NULL)
2514 		kmem_free(error_log, logsize);
2515 
2516 	if (health_log != NULL)
2517 		kmem_free(health_log, logsize);
2518 
2519 	if (nslist != NULL)
2520 		kmem_free(nslist, logsize);
2521 }
2522 
2523 static void
2524 nvme_admin_cmd(nvme_cmd_t *cmd, uint32_t sec)
2525 {
2526 	mutex_enter(&cmd->nc_mutex);
2527 	nvme_submit_admin_cmd(cmd->nc_nvme->n_adminq, cmd);
2528 	nvme_wait_cmd(cmd, sec);
2529 	mutex_exit(&cmd->nc_mutex);
2530 }
2531 
2532 static void
2533 nvme_async_event(nvme_t *nvme)
2534 {
2535 	nvme_cmd_t *cmd;
2536 
2537 	cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
2538 	cmd->nc_sqid = 0;
2539 	cmd->nc_sqe.sqe_opc = NVME_OPC_ASYNC_EVENT;
2540 	cmd->nc_callback = nvme_async_event_task;
2541 	cmd->nc_dontpanic = B_TRUE;
2542 
2543 	nvme_submit_admin_cmd(nvme->n_adminq, cmd);
2544 }
2545 
2546 /*
2547  * There are commands such as format or vendor unique commands that are going to
2548  * manipulate the data in a namespace or destroy them, we make sure that none of
2549  * the ones that will be impacted are actually attached.
2550  */
2551 static boolean_t
2552 nvme_no_blkdev_attached(nvme_t *nvme, uint32_t nsid)
2553 {
2554 	ASSERT(MUTEX_HELD(&nvme->n_mgmt_mutex));
2555 	ASSERT3U(nsid, !=, 0);
2556 
2557 	if (nsid != NVME_NSID_BCAST) {
2558 		nvme_namespace_t *ns = nvme_nsid2ns(nvme, nsid);
2559 		return (!ns->ns_attached);
2560 	}
2561 
2562 	for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) {
2563 		nvme_namespace_t *ns = nvme_nsid2ns(nvme, i);
2564 
2565 		if (ns->ns_attached) {
2566 			return (B_FALSE);
2567 		}
2568 	}
2569 
2570 	return (B_TRUE);
2571 }
2572 
2573 static boolean_t
2574 nvme_format_nvm(nvme_t *nvme, nvme_ioctl_format_t *ioc)
2575 {
2576 	nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
2577 	nvme_format_nvm_t format_nvm = { 0 };
2578 	boolean_t ret;
2579 
2580 	format_nvm.b.fm_lbaf = bitx32(ioc->nif_lbaf, 3, 0);
2581 	format_nvm.b.fm_ses = bitx32(ioc->nif_ses, 2, 0);
2582 
2583 	cmd->nc_sqid = 0;
2584 	cmd->nc_callback = nvme_wakeup_cmd;
2585 	cmd->nc_sqe.sqe_nsid = ioc->nif_common.nioc_nsid;
2586 	cmd->nc_sqe.sqe_opc = NVME_OPC_NVM_FORMAT;
2587 	cmd->nc_sqe.sqe_cdw10 = format_nvm.r;
2588 
2589 	/*
2590 	 * We don't want to panic on any format commands. There are two reasons
2591 	 * for this:
2592 	 *
2593 	 * 1) All format commands are initiated by users. We don't want to panic
2594 	 * on user commands.
2595 	 *
2596 	 * 2) Several devices like the Samsung SM951 don't allow formatting of
2597 	 * all namespaces in one command and we'd prefer to handle that
2598 	 * gracefully.
2599 	 */
2600 	cmd->nc_dontpanic = B_TRUE;
2601 
2602 	nvme_admin_cmd(cmd, nvme_format_cmd_timeout);
2603 
2604 	if (!nvme_check_cmd_status_ioctl(cmd, &ioc->nif_common) != 0) {
2605 		dev_err(nvme->n_dip, CE_WARN,
2606 		    "!FORMAT failed with sct = %x, sc = %x",
2607 		    cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
2608 		ret = B_FALSE;
2609 		goto fail;
2610 	}
2611 
2612 	ret = B_TRUE;
2613 fail:
2614 	nvme_free_cmd(cmd);
2615 	return (ret);
2616 }
2617 
2618 /*
2619  * Retrieve a specific log page. The contents of the log page request should
2620  * have already been validated by the system.
2621  */
2622 static boolean_t
2623 nvme_get_logpage(nvme_t *nvme, boolean_t user, nvme_ioctl_get_logpage_t *log,
2624     void **buf)
2625 {
2626 	nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
2627 	nvme_getlogpage_dw10_t dw10;
2628 	uint32_t offlo, offhi;
2629 	nvme_getlogpage_dw11_t dw11;
2630 	nvme_getlogpage_dw14_t dw14;
2631 	uint32_t ndw;
2632 	boolean_t ret = B_FALSE;
2633 
2634 	bzero(&dw10, sizeof (dw10));
2635 	bzero(&dw11, sizeof (dw11));
2636 	bzero(&dw14, sizeof (dw14));
2637 
2638 	cmd->nc_sqid = 0;
2639 	cmd->nc_callback = nvme_wakeup_cmd;
2640 	cmd->nc_sqe.sqe_opc = NVME_OPC_GET_LOG_PAGE;
2641 	cmd->nc_sqe.sqe_nsid = log->nigl_common.nioc_nsid;
2642 
2643 	if (user)
2644 		cmd->nc_dontpanic = B_TRUE;
2645 
2646 	/*
2647 	 * The size field is the number of double words, but is a zeros based
2648 	 * value. We need to store our actual value minus one.
2649 	 */
2650 	ndw = (uint32_t)(log->nigl_len / 4);
2651 	ASSERT3U(ndw, >, 0);
2652 	ndw--;
2653 
2654 	dw10.b.lp_lid = bitx32(log->nigl_lid, 7, 0);
2655 	dw10.b.lp_lsp = bitx32(log->nigl_lsp, 6, 0);
2656 	dw10.b.lp_rae = bitx32(log->nigl_lsp, 0, 0);
2657 	dw10.b.lp_lnumdl = bitx32(ndw, 15, 0);
2658 
2659 	dw11.b.lp_numdu = bitx32(ndw, 31, 16);
2660 	dw11.b.lp_lsi = bitx32(log->nigl_lsi, 15, 0);
2661 
2662 	offlo = bitx64(log->nigl_offset, 31, 0);
2663 	offhi = bitx64(log->nigl_offset, 63, 32);
2664 
2665 	dw14.b.lp_csi = bitx32(log->nigl_csi, 7, 0);
2666 
2667 	cmd->nc_sqe.sqe_cdw10 = dw10.r;
2668 	cmd->nc_sqe.sqe_cdw11 = dw11.r;
2669 	cmd->nc_sqe.sqe_cdw12 = offlo;
2670 	cmd->nc_sqe.sqe_cdw13 = offhi;
2671 	cmd->nc_sqe.sqe_cdw14 = dw14.r;
2672 
2673 	if (nvme_zalloc_dma(nvme, log->nigl_len, DDI_DMA_READ,
2674 	    &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) {
2675 		dev_err(nvme->n_dip, CE_WARN,
2676 		    "!nvme_zalloc_dma failed for GET LOG PAGE");
2677 		ret = nvme_ioctl_error(&log->nigl_common,
2678 		    NVME_IOCTL_E_NO_DMA_MEM, 0, 0);
2679 		goto fail;
2680 	}
2681 
2682 	if (nvme_fill_prp(cmd, cmd->nc_dma->nd_dmah) != 0) {
2683 		ret = nvme_ioctl_error(&log->nigl_common,
2684 		    NVME_IOCTL_E_NO_DMA_MEM, 0, 0);
2685 		goto fail;
2686 	}
2687 	nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
2688 
2689 	if (!nvme_check_cmd_status_ioctl(cmd, &log->nigl_common)) {
2690 		if (!user) {
2691 			dev_err(nvme->n_dip, CE_WARN,
2692 			    "!GET LOG PAGE failed with sct = %x, sc = %x",
2693 			    cmd->nc_cqe.cqe_sf.sf_sct,
2694 			    cmd->nc_cqe.cqe_sf.sf_sc);
2695 		}
2696 		ret = B_FALSE;
2697 		goto fail;
2698 	}
2699 
2700 	*buf = kmem_alloc(log->nigl_len, KM_SLEEP);
2701 	bcopy(cmd->nc_dma->nd_memp, *buf, log->nigl_len);
2702 
2703 	ret = B_TRUE;
2704 fail:
2705 	nvme_free_cmd(cmd);
2706 
2707 	return (ret);
2708 }
2709 
2710 /*
2711  * This is an internal wrapper for when the kernel wants to get a log page.
2712  * Currently this assumes that the only thing that is required is the log page
2713  * ID. If more information is required, we'll be better served to just use the
2714  * general ioctl interface.
2715  */
2716 static boolean_t
2717 nvme_get_logpage_int(nvme_t *nvme, boolean_t user, void **buf, size_t *bufsize,
2718     uint8_t lid)
2719 {
2720 	const nvme_log_page_info_t *info = NULL;
2721 	nvme_ioctl_get_logpage_t log;
2722 	nvme_valid_ctrl_data_t data;
2723 	boolean_t bret;
2724 	bool var;
2725 
2726 	for (size_t i = 0; i < nvme_std_log_npages; i++) {
2727 		if (nvme_std_log_pages[i].nlpi_lid == lid &&
2728 		    nvme_std_log_pages[i].nlpi_csi == NVME_CSI_NVM) {
2729 			info = &nvme_std_log_pages[i];
2730 			break;
2731 		}
2732 	}
2733 
2734 	if (info == NULL) {
2735 		return (B_FALSE);
2736 	}
2737 
2738 	data.vcd_vers = &nvme->n_version;
2739 	data.vcd_id = nvme->n_idctl;
2740 	bzero(&log, sizeof (log));
2741 	log.nigl_common.nioc_nsid = NVME_NSID_BCAST;
2742 	log.nigl_csi = info->nlpi_csi;
2743 	log.nigl_lid = info->nlpi_lid;
2744 	log.nigl_len = nvme_log_page_info_size(info, &data, &var);
2745 
2746 	/*
2747 	 * We only support getting standard fixed-length log pages through the
2748 	 * kernel interface at this time. If a log page either has an unknown
2749 	 * size or has a variable length, then we cannot get it.
2750 	 */
2751 	if (log.nigl_len == 0 || var) {
2752 		return (B_FALSE);
2753 	}
2754 
2755 	bret = nvme_get_logpage(nvme, user, &log, buf);
2756 	if (!bret) {
2757 		return (B_FALSE);
2758 	}
2759 
2760 	*bufsize = log.nigl_len;
2761 	return (B_TRUE);
2762 }
2763 
2764 static boolean_t
2765 nvme_identify(nvme_t *nvme, boolean_t user, nvme_ioctl_identify_t *ioc,
2766     void **buf)
2767 {
2768 	nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
2769 	boolean_t ret = B_FALSE;
2770 	nvme_identify_dw10_t dw10;
2771 
2772 	ASSERT3P(buf, !=, NULL);
2773 
2774 	bzero(&dw10, sizeof (dw10));
2775 
2776 	cmd->nc_sqid = 0;
2777 	cmd->nc_callback = nvme_wakeup_cmd;
2778 	cmd->nc_sqe.sqe_opc = NVME_OPC_IDENTIFY;
2779 	cmd->nc_sqe.sqe_nsid = ioc->nid_common.nioc_nsid;
2780 
2781 	dw10.b.id_cns = bitx32(ioc->nid_cns, 7, 0);
2782 	dw10.b.id_cntid = bitx32(ioc->nid_ctrlid, 15, 0);
2783 
2784 	cmd->nc_sqe.sqe_cdw10 = dw10.r;
2785 
2786 	if (nvme_zalloc_dma(nvme, NVME_IDENTIFY_BUFSIZE, DDI_DMA_READ,
2787 	    &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) {
2788 		dev_err(nvme->n_dip, CE_WARN,
2789 		    "!nvme_zalloc_dma failed for IDENTIFY");
2790 		ret = nvme_ioctl_error(&ioc->nid_common,
2791 		    NVME_IOCTL_E_NO_DMA_MEM, 0, 0);
2792 		goto fail;
2793 	}
2794 
2795 	if (cmd->nc_dma->nd_ncookie > 2) {
2796 		dev_err(nvme->n_dip, CE_WARN,
2797 		    "!too many DMA cookies for IDENTIFY");
2798 		atomic_inc_32(&nvme->n_too_many_cookies);
2799 		ret = nvme_ioctl_error(&ioc->nid_common,
2800 		    NVME_IOCTL_E_BAD_PRP, 0, 0);
2801 		goto fail;
2802 	}
2803 
2804 	cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_dma->nd_cookie.dmac_laddress;
2805 	if (cmd->nc_dma->nd_ncookie > 1) {
2806 		ddi_dma_nextcookie(cmd->nc_dma->nd_dmah,
2807 		    &cmd->nc_dma->nd_cookie);
2808 		cmd->nc_sqe.sqe_dptr.d_prp[1] =
2809 		    cmd->nc_dma->nd_cookie.dmac_laddress;
2810 	}
2811 
2812 	if (user)
2813 		cmd->nc_dontpanic = B_TRUE;
2814 
2815 	nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
2816 
2817 	if (!nvme_check_cmd_status_ioctl(cmd, &ioc->nid_common)) {
2818 		dev_err(nvme->n_dip, CE_WARN,
2819 		    "!IDENTIFY failed with sct = %x, sc = %x",
2820 		    cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
2821 		ret = B_FALSE;
2822 		goto fail;
2823 	}
2824 
2825 	*buf = kmem_alloc(NVME_IDENTIFY_BUFSIZE, KM_SLEEP);
2826 	bcopy(cmd->nc_dma->nd_memp, *buf, NVME_IDENTIFY_BUFSIZE);
2827 	ret = B_TRUE;
2828 
2829 fail:
2830 	nvme_free_cmd(cmd);
2831 
2832 	return (ret);
2833 }
2834 
2835 static boolean_t
2836 nvme_identify_int(nvme_t *nvme, uint32_t nsid, uint8_t cns, void **buf)
2837 {
2838 	nvme_ioctl_identify_t id;
2839 
2840 	bzero(&id, sizeof (nvme_ioctl_identify_t));
2841 	id.nid_common.nioc_nsid = nsid;
2842 	id.nid_cns = cns;
2843 
2844 	return (nvme_identify(nvme, B_FALSE, &id, buf));
2845 }
2846 
2847 static int
2848 nvme_set_features(nvme_t *nvme, boolean_t user, uint32_t nsid, uint8_t feature,
2849     uint32_t val, uint32_t *res)
2850 {
2851 	_NOTE(ARGUNUSED(nsid));
2852 	nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
2853 	int ret = EINVAL;
2854 
2855 	ASSERT(res != NULL);
2856 
2857 	cmd->nc_sqid = 0;
2858 	cmd->nc_callback = nvme_wakeup_cmd;
2859 	cmd->nc_sqe.sqe_opc = NVME_OPC_SET_FEATURES;
2860 	cmd->nc_sqe.sqe_cdw10 = feature;
2861 	cmd->nc_sqe.sqe_cdw11 = val;
2862 
2863 	if (user)
2864 		cmd->nc_dontpanic = B_TRUE;
2865 
2866 	switch (feature) {
2867 	case NVME_FEAT_WRITE_CACHE:
2868 		if (!nvme->n_write_cache_present)
2869 			goto fail;
2870 		break;
2871 
2872 	case NVME_FEAT_NQUEUES:
2873 		break;
2874 
2875 	default:
2876 		goto fail;
2877 	}
2878 
2879 	nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
2880 
2881 	if ((ret = nvme_check_cmd_status(cmd)) != 0) {
2882 		dev_err(nvme->n_dip, CE_WARN,
2883 		    "!SET FEATURES %d failed with sct = %x, sc = %x",
2884 		    feature, cmd->nc_cqe.cqe_sf.sf_sct,
2885 		    cmd->nc_cqe.cqe_sf.sf_sc);
2886 		goto fail;
2887 	}
2888 
2889 	*res = cmd->nc_cqe.cqe_dw0;
2890 
2891 fail:
2892 	nvme_free_cmd(cmd);
2893 	return (ret);
2894 }
2895 
2896 static int
2897 nvme_write_cache_set(nvme_t *nvme, boolean_t enable)
2898 {
2899 	nvme_write_cache_t nwc = { 0 };
2900 
2901 	if (enable)
2902 		nwc.b.wc_wce = 1;
2903 
2904 	/*
2905 	 * We've seen some cases where this fails due to us being told we've
2906 	 * specified an invalid namespace when operating against the Xen xcp-ng
2907 	 * qemu NVMe virtual device. As such, we generally ensure that trying to
2908 	 * enable this doesn't lead us to panic. It's not completely clear why
2909 	 * specifying namespace zero here fails, but not when we're setting the
2910 	 * number of queues below.
2911 	 */
2912 	return (nvme_set_features(nvme, B_TRUE, 0, NVME_FEAT_WRITE_CACHE,
2913 	    nwc.r, &nwc.r));
2914 }
2915 
2916 static int
2917 nvme_set_nqueues(nvme_t *nvme)
2918 {
2919 	nvme_nqueues_t nq = { 0 };
2920 	int ret;
2921 
2922 	/*
2923 	 * The default is to allocate one completion queue per vector.
2924 	 */
2925 	if (nvme->n_completion_queues == -1)
2926 		nvme->n_completion_queues = nvme->n_intr_cnt;
2927 
2928 	/*
2929 	 * There is no point in having more completion queues than
2930 	 * interrupt vectors.
2931 	 */
2932 	nvme->n_completion_queues = MIN(nvme->n_completion_queues,
2933 	    nvme->n_intr_cnt);
2934 
2935 	/*
2936 	 * The default is to use one submission queue per completion queue.
2937 	 */
2938 	if (nvme->n_submission_queues == -1)
2939 		nvme->n_submission_queues = nvme->n_completion_queues;
2940 
2941 	/*
2942 	 * There is no point in having more completion queues than
2943 	 * submission queues.
2944 	 */
2945 	nvme->n_completion_queues = MIN(nvme->n_completion_queues,
2946 	    nvme->n_submission_queues);
2947 
2948 	ASSERT(nvme->n_submission_queues > 0);
2949 	ASSERT(nvme->n_completion_queues > 0);
2950 
2951 	nq.b.nq_nsq = nvme->n_submission_queues - 1;
2952 	nq.b.nq_ncq = nvme->n_completion_queues - 1;
2953 
2954 	ret = nvme_set_features(nvme, B_FALSE, 0, NVME_FEAT_NQUEUES, nq.r,
2955 	    &nq.r);
2956 
2957 	if (ret == 0) {
2958 		/*
2959 		 * Never use more than the requested number of queues.
2960 		 */
2961 		nvme->n_submission_queues = MIN(nvme->n_submission_queues,
2962 		    nq.b.nq_nsq + 1);
2963 		nvme->n_completion_queues = MIN(nvme->n_completion_queues,
2964 		    nq.b.nq_ncq + 1);
2965 	}
2966 
2967 	return (ret);
2968 }
2969 
2970 static int
2971 nvme_create_completion_queue(nvme_t *nvme, nvme_cq_t *cq)
2972 {
2973 	nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
2974 	nvme_create_queue_dw10_t dw10 = { 0 };
2975 	nvme_create_cq_dw11_t c_dw11 = { 0 };
2976 	int ret;
2977 
2978 	dw10.b.q_qid = cq->ncq_id;
2979 	dw10.b.q_qsize = cq->ncq_nentry - 1;
2980 
2981 	c_dw11.b.cq_pc = 1;
2982 	c_dw11.b.cq_ien = 1;
2983 	c_dw11.b.cq_iv = cq->ncq_id % nvme->n_intr_cnt;
2984 
2985 	cmd->nc_sqid = 0;
2986 	cmd->nc_callback = nvme_wakeup_cmd;
2987 	cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_CQUEUE;
2988 	cmd->nc_sqe.sqe_cdw10 = dw10.r;
2989 	cmd->nc_sqe.sqe_cdw11 = c_dw11.r;
2990 	cmd->nc_sqe.sqe_dptr.d_prp[0] = cq->ncq_dma->nd_cookie.dmac_laddress;
2991 
2992 	nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
2993 
2994 	if ((ret = nvme_check_cmd_status(cmd)) != 0) {
2995 		dev_err(nvme->n_dip, CE_WARN,
2996 		    "!CREATE CQUEUE failed with sct = %x, sc = %x",
2997 		    cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
2998 	}
2999 
3000 	nvme_free_cmd(cmd);
3001 
3002 	return (ret);
3003 }
3004 
3005 static int
3006 nvme_create_io_qpair(nvme_t *nvme, nvme_qpair_t *qp, uint16_t idx)
3007 {
3008 	nvme_cq_t *cq = qp->nq_cq;
3009 	nvme_cmd_t *cmd;
3010 	nvme_create_queue_dw10_t dw10 = { 0 };
3011 	nvme_create_sq_dw11_t s_dw11 = { 0 };
3012 	int ret;
3013 
3014 	/*
3015 	 * It is possible to have more qpairs than completion queues,
3016 	 * and when the idx > ncq_id, that completion queue is shared
3017 	 * and has already been created.
3018 	 */
3019 	if (idx <= cq->ncq_id &&
3020 	    nvme_create_completion_queue(nvme, cq) != DDI_SUCCESS)
3021 		return (DDI_FAILURE);
3022 
3023 	dw10.b.q_qid = idx;
3024 	dw10.b.q_qsize = qp->nq_nentry - 1;
3025 
3026 	s_dw11.b.sq_pc = 1;
3027 	s_dw11.b.sq_cqid = cq->ncq_id;
3028 
3029 	cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
3030 	cmd->nc_sqid = 0;
3031 	cmd->nc_callback = nvme_wakeup_cmd;
3032 	cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_SQUEUE;
3033 	cmd->nc_sqe.sqe_cdw10 = dw10.r;
3034 	cmd->nc_sqe.sqe_cdw11 = s_dw11.r;
3035 	cmd->nc_sqe.sqe_dptr.d_prp[0] = qp->nq_sqdma->nd_cookie.dmac_laddress;
3036 
3037 	nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
3038 
3039 	if ((ret = nvme_check_cmd_status(cmd)) != 0) {
3040 		dev_err(nvme->n_dip, CE_WARN,
3041 		    "!CREATE SQUEUE failed with sct = %x, sc = %x",
3042 		    cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
3043 	}
3044 
3045 	nvme_free_cmd(cmd);
3046 
3047 	return (ret);
3048 }
3049 
3050 static boolean_t
3051 nvme_reset(nvme_t *nvme, boolean_t quiesce)
3052 {
3053 	nvme_reg_csts_t csts;
3054 	int i;
3055 
3056 	nvme_put32(nvme, NVME_REG_CC, 0);
3057 
3058 	csts.r = nvme_get32(nvme, NVME_REG_CSTS);
3059 	if (csts.b.csts_rdy == 1) {
3060 		nvme_put32(nvme, NVME_REG_CC, 0);
3061 
3062 		/*
3063 		 * The timeout value is from the Controller Capabilities
3064 		 * register (CAP.TO, section 3.1.1). This is the worst case
3065 		 * time to wait for CSTS.RDY to transition from 1 to 0 after
3066 		 * CC.EN transitions from 1 to 0.
3067 		 *
3068 		 * The timeout units are in 500 ms units, and we are delaying
3069 		 * in 50ms chunks, hence counting to n_timeout * 10.
3070 		 */
3071 		for (i = 0; i < nvme->n_timeout * 10; i++) {
3072 			csts.r = nvme_get32(nvme, NVME_REG_CSTS);
3073 			if (csts.b.csts_rdy == 0)
3074 				break;
3075 
3076 			/*
3077 			 * Quiescing drivers should not use locks or timeouts,
3078 			 * so if this is the quiesce path, use a quiesce-safe
3079 			 * delay.
3080 			 */
3081 			if (quiesce) {
3082 				drv_usecwait(50000);
3083 			} else {
3084 				delay(drv_usectohz(50000));
3085 			}
3086 		}
3087 	}
3088 
3089 	nvme_put32(nvme, NVME_REG_AQA, 0);
3090 	nvme_put32(nvme, NVME_REG_ASQ, 0);
3091 	nvme_put32(nvme, NVME_REG_ACQ, 0);
3092 
3093 	csts.r = nvme_get32(nvme, NVME_REG_CSTS);
3094 	return (csts.b.csts_rdy == 0 ? B_TRUE : B_FALSE);
3095 }
3096 
3097 static void
3098 nvme_shutdown(nvme_t *nvme, boolean_t quiesce)
3099 {
3100 	nvme_reg_cc_t cc;
3101 	nvme_reg_csts_t csts;
3102 	int i;
3103 
3104 	cc.r = nvme_get32(nvme, NVME_REG_CC);
3105 	cc.b.cc_shn = NVME_CC_SHN_NORMAL;
3106 	nvme_put32(nvme, NVME_REG_CC, cc.r);
3107 
3108 	for (i = 0; i < 10; i++) {
3109 		csts.r = nvme_get32(nvme, NVME_REG_CSTS);
3110 		if (csts.b.csts_shst == NVME_CSTS_SHN_COMPLETE)
3111 			break;
3112 
3113 		if (quiesce) {
3114 			drv_usecwait(100000);
3115 		} else {
3116 			delay(drv_usectohz(100000));
3117 		}
3118 	}
3119 }
3120 
3121 /*
3122  * Return length of string without trailing spaces.
3123  */
3124 static int
3125 nvme_strlen(const char *str, int len)
3126 {
3127 	if (len <= 0)
3128 		return (0);
3129 
3130 	while (str[--len] == ' ')
3131 		;
3132 
3133 	return (++len);
3134 }
3135 
3136 static void
3137 nvme_config_min_block_size(nvme_t *nvme, char *model, char *val)
3138 {
3139 	ulong_t bsize = 0;
3140 	char *msg = "";
3141 
3142 	if (ddi_strtoul(val, NULL, 0, &bsize) != 0)
3143 		goto err;
3144 
3145 	if (!ISP2(bsize)) {
3146 		msg = ": not a power of 2";
3147 		goto err;
3148 	}
3149 
3150 	if (bsize < NVME_DEFAULT_MIN_BLOCK_SIZE) {
3151 		msg = ": too low";
3152 		goto err;
3153 	}
3154 
3155 	nvme->n_min_block_size = bsize;
3156 	return;
3157 
3158 err:
3159 	dev_err(nvme->n_dip, CE_WARN,
3160 	    "!nvme-config-list: ignoring invalid min-phys-block-size '%s' "
3161 	    "for model '%s'%s", val, model, msg);
3162 
3163 	nvme->n_min_block_size = NVME_DEFAULT_MIN_BLOCK_SIZE;
3164 }
3165 
3166 static void
3167 nvme_config_boolean(nvme_t *nvme, char *model, char *name, char *val,
3168     boolean_t *b)
3169 {
3170 	if (strcmp(val, "on") == 0 ||
3171 	    strcmp(val, "true") == 0)
3172 		*b = B_TRUE;
3173 	else if (strcmp(val, "off") == 0 ||
3174 	    strcmp(val, "false") == 0)
3175 		*b = B_FALSE;
3176 	else
3177 		dev_err(nvme->n_dip, CE_WARN,
3178 		    "!nvme-config-list: invalid value for %s '%s'"
3179 		    " for model '%s', ignoring", name, val, model);
3180 }
3181 
3182 static void
3183 nvme_config_list(nvme_t *nvme)
3184 {
3185 	char	**config_list;
3186 	uint_t	nelem;
3187 	int	rv, i;
3188 
3189 	/*
3190 	 * We're following the pattern of 'sd-config-list' here, but extend it.
3191 	 * Instead of two we have three separate strings for "model", "fwrev",
3192 	 * and "name-value-list".
3193 	 */
3194 	rv = ddi_prop_lookup_string_array(DDI_DEV_T_ANY, nvme->n_dip,
3195 	    DDI_PROP_DONTPASS, "nvme-config-list", &config_list, &nelem);
3196 
3197 	if (rv != DDI_PROP_SUCCESS) {
3198 		if (rv == DDI_PROP_CANNOT_DECODE) {
3199 			dev_err(nvme->n_dip, CE_WARN,
3200 			    "!nvme-config-list: cannot be decoded");
3201 		}
3202 
3203 		return;
3204 	}
3205 
3206 	if ((nelem % 3) != 0) {
3207 		dev_err(nvme->n_dip, CE_WARN, "!nvme-config-list: must be "
3208 		    "triplets of <model>/<fwrev>/<name-value-list> strings ");
3209 		goto out;
3210 	}
3211 
3212 	for (i = 0; i < nelem; i += 3) {
3213 		char	*model = config_list[i];
3214 		char	*fwrev = config_list[i + 1];
3215 		char	*nvp, *save_nv;
3216 		int	id_model_len, id_fwrev_len;
3217 
3218 		id_model_len = nvme_strlen(nvme->n_idctl->id_model,
3219 		    sizeof (nvme->n_idctl->id_model));
3220 
3221 		if (strlen(model) != id_model_len)
3222 			continue;
3223 
3224 		if (strncmp(model, nvme->n_idctl->id_model, id_model_len) != 0)
3225 			continue;
3226 
3227 		id_fwrev_len = nvme_strlen(nvme->n_idctl->id_fwrev,
3228 		    sizeof (nvme->n_idctl->id_fwrev));
3229 
3230 		if (strlen(fwrev) != 0) {
3231 			boolean_t match = B_FALSE;
3232 			char *fwr, *last_fw;
3233 
3234 			for (fwr = strtok_r(fwrev, ",", &last_fw);
3235 			    fwr != NULL;
3236 			    fwr = strtok_r(NULL, ",", &last_fw)) {
3237 				if (strlen(fwr) != id_fwrev_len)
3238 					continue;
3239 
3240 				if (strncmp(fwr, nvme->n_idctl->id_fwrev,
3241 				    id_fwrev_len) == 0)
3242 					match = B_TRUE;
3243 			}
3244 
3245 			if (!match)
3246 				continue;
3247 		}
3248 
3249 		/*
3250 		 * We should now have a comma-separated list of name:value
3251 		 * pairs.
3252 		 */
3253 		for (nvp = strtok_r(config_list[i + 2], ",", &save_nv);
3254 		    nvp != NULL; nvp = strtok_r(NULL, ",", &save_nv)) {
3255 			char	*name = nvp;
3256 			char	*val = strchr(nvp, ':');
3257 
3258 			if (val == NULL || name == val) {
3259 				dev_err(nvme->n_dip, CE_WARN,
3260 				    "!nvme-config-list: <name-value-list> "
3261 				    "for model '%s' is malformed", model);
3262 				goto out;
3263 			}
3264 
3265 			/*
3266 			 * Null-terminate 'name', move 'val' past ':' sep.
3267 			 */
3268 			*val++ = '\0';
3269 
3270 			/*
3271 			 * Process the name:val pairs that we know about.
3272 			 */
3273 			if (strcmp(name, "ignore-unknown-vendor-status") == 0) {
3274 				nvme_config_boolean(nvme, model, name, val,
3275 				    &nvme->n_ignore_unknown_vendor_status);
3276 			} else if (strcmp(name, "min-phys-block-size") == 0) {
3277 				nvme_config_min_block_size(nvme, model, val);
3278 			} else if (strcmp(name, "volatile-write-cache") == 0) {
3279 				nvme_config_boolean(nvme, model, name, val,
3280 				    &nvme->n_write_cache_enabled);
3281 			} else {
3282 				/*
3283 				 * Unknown 'name'.
3284 				 */
3285 				dev_err(nvme->n_dip, CE_WARN,
3286 				    "!nvme-config-list: unknown config '%s' "
3287 				    "for model '%s', ignoring", name, model);
3288 			}
3289 		}
3290 	}
3291 
3292 out:
3293 	ddi_prop_free(config_list);
3294 }
3295 
3296 static void
3297 nvme_prepare_devid(nvme_t *nvme, uint32_t nsid)
3298 {
3299 	/*
3300 	 * Section 7.7 of the spec describes how to get a unique ID for
3301 	 * the controller: the vendor ID, the model name and the serial
3302 	 * number shall be unique when combined.
3303 	 *
3304 	 * If a namespace has no EUI64 we use the above and add the hex
3305 	 * namespace ID to get a unique ID for the namespace.
3306 	 */
3307 	char model[sizeof (nvme->n_idctl->id_model) + 1];
3308 	char serial[sizeof (nvme->n_idctl->id_serial) + 1];
3309 
3310 	bcopy(nvme->n_idctl->id_model, model, sizeof (nvme->n_idctl->id_model));
3311 	bcopy(nvme->n_idctl->id_serial, serial,
3312 	    sizeof (nvme->n_idctl->id_serial));
3313 
3314 	model[sizeof (nvme->n_idctl->id_model)] = '\0';
3315 	serial[sizeof (nvme->n_idctl->id_serial)] = '\0';
3316 
3317 	nvme_nsid2ns(nvme, nsid)->ns_devid = kmem_asprintf("%4X-%s-%s-%X",
3318 	    nvme->n_idctl->id_vid, model, serial, nsid);
3319 }
3320 
3321 static nvme_identify_nsid_list_t *
3322 nvme_update_nsid_list(nvme_t *nvme, int cns)
3323 {
3324 	nvme_identify_nsid_list_t *nslist;
3325 
3326 	/*
3327 	 * We currently don't handle cases where there are more than
3328 	 * 1024 active namespaces, requiring several IDENTIFY commands.
3329 	 */
3330 	if (nvme_identify_int(nvme, 0, cns, (void **)&nslist))
3331 		return (nslist);
3332 
3333 	return (NULL);
3334 }
3335 
3336 nvme_namespace_t *
3337 nvme_nsid2ns(nvme_t *nvme, uint32_t nsid)
3338 {
3339 	ASSERT3U(nsid, !=, 0);
3340 	ASSERT3U(nsid, <=, nvme->n_namespace_count);
3341 	return (&nvme->n_ns[nsid - 1]);
3342 }
3343 
3344 static boolean_t
3345 nvme_allocated_ns(nvme_namespace_t *ns)
3346 {
3347 	nvme_t *nvme = ns->ns_nvme;
3348 	uint32_t i;
3349 
3350 	ASSERT(MUTEX_HELD(&nvme->n_mgmt_mutex));
3351 
3352 	/*
3353 	 * If supported, update the list of allocated namespace IDs.
3354 	 */
3355 	if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 2) &&
3356 	    nvme->n_idctl->id_oacs.oa_nsmgmt != 0) {
3357 		nvme_identify_nsid_list_t *nslist = nvme_update_nsid_list(nvme,
3358 		    NVME_IDENTIFY_NSID_ALLOC_LIST);
3359 		boolean_t found = B_FALSE;
3360 
3361 		/*
3362 		 * When namespace management is supported, this really shouldn't
3363 		 * be NULL. Treat all namespaces as allocated if it is.
3364 		 */
3365 		if (nslist == NULL)
3366 			return (B_TRUE);
3367 
3368 		for (i = 0; i < ARRAY_SIZE(nslist->nl_nsid); i++) {
3369 			if (ns->ns_id == 0)
3370 				break;
3371 
3372 			if (ns->ns_id == nslist->nl_nsid[i])
3373 				found = B_TRUE;
3374 		}
3375 
3376 		kmem_free(nslist, NVME_IDENTIFY_BUFSIZE);
3377 		return (found);
3378 	} else {
3379 		/*
3380 		 * If namespace management isn't supported, report all
3381 		 * namespaces as allocated.
3382 		 */
3383 		return (B_TRUE);
3384 	}
3385 }
3386 
3387 static boolean_t
3388 nvme_active_ns(nvme_namespace_t *ns)
3389 {
3390 	nvme_t *nvme = ns->ns_nvme;
3391 	uint64_t *ptr;
3392 	uint32_t i;
3393 
3394 	ASSERT(MUTEX_HELD(&nvme->n_mgmt_mutex));
3395 
3396 	/*
3397 	 * If supported, update the list of active namespace IDs.
3398 	 */
3399 	if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1)) {
3400 		nvme_identify_nsid_list_t *nslist = nvme_update_nsid_list(nvme,
3401 		    NVME_IDENTIFY_NSID_LIST);
3402 		boolean_t found = B_FALSE;
3403 
3404 		/*
3405 		 * When namespace management is supported, this really shouldn't
3406 		 * be NULL. Treat all namespaces as allocated if it is.
3407 		 */
3408 		if (nslist == NULL)
3409 			return (B_TRUE);
3410 
3411 		for (i = 0; i < ARRAY_SIZE(nslist->nl_nsid); i++) {
3412 			if (ns->ns_id == 0)
3413 				break;
3414 
3415 			if (ns->ns_id == nslist->nl_nsid[i])
3416 				found = B_TRUE;
3417 		}
3418 
3419 		kmem_free(nslist, NVME_IDENTIFY_BUFSIZE);
3420 		return (found);
3421 	}
3422 
3423 	/*
3424 	 * Workaround for revision 1.0:
3425 	 * Check whether the IDENTIFY NAMESPACE data is zero-filled.
3426 	 */
3427 	for (ptr = (uint64_t *)ns->ns_idns;
3428 	    ptr != (uint64_t *)(ns->ns_idns + 1);
3429 	    ptr++) {
3430 		if (*ptr != 0) {
3431 			return (B_TRUE);
3432 		}
3433 	}
3434 
3435 	return (B_FALSE);
3436 }
3437 
3438 static int
3439 nvme_init_ns(nvme_t *nvme, uint32_t nsid)
3440 {
3441 	nvme_namespace_t *ns = nvme_nsid2ns(nvme, nsid);
3442 	nvme_identify_nsid_t *idns;
3443 	boolean_t was_ignored;
3444 	int last_rp;
3445 
3446 	ns->ns_nvme = nvme;
3447 
3448 	ASSERT(MUTEX_HELD(&nvme->n_mgmt_mutex));
3449 
3450 	/*
3451 	 * Because we might rescan a namespace and this will fail after boot
3452 	 * that'd leave us in a bad spot. We need to do something about this
3453 	 * longer term, but it's not clear how exactly we would recover right
3454 	 * now.
3455 	 */
3456 	if (!nvme_identify_int(nvme, nsid, NVME_IDENTIFY_NSID,
3457 	    (void **)&idns)) {
3458 		dev_err(nvme->n_dip, CE_WARN,
3459 		    "!failed to identify namespace %d", nsid);
3460 		return (DDI_FAILURE);
3461 	}
3462 
3463 	if (ns->ns_idns != NULL)
3464 		kmem_free(ns->ns_idns, sizeof (nvme_identify_nsid_t));
3465 
3466 	ns->ns_idns = idns;
3467 	ns->ns_id = nsid;
3468 
3469 	was_ignored = ns->ns_ignore;
3470 
3471 	ns->ns_allocated = nvme_allocated_ns(ns);
3472 	ns->ns_active = nvme_active_ns(ns);
3473 
3474 	ns->ns_block_count = idns->id_nsize;
3475 	ns->ns_block_size =
3476 	    1 << idns->id_lbaf[idns->id_flbas.lba_format].lbaf_lbads;
3477 	ns->ns_best_block_size = ns->ns_block_size;
3478 
3479 	/*
3480 	 * Get the EUI64 if present.
3481 	 */
3482 	if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1))
3483 		bcopy(idns->id_eui64, ns->ns_eui64, sizeof (ns->ns_eui64));
3484 
3485 	/*
3486 	 * Get the NGUID if present.
3487 	 */
3488 	if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 2))
3489 		bcopy(idns->id_nguid, ns->ns_nguid, sizeof (ns->ns_nguid));
3490 
3491 	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
3492 	if (*(uint64_t *)ns->ns_eui64 == 0)
3493 		nvme_prepare_devid(nvme, ns->ns_id);
3494 
3495 	(void) snprintf(ns->ns_name, sizeof (ns->ns_name), "%u", ns->ns_id);
3496 
3497 	/*
3498 	 * Find the LBA format with no metadata and the best relative
3499 	 * performance. A value of 3 means "degraded", 0 is best.
3500 	 */
3501 	last_rp = 3;
3502 	for (int j = 0; j <= idns->id_nlbaf; j++) {
3503 		if (idns->id_lbaf[j].lbaf_lbads == 0)
3504 			break;
3505 		if (idns->id_lbaf[j].lbaf_ms != 0)
3506 			continue;
3507 		if (idns->id_lbaf[j].lbaf_rp >= last_rp)
3508 			continue;
3509 		last_rp = idns->id_lbaf[j].lbaf_rp;
3510 		ns->ns_best_block_size =
3511 		    1 << idns->id_lbaf[j].lbaf_lbads;
3512 	}
3513 
3514 	if (ns->ns_best_block_size < nvme->n_min_block_size)
3515 		ns->ns_best_block_size = nvme->n_min_block_size;
3516 
3517 	was_ignored = ns->ns_ignore;
3518 
3519 	/*
3520 	 * We currently don't support namespaces that are inactive, or use
3521 	 * either:
3522 	 * - protection information
3523 	 * - illegal block size (< 512)
3524 	 */
3525 	if (!ns->ns_active) {
3526 		ns->ns_ignore = B_TRUE;
3527 	} else if (idns->id_dps.dp_pinfo) {
3528 		dev_err(nvme->n_dip, CE_WARN,
3529 		    "!ignoring namespace %d, unsupported feature: "
3530 		    "pinfo = %d", nsid, idns->id_dps.dp_pinfo);
3531 		ns->ns_ignore = B_TRUE;
3532 	} else if (ns->ns_block_size < 512) {
3533 		dev_err(nvme->n_dip, CE_WARN,
3534 		    "!ignoring namespace %d, unsupported block size %"PRIu64,
3535 		    nsid, (uint64_t)ns->ns_block_size);
3536 		ns->ns_ignore = B_TRUE;
3537 	} else {
3538 		ns->ns_ignore = B_FALSE;
3539 	}
3540 
3541 	/*
3542 	 * Keep a count of namespaces which are attachable.
3543 	 * See comments in nvme_bd_driveinfo() to understand its effect.
3544 	 */
3545 	if (was_ignored) {
3546 		/*
3547 		 * Previously ignored, but now not. Count it.
3548 		 */
3549 		if (!ns->ns_ignore)
3550 			nvme->n_namespaces_attachable++;
3551 	} else {
3552 		/*
3553 		 * Wasn't ignored previously, but now needs to be.
3554 		 * Discount it.
3555 		 */
3556 		if (ns->ns_ignore)
3557 			nvme->n_namespaces_attachable--;
3558 	}
3559 
3560 	return (DDI_SUCCESS);
3561 }
3562 
3563 static boolean_t
3564 nvme_attach_ns(nvme_t *nvme, nvme_ioctl_common_t *com)
3565 {
3566 	nvme_namespace_t *ns = nvme_nsid2ns(nvme, com->nioc_nsid);
3567 
3568 	ASSERT(MUTEX_HELD(&nvme->n_mgmt_mutex));
3569 
3570 	if (ns->ns_ignore) {
3571 		return (nvme_ioctl_error(com, NVME_IOCTL_E_UNSUP_ATTACH_NS,
3572 		    0, 0));
3573 	}
3574 
3575 	if (ns->ns_bd_hdl == NULL) {
3576 		bd_ops_t ops = nvme_bd_ops;
3577 
3578 		if (!nvme->n_idctl->id_oncs.on_dset_mgmt)
3579 			ops.o_free_space = NULL;
3580 
3581 		ns->ns_bd_hdl = bd_alloc_handle(ns, &ops, &nvme->n_prp_dma_attr,
3582 		    KM_SLEEP);
3583 
3584 		if (ns->ns_bd_hdl == NULL) {
3585 			dev_err(nvme->n_dip, CE_WARN, "!Failed to get blkdev "
3586 			    "handle for namespace id %u", com->nioc_nsid);
3587 			return (nvme_ioctl_error(com,
3588 			    NVME_IOCTL_E_BLKDEV_ATTACH, 0, 0));
3589 		}
3590 	}
3591 
3592 	if (bd_attach_handle(nvme->n_dip, ns->ns_bd_hdl) != DDI_SUCCESS) {
3593 		return (nvme_ioctl_error(com, NVME_IOCTL_E_BLKDEV_ATTACH,
3594 		    0, 0));
3595 	}
3596 
3597 	ns->ns_attached = B_TRUE;
3598 
3599 	return (B_TRUE);
3600 }
3601 
3602 static boolean_t
3603 nvme_detach_ns(nvme_t *nvme, nvme_ioctl_common_t *com)
3604 {
3605 	nvme_namespace_t *ns = nvme_nsid2ns(nvme, com->nioc_nsid);
3606 
3607 	ASSERT(MUTEX_HELD(&nvme->n_mgmt_mutex));
3608 
3609 	if (ns->ns_ignore || !ns->ns_attached)
3610 		return (B_TRUE);
3611 
3612 	ASSERT3P(ns->ns_bd_hdl, !=, NULL);
3613 	if (bd_detach_handle(ns->ns_bd_hdl) != DDI_SUCCESS) {
3614 		return (nvme_ioctl_error(com, NVME_IOCTL_E_BLKDEV_DETACH, 0,
3615 		    0));
3616 	}
3617 
3618 	ns->ns_attached = B_FALSE;
3619 	return (B_TRUE);
3620 
3621 }
3622 
3623 /*
3624  * Rescan the namespace information associated with the namespaces indicated by
3625  * ioc. They should not be attached to blkdev right now.
3626  */
3627 static void
3628 nvme_rescan_ns(nvme_t *nvme, uint32_t nsid)
3629 {
3630 	ASSERT(MUTEX_HELD(&nvme->n_mgmt_mutex));
3631 	ASSERT3U(nsid, !=, 0);
3632 
3633 	if (nsid != NVME_NSID_BCAST) {
3634 		nvme_namespace_t *ns = nvme_nsid2ns(nvme, nsid);
3635 
3636 		ASSERT3U(ns->ns_attached, ==, B_FALSE);
3637 		(void) nvme_init_ns(nvme, nsid);
3638 		return;
3639 	}
3640 
3641 	for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) {
3642 		nvme_namespace_t *ns = nvme_nsid2ns(nvme, i);
3643 
3644 		ASSERT3U(ns->ns_attached, ==, B_FALSE);
3645 		(void) nvme_init_ns(nvme, i);
3646 	}
3647 }
3648 
3649 typedef struct nvme_quirk_table {
3650 	uint16_t nq_vendor_id;
3651 	uint16_t nq_device_id;
3652 	nvme_quirk_t nq_quirks;
3653 } nvme_quirk_table_t;
3654 
3655 static const nvme_quirk_table_t nvme_quirks[] = {
3656 	{ 0x1987, 0x5018, NVME_QUIRK_START_CID },	/* Phison E18 */
3657 };
3658 
3659 static void
3660 nvme_detect_quirks(nvme_t *nvme)
3661 {
3662 	for (uint_t i = 0; i < ARRAY_SIZE(nvme_quirks); i++) {
3663 		const nvme_quirk_table_t *nqt = &nvme_quirks[i];
3664 
3665 		if (nqt->nq_vendor_id == nvme->n_vendor_id &&
3666 		    nqt->nq_device_id == nvme->n_device_id) {
3667 			nvme->n_quirks = nqt->nq_quirks;
3668 			return;
3669 		}
3670 	}
3671 }
3672 
3673 static int
3674 nvme_init(nvme_t *nvme)
3675 {
3676 	nvme_reg_cc_t cc = { 0 };
3677 	nvme_reg_aqa_t aqa = { 0 };
3678 	nvme_reg_asq_t asq = { 0 };
3679 	nvme_reg_acq_t acq = { 0 };
3680 	nvme_reg_cap_t cap;
3681 	nvme_reg_vs_t vs;
3682 	nvme_reg_csts_t csts;
3683 	int i = 0;
3684 	uint16_t nqueues;
3685 	uint_t tq_threads;
3686 	char model[sizeof (nvme->n_idctl->id_model) + 1];
3687 	char *vendor, *product;
3688 	uint32_t nsid;
3689 
3690 	/* Check controller version */
3691 	vs.r = nvme_get32(nvme, NVME_REG_VS);
3692 	nvme->n_version.v_major = vs.b.vs_mjr;
3693 	nvme->n_version.v_minor = vs.b.vs_mnr;
3694 	dev_err(nvme->n_dip, CE_CONT, "?NVMe spec version %d.%d",
3695 	    nvme->n_version.v_major, nvme->n_version.v_minor);
3696 
3697 	if (nvme->n_version.v_major > nvme_version_major) {
3698 		dev_err(nvme->n_dip, CE_WARN, "!no support for version > %d.x",
3699 		    nvme_version_major);
3700 		if (nvme->n_strict_version)
3701 			goto fail;
3702 	}
3703 
3704 	/* retrieve controller configuration */
3705 	cap.r = nvme_get64(nvme, NVME_REG_CAP);
3706 
3707 	if ((cap.b.cap_css & NVME_CAP_CSS_NVM) == 0) {
3708 		dev_err(nvme->n_dip, CE_WARN,
3709 		    "!NVM command set not supported by hardware");
3710 		goto fail;
3711 	}
3712 
3713 	nvme->n_nssr_supported = cap.b.cap_nssrs;
3714 	nvme->n_doorbell_stride = 4 << cap.b.cap_dstrd;
3715 	nvme->n_timeout = cap.b.cap_to;
3716 	nvme->n_arbitration_mechanisms = cap.b.cap_ams;
3717 	nvme->n_cont_queues_reqd = cap.b.cap_cqr;
3718 	nvme->n_max_queue_entries = cap.b.cap_mqes + 1;
3719 
3720 	/*
3721 	 * The MPSMIN and MPSMAX fields in the CAP register use 0 to specify
3722 	 * the base page size of 4k (1<<12), so add 12 here to get the real
3723 	 * page size value.
3724 	 */
3725 	nvme->n_pageshift = MIN(MAX(cap.b.cap_mpsmin + 12, PAGESHIFT),
3726 	    cap.b.cap_mpsmax + 12);
3727 	nvme->n_pagesize = 1UL << (nvme->n_pageshift);
3728 
3729 	/*
3730 	 * Set up Queue DMA to transfer at least 1 page-aligned page at a time.
3731 	 */
3732 	nvme->n_queue_dma_attr.dma_attr_align = nvme->n_pagesize;
3733 	nvme->n_queue_dma_attr.dma_attr_minxfer = nvme->n_pagesize;
3734 
3735 	/*
3736 	 * Set up PRP DMA to transfer 1 page-aligned page at a time.
3737 	 * Maxxfer may be increased after we identified the controller limits.
3738 	 */
3739 	nvme->n_prp_dma_attr.dma_attr_maxxfer = nvme->n_pagesize;
3740 	nvme->n_prp_dma_attr.dma_attr_minxfer = nvme->n_pagesize;
3741 	nvme->n_prp_dma_attr.dma_attr_align = nvme->n_pagesize;
3742 	nvme->n_prp_dma_attr.dma_attr_seg = nvme->n_pagesize - 1;
3743 
3744 	/*
3745 	 * Reset controller if it's still in ready state.
3746 	 */
3747 	if (nvme_reset(nvme, B_FALSE) == B_FALSE) {
3748 		dev_err(nvme->n_dip, CE_WARN, "!unable to reset controller");
3749 		ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST);
3750 		nvme->n_dead = B_TRUE;
3751 		goto fail;
3752 	}
3753 
3754 	/*
3755 	 * Create the cq array with one completion queue to be assigned
3756 	 * to the admin queue pair and a limited number of taskqs (4).
3757 	 */
3758 	if (nvme_create_cq_array(nvme, 1, nvme->n_admin_queue_len, 4) !=
3759 	    DDI_SUCCESS) {
3760 		dev_err(nvme->n_dip, CE_WARN,
3761 		    "!failed to pre-allocate admin completion queue");
3762 		goto fail;
3763 	}
3764 	/*
3765 	 * Create the admin queue pair.
3766 	 */
3767 	if (nvme_alloc_qpair(nvme, nvme->n_admin_queue_len, &nvme->n_adminq, 0)
3768 	    != DDI_SUCCESS) {
3769 		dev_err(nvme->n_dip, CE_WARN,
3770 		    "!unable to allocate admin qpair");
3771 		goto fail;
3772 	}
3773 	nvme->n_ioq = kmem_alloc(sizeof (nvme_qpair_t *), KM_SLEEP);
3774 	nvme->n_ioq[0] = nvme->n_adminq;
3775 
3776 	if (nvme->n_quirks & NVME_QUIRK_START_CID)
3777 		nvme->n_adminq->nq_next_cmd++;
3778 
3779 	nvme->n_progress |= NVME_ADMIN_QUEUE;
3780 
3781 	(void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip,
3782 	    "admin-queue-len", nvme->n_admin_queue_len);
3783 
3784 	aqa.b.aqa_asqs = aqa.b.aqa_acqs = nvme->n_admin_queue_len - 1;
3785 	asq = nvme->n_adminq->nq_sqdma->nd_cookie.dmac_laddress;
3786 	acq = nvme->n_adminq->nq_cq->ncq_dma->nd_cookie.dmac_laddress;
3787 
3788 	ASSERT((asq & (nvme->n_pagesize - 1)) == 0);
3789 	ASSERT((acq & (nvme->n_pagesize - 1)) == 0);
3790 
3791 	nvme_put32(nvme, NVME_REG_AQA, aqa.r);
3792 	nvme_put64(nvme, NVME_REG_ASQ, asq);
3793 	nvme_put64(nvme, NVME_REG_ACQ, acq);
3794 
3795 	cc.b.cc_ams = 0;	/* use Round-Robin arbitration */
3796 	cc.b.cc_css = 0;	/* use NVM command set */
3797 	cc.b.cc_mps = nvme->n_pageshift - 12;
3798 	cc.b.cc_shn = 0;	/* no shutdown in progress */
3799 	cc.b.cc_en = 1;		/* enable controller */
3800 	cc.b.cc_iosqes = 6;	/* submission queue entry is 2^6 bytes long */
3801 	cc.b.cc_iocqes = 4;	/* completion queue entry is 2^4 bytes long */
3802 
3803 	nvme_put32(nvme, NVME_REG_CC, cc.r);
3804 
3805 	/*
3806 	 * Wait for the controller to become ready.
3807 	 */
3808 	csts.r = nvme_get32(nvme, NVME_REG_CSTS);
3809 	if (csts.b.csts_rdy == 0) {
3810 		for (i = 0; i != nvme->n_timeout * 10; i++) {
3811 			delay(drv_usectohz(50000));
3812 			csts.r = nvme_get32(nvme, NVME_REG_CSTS);
3813 
3814 			if (csts.b.csts_cfs == 1) {
3815 				dev_err(nvme->n_dip, CE_WARN,
3816 				    "!controller fatal status at init");
3817 				ddi_fm_service_impact(nvme->n_dip,
3818 				    DDI_SERVICE_LOST);
3819 				nvme->n_dead = B_TRUE;
3820 				goto fail;
3821 			}
3822 
3823 			if (csts.b.csts_rdy == 1)
3824 				break;
3825 		}
3826 	}
3827 
3828 	if (csts.b.csts_rdy == 0) {
3829 		dev_err(nvme->n_dip, CE_WARN, "!controller not ready");
3830 		ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST);
3831 		nvme->n_dead = B_TRUE;
3832 		goto fail;
3833 	}
3834 
3835 	/*
3836 	 * Assume an abort command limit of 1. We'll destroy and re-init
3837 	 * that later when we know the true abort command limit.
3838 	 */
3839 	sema_init(&nvme->n_abort_sema, 1, NULL, SEMA_DRIVER, NULL);
3840 
3841 	/*
3842 	 * Set up initial interrupt for admin queue.
3843 	 */
3844 	if ((nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSIX, 1)
3845 	    != DDI_SUCCESS) &&
3846 	    (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSI, 1)
3847 	    != DDI_SUCCESS) &&
3848 	    (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_FIXED, 1)
3849 	    != DDI_SUCCESS)) {
3850 		dev_err(nvme->n_dip, CE_WARN,
3851 		    "!failed to set up initial interrupt");
3852 		goto fail;
3853 	}
3854 
3855 	/*
3856 	 * Post an asynchronous event command to catch errors.
3857 	 * We assume the asynchronous events are supported as required by
3858 	 * specification (Figure 40 in section 5 of NVMe 1.2).
3859 	 * However, since at least qemu does not follow the specification,
3860 	 * we need a mechanism to protect ourselves.
3861 	 */
3862 	nvme->n_async_event_supported = B_TRUE;
3863 	nvme_async_event(nvme);
3864 
3865 	/*
3866 	 * Identify Controller
3867 	 */
3868 	if (!nvme_identify_int(nvme, 0, NVME_IDENTIFY_CTRL,
3869 	    (void **)&nvme->n_idctl)) {
3870 		dev_err(nvme->n_dip, CE_WARN, "!failed to identify controller");
3871 		goto fail;
3872 	}
3873 
3874 	/*
3875 	 * Get the common namespace information if available. If not, we use the
3876 	 * information for nsid 1.
3877 	 */
3878 	if (nvme_ctrl_atleast(nvme, &nvme_vers_1v2) &&
3879 	    nvme->n_idctl->id_oacs.oa_nsmgmt != 0) {
3880 		nsid = NVME_NSID_BCAST;
3881 	} else {
3882 		nsid = 1;
3883 	}
3884 
3885 	if (!nvme_identify_int(nvme, nsid, NVME_IDENTIFY_NSID,
3886 	    (void **)&nvme->n_idcomns)) {
3887 		dev_err(nvme->n_dip, CE_WARN, "!failed to identify common "
3888 		    "namespace information");
3889 		goto fail;
3890 	}
3891 	/*
3892 	 * Process nvme-config-list (if present) in nvme.conf.
3893 	 */
3894 	nvme_config_list(nvme);
3895 
3896 	/*
3897 	 * Get Vendor & Product ID
3898 	 */
3899 	bcopy(nvme->n_idctl->id_model, model, sizeof (nvme->n_idctl->id_model));
3900 	model[sizeof (nvme->n_idctl->id_model)] = '\0';
3901 	sata_split_model(model, &vendor, &product);
3902 
3903 	if (vendor == NULL)
3904 		nvme->n_vendor = strdup("NVMe");
3905 	else
3906 		nvme->n_vendor = strdup(vendor);
3907 
3908 	nvme->n_product = strdup(product);
3909 
3910 	/*
3911 	 * Get controller limits.
3912 	 */
3913 	nvme->n_async_event_limit = MAX(NVME_MIN_ASYNC_EVENT_LIMIT,
3914 	    MIN(nvme->n_admin_queue_len / 10,
3915 	    MIN(nvme->n_idctl->id_aerl + 1, nvme->n_async_event_limit)));
3916 
3917 	(void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip,
3918 	    "async-event-limit", nvme->n_async_event_limit);
3919 
3920 	nvme->n_abort_command_limit = nvme->n_idctl->id_acl + 1;
3921 
3922 	/*
3923 	 * Reinitialize the semaphore with the true abort command limit
3924 	 * supported by the hardware. It's not necessary to disable interrupts
3925 	 * as only command aborts use the semaphore, and no commands are
3926 	 * executed or aborted while we're here.
3927 	 */
3928 	sema_destroy(&nvme->n_abort_sema);
3929 	sema_init(&nvme->n_abort_sema, nvme->n_abort_command_limit - 1, NULL,
3930 	    SEMA_DRIVER, NULL);
3931 
3932 	nvme->n_progress |= NVME_CTRL_LIMITS;
3933 
3934 	if (nvme->n_idctl->id_mdts == 0)
3935 		nvme->n_max_data_transfer_size = nvme->n_pagesize * 65536;
3936 	else
3937 		nvme->n_max_data_transfer_size =
3938 		    1ull << (nvme->n_pageshift + nvme->n_idctl->id_mdts);
3939 
3940 	nvme->n_error_log_len = nvme->n_idctl->id_elpe + 1;
3941 
3942 	/*
3943 	 * Limit n_max_data_transfer_size to what we can handle in one PRP.
3944 	 * Chained PRPs are currently unsupported.
3945 	 *
3946 	 * This is a no-op on hardware which doesn't support a transfer size
3947 	 * big enough to require chained PRPs.
3948 	 */
3949 	nvme->n_max_data_transfer_size = MIN(nvme->n_max_data_transfer_size,
3950 	    (nvme->n_pagesize / sizeof (uint64_t) * nvme->n_pagesize));
3951 
3952 	nvme->n_prp_dma_attr.dma_attr_maxxfer = nvme->n_max_data_transfer_size;
3953 
3954 	/*
3955 	 * Make sure the minimum/maximum queue entry sizes are not
3956 	 * larger/smaller than the default.
3957 	 */
3958 
3959 	if (((1 << nvme->n_idctl->id_sqes.qes_min) > sizeof (nvme_sqe_t)) ||
3960 	    ((1 << nvme->n_idctl->id_sqes.qes_max) < sizeof (nvme_sqe_t)) ||
3961 	    ((1 << nvme->n_idctl->id_cqes.qes_min) > sizeof (nvme_cqe_t)) ||
3962 	    ((1 << nvme->n_idctl->id_cqes.qes_max) < sizeof (nvme_cqe_t)))
3963 		goto fail;
3964 
3965 	/*
3966 	 * Check for the presence of a Volatile Write Cache. If present,
3967 	 * enable or disable based on the value of the property
3968 	 * volatile-write-cache-enable (default is enabled).
3969 	 */
3970 	nvme->n_write_cache_present =
3971 	    nvme->n_idctl->id_vwc.vwc_present == 0 ? B_FALSE : B_TRUE;
3972 
3973 	(void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip,
3974 	    "volatile-write-cache-present",
3975 	    nvme->n_write_cache_present ? 1 : 0);
3976 
3977 	if (!nvme->n_write_cache_present) {
3978 		nvme->n_write_cache_enabled = B_FALSE;
3979 	} else if (nvme_write_cache_set(nvme, nvme->n_write_cache_enabled)
3980 	    != 0) {
3981 		dev_err(nvme->n_dip, CE_WARN,
3982 		    "!failed to %sable volatile write cache",
3983 		    nvme->n_write_cache_enabled ? "en" : "dis");
3984 		/*
3985 		 * Assume the cache is (still) enabled.
3986 		 */
3987 		nvme->n_write_cache_enabled = B_TRUE;
3988 	}
3989 
3990 	(void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip,
3991 	    "volatile-write-cache-enable",
3992 	    nvme->n_write_cache_enabled ? 1 : 0);
3993 
3994 	/*
3995 	 * Get number of supported namespaces and allocate namespace array.
3996 	 */
3997 	nvme->n_namespace_count = nvme->n_idctl->id_nn;
3998 
3999 	if (nvme->n_namespace_count == 0) {
4000 		dev_err(nvme->n_dip, CE_WARN,
4001 		    "!controllers without namespaces are not supported");
4002 		goto fail;
4003 	}
4004 
4005 	if (nvme->n_namespace_count > NVME_MINOR_MAX) {
4006 		dev_err(nvme->n_dip, CE_WARN,
4007 		    "!too many namespaces: %d, limiting to %d\n",
4008 		    nvme->n_namespace_count, NVME_MINOR_MAX);
4009 		nvme->n_namespace_count = NVME_MINOR_MAX;
4010 	}
4011 
4012 	nvme->n_ns = kmem_zalloc(sizeof (nvme_namespace_t) *
4013 	    nvme->n_namespace_count, KM_SLEEP);
4014 
4015 	/*
4016 	 * Try to set up MSI/MSI-X interrupts.
4017 	 */
4018 	if ((nvme->n_intr_types & (DDI_INTR_TYPE_MSI | DDI_INTR_TYPE_MSIX))
4019 	    != 0) {
4020 		nvme_release_interrupts(nvme);
4021 
4022 		nqueues = MIN(UINT16_MAX, ncpus);
4023 
4024 		if ((nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSIX,
4025 		    nqueues) != DDI_SUCCESS) &&
4026 		    (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSI,
4027 		    nqueues) != DDI_SUCCESS)) {
4028 			dev_err(nvme->n_dip, CE_WARN,
4029 			    "!failed to set up MSI/MSI-X interrupts");
4030 			goto fail;
4031 		}
4032 	}
4033 
4034 	/*
4035 	 * Create I/O queue pairs.
4036 	 */
4037 
4038 	if (nvme_set_nqueues(nvme) != 0) {
4039 		dev_err(nvme->n_dip, CE_WARN,
4040 		    "!failed to set number of I/O queues to %d",
4041 		    nvme->n_intr_cnt);
4042 		goto fail;
4043 	}
4044 
4045 	/*
4046 	 * Reallocate I/O queue array
4047 	 */
4048 	kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *));
4049 	nvme->n_ioq = kmem_zalloc(sizeof (nvme_qpair_t *) *
4050 	    (nvme->n_submission_queues + 1), KM_SLEEP);
4051 	nvme->n_ioq[0] = nvme->n_adminq;
4052 
4053 	/*
4054 	 * There should always be at least as many submission queues
4055 	 * as completion queues.
4056 	 */
4057 	ASSERT(nvme->n_submission_queues >= nvme->n_completion_queues);
4058 
4059 	nvme->n_ioq_count = nvme->n_submission_queues;
4060 
4061 	nvme->n_io_squeue_len =
4062 	    MIN(nvme->n_io_squeue_len, nvme->n_max_queue_entries);
4063 
4064 	(void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-squeue-len",
4065 	    nvme->n_io_squeue_len);
4066 
4067 	/*
4068 	 * Pre-allocate completion queues.
4069 	 * When there are the same number of submission and completion
4070 	 * queues there is no value in having a larger completion
4071 	 * queue length.
4072 	 */
4073 	if (nvme->n_submission_queues == nvme->n_completion_queues)
4074 		nvme->n_io_cqueue_len = MIN(nvme->n_io_cqueue_len,
4075 		    nvme->n_io_squeue_len);
4076 
4077 	nvme->n_io_cqueue_len = MIN(nvme->n_io_cqueue_len,
4078 	    nvme->n_max_queue_entries);
4079 
4080 	(void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-cqueue-len",
4081 	    nvme->n_io_cqueue_len);
4082 
4083 	/*
4084 	 * Assign the equal quantity of taskq threads to each completion
4085 	 * queue, capping the total number of threads to the number
4086 	 * of CPUs.
4087 	 */
4088 	tq_threads = MIN(UINT16_MAX, ncpus) / nvme->n_completion_queues;
4089 
4090 	/*
4091 	 * In case the calculation above is zero, we need at least one
4092 	 * thread per completion queue.
4093 	 */
4094 	tq_threads = MAX(1, tq_threads);
4095 
4096 	if (nvme_create_cq_array(nvme, nvme->n_completion_queues + 1,
4097 	    nvme->n_io_cqueue_len, tq_threads) != DDI_SUCCESS) {
4098 		dev_err(nvme->n_dip, CE_WARN,
4099 		    "!failed to pre-allocate completion queues");
4100 		goto fail;
4101 	}
4102 
4103 	/*
4104 	 * If we use less completion queues than interrupt vectors return
4105 	 * some of the interrupt vectors back to the system.
4106 	 */
4107 	if (nvme->n_completion_queues + 1 < nvme->n_intr_cnt) {
4108 		nvme_release_interrupts(nvme);
4109 
4110 		if (nvme_setup_interrupts(nvme, nvme->n_intr_type,
4111 		    nvme->n_completion_queues + 1) != DDI_SUCCESS) {
4112 			dev_err(nvme->n_dip, CE_WARN,
4113 			    "!failed to reduce number of interrupts");
4114 			goto fail;
4115 		}
4116 	}
4117 
4118 	/*
4119 	 * Alloc & register I/O queue pairs
4120 	 */
4121 
4122 	for (i = 1; i != nvme->n_ioq_count + 1; i++) {
4123 		if (nvme_alloc_qpair(nvme, nvme->n_io_squeue_len,
4124 		    &nvme->n_ioq[i], i) != DDI_SUCCESS) {
4125 			dev_err(nvme->n_dip, CE_WARN,
4126 			    "!unable to allocate I/O qpair %d", i);
4127 			goto fail;
4128 		}
4129 
4130 		if (nvme_create_io_qpair(nvme, nvme->n_ioq[i], i) != 0) {
4131 			dev_err(nvme->n_dip, CE_WARN,
4132 			    "!unable to create I/O qpair %d", i);
4133 			goto fail;
4134 		}
4135 	}
4136 
4137 	/*
4138 	 * Post more asynchronous events commands to reduce event reporting
4139 	 * latency as suggested by the spec.
4140 	 */
4141 	if (nvme->n_async_event_supported) {
4142 		for (i = 1; i != nvme->n_async_event_limit; i++)
4143 			nvme_async_event(nvme);
4144 	}
4145 
4146 	return (DDI_SUCCESS);
4147 
4148 fail:
4149 	(void) nvme_reset(nvme, B_FALSE);
4150 	return (DDI_FAILURE);
4151 }
4152 
4153 static uint_t
4154 nvme_intr(caddr_t arg1, caddr_t arg2)
4155 {
4156 	/*LINTED: E_PTR_BAD_CAST_ALIGN*/
4157 	nvme_t *nvme = (nvme_t *)arg1;
4158 	int inum = (int)(uintptr_t)arg2;
4159 	int ccnt = 0;
4160 	int qnum;
4161 
4162 	if (inum >= nvme->n_intr_cnt)
4163 		return (DDI_INTR_UNCLAIMED);
4164 
4165 	if (nvme->n_dead)
4166 		return (nvme->n_intr_type == DDI_INTR_TYPE_FIXED ?
4167 		    DDI_INTR_UNCLAIMED : DDI_INTR_CLAIMED);
4168 
4169 	/*
4170 	 * The interrupt vector a queue uses is calculated as queue_idx %
4171 	 * intr_cnt in nvme_create_io_qpair(). Iterate through the queue array
4172 	 * in steps of n_intr_cnt to process all queues using this vector.
4173 	 */
4174 	for (qnum = inum;
4175 	    qnum < nvme->n_cq_count && nvme->n_cq[qnum] != NULL;
4176 	    qnum += nvme->n_intr_cnt) {
4177 		ccnt += nvme_process_iocq(nvme, nvme->n_cq[qnum]);
4178 	}
4179 
4180 	return (ccnt > 0 ? DDI_INTR_CLAIMED : DDI_INTR_UNCLAIMED);
4181 }
4182 
4183 static void
4184 nvme_release_interrupts(nvme_t *nvme)
4185 {
4186 	int i;
4187 
4188 	for (i = 0; i < nvme->n_intr_cnt; i++) {
4189 		if (nvme->n_inth[i] == NULL)
4190 			break;
4191 
4192 		if (nvme->n_intr_cap & DDI_INTR_FLAG_BLOCK)
4193 			(void) ddi_intr_block_disable(&nvme->n_inth[i], 1);
4194 		else
4195 			(void) ddi_intr_disable(nvme->n_inth[i]);
4196 
4197 		(void) ddi_intr_remove_handler(nvme->n_inth[i]);
4198 		(void) ddi_intr_free(nvme->n_inth[i]);
4199 	}
4200 
4201 	kmem_free(nvme->n_inth, nvme->n_inth_sz);
4202 	nvme->n_inth = NULL;
4203 	nvme->n_inth_sz = 0;
4204 
4205 	nvme->n_progress &= ~NVME_INTERRUPTS;
4206 }
4207 
4208 static int
4209 nvme_setup_interrupts(nvme_t *nvme, int intr_type, int nqpairs)
4210 {
4211 	int nintrs, navail, count;
4212 	int ret;
4213 	int i;
4214 
4215 	if (nvme->n_intr_types == 0) {
4216 		ret = ddi_intr_get_supported_types(nvme->n_dip,
4217 		    &nvme->n_intr_types);
4218 		if (ret != DDI_SUCCESS) {
4219 			dev_err(nvme->n_dip, CE_WARN,
4220 			    "!%s: ddi_intr_get_supported types failed",
4221 			    __func__);
4222 			return (ret);
4223 		}
4224 #ifdef __x86
4225 		if (get_hwenv() == HW_VMWARE)
4226 			nvme->n_intr_types &= ~DDI_INTR_TYPE_MSIX;
4227 #endif
4228 	}
4229 
4230 	if ((nvme->n_intr_types & intr_type) == 0)
4231 		return (DDI_FAILURE);
4232 
4233 	ret = ddi_intr_get_nintrs(nvme->n_dip, intr_type, &nintrs);
4234 	if (ret != DDI_SUCCESS) {
4235 		dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_nintrs failed",
4236 		    __func__);
4237 		return (ret);
4238 	}
4239 
4240 	ret = ddi_intr_get_navail(nvme->n_dip, intr_type, &navail);
4241 	if (ret != DDI_SUCCESS) {
4242 		dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_navail failed",
4243 		    __func__);
4244 		return (ret);
4245 	}
4246 
4247 	/* We want at most one interrupt per queue pair. */
4248 	if (navail > nqpairs)
4249 		navail = nqpairs;
4250 
4251 	nvme->n_inth_sz = sizeof (ddi_intr_handle_t) * navail;
4252 	nvme->n_inth = kmem_zalloc(nvme->n_inth_sz, KM_SLEEP);
4253 
4254 	ret = ddi_intr_alloc(nvme->n_dip, nvme->n_inth, intr_type, 0, navail,
4255 	    &count, 0);
4256 	if (ret != DDI_SUCCESS) {
4257 		dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_alloc failed",
4258 		    __func__);
4259 		goto fail;
4260 	}
4261 
4262 	nvme->n_intr_cnt = count;
4263 
4264 	ret = ddi_intr_get_pri(nvme->n_inth[0], &nvme->n_intr_pri);
4265 	if (ret != DDI_SUCCESS) {
4266 		dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_pri failed",
4267 		    __func__);
4268 		goto fail;
4269 	}
4270 
4271 	for (i = 0; i < count; i++) {
4272 		ret = ddi_intr_add_handler(nvme->n_inth[i], nvme_intr,
4273 		    (void *)nvme, (void *)(uintptr_t)i);
4274 		if (ret != DDI_SUCCESS) {
4275 			dev_err(nvme->n_dip, CE_WARN,
4276 			    "!%s: ddi_intr_add_handler failed", __func__);
4277 			goto fail;
4278 		}
4279 	}
4280 
4281 	(void) ddi_intr_get_cap(nvme->n_inth[0], &nvme->n_intr_cap);
4282 
4283 	for (i = 0; i < count; i++) {
4284 		if (nvme->n_intr_cap & DDI_INTR_FLAG_BLOCK)
4285 			ret = ddi_intr_block_enable(&nvme->n_inth[i], 1);
4286 		else
4287 			ret = ddi_intr_enable(nvme->n_inth[i]);
4288 
4289 		if (ret != DDI_SUCCESS) {
4290 			dev_err(nvme->n_dip, CE_WARN,
4291 			    "!%s: enabling interrupt %d failed", __func__, i);
4292 			goto fail;
4293 		}
4294 	}
4295 
4296 	nvme->n_intr_type = intr_type;
4297 
4298 	nvme->n_progress |= NVME_INTERRUPTS;
4299 
4300 	return (DDI_SUCCESS);
4301 
4302 fail:
4303 	nvme_release_interrupts(nvme);
4304 
4305 	return (ret);
4306 }
4307 
4308 static int
4309 nvme_fm_errcb(dev_info_t *dip, ddi_fm_error_t *fm_error, const void *arg)
4310 {
4311 	_NOTE(ARGUNUSED(arg));
4312 
4313 	pci_ereport_post(dip, fm_error, NULL);
4314 	return (fm_error->fme_status);
4315 }
4316 
4317 static void
4318 nvme_remove_callback(dev_info_t *dip, ddi_eventcookie_t cookie, void *a,
4319     void *b)
4320 {
4321 	nvme_t *nvme = a;
4322 
4323 	nvme_ctrl_mark_dead(nvme, B_TRUE);
4324 
4325 	/*
4326 	 * Fail all outstanding commands, including those in the admin queue
4327 	 * (queue 0).
4328 	 */
4329 	for (uint_t i = 0; i < nvme->n_ioq_count + 1; i++) {
4330 		nvme_qpair_t *qp = nvme->n_ioq[i];
4331 
4332 		mutex_enter(&qp->nq_mutex);
4333 		for (size_t j = 0; j < qp->nq_nentry; j++) {
4334 			nvme_cmd_t *cmd = qp->nq_cmd[j];
4335 			nvme_cmd_t *u_cmd;
4336 
4337 			if (cmd == NULL) {
4338 				continue;
4339 			}
4340 
4341 			/*
4342 			 * Since we have the queue lock held the entire time we
4343 			 * iterate over it, it's not possible for the queue to
4344 			 * change underneath us. Thus, we don't need to check
4345 			 * that the return value of nvme_unqueue_cmd matches the
4346 			 * requested cmd to unqueue.
4347 			 */
4348 			u_cmd = nvme_unqueue_cmd(nvme, qp, cmd->nc_sqe.sqe_cid);
4349 			taskq_dispatch_ent(qp->nq_cq->ncq_cmd_taskq,
4350 			    cmd->nc_callback, cmd, TQ_NOSLEEP, &cmd->nc_tqent);
4351 
4352 			ASSERT3P(u_cmd, ==, cmd);
4353 		}
4354 		mutex_exit(&qp->nq_mutex);
4355 	}
4356 }
4357 
4358 /*
4359  * Open minor management
4360  */
4361 static int
4362 nvme_minor_comparator(const void *l, const void *r)
4363 {
4364 	const nvme_minor_t *lm = l;
4365 	const nvme_minor_t *rm = r;
4366 
4367 	if (lm->nm_minor > rm->nm_minor) {
4368 		return (1);
4369 	} else if (lm->nm_minor < rm->nm_minor) {
4370 		return (-1);
4371 	} else {
4372 		return (0);
4373 	}
4374 }
4375 
4376 static void
4377 nvme_minor_free(nvme_minor_t *minor)
4378 {
4379 	if (minor->nm_minor > 0) {
4380 		ASSERT3S(minor->nm_minor, >=, NVME_OPEN_MINOR_MIN);
4381 		id_free(nvme_open_minors, minor->nm_minor);
4382 		minor->nm_minor = 0;
4383 	}
4384 	VERIFY0(list_link_active(&minor->nm_ctrl_lock.nli_node));
4385 	VERIFY0(list_link_active(&minor->nm_ns_lock.nli_node));
4386 	cv_destroy(&minor->nm_cv);
4387 	kmem_free(minor, sizeof (nvme_minor_t));
4388 }
4389 
4390 static nvme_minor_t *
4391 nvme_minor_find_by_dev(dev_t dev)
4392 {
4393 	id_t id = (id_t)getminor(dev);
4394 	nvme_minor_t search = { .nm_minor = id };
4395 	nvme_minor_t *ret;
4396 
4397 	mutex_enter(&nvme_open_minors_mutex);
4398 	ret = avl_find(&nvme_open_minors_avl, &search, NULL);
4399 	mutex_exit(&nvme_open_minors_mutex);
4400 
4401 	return (ret);
4402 }
4403 
4404 static int
4405 nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
4406 {
4407 	nvme_t *nvme;
4408 	int instance;
4409 	int nregs;
4410 	off_t regsize;
4411 	char name[32];
4412 	boolean_t attached_ns;
4413 
4414 	if (cmd != DDI_ATTACH)
4415 		return (DDI_FAILURE);
4416 
4417 	instance = ddi_get_instance(dip);
4418 
4419 	if (ddi_soft_state_zalloc(nvme_state, instance) != DDI_SUCCESS)
4420 		return (DDI_FAILURE);
4421 
4422 	nvme = ddi_get_soft_state(nvme_state, instance);
4423 	ddi_set_driver_private(dip, nvme);
4424 	nvme->n_dip = dip;
4425 
4426 	/*
4427 	 * Map PCI config space
4428 	 */
4429 	if (pci_config_setup(dip, &nvme->n_pcicfg_handle) != DDI_SUCCESS) {
4430 		dev_err(dip, CE_WARN, "!failed to map PCI config space");
4431 		goto fail;
4432 	}
4433 	nvme->n_progress |= NVME_PCI_CONFIG;
4434 
4435 	/*
4436 	 * Get the various PCI IDs from config space
4437 	 */
4438 	nvme->n_vendor_id =
4439 	    pci_config_get16(nvme->n_pcicfg_handle, PCI_CONF_VENID);
4440 	nvme->n_device_id =
4441 	    pci_config_get16(nvme->n_pcicfg_handle, PCI_CONF_DEVID);
4442 	nvme->n_revision_id =
4443 	    pci_config_get8(nvme->n_pcicfg_handle, PCI_CONF_REVID);
4444 	nvme->n_subsystem_device_id =
4445 	    pci_config_get16(nvme->n_pcicfg_handle, PCI_CONF_SUBSYSID);
4446 	nvme->n_subsystem_vendor_id =
4447 	    pci_config_get16(nvme->n_pcicfg_handle, PCI_CONF_SUBVENID);
4448 
4449 	nvme_detect_quirks(nvme);
4450 
4451 	/*
4452 	 * Set up event handlers for hot removal. While npe(4D) supports the hot
4453 	 * removal event being injected for devices, the same is not true of all
4454 	 * of our possible parents (i.e. pci(4D) as of this writing). The most
4455 	 * common case this shows up is in some virtualization environments. We
4456 	 * should treat this as non-fatal so that way devices work but leave
4457 	 * this set up in such a way that if a nexus does grow support for this
4458 	 * we're good to go.
4459 	 */
4460 	if (ddi_get_eventcookie(nvme->n_dip, DDI_DEVI_REMOVE_EVENT,
4461 	    &nvme->n_rm_cookie) == DDI_SUCCESS) {
4462 		if (ddi_add_event_handler(nvme->n_dip, nvme->n_rm_cookie,
4463 		    nvme_remove_callback, nvme, &nvme->n_ev_rm_cb_id) !=
4464 		    DDI_SUCCESS) {
4465 			goto fail;
4466 		}
4467 	} else {
4468 		nvme->n_ev_rm_cb_id = NULL;
4469 	}
4470 
4471 	mutex_init(&nvme->n_minor_mutex, NULL, MUTEX_DRIVER, NULL);
4472 	nvme->n_progress |= NVME_MUTEX_INIT;
4473 
4474 	nvme->n_strict_version = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
4475 	    DDI_PROP_DONTPASS, "strict-version", 1) == 1 ? B_TRUE : B_FALSE;
4476 	nvme->n_ignore_unknown_vendor_status = ddi_prop_get_int(DDI_DEV_T_ANY,
4477 	    dip, DDI_PROP_DONTPASS, "ignore-unknown-vendor-status", 0) == 1 ?
4478 	    B_TRUE : B_FALSE;
4479 	nvme->n_admin_queue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
4480 	    DDI_PROP_DONTPASS, "admin-queue-len", NVME_DEFAULT_ADMIN_QUEUE_LEN);
4481 	nvme->n_io_squeue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
4482 	    DDI_PROP_DONTPASS, "io-squeue-len", NVME_DEFAULT_IO_QUEUE_LEN);
4483 	/*
4484 	 * Double up the default for completion queues in case of
4485 	 * queue sharing.
4486 	 */
4487 	nvme->n_io_cqueue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
4488 	    DDI_PROP_DONTPASS, "io-cqueue-len", 2 * NVME_DEFAULT_IO_QUEUE_LEN);
4489 	nvme->n_async_event_limit = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
4490 	    DDI_PROP_DONTPASS, "async-event-limit",
4491 	    NVME_DEFAULT_ASYNC_EVENT_LIMIT);
4492 	nvme->n_write_cache_enabled = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
4493 	    DDI_PROP_DONTPASS, "volatile-write-cache-enable", 1) != 0 ?
4494 	    B_TRUE : B_FALSE;
4495 	nvme->n_min_block_size = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
4496 	    DDI_PROP_DONTPASS, "min-phys-block-size",
4497 	    NVME_DEFAULT_MIN_BLOCK_SIZE);
4498 	nvme->n_submission_queues = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
4499 	    DDI_PROP_DONTPASS, "max-submission-queues", -1);
4500 	nvme->n_completion_queues = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
4501 	    DDI_PROP_DONTPASS, "max-completion-queues", -1);
4502 
4503 	if (!ISP2(nvme->n_min_block_size) ||
4504 	    (nvme->n_min_block_size < NVME_DEFAULT_MIN_BLOCK_SIZE)) {
4505 		dev_err(dip, CE_WARN, "!min-phys-block-size %s, "
4506 		    "using default %d", ISP2(nvme->n_min_block_size) ?
4507 		    "too low" : "not a power of 2",
4508 		    NVME_DEFAULT_MIN_BLOCK_SIZE);
4509 		nvme->n_min_block_size = NVME_DEFAULT_MIN_BLOCK_SIZE;
4510 	}
4511 
4512 	if (nvme->n_submission_queues != -1 &&
4513 	    (nvme->n_submission_queues < 1 ||
4514 	    nvme->n_submission_queues > UINT16_MAX)) {
4515 		dev_err(dip, CE_WARN, "!\"submission-queues\"=%d is not "
4516 		    "valid. Must be [1..%d]", nvme->n_submission_queues,
4517 		    UINT16_MAX);
4518 		nvme->n_submission_queues = -1;
4519 	}
4520 
4521 	if (nvme->n_completion_queues != -1 &&
4522 	    (nvme->n_completion_queues < 1 ||
4523 	    nvme->n_completion_queues > UINT16_MAX)) {
4524 		dev_err(dip, CE_WARN, "!\"completion-queues\"=%d is not "
4525 		    "valid. Must be [1..%d]", nvme->n_completion_queues,
4526 		    UINT16_MAX);
4527 		nvme->n_completion_queues = -1;
4528 	}
4529 
4530 	if (nvme->n_admin_queue_len < NVME_MIN_ADMIN_QUEUE_LEN)
4531 		nvme->n_admin_queue_len = NVME_MIN_ADMIN_QUEUE_LEN;
4532 	else if (nvme->n_admin_queue_len > NVME_MAX_ADMIN_QUEUE_LEN)
4533 		nvme->n_admin_queue_len = NVME_MAX_ADMIN_QUEUE_LEN;
4534 
4535 	if (nvme->n_io_squeue_len < NVME_MIN_IO_QUEUE_LEN)
4536 		nvme->n_io_squeue_len = NVME_MIN_IO_QUEUE_LEN;
4537 	if (nvme->n_io_cqueue_len < NVME_MIN_IO_QUEUE_LEN)
4538 		nvme->n_io_cqueue_len = NVME_MIN_IO_QUEUE_LEN;
4539 
4540 	if (nvme->n_async_event_limit < 1)
4541 		nvme->n_async_event_limit = NVME_DEFAULT_ASYNC_EVENT_LIMIT;
4542 
4543 	nvme->n_reg_acc_attr = nvme_reg_acc_attr;
4544 	nvme->n_queue_dma_attr = nvme_queue_dma_attr;
4545 	nvme->n_prp_dma_attr = nvme_prp_dma_attr;
4546 	nvme->n_sgl_dma_attr = nvme_sgl_dma_attr;
4547 
4548 	/*
4549 	 * Set up FMA support.
4550 	 */
4551 	nvme->n_fm_cap = ddi_getprop(DDI_DEV_T_ANY, dip,
4552 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "fm-capable",
4553 	    DDI_FM_EREPORT_CAPABLE | DDI_FM_ACCCHK_CAPABLE |
4554 	    DDI_FM_DMACHK_CAPABLE | DDI_FM_ERRCB_CAPABLE);
4555 
4556 	ddi_fm_init(dip, &nvme->n_fm_cap, &nvme->n_fm_ibc);
4557 
4558 	if (nvme->n_fm_cap) {
4559 		if (nvme->n_fm_cap & DDI_FM_ACCCHK_CAPABLE)
4560 			nvme->n_reg_acc_attr.devacc_attr_access =
4561 			    DDI_FLAGERR_ACC;
4562 
4563 		if (nvme->n_fm_cap & DDI_FM_DMACHK_CAPABLE) {
4564 			nvme->n_prp_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
4565 			nvme->n_sgl_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
4566 		}
4567 
4568 		if (DDI_FM_EREPORT_CAP(nvme->n_fm_cap) ||
4569 		    DDI_FM_ERRCB_CAP(nvme->n_fm_cap))
4570 			pci_ereport_setup(dip);
4571 
4572 		if (DDI_FM_ERRCB_CAP(nvme->n_fm_cap))
4573 			ddi_fm_handler_register(dip, nvme_fm_errcb,
4574 			    (void *)nvme);
4575 	}
4576 
4577 	nvme->n_progress |= NVME_FMA_INIT;
4578 
4579 	/*
4580 	 * The spec defines several register sets. Only the controller
4581 	 * registers (set 1) are currently used.
4582 	 */
4583 	if (ddi_dev_nregs(dip, &nregs) == DDI_FAILURE ||
4584 	    nregs < 2 ||
4585 	    ddi_dev_regsize(dip, 1, &regsize) == DDI_FAILURE)
4586 		goto fail;
4587 
4588 	if (ddi_regs_map_setup(dip, 1, &nvme->n_regs, 0, regsize,
4589 	    &nvme->n_reg_acc_attr, &nvme->n_regh) != DDI_SUCCESS) {
4590 		dev_err(dip, CE_WARN, "!failed to map regset 1");
4591 		goto fail;
4592 	}
4593 
4594 	nvme->n_progress |= NVME_REGS_MAPPED;
4595 
4596 	/*
4597 	 * Create PRP DMA cache
4598 	 */
4599 	(void) snprintf(name, sizeof (name), "%s%d_prp_cache",
4600 	    ddi_driver_name(dip), ddi_get_instance(dip));
4601 	nvme->n_prp_cache = kmem_cache_create(name, sizeof (nvme_dma_t),
4602 	    0, nvme_prp_dma_constructor, nvme_prp_dma_destructor,
4603 	    NULL, (void *)nvme, NULL, 0);
4604 
4605 	if (nvme_init(nvme) != DDI_SUCCESS)
4606 		goto fail;
4607 
4608 	/*
4609 	 * Initialize the driver with the UFM subsystem
4610 	 */
4611 	if (ddi_ufm_init(dip, DDI_UFM_CURRENT_VERSION, &nvme_ufm_ops,
4612 	    &nvme->n_ufmh, nvme) != 0) {
4613 		dev_err(dip, CE_WARN, "!failed to initialize UFM subsystem");
4614 		goto fail;
4615 	}
4616 	mutex_init(&nvme->n_fwslot_mutex, NULL, MUTEX_DRIVER, NULL);
4617 	ddi_ufm_update(nvme->n_ufmh);
4618 	nvme->n_progress |= NVME_UFM_INIT;
4619 
4620 	mutex_init(&nvme->n_mgmt_mutex, NULL, MUTEX_DRIVER, NULL);
4621 	nvme_lock_init(&nvme->n_lock);
4622 	nvme->n_progress |= NVME_MGMT_INIT;
4623 	nvme->n_dead_status = NVME_IOCTL_E_CTRL_DEAD;
4624 
4625 
4626 	/*
4627 	 * Identify namespaces.
4628 	 */
4629 	mutex_enter(&nvme->n_mgmt_mutex);
4630 
4631 	for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) {
4632 		nvme_namespace_t *ns = nvme_nsid2ns(nvme, i);
4633 
4634 		nvme_lock_init(&ns->ns_lock);
4635 		ns->ns_progress |= NVME_NS_LOCK;
4636 
4637 		/*
4638 		 * Namespaces start out ignored. When nvme_init_ns() checks
4639 		 * their properties and finds they can be used, it will set
4640 		 * ns_ignore to B_FALSE. It will also use this state change
4641 		 * to keep an accurate count of attachable namespaces.
4642 		 */
4643 		ns->ns_ignore = B_TRUE;
4644 		if (nvme_init_ns(nvme, i) != 0) {
4645 			mutex_exit(&nvme->n_mgmt_mutex);
4646 			goto fail;
4647 		}
4648 
4649 		if (ddi_create_minor_node(nvme->n_dip, ns->ns_name, S_IFCHR,
4650 		    NVME_MINOR(ddi_get_instance(nvme->n_dip), i),
4651 		    DDI_NT_NVME_ATTACHMENT_POINT, 0) != DDI_SUCCESS) {
4652 			mutex_exit(&nvme->n_mgmt_mutex);
4653 			dev_err(dip, CE_WARN,
4654 			    "!failed to create minor node for namespace %d", i);
4655 			goto fail;
4656 		}
4657 	}
4658 
4659 	if (ddi_create_minor_node(dip, "devctl", S_IFCHR,
4660 	    NVME_MINOR(ddi_get_instance(dip), 0), DDI_NT_NVME_NEXUS, 0)
4661 	    != DDI_SUCCESS) {
4662 		mutex_exit(&nvme->n_mgmt_mutex);
4663 		dev_err(dip, CE_WARN, "nvme_attach: "
4664 		    "cannot create devctl minor node");
4665 		goto fail;
4666 	}
4667 
4668 	attached_ns = B_FALSE;
4669 	for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) {
4670 		nvme_ioctl_common_t com = { .nioc_nsid = i };
4671 
4672 		if (nvme_attach_ns(nvme, &com)) {
4673 			attached_ns = B_TRUE;
4674 		} else if (com.nioc_drv_err != NVME_IOCTL_E_UNSUP_ATTACH_NS) {
4675 			dev_err(nvme->n_dip, CE_WARN, "!failed to attach "
4676 			    "namespace %d due to blkdev error", i);
4677 			/*
4678 			 * Once we have successfully attached a namespace we
4679 			 * can no longer fail the driver attach as there is now
4680 			 * a blkdev child node linked to this device, and
4681 			 * our node is not yet in the attached state.
4682 			 */
4683 			if (!attached_ns) {
4684 				mutex_exit(&nvme->n_mgmt_mutex);
4685 				goto fail;
4686 			}
4687 		}
4688 	}
4689 
4690 	mutex_exit(&nvme->n_mgmt_mutex);
4691 
4692 	return (DDI_SUCCESS);
4693 
4694 fail:
4695 	/* attach successful anyway so that FMA can retire the device */
4696 	if (nvme->n_dead)
4697 		return (DDI_SUCCESS);
4698 
4699 	(void) nvme_detach(dip, DDI_DETACH);
4700 
4701 	return (DDI_FAILURE);
4702 }
4703 
4704 static int
4705 nvme_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
4706 {
4707 	int instance;
4708 	nvme_t *nvme;
4709 
4710 	if (cmd != DDI_DETACH)
4711 		return (DDI_FAILURE);
4712 
4713 	instance = ddi_get_instance(dip);
4714 
4715 	nvme = ddi_get_soft_state(nvme_state, instance);
4716 
4717 	if (nvme == NULL)
4718 		return (DDI_FAILURE);
4719 
4720 	/*
4721 	 * Remove all minor nodes from the device regardless of the source in
4722 	 * one swoop.
4723 	 */
4724 	ddi_remove_minor_node(dip, NULL);
4725 
4726 	/*
4727 	 * We need to remove the event handler as one of the first things that
4728 	 * we do. If we proceed with other teardown without removing the event
4729 	 * handler, we could end up in a very unfortunate race with ourselves.
4730 	 * The DDI does not serialize these with detach (just like timeout(9F)
4731 	 * and others).
4732 	 */
4733 	if (nvme->n_ev_rm_cb_id != NULL) {
4734 		(void) ddi_remove_event_handler(nvme->n_ev_rm_cb_id);
4735 	}
4736 	nvme->n_ev_rm_cb_id = NULL;
4737 
4738 	/*
4739 	 * If the controller was marked dead, there is a slight chance that we
4740 	 * are asynchronusly processing the removal taskq. Because we have
4741 	 * removed the callback handler above and all minor nodes and commands
4742 	 * are closed, there is no other way to get in here. As such, we wait on
4743 	 * the nvme_dead_taskq to complete so we can avoid tracking if it's
4744 	 * running or not.
4745 	 */
4746 	taskq_wait(nvme_dead_taskq);
4747 
4748 	if (nvme->n_ns) {
4749 		for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) {
4750 			nvme_namespace_t *ns = nvme_nsid2ns(nvme, i);
4751 
4752 			if (ns->ns_bd_hdl) {
4753 				(void) bd_detach_handle(ns->ns_bd_hdl);
4754 				bd_free_handle(ns->ns_bd_hdl);
4755 			}
4756 
4757 			if (ns->ns_idns)
4758 				kmem_free(ns->ns_idns,
4759 				    sizeof (nvme_identify_nsid_t));
4760 			if (ns->ns_devid)
4761 				strfree(ns->ns_devid);
4762 
4763 			if ((ns->ns_progress & NVME_NS_LOCK) != 0)
4764 				nvme_lock_fini(&ns->ns_lock);
4765 		}
4766 
4767 		kmem_free(nvme->n_ns, sizeof (nvme_namespace_t) *
4768 		    nvme->n_namespace_count);
4769 	}
4770 
4771 	if (nvme->n_progress & NVME_MGMT_INIT) {
4772 		nvme_lock_fini(&nvme->n_lock);
4773 		mutex_destroy(&nvme->n_mgmt_mutex);
4774 	}
4775 
4776 	if (nvme->n_progress & NVME_UFM_INIT) {
4777 		ddi_ufm_fini(nvme->n_ufmh);
4778 		mutex_destroy(&nvme->n_fwslot_mutex);
4779 	}
4780 
4781 	if (nvme->n_progress & NVME_INTERRUPTS)
4782 		nvme_release_interrupts(nvme);
4783 
4784 	for (uint_t i = 0; i < nvme->n_cq_count; i++) {
4785 		if (nvme->n_cq[i]->ncq_cmd_taskq != NULL)
4786 			taskq_wait(nvme->n_cq[i]->ncq_cmd_taskq);
4787 	}
4788 
4789 	if (nvme->n_progress & NVME_MUTEX_INIT) {
4790 		mutex_destroy(&nvme->n_minor_mutex);
4791 	}
4792 
4793 	if (nvme->n_ioq_count > 0) {
4794 		for (uint_t i = 1; i != nvme->n_ioq_count + 1; i++) {
4795 			if (nvme->n_ioq[i] != NULL) {
4796 				/* TODO: send destroy queue commands */
4797 				nvme_free_qpair(nvme->n_ioq[i]);
4798 			}
4799 		}
4800 
4801 		kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *) *
4802 		    (nvme->n_ioq_count + 1));
4803 	}
4804 
4805 	if (nvme->n_prp_cache != NULL) {
4806 		kmem_cache_destroy(nvme->n_prp_cache);
4807 	}
4808 
4809 	if (nvme->n_progress & NVME_REGS_MAPPED) {
4810 		nvme_shutdown(nvme, B_FALSE);
4811 		(void) nvme_reset(nvme, B_FALSE);
4812 	}
4813 
4814 	if (nvme->n_progress & NVME_CTRL_LIMITS)
4815 		sema_destroy(&nvme->n_abort_sema);
4816 
4817 	if (nvme->n_progress & NVME_ADMIN_QUEUE)
4818 		nvme_free_qpair(nvme->n_adminq);
4819 
4820 	if (nvme->n_cq_count > 0) {
4821 		nvme_destroy_cq_array(nvme, 0);
4822 		nvme->n_cq = NULL;
4823 		nvme->n_cq_count = 0;
4824 	}
4825 
4826 	if (nvme->n_idcomns)
4827 		kmem_free(nvme->n_idcomns, NVME_IDENTIFY_BUFSIZE);
4828 
4829 	if (nvme->n_idctl)
4830 		kmem_free(nvme->n_idctl, NVME_IDENTIFY_BUFSIZE);
4831 
4832 	if (nvme->n_progress & NVME_REGS_MAPPED)
4833 		ddi_regs_map_free(&nvme->n_regh);
4834 
4835 	if (nvme->n_progress & NVME_FMA_INIT) {
4836 		if (DDI_FM_ERRCB_CAP(nvme->n_fm_cap))
4837 			ddi_fm_handler_unregister(nvme->n_dip);
4838 
4839 		if (DDI_FM_EREPORT_CAP(nvme->n_fm_cap) ||
4840 		    DDI_FM_ERRCB_CAP(nvme->n_fm_cap))
4841 			pci_ereport_teardown(nvme->n_dip);
4842 
4843 		ddi_fm_fini(nvme->n_dip);
4844 	}
4845 
4846 	if (nvme->n_progress & NVME_PCI_CONFIG)
4847 		pci_config_teardown(&nvme->n_pcicfg_handle);
4848 
4849 	if (nvme->n_vendor != NULL)
4850 		strfree(nvme->n_vendor);
4851 
4852 	if (nvme->n_product != NULL)
4853 		strfree(nvme->n_product);
4854 
4855 	ddi_soft_state_free(nvme_state, instance);
4856 
4857 	return (DDI_SUCCESS);
4858 }
4859 
4860 static int
4861 nvme_quiesce(dev_info_t *dip)
4862 {
4863 	int instance;
4864 	nvme_t *nvme;
4865 
4866 	instance = ddi_get_instance(dip);
4867 
4868 	nvme = ddi_get_soft_state(nvme_state, instance);
4869 
4870 	if (nvme == NULL)
4871 		return (DDI_FAILURE);
4872 
4873 	nvme_shutdown(nvme, B_TRUE);
4874 
4875 	(void) nvme_reset(nvme, B_TRUE);
4876 
4877 	return (DDI_SUCCESS);
4878 }
4879 
4880 static int
4881 nvme_fill_prp(nvme_cmd_t *cmd, ddi_dma_handle_t dma)
4882 {
4883 	nvme_t *nvme = cmd->nc_nvme;
4884 	uint_t nprp_per_page, nprp;
4885 	uint64_t *prp;
4886 	const ddi_dma_cookie_t *cookie;
4887 	uint_t idx;
4888 	uint_t ncookies = ddi_dma_ncookies(dma);
4889 
4890 	if (ncookies == 0)
4891 		return (DDI_FAILURE);
4892 
4893 	if ((cookie = ddi_dma_cookie_get(dma, 0)) == NULL)
4894 		return (DDI_FAILURE);
4895 	cmd->nc_sqe.sqe_dptr.d_prp[0] = cookie->dmac_laddress;
4896 
4897 	if (ncookies == 1) {
4898 		cmd->nc_sqe.sqe_dptr.d_prp[1] = 0;
4899 		return (DDI_SUCCESS);
4900 	} else if (ncookies == 2) {
4901 		if ((cookie = ddi_dma_cookie_get(dma, 1)) == NULL)
4902 			return (DDI_FAILURE);
4903 		cmd->nc_sqe.sqe_dptr.d_prp[1] = cookie->dmac_laddress;
4904 		return (DDI_SUCCESS);
4905 	}
4906 
4907 	/*
4908 	 * At this point, we're always operating on cookies at
4909 	 * index >= 1 and writing the addresses of those cookies
4910 	 * into a new page. The address of that page is stored
4911 	 * as the second PRP entry.
4912 	 */
4913 	nprp_per_page = nvme->n_pagesize / sizeof (uint64_t);
4914 	ASSERT(nprp_per_page > 0);
4915 
4916 	/*
4917 	 * We currently don't support chained PRPs and set up our DMA
4918 	 * attributes to reflect that. If we still get an I/O request
4919 	 * that needs a chained PRP something is very wrong. Account
4920 	 * for the first cookie here, which we've placed in d_prp[0].
4921 	 */
4922 	nprp = howmany(ncookies - 1, nprp_per_page);
4923 	VERIFY(nprp == 1);
4924 
4925 	/*
4926 	 * Allocate a page of pointers, in which we'll write the
4927 	 * addresses of cookies 1 to `ncookies`.
4928 	 */
4929 	cmd->nc_prp = kmem_cache_alloc(nvme->n_prp_cache, KM_SLEEP);
4930 	bzero(cmd->nc_prp->nd_memp, cmd->nc_prp->nd_len);
4931 	cmd->nc_sqe.sqe_dptr.d_prp[1] = cmd->nc_prp->nd_cookie.dmac_laddress;
4932 
4933 	prp = (uint64_t *)cmd->nc_prp->nd_memp;
4934 	for (idx = 1; idx < ncookies; idx++) {
4935 		if ((cookie = ddi_dma_cookie_get(dma, idx)) == NULL)
4936 			return (DDI_FAILURE);
4937 		*prp++ = cookie->dmac_laddress;
4938 	}
4939 
4940 	(void) ddi_dma_sync(cmd->nc_prp->nd_dmah, 0, cmd->nc_prp->nd_len,
4941 	    DDI_DMA_SYNC_FORDEV);
4942 	return (DDI_SUCCESS);
4943 }
4944 
4945 /*
4946  * The maximum number of requests supported for a deallocate request is
4947  * NVME_DSET_MGMT_MAX_RANGES (256) -- this is from the NVMe 1.1 spec (and
4948  * unchanged through at least 1.4a). The definition of nvme_range_t is also
4949  * from the NVMe 1.1 spec. Together, the result is that all of the ranges for
4950  * a deallocate request will fit into the smallest supported namespace page
4951  * (4k).
4952  */
4953 CTASSERT(sizeof (nvme_range_t) * NVME_DSET_MGMT_MAX_RANGES == 4096);
4954 
4955 static int
4956 nvme_fill_ranges(nvme_cmd_t *cmd, bd_xfer_t *xfer, uint64_t blocksize,
4957     int allocflag)
4958 {
4959 	const dkioc_free_list_t *dfl = xfer->x_dfl;
4960 	const dkioc_free_list_ext_t *exts = dfl->dfl_exts;
4961 	nvme_t *nvme = cmd->nc_nvme;
4962 	nvme_range_t *ranges = NULL;
4963 	uint_t i;
4964 
4965 	/*
4966 	 * The number of ranges in the request is 0s based (that is
4967 	 * word10 == 0 -> 1 range, word10 == 1 -> 2 ranges, ...,
4968 	 * word10 == 255 -> 256 ranges). Therefore the allowed values are
4969 	 * [1..NVME_DSET_MGMT_MAX_RANGES]. If blkdev gives us a bad request,
4970 	 * we either provided bad info in nvme_bd_driveinfo() or there is a bug
4971 	 * in blkdev.
4972 	 */
4973 	VERIFY3U(dfl->dfl_num_exts, >, 0);
4974 	VERIFY3U(dfl->dfl_num_exts, <=, NVME_DSET_MGMT_MAX_RANGES);
4975 	cmd->nc_sqe.sqe_cdw10 = (dfl->dfl_num_exts - 1) & 0xff;
4976 
4977 	cmd->nc_sqe.sqe_cdw11 = NVME_DSET_MGMT_ATTR_DEALLOCATE;
4978 
4979 	cmd->nc_prp = kmem_cache_alloc(nvme->n_prp_cache, allocflag);
4980 	if (cmd->nc_prp == NULL)
4981 		return (DDI_FAILURE);
4982 
4983 	bzero(cmd->nc_prp->nd_memp, cmd->nc_prp->nd_len);
4984 	ranges = (nvme_range_t *)cmd->nc_prp->nd_memp;
4985 
4986 	cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_prp->nd_cookie.dmac_laddress;
4987 	cmd->nc_sqe.sqe_dptr.d_prp[1] = 0;
4988 
4989 	for (i = 0; i < dfl->dfl_num_exts; i++) {
4990 		uint64_t lba, len;
4991 
4992 		lba = (dfl->dfl_offset + exts[i].dfle_start) / blocksize;
4993 		len = exts[i].dfle_length / blocksize;
4994 
4995 		VERIFY3U(len, <=, UINT32_MAX);
4996 
4997 		/* No context attributes for a deallocate request */
4998 		ranges[i].nr_ctxattr = 0;
4999 		ranges[i].nr_len = len;
5000 		ranges[i].nr_lba = lba;
5001 	}
5002 
5003 	(void) ddi_dma_sync(cmd->nc_prp->nd_dmah, 0, cmd->nc_prp->nd_len,
5004 	    DDI_DMA_SYNC_FORDEV);
5005 
5006 	return (DDI_SUCCESS);
5007 }
5008 
5009 static nvme_cmd_t *
5010 nvme_create_nvm_cmd(nvme_namespace_t *ns, uint8_t opc, bd_xfer_t *xfer)
5011 {
5012 	nvme_t *nvme = ns->ns_nvme;
5013 	nvme_cmd_t *cmd;
5014 	int allocflag;
5015 
5016 	/*
5017 	 * Blkdev only sets BD_XFER_POLL when dumping, so don't sleep.
5018 	 */
5019 	allocflag = (xfer->x_flags & BD_XFER_POLL) ? KM_NOSLEEP : KM_SLEEP;
5020 	cmd = nvme_alloc_cmd(nvme, allocflag);
5021 
5022 	if (cmd == NULL)
5023 		return (NULL);
5024 
5025 	cmd->nc_sqe.sqe_opc = opc;
5026 	cmd->nc_callback = nvme_bd_xfer_done;
5027 	cmd->nc_xfer = xfer;
5028 
5029 	switch (opc) {
5030 	case NVME_OPC_NVM_WRITE:
5031 	case NVME_OPC_NVM_READ:
5032 		VERIFY(xfer->x_nblks <= 0x10000);
5033 
5034 		cmd->nc_sqe.sqe_nsid = ns->ns_id;
5035 
5036 		cmd->nc_sqe.sqe_cdw10 = xfer->x_blkno & 0xffffffffu;
5037 		cmd->nc_sqe.sqe_cdw11 = (xfer->x_blkno >> 32);
5038 		cmd->nc_sqe.sqe_cdw12 = (uint16_t)(xfer->x_nblks - 1);
5039 
5040 		if (nvme_fill_prp(cmd, xfer->x_dmah) != DDI_SUCCESS)
5041 			goto fail;
5042 		break;
5043 
5044 	case NVME_OPC_NVM_FLUSH:
5045 		cmd->nc_sqe.sqe_nsid = ns->ns_id;
5046 		break;
5047 
5048 	case NVME_OPC_NVM_DSET_MGMT:
5049 		cmd->nc_sqe.sqe_nsid = ns->ns_id;
5050 
5051 		if (nvme_fill_ranges(cmd, xfer,
5052 		    (uint64_t)ns->ns_block_size, allocflag) != DDI_SUCCESS)
5053 			goto fail;
5054 		break;
5055 
5056 	default:
5057 		goto fail;
5058 	}
5059 
5060 	return (cmd);
5061 
5062 fail:
5063 	nvme_free_cmd(cmd);
5064 	return (NULL);
5065 }
5066 
5067 static void
5068 nvme_bd_xfer_done(void *arg)
5069 {
5070 	nvme_cmd_t *cmd = arg;
5071 	bd_xfer_t *xfer = cmd->nc_xfer;
5072 	int error = 0;
5073 
5074 	error = nvme_check_cmd_status(cmd);
5075 	nvme_free_cmd(cmd);
5076 
5077 	bd_xfer_done(xfer, error);
5078 }
5079 
5080 static void
5081 nvme_bd_driveinfo(void *arg, bd_drive_t *drive)
5082 {
5083 	nvme_namespace_t *ns = arg;
5084 	nvme_t *nvme = ns->ns_nvme;
5085 	uint_t ns_count = MAX(1, nvme->n_namespaces_attachable);
5086 	boolean_t mutex_exit_needed = B_TRUE;
5087 
5088 	/*
5089 	 * nvme_bd_driveinfo is called by blkdev in two situations:
5090 	 * - during bd_attach_handle(), which we call with the mutex held
5091 	 * - during bd_attach(), which may be called with or without the
5092 	 *   mutex held
5093 	 */
5094 	if (mutex_owned(&nvme->n_mgmt_mutex))
5095 		mutex_exit_needed = B_FALSE;
5096 	else
5097 		mutex_enter(&nvme->n_mgmt_mutex);
5098 
5099 	/*
5100 	 * Set the blkdev qcount to the number of submission queues.
5101 	 * It will then create one waitq/runq pair for each submission
5102 	 * queue and spread I/O requests across the queues.
5103 	 */
5104 	drive->d_qcount = nvme->n_ioq_count;
5105 
5106 	/*
5107 	 * I/O activity to individual namespaces is distributed across
5108 	 * each of the d_qcount blkdev queues (which has been set to
5109 	 * the number of nvme submission queues). d_qsize is the number
5110 	 * of submitted and not completed I/Os within each queue that blkdev
5111 	 * will allow before it starts holding them in the waitq.
5112 	 *
5113 	 * Each namespace will create a child blkdev instance, for each one
5114 	 * we try and set the d_qsize so that each namespace gets an
5115 	 * equal portion of the submission queue.
5116 	 *
5117 	 * If post instantiation of the nvme drive, n_namespaces_attachable
5118 	 * changes and a namespace is attached it could calculate a
5119 	 * different d_qsize. It may even be that the sum of the d_qsizes is
5120 	 * now beyond the submission queue size. Should that be the case
5121 	 * and the I/O rate is such that blkdev attempts to submit more
5122 	 * I/Os than the size of the submission queue, the excess I/Os
5123 	 * will be held behind the semaphore nq_sema.
5124 	 */
5125 	drive->d_qsize = nvme->n_io_squeue_len / ns_count;
5126 
5127 	/*
5128 	 * Don't let the queue size drop below the minimum, though.
5129 	 */
5130 	drive->d_qsize = MAX(drive->d_qsize, NVME_MIN_IO_QUEUE_LEN);
5131 
5132 	/*
5133 	 * d_maxxfer is not set, which means the value is taken from the DMA
5134 	 * attributes specified to bd_alloc_handle.
5135 	 */
5136 
5137 	drive->d_removable = B_FALSE;
5138 	drive->d_hotpluggable = B_FALSE;
5139 
5140 	bcopy(ns->ns_eui64, drive->d_eui64, sizeof (drive->d_eui64));
5141 	drive->d_target = ns->ns_id;
5142 	drive->d_lun = 0;
5143 
5144 	drive->d_model = nvme->n_idctl->id_model;
5145 	drive->d_model_len = sizeof (nvme->n_idctl->id_model);
5146 	drive->d_vendor = nvme->n_vendor;
5147 	drive->d_vendor_len = strlen(nvme->n_vendor);
5148 	drive->d_product = nvme->n_product;
5149 	drive->d_product_len = strlen(nvme->n_product);
5150 	drive->d_serial = nvme->n_idctl->id_serial;
5151 	drive->d_serial_len = sizeof (nvme->n_idctl->id_serial);
5152 	drive->d_revision = nvme->n_idctl->id_fwrev;
5153 	drive->d_revision_len = sizeof (nvme->n_idctl->id_fwrev);
5154 
5155 	/*
5156 	 * If we support the dataset management command, the only restrictions
5157 	 * on a discard request are the maximum number of ranges (segments)
5158 	 * per single request.
5159 	 */
5160 	if (nvme->n_idctl->id_oncs.on_dset_mgmt)
5161 		drive->d_max_free_seg = NVME_DSET_MGMT_MAX_RANGES;
5162 
5163 	if (mutex_exit_needed)
5164 		mutex_exit(&nvme->n_mgmt_mutex);
5165 }
5166 
5167 static int
5168 nvme_bd_mediainfo(void *arg, bd_media_t *media)
5169 {
5170 	nvme_namespace_t *ns = arg;
5171 	nvme_t *nvme = ns->ns_nvme;
5172 	boolean_t mutex_exit_needed = B_TRUE;
5173 
5174 	if (nvme->n_dead) {
5175 		return (EIO);
5176 	}
5177 
5178 	/*
5179 	 * nvme_bd_mediainfo is called by blkdev in various situations,
5180 	 * most of them out of our control. There's one exception though:
5181 	 * When we call bd_state_change() in response to "namespace change"
5182 	 * notification, where the mutex is already being held by us.
5183 	 */
5184 	if (mutex_owned(&nvme->n_mgmt_mutex))
5185 		mutex_exit_needed = B_FALSE;
5186 	else
5187 		mutex_enter(&nvme->n_mgmt_mutex);
5188 
5189 	media->m_nblks = ns->ns_block_count;
5190 	media->m_blksize = ns->ns_block_size;
5191 	media->m_readonly = B_FALSE;
5192 	media->m_solidstate = B_TRUE;
5193 
5194 	media->m_pblksize = ns->ns_best_block_size;
5195 
5196 	if (mutex_exit_needed)
5197 		mutex_exit(&nvme->n_mgmt_mutex);
5198 
5199 	return (0);
5200 }
5201 
5202 static int
5203 nvme_bd_cmd(nvme_namespace_t *ns, bd_xfer_t *xfer, uint8_t opc)
5204 {
5205 	nvme_t *nvme = ns->ns_nvme;
5206 	nvme_cmd_t *cmd;
5207 	nvme_qpair_t *ioq;
5208 	boolean_t poll;
5209 	int ret;
5210 
5211 	if (nvme->n_dead) {
5212 		return (EIO);
5213 	}
5214 
5215 	cmd = nvme_create_nvm_cmd(ns, opc, xfer);
5216 	if (cmd == NULL)
5217 		return (ENOMEM);
5218 
5219 	cmd->nc_sqid = xfer->x_qnum + 1;
5220 	ASSERT(cmd->nc_sqid <= nvme->n_ioq_count);
5221 	ioq = nvme->n_ioq[cmd->nc_sqid];
5222 
5223 	/*
5224 	 * Get the polling flag before submitting the command. The command may
5225 	 * complete immediately after it was submitted, which means we must
5226 	 * treat both cmd and xfer as if they have been freed already.
5227 	 */
5228 	poll = (xfer->x_flags & BD_XFER_POLL) != 0;
5229 
5230 	ret = nvme_submit_io_cmd(ioq, cmd);
5231 
5232 	if (ret != 0)
5233 		return (ret);
5234 
5235 	if (!poll)
5236 		return (0);
5237 
5238 	do {
5239 		cmd = nvme_retrieve_cmd(nvme, ioq);
5240 		if (cmd != NULL)
5241 			cmd->nc_callback(cmd);
5242 		else
5243 			drv_usecwait(10);
5244 	} while (ioq->nq_active_cmds != 0);
5245 
5246 	return (0);
5247 }
5248 
5249 static int
5250 nvme_bd_read(void *arg, bd_xfer_t *xfer)
5251 {
5252 	nvme_namespace_t *ns = arg;
5253 
5254 	return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_READ));
5255 }
5256 
5257 static int
5258 nvme_bd_write(void *arg, bd_xfer_t *xfer)
5259 {
5260 	nvme_namespace_t *ns = arg;
5261 
5262 	return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_WRITE));
5263 }
5264 
5265 static int
5266 nvme_bd_sync(void *arg, bd_xfer_t *xfer)
5267 {
5268 	nvme_namespace_t *ns = arg;
5269 
5270 	if (ns->ns_nvme->n_dead)
5271 		return (EIO);
5272 
5273 	/*
5274 	 * If the volatile write cache is not present or not enabled the FLUSH
5275 	 * command is a no-op, so we can take a shortcut here.
5276 	 */
5277 	if (!ns->ns_nvme->n_write_cache_present) {
5278 		bd_xfer_done(xfer, ENOTSUP);
5279 		return (0);
5280 	}
5281 
5282 	if (!ns->ns_nvme->n_write_cache_enabled) {
5283 		bd_xfer_done(xfer, 0);
5284 		return (0);
5285 	}
5286 
5287 	return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_FLUSH));
5288 }
5289 
5290 static int
5291 nvme_bd_devid(void *arg, dev_info_t *devinfo, ddi_devid_t *devid)
5292 {
5293 	nvme_namespace_t *ns = arg;
5294 	nvme_t *nvme = ns->ns_nvme;
5295 
5296 	if (nvme->n_dead) {
5297 		return (EIO);
5298 	}
5299 
5300 	if (*(uint64_t *)ns->ns_nguid != 0 ||
5301 	    *(uint64_t *)(ns->ns_nguid + 8) != 0) {
5302 		return (ddi_devid_init(devinfo, DEVID_NVME_NGUID,
5303 		    sizeof (ns->ns_nguid), ns->ns_nguid, devid));
5304 	} else if (*(uint64_t *)ns->ns_eui64 != 0) {
5305 		return (ddi_devid_init(devinfo, DEVID_NVME_EUI64,
5306 		    sizeof (ns->ns_eui64), ns->ns_eui64, devid));
5307 	} else {
5308 		return (ddi_devid_init(devinfo, DEVID_NVME_NSID,
5309 		    strlen(ns->ns_devid), ns->ns_devid, devid));
5310 	}
5311 }
5312 
5313 static int
5314 nvme_bd_free_space(void *arg, bd_xfer_t *xfer)
5315 {
5316 	nvme_namespace_t *ns = arg;
5317 
5318 	if (xfer->x_dfl == NULL)
5319 		return (EINVAL);
5320 
5321 	if (!ns->ns_nvme->n_idctl->id_oncs.on_dset_mgmt)
5322 		return (ENOTSUP);
5323 
5324 	return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_DSET_MGMT));
5325 }
5326 
5327 static int
5328 nvme_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
5329 {
5330 #ifndef __lock_lint
5331 	_NOTE(ARGUNUSED(cred_p));
5332 #endif
5333 	nvme_t *nvme;
5334 	nvme_minor_t *minor = NULL;
5335 	uint32_t nsid;
5336 	minor_t m = getminor(*devp);
5337 	int rv = 0;
5338 
5339 	if (otyp != OTYP_CHR)
5340 		return (EINVAL);
5341 
5342 	if (m >= NVME_OPEN_MINOR_MIN)
5343 		return (ENXIO);
5344 
5345 	nvme = ddi_get_soft_state(nvme_state, NVME_MINOR_INST(m));
5346 	nsid = NVME_MINOR_NSID(m);
5347 
5348 	if (nvme == NULL)
5349 		return (ENXIO);
5350 
5351 	if (nsid > nvme->n_namespace_count)
5352 		return (ENXIO);
5353 
5354 	if (nvme->n_dead)
5355 		return (EIO);
5356 
5357 	/*
5358 	 * At this point, we're going to allow an open to proceed on this
5359 	 * device. We need to allocate a new instance for this (presuming one is
5360 	 * available).
5361 	 */
5362 	minor = kmem_zalloc(sizeof (nvme_minor_t), KM_NOSLEEP_LAZY);
5363 	if (minor == NULL) {
5364 		return (ENOMEM);
5365 	}
5366 
5367 	cv_init(&minor->nm_cv, NULL, CV_DRIVER, NULL);
5368 	list_link_init(&minor->nm_ctrl_lock.nli_node);
5369 	minor->nm_ctrl_lock.nli_nvme = nvme;
5370 	minor->nm_ctrl_lock.nli_minor = minor;
5371 	list_link_init(&minor->nm_ns_lock.nli_node);
5372 	minor->nm_ns_lock.nli_nvme = nvme;
5373 	minor->nm_ns_lock.nli_minor = minor;
5374 	minor->nm_minor = id_alloc_nosleep(nvme_open_minors);
5375 	if (minor->nm_minor == -1) {
5376 		nvme_minor_free(minor);
5377 		return (ENOSPC);
5378 	}
5379 
5380 	minor->nm_ctrl = nvme;
5381 	if (nsid != 0) {
5382 		minor->nm_ns = nvme_nsid2ns(nvme, nsid);
5383 	}
5384 
5385 	/*
5386 	 * Before we check for exclusive access and attempt a lock if requested,
5387 	 * ensure that this minor is persisted.
5388 	 */
5389 	mutex_enter(&nvme_open_minors_mutex);
5390 	avl_add(&nvme_open_minors_avl, minor);
5391 	mutex_exit(&nvme_open_minors_mutex);
5392 
5393 	/*
5394 	 * A request for opening this FEXCL, is translated into a non-blocking
5395 	 * write lock of the appropriate entity. This honors the original
5396 	 * semantics here. In the future, we should see if we can remove this
5397 	 * and turn a request for FEXCL at open into ENOTSUP.
5398 	 */
5399 	mutex_enter(&nvme->n_minor_mutex);
5400 	if ((flag & FEXCL) != 0) {
5401 		nvme_ioctl_lock_t lock = {
5402 			.nil_level = NVME_LOCK_L_WRITE,
5403 			.nil_flags = NVME_LOCK_F_DONT_BLOCK
5404 		};
5405 
5406 		if (minor->nm_ns != NULL) {
5407 			lock.nil_ent = NVME_LOCK_E_NS;
5408 			lock.nil_common.nioc_nsid = nsid;
5409 		} else {
5410 			lock.nil_ent = NVME_LOCK_E_CTRL;
5411 		}
5412 		nvme_rwlock(minor, &lock);
5413 		if (lock.nil_common.nioc_drv_err != NVME_IOCTL_E_OK) {
5414 			mutex_exit(&nvme->n_minor_mutex);
5415 
5416 			mutex_enter(&nvme_open_minors_mutex);
5417 			avl_remove(&nvme_open_minors_avl, minor);
5418 			mutex_exit(&nvme_open_minors_mutex);
5419 
5420 			nvme_minor_free(minor);
5421 			return (EBUSY);
5422 		}
5423 	}
5424 	mutex_exit(&nvme->n_minor_mutex);
5425 
5426 	*devp = makedevice(getmajor(*devp), (minor_t)minor->nm_minor);
5427 	return (rv);
5428 
5429 }
5430 
5431 static int
5432 nvme_close(dev_t dev, int flag __unused, int otyp, cred_t *cred_p __unused)
5433 {
5434 	nvme_minor_t *minor;
5435 	nvme_t *nvme;
5436 
5437 	if (otyp != OTYP_CHR) {
5438 		return (ENXIO);
5439 	}
5440 
5441 	minor = nvme_minor_find_by_dev(dev);
5442 	if (minor == NULL) {
5443 		return (ENXIO);
5444 	}
5445 
5446 	mutex_enter(&nvme_open_minors_mutex);
5447 	avl_remove(&nvme_open_minors_avl, minor);
5448 	mutex_exit(&nvme_open_minors_mutex);
5449 
5450 	/*
5451 	 * When this device is being closed, we must ensure that any locks held
5452 	 * by this are dealt with.
5453 	 */
5454 	nvme = minor->nm_ctrl;
5455 	mutex_enter(&nvme->n_minor_mutex);
5456 	ASSERT3U(minor->nm_ctrl_lock.nli_state, !=, NVME_LOCK_STATE_BLOCKED);
5457 	ASSERT3U(minor->nm_ns_lock.nli_state, !=, NVME_LOCK_STATE_BLOCKED);
5458 
5459 	if (minor->nm_ctrl_lock.nli_state == NVME_LOCK_STATE_ACQUIRED) {
5460 		VERIFY3P(minor->nm_ctrl_lock.nli_lock, !=, NULL);
5461 		nvme_rwunlock(&minor->nm_ctrl_lock,
5462 		    minor->nm_ctrl_lock.nli_lock);
5463 	}
5464 
5465 	if (minor->nm_ns_lock.nli_state == NVME_LOCK_STATE_ACQUIRED) {
5466 		VERIFY3P(minor->nm_ns_lock.nli_lock, !=, NULL);
5467 		nvme_rwunlock(&minor->nm_ns_lock, minor->nm_ns_lock.nli_lock);
5468 	}
5469 	mutex_exit(&nvme->n_minor_mutex);
5470 
5471 	nvme_minor_free(minor);
5472 
5473 	return (0);
5474 }
5475 
5476 void
5477 nvme_ioctl_success(nvme_ioctl_common_t *ioc)
5478 {
5479 	ioc->nioc_drv_err = NVME_IOCTL_E_OK;
5480 	ioc->nioc_ctrl_sc = NVME_CQE_SC_GEN_SUCCESS;
5481 	ioc->nioc_ctrl_sct = NVME_CQE_SCT_GENERIC;
5482 }
5483 
5484 boolean_t
5485 nvme_ioctl_error(nvme_ioctl_common_t *ioc, nvme_ioctl_errno_t err, uint32_t sct,
5486     uint32_t sc)
5487 {
5488 	ioc->nioc_drv_err = err;
5489 	ioc->nioc_ctrl_sct = sct;
5490 	ioc->nioc_ctrl_sc = sc;
5491 
5492 	return (B_FALSE);
5493 }
5494 
5495 static int
5496 nvme_ioctl_copyout_error(nvme_ioctl_errno_t err, intptr_t uaddr, int mode)
5497 {
5498 	nvme_ioctl_common_t ioc;
5499 
5500 	ASSERT3U(err, !=, NVME_IOCTL_E_CTRL_ERROR);
5501 	bzero(&ioc, sizeof (ioc));
5502 	if (ddi_copyout(&ioc, (void *)uaddr, sizeof (nvme_ioctl_common_t),
5503 	    mode & FKIOCTL) != 0) {
5504 		return (EFAULT);
5505 	}
5506 	return (0);
5507 }
5508 
5509 
5510 /*
5511  * The companion to the namespace checking. This occurs after any rewriting
5512  * occurs. This is the primary point that we attempt to enforce any operation's
5513  * exclusivity. Note, it is theoretically possible for an operation to be
5514  * ongoing and to have someone with an exclusive lock ask to unlock it for some
5515  * reason. This does not maintain the number of such events that are going on.
5516  * While perhaps this is leaving too much up to the user, by the same token we
5517  * don't try to stop them from issuing two different format NVM commands
5518  * targeting the whole device at the same time either, even though the
5519  * controller would really rather that didn't happen.
5520  */
5521 static boolean_t
5522 nvme_ioctl_excl_check(nvme_minor_t *minor, nvme_ioctl_common_t *ioc,
5523     const nvme_ioctl_check_t *check)
5524 {
5525 	nvme_t *const nvme = minor->nm_ctrl;
5526 	nvme_namespace_t *ns;
5527 	boolean_t have_ctrl, have_ns, ctrl_is_excl, ns_is_excl;
5528 
5529 	/*
5530 	 * If the command doesn't require anything, then we're done.
5531 	 */
5532 	if (check->nck_excl == NVME_IOCTL_EXCL_SKIP) {
5533 		return (B_TRUE);
5534 	}
5535 
5536 	if (ioc->nioc_nsid == 0 || ioc->nioc_nsid == NVME_NSID_BCAST) {
5537 		ns = NULL;
5538 	} else {
5539 		ns = nvme_nsid2ns(nvme, ioc->nioc_nsid);
5540 	}
5541 
5542 	mutex_enter(&nvme->n_minor_mutex);
5543 	ctrl_is_excl = nvme->n_lock.nl_writer != NULL;
5544 	have_ctrl = nvme->n_lock.nl_writer == &minor->nm_ctrl_lock;
5545 	if (ns != NULL) {
5546 		/*
5547 		 * We explicitly test the namespace lock's writer versus asking
5548 		 * the minor because the minor's namespace lock may apply to a
5549 		 * different namespace.
5550 		 */
5551 		ns_is_excl = ns->ns_lock.nl_writer != NULL;
5552 		have_ns = ns->ns_lock.nl_writer == &minor->nm_ns_lock;
5553 		ASSERT0(have_ctrl && have_ns);
5554 #ifdef	DEBUG
5555 		if (have_ns) {
5556 			ASSERT3P(minor->nm_ns_lock.nli_ns, ==, ns);
5557 		}
5558 #endif
5559 	} else {
5560 		ns_is_excl = B_FALSE;
5561 		have_ns = B_FALSE;
5562 	}
5563 	ASSERT0(ctrl_is_excl && ns_is_excl);
5564 	mutex_exit(&nvme->n_minor_mutex);
5565 
5566 	if (check->nck_excl == NVME_IOCTL_EXCL_WRITE) {
5567 		if (ns == NULL) {
5568 			if (have_ctrl) {
5569 				return (B_TRUE);
5570 			}
5571 			return (nvme_ioctl_error(ioc,
5572 			    NVME_IOCTL_E_NEED_CTRL_WRLOCK, 0, 0));
5573 		} else {
5574 			if (have_ctrl || have_ns) {
5575 				return (B_TRUE);
5576 			}
5577 			return (nvme_ioctl_error(ioc,
5578 			    NVME_IOCTL_E_NEED_NS_WRLOCK, 0, 0));
5579 		}
5580 	}
5581 
5582 	/*
5583 	 * Now we have an operation that does not require exclusive access. We
5584 	 * can proceed as long as no one else has it or if someone does it is
5585 	 * us. Regardless of what we target, a controller lock will stop us.
5586 	 */
5587 	if (ctrl_is_excl && !have_ctrl) {
5588 		return (nvme_ioctl_error(ioc, NVME_IOCTL_E_CTRL_LOCKED, 0, 0));
5589 	}
5590 
5591 	/*
5592 	 * Only check namespace exclusivity if we are targeting one.
5593 	 */
5594 	if (ns != NULL && ns_is_excl && !have_ns) {
5595 		return (nvme_ioctl_error(ioc, NVME_IOCTL_E_NS_LOCKED, 0, 0));
5596 	}
5597 
5598 	return (B_TRUE);
5599 }
5600 
5601 /*
5602  * Perform common checking as to whether or not an ioctl operation may proceed.
5603  * We check in this function various aspects of the namespace attributes that
5604  * it's calling on. Once the namespace attributes and any possible rewriting
5605  * have been performed, then we proceed to check whether or not the requisite
5606  * exclusive access is present in nvme_ioctl_excl_check().
5607  */
5608 static boolean_t
5609 nvme_ioctl_check(nvme_minor_t *minor, nvme_ioctl_common_t *ioc,
5610     const nvme_ioctl_check_t *check)
5611 {
5612 	/*
5613 	 * If the minor has a namespace pointer, then it is constrained to that
5614 	 * namespace. If a namespace is allowed, then there are only two valid
5615 	 * values that we can find. The first is matching the minor. The second
5616 	 * is our value zero, which will be transformed to the current
5617 	 * namespace.
5618 	 */
5619 	if (minor->nm_ns != NULL) {
5620 		if (!check->nck_ns_ok || !check->nck_ns_minor_ok) {
5621 			return (nvme_ioctl_error(ioc, NVME_IOCTL_E_NOT_CTRL, 0,
5622 			    0));
5623 		}
5624 
5625 		if (ioc->nioc_nsid == 0) {
5626 			ioc->nioc_nsid = minor->nm_ns->ns_id;
5627 		} else if (ioc->nioc_nsid != minor->nm_ns->ns_id) {
5628 			return (nvme_ioctl_error(ioc,
5629 			    NVME_IOCTL_E_MINOR_WRONG_NS, 0, 0));
5630 		}
5631 
5632 		return (nvme_ioctl_excl_check(minor, ioc, check));
5633 	}
5634 
5635 	/*
5636 	 * If we've been told to skip checking the controller, here's where we
5637 	 * do that. This should really only be for commands which use the
5638 	 * namespace ID for listing purposes and therefore can have
5639 	 * traditionally illegal values here.
5640 	 */
5641 	if (check->nck_skip_ctrl) {
5642 		return (nvme_ioctl_excl_check(minor, ioc, check));
5643 	}
5644 
5645 	/*
5646 	 * At this point, we know that we're on the controller's node. We first
5647 	 * deal with the simple case, is a namespace allowed at all or not. If
5648 	 * it is not allowed, then the only acceptable value is zero.
5649 	 */
5650 	if (!check->nck_ns_ok) {
5651 		if (ioc->nioc_nsid != 0) {
5652 			return (nvme_ioctl_error(ioc, NVME_IOCTL_E_NS_UNUSE, 0,
5653 			    0));
5654 		}
5655 
5656 		return (nvme_ioctl_excl_check(minor, ioc, check));
5657 	}
5658 
5659 	/*
5660 	 * At this point, we know that a controller is allowed to use a
5661 	 * namespace. If we haven't been given zero or the broadcast namespace,
5662 	 * check to see if it's actually a valid namespace ID. If is outside of
5663 	 * range, then it is an error. Next, if we have been requested to
5664 	 * rewrite 0 (the this controller indicator) as the broadcast namespace,
5665 	 * do so.
5666 	 *
5667 	 * While we validate that this namespace is within the valid range, we
5668 	 * do not check if it is active or inactive. That is left to our callers
5669 	 * to determine.
5670 	 */
5671 	if (ioc->nioc_nsid > minor->nm_ctrl->n_namespace_count &&
5672 	    ioc->nioc_nsid != NVME_NSID_BCAST) {
5673 		return (nvme_ioctl_error(ioc, NVME_IOCTL_E_NS_RANGE, 0, 0));
5674 	}
5675 
5676 	if (ioc->nioc_nsid == 0 && check->nck_ctrl_rewrite) {
5677 		ioc->nioc_nsid = NVME_NSID_BCAST;
5678 	}
5679 
5680 	/*
5681 	 * Finally, see if we have ended up with a broadcast namespace ID
5682 	 * whether through specification or rewriting. If that is not allowed,
5683 	 * then that is an error.
5684 	 */
5685 	if (!check->nck_bcast_ok && ioc->nioc_nsid == NVME_NSID_BCAST) {
5686 		return (nvme_ioctl_error(ioc, NVME_IOCTL_E_NO_BCAST_NS, 0, 0));
5687 	}
5688 
5689 	return (nvme_ioctl_excl_check(minor, ioc, check));
5690 }
5691 
5692 static int
5693 nvme_ioctl_ctrl_info(nvme_minor_t *minor, intptr_t arg, int mode,
5694     cred_t *cred_p)
5695 {
5696 	nvme_t *const nvme = minor->nm_ctrl;
5697 	nvme_ioctl_ctrl_info_t *info;
5698 	nvme_reg_cap_t cap = { 0 };
5699 	nvme_ioctl_identify_t id = { .nid_cns = NVME_IDENTIFY_CTRL };
5700 	void *idbuf;
5701 
5702 	if ((mode & FREAD) == 0)
5703 		return (EBADF);
5704 
5705 	info = kmem_alloc(sizeof (nvme_ioctl_ctrl_info_t), KM_NOSLEEP_LAZY);
5706 	if (info == NULL) {
5707 		return (nvme_ioctl_copyout_error(NVME_IOCTL_E_NO_KERN_MEM, arg,
5708 		    mode));
5709 	}
5710 
5711 	if (ddi_copyin((void *)arg, info, sizeof (nvme_ioctl_ctrl_info_t),
5712 	    mode & FKIOCTL) != 0) {
5713 		kmem_free(info, sizeof (nvme_ioctl_ctrl_info_t));
5714 		return (EFAULT);
5715 	}
5716 
5717 	if (!nvme_ioctl_check(minor, &info->nci_common,
5718 	    &nvme_check_ctrl_info)) {
5719 		goto copyout;
5720 	}
5721 
5722 	/*
5723 	 * We explicitly do not use the identify controller copy in the kernel
5724 	 * right now so that way we can get a snapshot of the controller's
5725 	 * current capacity and values. While it's tempting to try to use this
5726 	 * to refresh the kernel's version we don't just to simplify the rest of
5727 	 * the driver right now.
5728 	 */
5729 	if (!nvme_identify(nvme, B_TRUE, &id, &idbuf)) {
5730 		info->nci_common = id.nid_common;
5731 		goto copyout;
5732 	}
5733 	bcopy(idbuf, &info->nci_ctrl_id, sizeof (nvme_identify_ctrl_t));
5734 	kmem_free(idbuf, NVME_IDENTIFY_BUFSIZE);
5735 
5736 	/*
5737 	 * Use the kernel's cached common namespace information for this.
5738 	 */
5739 	bcopy(nvme->n_idcomns, &info->nci_common_ns,
5740 	    sizeof (nvme_identify_nsid_t));
5741 
5742 	info->nci_vers = nvme->n_version;
5743 
5744 	/*
5745 	 * The MPSMIN and MPSMAX fields in the CAP register use 0 to
5746 	 * specify the base page size of 4k (1<<12), so add 12 here to
5747 	 * get the real page size value.
5748 	 */
5749 	cap.r = nvme_get64(nvme, NVME_REG_CAP);
5750 	info->nci_caps.cap_mpsmax = 1 << (12 + cap.b.cap_mpsmax);
5751 	info->nci_caps.cap_mpsmin = 1 << (12 + cap.b.cap_mpsmin);
5752 
5753 	info->nci_nintrs = (uint32_t)nvme->n_intr_cnt;
5754 
5755 copyout:
5756 	if (ddi_copyout(info, (void *)arg, sizeof (nvme_ioctl_ctrl_info_t),
5757 	    mode & FKIOCTL) != 0) {
5758 		kmem_free(info, sizeof (nvme_ioctl_ctrl_info_t));
5759 		return (EFAULT);
5760 	}
5761 
5762 	kmem_free(info, sizeof (nvme_ioctl_ctrl_info_t));
5763 	return (0);
5764 }
5765 
5766 static int
5767 nvme_ioctl_ns_info(nvme_minor_t *minor, intptr_t arg, int mode, cred_t *cred_p)
5768 {
5769 	nvme_t *const nvme = minor->nm_ctrl;
5770 	nvme_ioctl_ns_info_t *ns_info;
5771 	nvme_namespace_t *ns;
5772 	nvme_ioctl_identify_t id = { .nid_cns = NVME_IDENTIFY_NSID };
5773 	void *idbuf;
5774 
5775 	if ((mode & FREAD) == 0)
5776 		return (EBADF);
5777 
5778 	ns_info = kmem_zalloc(sizeof (nvme_ioctl_ns_info_t), KM_NOSLEEP_LAZY);
5779 	if (ns_info == NULL) {
5780 		return (nvme_ioctl_copyout_error(NVME_IOCTL_E_NO_KERN_MEM, arg,
5781 		    mode));
5782 	}
5783 
5784 	if (ddi_copyin((void *)arg, ns_info, sizeof (nvme_ioctl_ns_info_t),
5785 	    mode & FKIOCTL) != 0) {
5786 		kmem_free(ns_info, sizeof (nvme_ioctl_ns_info_t));
5787 		return (EFAULT);
5788 	}
5789 
5790 	if (!nvme_ioctl_check(minor, &ns_info->nni_common,
5791 	    &nvme_check_ns_info)) {
5792 		goto copyout;
5793 	}
5794 
5795 	ASSERT3U(ns_info->nni_common.nioc_nsid, >, 0);
5796 	ns = nvme_nsid2ns(nvme, ns_info->nni_common.nioc_nsid);
5797 
5798 	/*
5799 	 * First fetch a fresh copy of the namespace information. Most callers
5800 	 * are using this because they will want a mostly accurate snapshot of
5801 	 * capacity and utilization.
5802 	 */
5803 	id.nid_common.nioc_nsid = ns_info->nni_common.nioc_nsid;
5804 	if (!nvme_identify(nvme, B_TRUE, &id, &idbuf)) {
5805 		ns_info->nni_common = id.nid_common;
5806 		goto copyout;
5807 	}
5808 	bcopy(idbuf, &ns_info->nni_id, sizeof (nvme_identify_nsid_t));
5809 	kmem_free(idbuf, NVME_IDENTIFY_BUFSIZE);
5810 
5811 	mutex_enter(&nvme->n_mgmt_mutex);
5812 	if (ns->ns_allocated)
5813 		ns_info->nni_state |= NVME_NS_STATE_ALLOCATED;
5814 
5815 	if (ns->ns_active)
5816 		ns_info->nni_state |= NVME_NS_STATE_ACTIVE;
5817 
5818 	if (ns->ns_ignore)
5819 		ns_info->nni_state |= NVME_NS_STATE_IGNORED;
5820 
5821 	if (ns->ns_attached) {
5822 		const char *addr;
5823 
5824 		ns_info->nni_state |= NVME_NS_STATE_ATTACHED;
5825 		addr = bd_address(ns->ns_bd_hdl);
5826 		if (strlcpy(ns_info->nni_addr, addr,
5827 		    sizeof (ns_info->nni_addr)) >= sizeof (ns_info->nni_addr)) {
5828 			mutex_exit(&nvme->n_mgmt_mutex);
5829 			(void) nvme_ioctl_error(&ns_info->nni_common,
5830 			    NVME_IOCTL_E_BD_ADDR_OVER, 0, 0);
5831 			goto copyout;
5832 		}
5833 	}
5834 	mutex_exit(&nvme->n_mgmt_mutex);
5835 
5836 copyout:
5837 	if (ddi_copyout(ns_info, (void *)arg, sizeof (nvme_ioctl_ns_info_t),
5838 	    mode & FKIOCTL) != 0) {
5839 		kmem_free(ns_info, sizeof (nvme_ioctl_ns_info_t));
5840 		return (EFAULT);
5841 	}
5842 
5843 	kmem_free(ns_info, sizeof (nvme_ioctl_ns_info_t));
5844 	return (0);
5845 }
5846 
5847 static int
5848 nvme_ioctl_identify(nvme_minor_t *minor, intptr_t arg, int mode, cred_t *cred_p)
5849 {
5850 	_NOTE(ARGUNUSED(cred_p));
5851 	nvme_t *const nvme = minor->nm_ctrl;
5852 	void *idctl;
5853 	uint_t model;
5854 	nvme_ioctl_identify_t id;
5855 #ifdef	_MULTI_DATAMODEL
5856 	nvme_ioctl_identify32_t id32;
5857 #endif
5858 	boolean_t ns_minor;
5859 
5860 	if ((mode & FREAD) == 0)
5861 		return (EBADF);
5862 
5863 	model = ddi_model_convert_from(mode);
5864 	switch (model) {
5865 #ifdef	_MULTI_DATAMODEL
5866 	case DDI_MODEL_ILP32:
5867 		bzero(&id, sizeof (id));
5868 		if (ddi_copyin((void *)arg, &id32, sizeof (id32),
5869 		    mode & FKIOCTL) != 0) {
5870 			return (EFAULT);
5871 		}
5872 		id.nid_common.nioc_nsid = id32.nid_common.nioc_nsid;
5873 		id.nid_cns = id32.nid_cns;
5874 		id.nid_ctrlid = id32.nid_ctrlid;
5875 		id.nid_data = id32.nid_data;
5876 		break;
5877 #endif	/* _MULTI_DATAMODEL */
5878 	case DDI_MODEL_NONE:
5879 		if (ddi_copyin((void *)arg, &id, sizeof (id),
5880 		    mode & FKIOCTL) != 0) {
5881 			return (EFAULT);
5882 		}
5883 		break;
5884 	default:
5885 		return (ENOTSUP);
5886 	}
5887 
5888 	if (!nvme_ioctl_check(minor, &id.nid_common, &nvme_check_identify)) {
5889 		goto copyout;
5890 	}
5891 
5892 	ns_minor = minor->nm_ns != NULL;
5893 	if (!nvme_validate_identify(nvme, &id, ns_minor)) {
5894 		goto copyout;
5895 	}
5896 
5897 	if (nvme_identify(nvme, B_TRUE, &id, &idctl)) {
5898 		int ret = ddi_copyout(idctl, (void *)id.nid_data,
5899 		    NVME_IDENTIFY_BUFSIZE, mode & FKIOCTL);
5900 		kmem_free(idctl, NVME_IDENTIFY_BUFSIZE);
5901 		if (ret != 0) {
5902 			(void) nvme_ioctl_error(&id.nid_common,
5903 			    NVME_IOCTL_E_BAD_USER_DATA, 0, 0);
5904 			goto copyout;
5905 		}
5906 
5907 		nvme_ioctl_success(&id.nid_common);
5908 	}
5909 
5910 copyout:
5911 	switch (model) {
5912 #ifdef	_MULTI_DATAMODEL
5913 	case DDI_MODEL_ILP32:
5914 		id32.nid_common = id.nid_common;
5915 
5916 		if (ddi_copyout(&id32, (void *)arg, sizeof (id32),
5917 		    mode & FKIOCTL) != 0) {
5918 			return (EFAULT);
5919 		}
5920 		break;
5921 #endif	/* _MULTI_DATAMODEL */
5922 	case DDI_MODEL_NONE:
5923 		if (ddi_copyout(&id, (void *)arg, sizeof (id),
5924 		    mode & FKIOCTL) != 0) {
5925 			return (EFAULT);
5926 		}
5927 		break;
5928 	default:
5929 		return (ENOTSUP);
5930 	}
5931 
5932 	return (0);
5933 }
5934 
5935 /*
5936  * Execute commands on behalf of the various ioctls.
5937  *
5938  * If this returns true then the command completed successfully. Otherwise error
5939  * information is returned in the nvme_ioctl_common_t arguments.
5940  */
5941 typedef struct {
5942 	nvme_sqe_t *ica_sqe;
5943 	void *ica_data;
5944 	uint32_t ica_data_len;
5945 	uint_t ica_dma_flags;
5946 	int ica_copy_flags;
5947 	uint32_t ica_timeout;
5948 	uint32_t ica_cdw0;
5949 } nvme_ioc_cmd_args_t;
5950 
5951 static boolean_t
5952 nvme_ioc_cmd(nvme_t *nvme, nvme_ioctl_common_t *ioc, nvme_ioc_cmd_args_t *args)
5953 {
5954 	nvme_cmd_t *cmd;
5955 	boolean_t ret = B_FALSE;
5956 
5957 	cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
5958 	cmd->nc_sqid = 0;
5959 
5960 	/*
5961 	 * This function is used to facilitate requests from
5962 	 * userspace, so don't panic if the command fails. This
5963 	 * is especially true for admin passthru commands, where
5964 	 * the actual command data structure is entirely defined
5965 	 * by userspace.
5966 	 */
5967 	cmd->nc_dontpanic = B_TRUE;
5968 
5969 	cmd->nc_callback = nvme_wakeup_cmd;
5970 	cmd->nc_sqe = *args->ica_sqe;
5971 
5972 	if ((args->ica_dma_flags & DDI_DMA_RDWR) != 0) {
5973 		if (args->ica_data == NULL) {
5974 			ret = nvme_ioctl_error(ioc, NVME_IOCTL_E_NO_DMA_MEM,
5975 			    0, 0);
5976 			goto free_cmd;
5977 		}
5978 
5979 		if (nvme_zalloc_dma(nvme, args->ica_data_len,
5980 		    args->ica_dma_flags, &nvme->n_prp_dma_attr, &cmd->nc_dma) !=
5981 		    DDI_SUCCESS) {
5982 			dev_err(nvme->n_dip, CE_WARN,
5983 			    "!nvme_zalloc_dma failed for nvme_ioc_cmd()");
5984 			ret = nvme_ioctl_error(ioc,
5985 			    NVME_IOCTL_E_NO_DMA_MEM, 0, 0);
5986 			goto free_cmd;
5987 		}
5988 
5989 		if (nvme_fill_prp(cmd, cmd->nc_dma->nd_dmah) != 0) {
5990 			ret = nvme_ioctl_error(ioc,
5991 			    NVME_IOCTL_E_NO_DMA_MEM, 0, 0);
5992 			goto free_cmd;
5993 		}
5994 
5995 		if ((args->ica_dma_flags & DDI_DMA_WRITE) != 0 &&
5996 		    ddi_copyin(args->ica_data, cmd->nc_dma->nd_memp,
5997 		    args->ica_data_len, args->ica_copy_flags) != 0) {
5998 			ret = nvme_ioctl_error(ioc, NVME_IOCTL_E_BAD_USER_DATA,
5999 			    0, 0);
6000 			goto free_cmd;
6001 		}
6002 	}
6003 
6004 	nvme_admin_cmd(cmd, args->ica_timeout);
6005 
6006 	if (!nvme_check_cmd_status_ioctl(cmd, ioc)) {
6007 		ret = B_FALSE;
6008 		goto free_cmd;
6009 	}
6010 
6011 	args->ica_cdw0 = cmd->nc_cqe.cqe_dw0;
6012 
6013 	if ((args->ica_dma_flags & DDI_DMA_READ) != 0 &&
6014 	    ddi_copyout(cmd->nc_dma->nd_memp, args->ica_data,
6015 	    args->ica_data_len, args->ica_copy_flags) != 0) {
6016 		ret = nvme_ioctl_error(ioc, NVME_IOCTL_E_BAD_USER_DATA, 0, 0);
6017 		goto free_cmd;
6018 	}
6019 
6020 	ret = B_TRUE;
6021 	nvme_ioctl_success(ioc);
6022 
6023 free_cmd:
6024 	nvme_free_cmd(cmd);
6025 
6026 	return (ret);
6027 }
6028 
6029 static int
6030 nvme_ioctl_get_logpage(nvme_minor_t *minor, intptr_t arg, int mode,
6031     cred_t *cred_p)
6032 {
6033 	nvme_t *const nvme = minor->nm_ctrl;
6034 	void *buf;
6035 	nvme_ioctl_get_logpage_t log;
6036 	uint_t model;
6037 #ifdef	_MULTI_DATAMODEL
6038 	nvme_ioctl_get_logpage32_t log32;
6039 #endif
6040 
6041 	if ((mode & FREAD) == 0) {
6042 		return (EBADF);
6043 	}
6044 
6045 	model = ddi_model_convert_from(mode);
6046 	switch (model) {
6047 #ifdef	_MULTI_DATAMODEL
6048 	case DDI_MODEL_ILP32:
6049 		bzero(&log, sizeof (log));
6050 		if (ddi_copyin((void *)arg, &log32, sizeof (log32),
6051 		    mode & FKIOCTL) != 0) {
6052 			return (EFAULT);
6053 		}
6054 
6055 		log.nigl_common.nioc_nsid = log32.nigl_common.nioc_nsid;
6056 		log.nigl_csi = log32.nigl_csi;
6057 		log.nigl_lid = log32.nigl_lid;
6058 		log.nigl_lsp = log32.nigl_lsp;
6059 		log.nigl_len = log32.nigl_len;
6060 		log.nigl_offset = log32.nigl_offset;
6061 		log.nigl_data = log32.nigl_data;
6062 		break;
6063 #endif	/* _MULTI_DATAMODEL */
6064 	case DDI_MODEL_NONE:
6065 		if (ddi_copyin((void *)arg, &log, sizeof (log),
6066 		    mode & FKIOCTL) != 0) {
6067 			return (EFAULT);
6068 		}
6069 		break;
6070 	default:
6071 		return (ENOTSUP);
6072 	}
6073 
6074 	/*
6075 	 * Eventually we'd like to do a soft lock on the namespaces from
6076 	 * changing out from us during this operation in the future. But we
6077 	 * haven't implemented that yet.
6078 	 */
6079 	if (!nvme_ioctl_check(minor, &log.nigl_common,
6080 	    &nvme_check_get_logpage)) {
6081 		goto copyout;
6082 	}
6083 
6084 	if (!nvme_validate_logpage(nvme, &log)) {
6085 		goto copyout;
6086 	}
6087 
6088 	if (nvme_get_logpage(nvme, B_TRUE, &log, &buf)) {
6089 		int copy;
6090 
6091 		copy = ddi_copyout(buf, (void *)log.nigl_data, log.nigl_len,
6092 		    mode & FKIOCTL);
6093 		kmem_free(buf, log.nigl_len);
6094 		if (copy != 0) {
6095 			(void) nvme_ioctl_error(&log.nigl_common,
6096 			    NVME_IOCTL_E_BAD_USER_DATA, 0, 0);
6097 			goto copyout;
6098 		}
6099 
6100 		nvme_ioctl_success(&log.nigl_common);
6101 	}
6102 
6103 copyout:
6104 	switch (model) {
6105 #ifdef	_MULTI_DATAMODEL
6106 	case DDI_MODEL_ILP32:
6107 		bzero(&log32, sizeof (log32));
6108 
6109 		log32.nigl_common = log.nigl_common;
6110 		log32.nigl_csi = log.nigl_csi;
6111 		log32.nigl_lid = log.nigl_lid;
6112 		log32.nigl_lsp = log.nigl_lsp;
6113 		log32.nigl_len = log.nigl_len;
6114 		log32.nigl_offset = log.nigl_offset;
6115 		log32.nigl_data = log.nigl_data;
6116 		if (ddi_copyout(&log32, (void *)arg, sizeof (log32),
6117 		    mode & FKIOCTL) != 0) {
6118 			return (EFAULT);
6119 		}
6120 		break;
6121 #endif	/* _MULTI_DATAMODEL */
6122 	case DDI_MODEL_NONE:
6123 		if (ddi_copyout(&log, (void *)arg, sizeof (log),
6124 		    mode & FKIOCTL) != 0) {
6125 			return (EFAULT);
6126 		}
6127 		break;
6128 	default:
6129 		return (ENOTSUP);
6130 	}
6131 
6132 	return (0);
6133 }
6134 
6135 static int
6136 nvme_ioctl_get_feature(nvme_minor_t *minor, intptr_t arg, int mode,
6137     cred_t *cred_p)
6138 {
6139 	nvme_t *const nvme = minor->nm_ctrl;
6140 	nvme_ioctl_get_feature_t feat;
6141 	uint_t model;
6142 #ifdef	_MULTI_DATAMODEL
6143 	nvme_ioctl_get_feature32_t feat32;
6144 #endif
6145 	nvme_get_features_dw10_t gf_dw10 = { 0 };
6146 	nvme_ioc_cmd_args_t args = { NULL };
6147 	nvme_sqe_t sqe = {
6148 	    .sqe_opc	= NVME_OPC_GET_FEATURES
6149 	};
6150 
6151 	if ((mode & FREAD) == 0) {
6152 		return (EBADF);
6153 	}
6154 
6155 	model = ddi_model_convert_from(mode);
6156 	switch (model) {
6157 #ifdef	_MULTI_DATAMODEL
6158 	case DDI_MODEL_ILP32:
6159 		bzero(&feat, sizeof (feat));
6160 		if (ddi_copyin((void *)arg, &feat32, sizeof (feat32),
6161 		    mode & FKIOCTL) != 0) {
6162 			return (EFAULT);
6163 		}
6164 
6165 		feat.nigf_common.nioc_nsid = feat32.nigf_common.nioc_nsid;
6166 		feat.nigf_fid = feat32.nigf_fid;
6167 		feat.nigf_sel = feat32.nigf_sel;
6168 		feat.nigf_cdw11 = feat32.nigf_cdw11;
6169 		feat.nigf_data = feat32.nigf_data;
6170 		feat.nigf_len = feat32.nigf_len;
6171 		break;
6172 #endif	/* _MULTI_DATAMODEL */
6173 	case DDI_MODEL_NONE:
6174 		if (ddi_copyin((void *)arg, &feat, sizeof (feat),
6175 		    mode & FKIOCTL) != 0) {
6176 			return (EFAULT);
6177 		}
6178 		break;
6179 	default:
6180 		return (ENOTSUP);
6181 	}
6182 
6183 	if (!nvme_ioctl_check(minor, &feat.nigf_common,
6184 	    &nvme_check_get_feature)) {
6185 		goto copyout;
6186 	}
6187 
6188 	if (!nvme_validate_get_feature(nvme, &feat)) {
6189 		goto copyout;
6190 	}
6191 
6192 	gf_dw10.b.gt_fid = bitx32(feat.nigf_fid, 7, 0);
6193 	gf_dw10.b.gt_sel = bitx32(feat.nigf_sel, 2, 0);
6194 	sqe.sqe_cdw10 = gf_dw10.r;
6195 	sqe.sqe_cdw11 = feat.nigf_cdw11;
6196 	sqe.sqe_nsid = feat.nigf_common.nioc_nsid;
6197 
6198 	args.ica_sqe = &sqe;
6199 	if (feat.nigf_len != 0) {
6200 		args.ica_data = (void *)feat.nigf_data;
6201 		args.ica_data_len = feat.nigf_len;
6202 		args.ica_dma_flags = DDI_DMA_READ;
6203 	}
6204 	args.ica_copy_flags = mode;
6205 	args.ica_timeout = nvme_admin_cmd_timeout;
6206 
6207 	if (!nvme_ioc_cmd(nvme, &feat.nigf_common, &args)) {
6208 		goto copyout;
6209 	}
6210 
6211 	feat.nigf_cdw0 = args.ica_cdw0;
6212 
6213 copyout:
6214 	switch (model) {
6215 #ifdef	_MULTI_DATAMODEL
6216 	case DDI_MODEL_ILP32:
6217 		bzero(&feat32, sizeof (feat32));
6218 
6219 		feat32.nigf_common = feat.nigf_common;
6220 		feat32.nigf_fid = feat.nigf_fid;
6221 		feat32.nigf_sel = feat.nigf_sel;
6222 		feat32.nigf_cdw11 = feat.nigf_cdw11;
6223 		feat32.nigf_data = feat.nigf_data;
6224 		feat32.nigf_len = feat.nigf_len;
6225 		feat32.nigf_cdw0 = feat.nigf_cdw0;
6226 		if (ddi_copyout(&feat32, (void *)arg, sizeof (feat32),
6227 		    mode & FKIOCTL) != 0) {
6228 			return (EFAULT);
6229 		}
6230 		break;
6231 #endif	/* _MULTI_DATAMODEL */
6232 	case DDI_MODEL_NONE:
6233 		if (ddi_copyout(&feat, (void *)arg, sizeof (feat),
6234 		    mode & FKIOCTL) != 0) {
6235 			return (EFAULT);
6236 		}
6237 		break;
6238 	default:
6239 		return (ENOTSUP);
6240 	}
6241 
6242 	return (0);
6243 }
6244 
6245 static int
6246 nvme_ioctl_format(nvme_minor_t *minor, intptr_t arg, int mode, cred_t *cred_p)
6247 {
6248 	nvme_t *const nvme = minor->nm_ctrl;
6249 	nvme_ioctl_format_t ioc;
6250 
6251 	if ((mode & FWRITE) == 0)
6252 		return (EBADF);
6253 
6254 	if (secpolicy_sys_config(cred_p, B_FALSE) != 0)
6255 		return (EPERM);
6256 
6257 	if (ddi_copyin((void *)(uintptr_t)arg, &ioc,
6258 	    sizeof (nvme_ioctl_format_t), mode & FKIOCTL) != 0)
6259 		return (EFAULT);
6260 
6261 	if (!nvme_ioctl_check(minor, &ioc.nif_common, &nvme_check_format)) {
6262 		goto copyout;
6263 	}
6264 
6265 	if (!nvme_validate_format(nvme, &ioc)) {
6266 		goto copyout;
6267 	}
6268 
6269 	mutex_enter(&nvme->n_mgmt_mutex);
6270 	if (!nvme_no_blkdev_attached(nvme, ioc.nif_common.nioc_nsid)) {
6271 		mutex_exit(&nvme->n_mgmt_mutex);
6272 		(void) nvme_ioctl_error(&ioc.nif_common,
6273 		    NVME_IOCTL_E_NS_BLKDEV_ATTACH, 0, 0);
6274 		goto copyout;
6275 	}
6276 
6277 	if (nvme_format_nvm(nvme, &ioc)) {
6278 		nvme_ioctl_success(&ioc.nif_common);
6279 		nvme_rescan_ns(nvme, ioc.nif_common.nioc_nsid);
6280 	}
6281 	mutex_exit(&nvme->n_mgmt_mutex);
6282 
6283 copyout:
6284 	if (ddi_copyout(&ioc, (void *)(uintptr_t)arg, sizeof (ioc),
6285 	    mode & FKIOCTL) != 0) {
6286 		return (EFAULT);
6287 	}
6288 
6289 	return (0);
6290 }
6291 
6292 static int
6293 nvme_ioctl_detach(nvme_minor_t *minor, intptr_t arg, int mode, cred_t *cred_p)
6294 {
6295 	nvme_t *const nvme = minor->nm_ctrl;
6296 	nvme_ioctl_common_t com;
6297 
6298 	if ((mode & FWRITE) == 0)
6299 		return (EBADF);
6300 
6301 	if (secpolicy_sys_config(cred_p, B_FALSE) != 0)
6302 		return (EPERM);
6303 
6304 	if (ddi_copyin((void *)(uintptr_t)arg, &com, sizeof (com),
6305 	    mode & FKIOCTL) != 0) {
6306 		return (EFAULT);
6307 	}
6308 
6309 	if (!nvme_ioctl_check(minor, &com, &nvme_check_attach_detach)) {
6310 		goto copyout;
6311 	}
6312 
6313 	mutex_enter(&nvme->n_mgmt_mutex);
6314 	if (nvme_detach_ns(nvme, &com)) {
6315 		nvme_ioctl_success(&com);
6316 	}
6317 	mutex_exit(&nvme->n_mgmt_mutex);
6318 
6319 copyout:
6320 	if (ddi_copyout(&com, (void *)(uintptr_t)arg, sizeof (com),
6321 	    mode & FKIOCTL) != 0) {
6322 		return (EFAULT);
6323 	}
6324 
6325 	return (0);
6326 }
6327 
6328 static int
6329 nvme_ioctl_attach(nvme_minor_t *minor, intptr_t arg, int mode,
6330     cred_t *cred_p)
6331 {
6332 	nvme_t *const nvme = minor->nm_ctrl;
6333 	nvme_ioctl_common_t com;
6334 	nvme_namespace_t *ns;
6335 
6336 	if ((mode & FWRITE) == 0)
6337 		return (EBADF);
6338 
6339 	if (secpolicy_sys_config(cred_p, B_FALSE) != 0)
6340 		return (EPERM);
6341 
6342 	if (ddi_copyin((void *)(uintptr_t)arg, &com, sizeof (com),
6343 	    mode & FKIOCTL) != 0) {
6344 		return (EFAULT);
6345 	}
6346 
6347 	if (!nvme_ioctl_check(minor, &com, &nvme_check_attach_detach)) {
6348 		goto copyout;
6349 	}
6350 
6351 	mutex_enter(&nvme->n_mgmt_mutex);
6352 	ns = nvme_nsid2ns(nvme, com.nioc_nsid);
6353 
6354 	/*
6355 	 * Strictly speaking we shouldn't need to call nvme_init_ns() here as
6356 	 * we should be properly refreshing the internal state when we are
6357 	 * issuing commands that change things. However, we opt to still do so
6358 	 * as a bit of a safety check lest we give the kernel something bad or a
6359 	 * vendor unique command somehow did something behind our backs.
6360 	 */
6361 	if (!ns->ns_attached) {
6362 		(void) nvme_rescan_ns(nvme, com.nioc_nsid);
6363 		if (nvme_attach_ns(nvme, &com)) {
6364 			nvme_ioctl_success(&com);
6365 		}
6366 	} else {
6367 		nvme_ioctl_success(&com);
6368 	}
6369 	mutex_exit(&nvme->n_mgmt_mutex);
6370 
6371 copyout:
6372 	if (ddi_copyout(&com, (void *)(uintptr_t)arg, sizeof (com),
6373 	    mode & FKIOCTL) != 0) {
6374 		return (EFAULT);
6375 	}
6376 
6377 	return (0);
6378 }
6379 
6380 static void
6381 nvme_ufm_update(nvme_t *nvme)
6382 {
6383 	mutex_enter(&nvme->n_fwslot_mutex);
6384 	ddi_ufm_update(nvme->n_ufmh);
6385 	if (nvme->n_fwslot != NULL) {
6386 		kmem_free(nvme->n_fwslot, sizeof (nvme_fwslot_log_t));
6387 		nvme->n_fwslot = NULL;
6388 	}
6389 	mutex_exit(&nvme->n_fwslot_mutex);
6390 }
6391 
6392 /*
6393  * Download new firmware to the device's internal staging area. We do not call
6394  * nvme_ufm_update() here because after a firmware download, there has been no
6395  * change to any of the actual persistent firmware data. That requires a
6396  * subsequent ioctl (NVME_IOC_FIRMWARE_COMMIT) to commit the firmware to a slot
6397  * or to activate a slot.
6398  */
6399 static int
6400 nvme_ioctl_firmware_download(nvme_minor_t *minor, intptr_t arg, int mode,
6401     cred_t *cred_p)
6402 {
6403 	nvme_t *const nvme = minor->nm_ctrl;
6404 	nvme_ioctl_fw_load_t fw;
6405 	uint64_t len, maxcopy;
6406 	offset_t offset;
6407 	uint32_t gran;
6408 	nvme_valid_ctrl_data_t data;
6409 	uintptr_t buf;
6410 	nvme_sqe_t sqe = {
6411 	    .sqe_opc	= NVME_OPC_FW_IMAGE_LOAD
6412 	};
6413 
6414 	if ((mode & FWRITE) == 0)
6415 		return (EBADF);
6416 
6417 	if (secpolicy_sys_config(cred_p, B_FALSE) != 0)
6418 		return (EPERM);
6419 
6420 	if (ddi_copyin((void *)(uintptr_t)arg, &fw, sizeof (fw),
6421 	    mode & FKIOCTL) != 0) {
6422 		return (EFAULT);
6423 	}
6424 
6425 	if (!nvme_ioctl_check(minor, &fw.fwl_common, &nvme_check_firmware)) {
6426 		goto copyout;
6427 	}
6428 
6429 	if (!nvme_validate_fw_load(nvme, &fw)) {
6430 		goto copyout;
6431 	}
6432 
6433 	len = fw.fwl_len;
6434 	offset = fw.fwl_off;
6435 	buf = fw.fwl_buf;
6436 
6437 	/*
6438 	 * We need to determine the minimum and maximum amount of data that we
6439 	 * will send to the device in a given go. Starting in NMVe 1.3 this must
6440 	 * be a multiple of the firmware update granularity (FWUG), but must not
6441 	 * exceed the maximum data transfer that we've set. Many devices don't
6442 	 * report something here, which means we'll end up getting our default
6443 	 * value. Our policy is a little simple, but it's basically if the
6444 	 * maximum data transfer is evenly divided by the granularity, then use
6445 	 * it. Otherwise we use the granularity itself. The granularity is
6446 	 * always in page sized units, so trying to find another optimum point
6447 	 * isn't worth it. If we encounter a contradiction, then we will have to
6448 	 * error out.
6449 	 */
6450 	data.vcd_vers = &nvme->n_version;
6451 	data.vcd_id = nvme->n_idctl;
6452 	gran = nvme_fw_load_granularity(&data);
6453 
6454 	if ((nvme->n_max_data_transfer_size % gran) == 0) {
6455 		maxcopy = nvme->n_max_data_transfer_size;
6456 	} else if (gran <= nvme->n_max_data_transfer_size) {
6457 		maxcopy = gran;
6458 	} else {
6459 		(void) nvme_ioctl_error(&fw.fwl_common,
6460 		    NVME_IOCTL_E_FW_LOAD_IMPOS_GRAN, 0, 0);
6461 		goto copyout;
6462 	}
6463 
6464 	while (len > 0) {
6465 		nvme_ioc_cmd_args_t args = { NULL };
6466 		uint64_t copylen = MIN(maxcopy, len);
6467 
6468 		sqe.sqe_cdw10 = (uint32_t)(copylen >> NVME_DWORD_SHIFT) - 1;
6469 		sqe.sqe_cdw11 = (uint32_t)(offset >> NVME_DWORD_SHIFT);
6470 
6471 		args.ica_sqe = &sqe;
6472 		args.ica_data = (void *)buf;
6473 		args.ica_data_len = copylen;
6474 		args.ica_dma_flags = DDI_DMA_WRITE;
6475 		args.ica_copy_flags = mode;
6476 		args.ica_timeout = nvme_admin_cmd_timeout;
6477 
6478 		if (!nvme_ioc_cmd(nvme, &fw.fwl_common, &args)) {
6479 			break;
6480 		}
6481 
6482 		buf += copylen;
6483 		offset += copylen;
6484 		len -= copylen;
6485 	}
6486 
6487 copyout:
6488 	if (ddi_copyout(&fw, (void *)(uintptr_t)arg, sizeof (fw),
6489 	    mode & FKIOCTL) != 0) {
6490 		return (EFAULT);
6491 	}
6492 
6493 	return (0);
6494 }
6495 
6496 static int
6497 nvme_ioctl_firmware_commit(nvme_minor_t *minor, intptr_t arg, int mode,
6498     cred_t *cred_p)
6499 {
6500 	nvme_t *const nvme = minor->nm_ctrl;
6501 	nvme_ioctl_fw_commit_t fw;
6502 	nvme_firmware_commit_dw10_t fc_dw10 = { 0 };
6503 	nvme_ioc_cmd_args_t args = { NULL };
6504 	nvme_sqe_t sqe = {
6505 	    .sqe_opc	= NVME_OPC_FW_ACTIVATE
6506 	};
6507 
6508 	if ((mode & FWRITE) == 0)
6509 		return (EBADF);
6510 
6511 	if (secpolicy_sys_config(cred_p, B_FALSE) != 0)
6512 		return (EPERM);
6513 
6514 	if (ddi_copyin((void *)(uintptr_t)arg, &fw, sizeof (fw),
6515 	    mode & FKIOCTL) != 0) {
6516 		return (EFAULT);
6517 	}
6518 
6519 	if (!nvme_ioctl_check(minor, &fw.fwc_common, &nvme_check_firmware)) {
6520 		goto copyout;
6521 	}
6522 
6523 	if (!nvme_validate_fw_commit(nvme, &fw)) {
6524 		goto copyout;
6525 	}
6526 
6527 	fc_dw10.b.fc_slot = fw.fwc_slot;
6528 	fc_dw10.b.fc_action = fw.fwc_action;
6529 	sqe.sqe_cdw10 = fc_dw10.r;
6530 
6531 	args.ica_sqe = &sqe;
6532 	args.ica_timeout = nvme_commit_save_cmd_timeout;
6533 
6534 	/*
6535 	 * There are no conditional actions to take based on this succeeding or
6536 	 * failing. A failure is recorded in the ioctl structure returned to the
6537 	 * user.
6538 	 */
6539 	(void) nvme_ioc_cmd(nvme, &fw.fwc_common, &args);
6540 
6541 	/*
6542 	 * Let the DDI UFM subsystem know that the firmware information for
6543 	 * this device has changed. We perform this unconditionally as an
6544 	 * invalidation doesn't particularly hurt us.
6545 	 */
6546 	nvme_ufm_update(nvme);
6547 
6548 copyout:
6549 	if (ddi_copyout(&fw, (void *)(uintptr_t)arg, sizeof (fw),
6550 	    mode & FKIOCTL) != 0) {
6551 		return (EFAULT);
6552 	}
6553 
6554 	return (0);
6555 }
6556 
6557 /*
6558  * Helper to copy in a passthru command from userspace, handling
6559  * different data models.
6560  */
6561 static int
6562 nvme_passthru_copyin_cmd(const void *buf, nvme_ioctl_passthru_t *cmd, int mode)
6563 {
6564 	switch (ddi_model_convert_from(mode & FMODELS)) {
6565 #ifdef _MULTI_DATAMODEL
6566 	case DDI_MODEL_ILP32: {
6567 		nvme_ioctl_passthru32_t cmd32;
6568 
6569 		if (ddi_copyin(buf, (void*)&cmd32, sizeof (cmd32), mode) != 0)
6570 			return (EFAULT);
6571 
6572 		bzero(cmd, sizeof (nvme_ioctl_passthru_t));
6573 
6574 		cmd->npc_common.nioc_nsid = cmd32.npc_common.nioc_nsid;
6575 		cmd->npc_opcode = cmd32.npc_opcode;
6576 		cmd->npc_timeout = cmd32.npc_timeout;
6577 		cmd->npc_flags = cmd32.npc_flags;
6578 		cmd->npc_impact = cmd32.npc_impact;
6579 		cmd->npc_cdw12 = cmd32.npc_cdw12;
6580 		cmd->npc_cdw13 = cmd32.npc_cdw13;
6581 		cmd->npc_cdw14 = cmd32.npc_cdw14;
6582 		cmd->npc_cdw15 = cmd32.npc_cdw15;
6583 		cmd->npc_buflen = cmd32.npc_buflen;
6584 		cmd->npc_buf = cmd32.npc_buf;
6585 		break;
6586 	}
6587 #endif	/* _MULTI_DATAMODEL */
6588 	case DDI_MODEL_NONE:
6589 		if (ddi_copyin(buf, (void *)cmd, sizeof (nvme_ioctl_passthru_t),
6590 		    mode) != 0) {
6591 			return (EFAULT);
6592 		}
6593 		break;
6594 	default:
6595 		return (ENOTSUP);
6596 	}
6597 
6598 	return (0);
6599 }
6600 
6601 /*
6602  * Helper to copy out a passthru command result to userspace, handling
6603  * different data models.
6604  */
6605 static int
6606 nvme_passthru_copyout_cmd(const nvme_ioctl_passthru_t *cmd, void *buf, int mode)
6607 {
6608 	switch (ddi_model_convert_from(mode & FMODELS)) {
6609 #ifdef _MULTI_DATAMODEL
6610 	case DDI_MODEL_ILP32: {
6611 		nvme_ioctl_passthru32_t cmd32;
6612 
6613 		bzero(&cmd32, sizeof (nvme_ioctl_passthru32_t));
6614 
6615 		cmd32.npc_common = cmd->npc_common;
6616 		cmd32.npc_opcode = cmd->npc_opcode;
6617 		cmd32.npc_timeout = cmd->npc_timeout;
6618 		cmd32.npc_flags = cmd->npc_flags;
6619 		cmd32.npc_impact = cmd->npc_impact;
6620 		cmd32.npc_cdw0 = cmd->npc_cdw0;
6621 		cmd32.npc_cdw12 = cmd->npc_cdw12;
6622 		cmd32.npc_cdw13 = cmd->npc_cdw13;
6623 		cmd32.npc_cdw14 = cmd->npc_cdw14;
6624 		cmd32.npc_cdw15 = cmd->npc_cdw15;
6625 		cmd32.npc_buflen = (size32_t)cmd->npc_buflen;
6626 		cmd32.npc_buf = (uintptr32_t)cmd->npc_buf;
6627 		if (ddi_copyout(&cmd32, buf, sizeof (cmd32), mode) != 0)
6628 			return (EFAULT);
6629 		break;
6630 	}
6631 #endif	/* _MULTI_DATAMODEL */
6632 	case DDI_MODEL_NONE:
6633 		if (ddi_copyout(cmd, buf, sizeof (nvme_ioctl_passthru_t),
6634 		    mode) != 0) {
6635 			return (EFAULT);
6636 		}
6637 		break;
6638 	default:
6639 		return (ENOTSUP);
6640 	}
6641 	return (0);
6642 }
6643 
6644 /*
6645  * Run an arbitrary vendor-specific admin command on the device.
6646  */
6647 static int
6648 nvme_ioctl_passthru(nvme_minor_t *minor, intptr_t arg, int mode, cred_t *cred_p)
6649 {
6650 	nvme_t *const nvme = minor->nm_ctrl;
6651 	int rv;
6652 	nvme_ioctl_passthru_t pass;
6653 	nvme_sqe_t sqe;
6654 	nvme_ioc_cmd_args_t args = { NULL };
6655 
6656 	/*
6657 	 * Basic checks: permissions, data model, argument size.
6658 	 */
6659 	if ((mode & FWRITE) == 0)
6660 		return (EBADF);
6661 
6662 	if (secpolicy_sys_config(cred_p, B_FALSE) != 0)
6663 		return (EPERM);
6664 
6665 	if ((rv = nvme_passthru_copyin_cmd((void *)(uintptr_t)arg, &pass,
6666 	    mode)) != 0) {
6667 		return (rv);
6668 	}
6669 
6670 	if (!nvme_ioctl_check(minor, &pass.npc_common, &nvme_check_passthru)) {
6671 		goto copyout;
6672 	}
6673 
6674 	if (!nvme_validate_vuc(nvme, &pass)) {
6675 		goto copyout;
6676 	}
6677 
6678 	mutex_enter(&nvme->n_mgmt_mutex);
6679 	if ((pass.npc_impact & NVME_IMPACT_NS) != 0) {
6680 		/*
6681 		 * We've been told this has ns impact. Right now force that to
6682 		 * be every ns until we have more use cases and reason to trust
6683 		 * the nsid field.
6684 		 */
6685 		if (!nvme_no_blkdev_attached(nvme, NVME_NSID_BCAST)) {
6686 			mutex_exit(&nvme->n_mgmt_mutex);
6687 			(void) nvme_ioctl_error(&pass.npc_common,
6688 			    NVME_IOCTL_E_NS_BLKDEV_ATTACH, 0, 0);
6689 			goto copyout;
6690 		}
6691 	}
6692 
6693 	bzero(&sqe, sizeof (sqe));
6694 
6695 	sqe.sqe_opc = pass.npc_opcode;
6696 	sqe.sqe_nsid = pass.npc_common.nioc_nsid;
6697 	sqe.sqe_cdw10 = (uint32_t)(pass.npc_buflen >> NVME_DWORD_SHIFT);
6698 	sqe.sqe_cdw12 = pass.npc_cdw12;
6699 	sqe.sqe_cdw13 = pass.npc_cdw13;
6700 	sqe.sqe_cdw14 = pass.npc_cdw14;
6701 	sqe.sqe_cdw15 = pass.npc_cdw15;
6702 
6703 	args.ica_sqe = &sqe;
6704 	args.ica_data = (void *)pass.npc_buf;
6705 	args.ica_data_len = pass.npc_buflen;
6706 	args.ica_copy_flags = mode;
6707 	args.ica_timeout = pass.npc_timeout;
6708 
6709 	if ((pass.npc_flags & NVME_PASSTHRU_READ) != 0)
6710 		args.ica_dma_flags |= DDI_DMA_READ;
6711 	else if ((pass.npc_flags & NVME_PASSTHRU_WRITE) != 0)
6712 		args.ica_dma_flags |= DDI_DMA_WRITE;
6713 
6714 	if (nvme_ioc_cmd(nvme, &pass.npc_common, &args)) {
6715 		pass.npc_cdw0 = args.ica_cdw0;
6716 		if ((pass.npc_impact & NVME_IMPACT_NS) != 0) {
6717 			nvme_rescan_ns(nvme, NVME_NSID_BCAST);
6718 		}
6719 	}
6720 	mutex_exit(&nvme->n_mgmt_mutex);
6721 
6722 copyout:
6723 	rv = nvme_passthru_copyout_cmd(&pass, (void *)(uintptr_t)arg,
6724 	    mode);
6725 
6726 	return (rv);
6727 }
6728 
6729 static int
6730 nvme_ioctl_lock(nvme_minor_t *minor, intptr_t arg, int mode,
6731     cred_t *cred_p)
6732 {
6733 	nvme_ioctl_lock_t lock;
6734 	const nvme_lock_flags_t all_flags = NVME_LOCK_F_DONT_BLOCK;
6735 	nvme_t *nvme = minor->nm_ctrl;
6736 
6737 	if ((mode & FWRITE) == 0)
6738 		return (EBADF);
6739 
6740 	if (secpolicy_sys_config(cred_p, B_FALSE) != 0)
6741 		return (EPERM);
6742 
6743 	if (ddi_copyin((void *)(uintptr_t)arg, &lock, sizeof (lock),
6744 	    mode & FKIOCTL) != 0) {
6745 		return (EFAULT);
6746 	}
6747 
6748 	if (lock.nil_ent != NVME_LOCK_E_CTRL &&
6749 	    lock.nil_ent != NVME_LOCK_E_NS) {
6750 		(void) nvme_ioctl_error(&lock.nil_common,
6751 		    NVME_IOCTL_E_BAD_LOCK_ENTITY, 0, 0);
6752 		goto copyout;
6753 	}
6754 
6755 	if (lock.nil_level != NVME_LOCK_L_READ &&
6756 	    lock.nil_level != NVME_LOCK_L_WRITE) {
6757 		(void) nvme_ioctl_error(&lock.nil_common,
6758 		    NVME_IOCTL_E_BAD_LOCK_LEVEL, 0, 0);
6759 		goto copyout;
6760 	}
6761 
6762 	if ((lock.nil_flags & ~all_flags) != 0) {
6763 		(void) nvme_ioctl_error(&lock.nil_common,
6764 		    NVME_IOCTL_E_BAD_LOCK_FLAGS, 0, 0);
6765 		goto copyout;
6766 	}
6767 
6768 	if (!nvme_ioctl_check(minor, &lock.nil_common, &nvme_check_locking)) {
6769 		goto copyout;
6770 	}
6771 
6772 	/*
6773 	 * If we're on a namespace, confirm that we're not asking for the
6774 	 * controller.
6775 	 */
6776 	if (lock.nil_common.nioc_nsid != 0 &&
6777 	    lock.nil_ent == NVME_LOCK_E_CTRL) {
6778 		(void) nvme_ioctl_error(&lock.nil_common,
6779 		    NVME_IOCTL_E_NS_CANNOT_LOCK_CTRL, 0, 0);
6780 		goto copyout;
6781 	}
6782 
6783 	/*
6784 	 * We've reached the point where we can no longer actually check things
6785 	 * without serializing state. First, we need to check to make sure that
6786 	 * none of our invariants are being broken for locking:
6787 	 *
6788 	 * 1) The caller isn't already blocking for a lock operation to
6789 	 * complete.
6790 	 *
6791 	 * 2) The caller is attempting to grab a lock that they already have.
6792 	 * While there are other rule violations that this might create, we opt
6793 	 * to check this ahead of it so we can have slightly better error
6794 	 * messages for our callers.
6795 	 *
6796 	 * 3) The caller is trying to grab a controller lock, while holding a
6797 	 * namespace lock.
6798 	 *
6799 	 * 4) The caller has a controller write lock and is trying to get a
6800 	 * namespace lock. For now, we disallow this case. Holding a controller
6801 	 * read lock is allowed, but the write lock allows you to operate on all
6802 	 * namespaces anyways. In addition, this simplifies the locking logic;
6803 	 * however, this constraint may be loosened in the future.
6804 	 *
6805 	 * 5) The caller is trying to acquire a second namespace lock when they
6806 	 * already have one.
6807 	 */
6808 	mutex_enter(&nvme->n_minor_mutex);
6809 	if (minor->nm_ctrl_lock.nli_state == NVME_LOCK_STATE_BLOCKED ||
6810 	    minor->nm_ns_lock.nli_state == NVME_LOCK_STATE_BLOCKED) {
6811 		(void) nvme_ioctl_error(&lock.nil_common,
6812 		    NVME_IOCTL_E_LOCK_PENDING, 0, 0);
6813 		mutex_exit(&nvme->n_minor_mutex);
6814 		goto copyout;
6815 	}
6816 
6817 	if ((lock.nil_ent == NVME_LOCK_E_CTRL &&
6818 	    minor->nm_ctrl_lock.nli_state == NVME_LOCK_STATE_ACQUIRED) ||
6819 	    (lock.nil_ent == NVME_LOCK_E_NS &&
6820 	    minor->nm_ns_lock.nli_state == NVME_LOCK_STATE_ACQUIRED &&
6821 	    minor->nm_ns_lock.nli_ns->ns_id == lock.nil_common.nioc_nsid)) {
6822 		(void) nvme_ioctl_error(&lock.nil_common,
6823 		    NVME_IOCTL_E_LOCK_ALREADY_HELD, 0, 0);
6824 		mutex_exit(&nvme->n_minor_mutex);
6825 		goto copyout;
6826 	}
6827 
6828 	if (lock.nil_ent == NVME_LOCK_E_CTRL &&
6829 	    minor->nm_ns_lock.nli_state != NVME_LOCK_STATE_UNLOCKED) {
6830 		(void) nvme_ioctl_error(&lock.nil_common,
6831 		    NVME_IOCTL_E_LOCK_NO_CTRL_WITH_NS, 0, 0);
6832 		mutex_exit(&nvme->n_minor_mutex);
6833 		goto copyout;
6834 	}
6835 
6836 	if (lock.nil_ent == NVME_LOCK_E_NS &&
6837 	    (minor->nm_ctrl_lock.nli_state == NVME_LOCK_STATE_ACQUIRED &&
6838 	    minor->nm_ctrl_lock.nli_curlevel == NVME_LOCK_L_WRITE)) {
6839 		(void) nvme_ioctl_error(&lock.nil_common,
6840 		    NVME_IOCTL_LOCK_NO_NS_WITH_CTRL_WRLOCK, 0, 0);
6841 		mutex_exit(&nvme->n_minor_mutex);
6842 		goto copyout;
6843 	}
6844 
6845 	if (lock.nil_ent == NVME_LOCK_E_NS &&
6846 	    minor->nm_ns_lock.nli_state != NVME_LOCK_STATE_UNLOCKED) {
6847 		(void) nvme_ioctl_error(&lock.nil_common,
6848 		    NVME_IOCTL_E_LOCK_NO_2ND_NS, 0, 0);
6849 		mutex_exit(&nvme->n_minor_mutex);
6850 		goto copyout;
6851 	}
6852 
6853 
6854 #ifdef	DEBUG
6855 	/*
6856 	 * This is a big block of sanity checks to make sure that we haven't
6857 	 * allowed anything bad to happen.
6858 	 */
6859 	if (lock.nil_ent == NVME_LOCK_E_NS) {
6860 		ASSERT3P(minor->nm_ns_lock.nli_lock, ==, NULL);
6861 		ASSERT3U(minor->nm_ns_lock.nli_state, ==,
6862 		    NVME_LOCK_STATE_UNLOCKED);
6863 		ASSERT3U(minor->nm_ns_lock.nli_curlevel, ==, 0);
6864 		ASSERT3P(minor->nm_ns_lock.nli_ns, ==, NULL);
6865 
6866 		if (minor->nm_ns != NULL) {
6867 			ASSERT3U(minor->nm_ns->ns_id, ==,
6868 			    lock.nil_common.nioc_nsid);
6869 		}
6870 
6871 		ASSERT0(list_link_active(&minor->nm_ns_lock.nli_node));
6872 	} else {
6873 		ASSERT3P(minor->nm_ctrl_lock.nli_lock, ==, NULL);
6874 		ASSERT3U(minor->nm_ctrl_lock.nli_state, ==,
6875 		    NVME_LOCK_STATE_UNLOCKED);
6876 		ASSERT3U(minor->nm_ctrl_lock.nli_curlevel, ==, 0);
6877 		ASSERT3P(minor->nm_ns_lock.nli_ns, ==, NULL);
6878 		ASSERT0(list_link_active(&minor->nm_ctrl_lock.nli_node));
6879 
6880 		ASSERT3P(minor->nm_ns_lock.nli_lock, ==, NULL);
6881 		ASSERT3U(minor->nm_ns_lock.nli_state, ==,
6882 		    NVME_LOCK_STATE_UNLOCKED);
6883 		ASSERT3U(minor->nm_ns_lock.nli_curlevel, ==, 0);
6884 		ASSERT3P(minor->nm_ns_lock.nli_ns, ==, NULL);
6885 		ASSERT0(list_link_active(&minor->nm_ns_lock.nli_node));
6886 	}
6887 #endif	/* DEBUG */
6888 
6889 	/*
6890 	 * At this point we should actually attempt a locking operation.
6891 	 */
6892 	nvme_rwlock(minor, &lock);
6893 	mutex_exit(&nvme->n_minor_mutex);
6894 
6895 copyout:
6896 	if (ddi_copyout(&lock, (void *)(uintptr_t)arg, sizeof (lock),
6897 	    mode & FKIOCTL) != 0) {
6898 		return (EFAULT);
6899 	}
6900 
6901 	return (0);
6902 }
6903 
6904 static int
6905 nvme_ioctl_unlock(nvme_minor_t *minor, intptr_t arg, int mode,
6906     cred_t *cred_p)
6907 {
6908 	nvme_ioctl_unlock_t unlock;
6909 	nvme_t *const nvme = minor->nm_ctrl;
6910 	boolean_t is_ctrl;
6911 	nvme_lock_t *lock;
6912 	nvme_minor_lock_info_t *info;
6913 
6914 	/*
6915 	 * Note, we explicitly don't check for privileges for unlock. The idea
6916 	 * being that if you have the lock, that's what matters. If you don't
6917 	 * have the lock, it doesn't matter what privileges that you have at
6918 	 * all.
6919 	 */
6920 	if ((mode & FWRITE) == 0)
6921 		return (EBADF);
6922 
6923 	if (ddi_copyin((void *)(uintptr_t)arg, &unlock, sizeof (unlock),
6924 	    mode & FKIOCTL) != 0) {
6925 		return (EFAULT);
6926 	}
6927 
6928 	if (unlock.niu_ent != NVME_LOCK_E_CTRL &&
6929 	    unlock.niu_ent != NVME_LOCK_E_NS) {
6930 		(void) nvme_ioctl_error(&unlock.niu_common,
6931 		    NVME_IOCTL_E_BAD_LOCK_ENTITY, 0, 0);
6932 		goto copyout;
6933 	}
6934 
6935 	if (!nvme_ioctl_check(minor, &unlock.niu_common, &nvme_check_locking)) {
6936 		goto copyout;
6937 	}
6938 
6939 	/*
6940 	 * If we're on a namespace, confirm that we're not asking for the
6941 	 * controller.
6942 	 */
6943 	if (unlock.niu_common.nioc_nsid != 0 &&
6944 	    unlock.niu_ent == NVME_LOCK_E_CTRL) {
6945 		(void) nvme_ioctl_error(&unlock.niu_common,
6946 		    NVME_IOCTL_E_NS_CANNOT_UNLOCK_CTRL, 0, 0);
6947 		goto copyout;
6948 	}
6949 
6950 	mutex_enter(&nvme->n_minor_mutex);
6951 	if (unlock.niu_ent == NVME_LOCK_E_CTRL) {
6952 		if (minor->nm_ctrl_lock.nli_state != NVME_LOCK_STATE_ACQUIRED) {
6953 			mutex_exit(&nvme->n_minor_mutex);
6954 			(void) nvme_ioctl_error(&unlock.niu_common,
6955 			    NVME_IOCTL_E_LOCK_NOT_HELD, 0, 0);
6956 			goto copyout;
6957 		}
6958 	} else {
6959 		if (minor->nm_ns_lock.nli_ns == NULL) {
6960 			mutex_exit(&nvme->n_minor_mutex);
6961 			(void) nvme_ioctl_error(&unlock.niu_common,
6962 			    NVME_IOCTL_E_LOCK_NOT_HELD, 0, 0);
6963 			goto copyout;
6964 		}
6965 
6966 		/*
6967 		 * Check that our unlock request corresponds to the namespace ID
6968 		 * that is currently locked. This could happen if we're using
6969 		 * the controller node and it specified a valid, but not locked,
6970 		 * namespace ID.
6971 		 */
6972 		if (minor->nm_ns_lock.nli_ns->ns_id !=
6973 		    unlock.niu_common.nioc_nsid) {
6974 			mutex_exit(&nvme->n_minor_mutex);
6975 			ASSERT3P(minor->nm_ns, ==, NULL);
6976 			(void) nvme_ioctl_error(&unlock.niu_common,
6977 			    NVME_IOCTL_E_LOCK_WRONG_NS, 0, 0);
6978 			goto copyout;
6979 		}
6980 
6981 		if (minor->nm_ns_lock.nli_state != NVME_LOCK_STATE_ACQUIRED) {
6982 			mutex_exit(&nvme->n_minor_mutex);
6983 			(void) nvme_ioctl_error(&unlock.niu_common,
6984 			    NVME_IOCTL_E_LOCK_NOT_HELD, 0, 0);
6985 			goto copyout;
6986 		}
6987 	}
6988 
6989 	/*
6990 	 * Finally, perform the unlock.
6991 	 */
6992 	is_ctrl = unlock.niu_ent == NVME_LOCK_E_CTRL;
6993 	if (is_ctrl) {
6994 		lock = &nvme->n_lock;
6995 		info = &minor->nm_ctrl_lock;
6996 	} else {
6997 		nvme_namespace_t *ns;
6998 		const uint32_t nsid = unlock.niu_common.nioc_nsid;
6999 
7000 		ns = nvme_nsid2ns(nvme, nsid);
7001 		lock = &ns->ns_lock;
7002 		info = &minor->nm_ns_lock;
7003 		VERIFY3P(ns, ==, info->nli_ns);
7004 	}
7005 	nvme_rwunlock(info, lock);
7006 	mutex_exit(&nvme->n_minor_mutex);
7007 	nvme_ioctl_success(&unlock.niu_common);
7008 
7009 copyout:
7010 	if (ddi_copyout(&unlock, (void *)(uintptr_t)arg, sizeof (unlock),
7011 	    mode & FKIOCTL) != 0) {
7012 		return (EFAULT);
7013 	}
7014 
7015 	return (0);
7016 }
7017 
7018 static int
7019 nvme_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred_p,
7020     int *rval_p)
7021 {
7022 #ifndef __lock_lint
7023 	_NOTE(ARGUNUSED(rval_p));
7024 #endif
7025 	nvme_minor_t *minor;
7026 	nvme_t *nvme;
7027 
7028 	minor = nvme_minor_find_by_dev(dev);
7029 	if (minor == NULL) {
7030 		return (ENXIO);
7031 	}
7032 
7033 	nvme = minor->nm_ctrl;
7034 	if (nvme == NULL)
7035 		return (ENXIO);
7036 
7037 	if (IS_DEVCTL(cmd))
7038 		return (ndi_devctl_ioctl(nvme->n_dip, cmd, arg, mode, 0));
7039 
7040 	if (nvme->n_dead && (cmd != NVME_IOC_DETACH && cmd !=
7041 	    NVME_IOC_UNLOCK)) {
7042 		if (IS_NVME_IOC(cmd) == 0) {
7043 			return (EIO);
7044 		}
7045 
7046 		return (nvme_ioctl_copyout_error(nvme->n_dead_status, arg,
7047 		    mode));
7048 	}
7049 
7050 	/*
7051 	 * ioctls that are no longer using the original ioctl structure.
7052 	 */
7053 	switch (cmd) {
7054 	case NVME_IOC_CTRL_INFO:
7055 		return (nvme_ioctl_ctrl_info(minor, arg, mode, cred_p));
7056 	case NVME_IOC_IDENTIFY:
7057 		return (nvme_ioctl_identify(minor, arg, mode, cred_p));
7058 	case NVME_IOC_GET_LOGPAGE:
7059 		return (nvme_ioctl_get_logpage(minor, arg, mode, cred_p));
7060 	case NVME_IOC_GET_FEATURE:
7061 		return (nvme_ioctl_get_feature(minor, arg, mode, cred_p));
7062 	case NVME_IOC_DETACH:
7063 		return (nvme_ioctl_detach(minor, arg, mode, cred_p));
7064 	case NVME_IOC_ATTACH:
7065 		return (nvme_ioctl_attach(minor, arg, mode, cred_p));
7066 	case NVME_IOC_FORMAT:
7067 		return (nvme_ioctl_format(minor, arg, mode, cred_p));
7068 	case NVME_IOC_FIRMWARE_DOWNLOAD:
7069 		return (nvme_ioctl_firmware_download(minor, arg, mode,
7070 		    cred_p));
7071 	case NVME_IOC_FIRMWARE_COMMIT:
7072 		return (nvme_ioctl_firmware_commit(minor, arg, mode,
7073 		    cred_p));
7074 	case NVME_IOC_NS_INFO:
7075 		return (nvme_ioctl_ns_info(minor, arg, mode, cred_p));
7076 	case NVME_IOC_PASSTHRU:
7077 		return (nvme_ioctl_passthru(minor, arg, mode, cred_p));
7078 	case NVME_IOC_LOCK:
7079 		return (nvme_ioctl_lock(minor, arg, mode, cred_p));
7080 	case NVME_IOC_UNLOCK:
7081 		return (nvme_ioctl_unlock(minor, arg, mode, cred_p));
7082 	default:
7083 		return (ENOTTY);
7084 	}
7085 }
7086 
7087 /*
7088  * DDI UFM Callbacks
7089  */
7090 static int
7091 nvme_ufm_fill_image(ddi_ufm_handle_t *ufmh, void *arg, uint_t imgno,
7092     ddi_ufm_image_t *img)
7093 {
7094 	nvme_t *nvme = arg;
7095 
7096 	if (imgno != 0)
7097 		return (EINVAL);
7098 
7099 	ddi_ufm_image_set_desc(img, "Firmware");
7100 	ddi_ufm_image_set_nslots(img, nvme->n_idctl->id_frmw.fw_nslot);
7101 
7102 	return (0);
7103 }
7104 
7105 /*
7106  * Fill out firmware slot information for the requested slot.  The firmware
7107  * slot information is gathered by requesting the Firmware Slot Information log
7108  * page.  The format of the page is described in section 5.10.1.3.
7109  *
7110  * We lazily cache the log page on the first call and then invalidate the cache
7111  * data after a successful firmware download or firmware commit command.
7112  * The cached data is protected by a mutex as the state can change
7113  * asynchronous to this callback.
7114  */
7115 static int
7116 nvme_ufm_fill_slot(ddi_ufm_handle_t *ufmh, void *arg, uint_t imgno,
7117     uint_t slotno, ddi_ufm_slot_t *slot)
7118 {
7119 	nvme_t *nvme = arg;
7120 	void *log = NULL;
7121 	size_t bufsize;
7122 	ddi_ufm_attr_t attr = 0;
7123 	char fw_ver[NVME_FWVER_SZ + 1];
7124 
7125 	if (imgno > 0 || slotno > (nvme->n_idctl->id_frmw.fw_nslot - 1))
7126 		return (EINVAL);
7127 
7128 	mutex_enter(&nvme->n_fwslot_mutex);
7129 	if (nvme->n_fwslot == NULL) {
7130 		if (!nvme_get_logpage_int(nvme, B_TRUE, &log, &bufsize,
7131 		    NVME_LOGPAGE_FWSLOT) ||
7132 		    bufsize != sizeof (nvme_fwslot_log_t)) {
7133 			if (log != NULL)
7134 				kmem_free(log, bufsize);
7135 			mutex_exit(&nvme->n_fwslot_mutex);
7136 			return (EIO);
7137 		}
7138 		nvme->n_fwslot = (nvme_fwslot_log_t *)log;
7139 	}
7140 
7141 	/*
7142 	 * NVMe numbers firmware slots starting at 1
7143 	 */
7144 	if (slotno == (nvme->n_fwslot->fw_afi - 1))
7145 		attr |= DDI_UFM_ATTR_ACTIVE;
7146 
7147 	if (slotno != 0 || nvme->n_idctl->id_frmw.fw_readonly == 0)
7148 		attr |= DDI_UFM_ATTR_WRITEABLE;
7149 
7150 	if (nvme->n_fwslot->fw_frs[slotno][0] == '\0') {
7151 		attr |= DDI_UFM_ATTR_EMPTY;
7152 	} else {
7153 		(void) strncpy(fw_ver, nvme->n_fwslot->fw_frs[slotno],
7154 		    NVME_FWVER_SZ);
7155 		fw_ver[NVME_FWVER_SZ] = '\0';
7156 		ddi_ufm_slot_set_version(slot, fw_ver);
7157 	}
7158 	mutex_exit(&nvme->n_fwslot_mutex);
7159 
7160 	ddi_ufm_slot_set_attrs(slot, attr);
7161 
7162 	return (0);
7163 }
7164 
7165 static int
7166 nvme_ufm_getcaps(ddi_ufm_handle_t *ufmh, void *arg, ddi_ufm_cap_t *caps)
7167 {
7168 	*caps = DDI_UFM_CAP_REPORT;
7169 	return (0);
7170 }
7171 
7172 boolean_t
7173 nvme_ctrl_atleast(nvme_t *nvme, const nvme_version_t *min)
7174 {
7175 	return (nvme_vers_atleast(&nvme->n_version, min) ? B_TRUE : B_FALSE);
7176 }
7177