xref: /illumos-gate/usr/src/uts/common/io/nvme/nvme.c (revision fc910014e8a32a65612105835a10995f2c13d942)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright (c) 2016 The MathWorks, Inc.  All rights reserved.
14  * Copyright 2019 Unix Software Ltd.
15  * Copyright 2020 Joyent, Inc.
16  * Copyright 2020 Racktop Systems.
17  * Copyright 2024 Oxide Computer Company.
18  * Copyright 2022 OmniOS Community Edition (OmniOSce) Association.
19  * Copyright 2022 Tintri by DDN, Inc. All rights reserved.
20  */
21 
22 /*
23  * blkdev driver for NVMe compliant storage devices
24  *
25  * This driver targets and is designed to support all NVMe 1.x and NVMe 2.x
26  * devices. Features are added to the driver as we encounter devices that
27  * require them and our needs, so some commands or log pages may not take
28  * advantage of newer features that devices support at this time. When you
29  * encounter such a case, it is generally fine to add that support to the driver
30  * as long as you take care to ensure that the requisite device version is met
31  * before using it.
32  *
33  * The driver has only been tested on x86 systems and will not work on big-
34  * endian systems without changes to the code accessing registers and data
35  * structures used by the hardware.
36  *
37  *
38  * Interrupt Usage:
39  *
40  * The driver will use a single interrupt while configuring the device as the
41  * specification requires, but contrary to the specification it will try to use
42  * a single-message MSI(-X) or FIXED interrupt. Later in the attach process it
43  * will switch to multiple-message MSI(-X) if supported. The driver wants to
44  * have one interrupt vector per CPU, but it will work correctly if less are
45  * available. Interrupts can be shared by queues, the interrupt handler will
46  * iterate through the I/O queue array by steps of n_intr_cnt. Usually only
47  * the admin queue will share an interrupt with one I/O queue. The interrupt
48  * handler will retrieve completed commands from all queues sharing an interrupt
49  * vector and will post them to a taskq for completion processing.
50  *
51  *
52  * Command Processing:
53  *
54  * NVMe devices can have up to 65535 I/O queue pairs, with each queue holding up
55  * to 65536 I/O commands. The driver will configure one I/O queue pair per
56  * available interrupt vector, with the queue length usually much smaller than
57  * the maximum of 65536. If the hardware doesn't provide enough queues, fewer
58  * interrupt vectors will be used.
59  *
60  * Additionally the hardware provides a single special admin queue pair that can
61  * hold up to 4096 admin commands.
62  *
63  * From the hardware perspective both queues of a queue pair are independent,
64  * but they share some driver state: the command array (holding pointers to
65  * commands currently being processed by the hardware) and the active command
66  * counter. Access to a submission queue and the shared state is protected by
67  * nq_mutex; completion queue is protected by ncq_mutex.
68  *
69  * When a command is submitted to a queue pair the active command counter is
70  * incremented and a pointer to the command is stored in the command array. The
71  * array index is used as command identifier (CID) in the submission queue
72  * entry. Some commands may take a very long time to complete, and if the queue
73  * wraps around in that time a submission may find the next array slot to still
74  * be used by a long-running command. In this case the array is sequentially
75  * searched for the next free slot. The length of the command array is the same
76  * as the configured queue length. Queue overrun is prevented by the semaphore,
77  * so a command submission may block if the queue is full.
78  *
79  *
80  * Polled I/O Support:
81  *
82  * For kernel core dump support the driver can do polled I/O. As interrupts are
83  * turned off while dumping the driver will just submit a command in the regular
84  * way, and then repeatedly attempt a command retrieval until it gets the
85  * command back.
86  *
87  *
88  * Namespace Support:
89  *
90  * NVMe devices can have multiple namespaces, each being a independent data
91  * store. The driver supports multiple namespaces and creates a blkdev interface
92  * for each namespace found. Namespaces can have various attributes to support
93  * protection information. This driver does not support any of this and ignores
94  * namespaces that have these attributes.
95  *
96  * As of NVMe 1.1 namespaces can have an 64bit Extended Unique Identifier
97  * (EUI64), and NVMe 1.2 introduced an additional 128bit Namespace Globally
98  * Unique Identifier (NGUID). This driver uses either the NGUID or the EUI64
99  * if present to generate the devid, and passes the EUI64 to blkdev to use it
100  * in the device node names.
101  *
102  * We currently support only (2 << NVME_MINOR_INST_SHIFT) - 2 namespaces in a
103  * single controller. This is an artificial limit imposed by the driver to be
104  * able to address a reasonable number of controllers and namespaces while
105  * fitting within the constraints of MAXMIN32, aka a 32-bit device number which
106  * only has 18-bits for the minor number. See the minor node section for more
107  * information.
108  *
109  *
110  * Minor nodes:
111  *
112  * For each NVMe device the driver exposes one minor node for the controller and
113  * one minor node for each namespace. The only operations supported by those
114  * minor nodes are open(9E), close(9E), and ioctl(9E). This serves as the
115  * primary control interface for the devices. The character device is a private
116  * interface and we attempt stability through libnvme and more so nvmeadm.
117  *
118  * The controller minor node is much more flexible than the namespace minor node
119  * and should be preferred. The controller node allows one to target any
120  * namespace that the device has, while the namespace is limited in what it can
121  * acquire. While the namespace minor exists, it should not be relied upon and
122  * is not by libnvme.
123  *
124  * The minor number space is split in two. We use the lower part to support the
125  * controller and namespaces as described above in the 'Namespace Support'
126  * section. The second set is used for cloning opens. We set aside one million
127  * minors for this purpose. We utilize a cloning open so that way we can have
128  * per-file_t state. This is how we end up implementing and tracking locking
129  * state and related.
130  *
131  * When we have this cloned open, then we allocate a new nvme_minor_t which gets
132  * its minor number from the nvme_open_minors id_space_t and is stored in the
133  * nvme_open_minors_avl. While someone calls open on a controller or namespace
134  * minor, everything else occurs in the context of one of these ephemeral
135  * minors.
136  *
137  *
138  * ioctls, Errors, and Exclusive Access:
139  *
140  * All of the logical commands that one can issue are driven through the
141  * ioctl(9E) interface. All of our ioctls have a similar shape where they
142  * all include the 'nvme_ioctl_common_t' as their first member.
143  *
144  * This common ioctl structure is used to communicate the namespace that should
145  * be targeted. When the namespace is left as 0, then that indicates that it
146  * should target whatever the default is of the minor node. For a namespace
147  * minor, that will be transparently rewritten to the namespace's namespace id.
148  *
149  * In addition, the nvme_ioctl_common_t structure also has a standard error
150  * return. Our goal in our ioctl path is to ensure that we have useful semantic
151  * errors as much as possible. EINVAL, EIO, etc. are all overloaded. Instead as
152  * long as we can copy in our structure, then we will set a semantic error. If
153  * we have an error from the controller, then that will be included there.
154  *
155  * Each command has a specific policy that controls whether or not it is allowed
156  * on the namespace or controller minor, whether the broadcast namespace is
157  * allowed, various settings around what kind of exclusive access is allowed,
158  * and more. Each of these is wrapped up in a bit of policy described by the
159  * 'nvme_ioctl_check_t' structure.
160  *
161  * The device provides a form of exclusion in the form of both a
162  * controller-level and namespace-level read and write lock. Most operations do
163  * not require a lock (e.g. get log page, identify, etc.), but a few do (e.g.
164  * format nvm, firmware related activity, etc.). A read lock guarantees that you
165  * can complete your operation without interference, but read locks are not
166  * required. If you don't take a read lock and someone comes in with a write
167  * lock, then subsequent operations will fail with a semantic error indicating
168  * that you were blocked due to this.
169  *
170  * Here are some of the rules that govern our locks:
171  *
172  * 1. Writers starve readers. Any readers are allowed to finish when there is a
173  *    pending writer; however, all subsequent readers will be blocked upon that
174  *    writer.
175  * 2. A controller write lock takes priority over all other locks. Put
176  *    differently a controller writer not only starves subsequent controller
177  *    readers, but also all namespace read and write locks.
178  * 3. Each namespace lock is independent.
179  * 4. At most a single namespace lock may be owned.
180  * 5. If you own a namespace lock, you may not take a controller lock (to help
181  *    with lock ordering).
182  * 6. In a similar spirit, if you own a controller write lock, you may not take
183  *    any namespace lock. Someone with the controller write lock can perform any
184  *    operations that they need to. However, if you have a controller read lock
185  *    you may take any namespace lock.
186  * 7. There is no ability to upgrade a read lock to a write lock.
187  * 8. There is no recursive locking.
188  *
189  * While there's a lot there to keep track of, the goals of these are to
190  * constrain things so as to avoid deadlock. This is more complex than the
191  * original implementation in the driver which only allowed for an exclusive
192  * open that was tied to the thread. The first issue with tying this to the
193  * thread was that that didn't work well for software that utilized thread
194  * pools, like complex daemons. The second issue is that we want the ability for
195  * daemons, such as a FRU monitor, to be able to retain a file descriptor to the
196  * device without blocking others from taking action except during critical
197  * periods.
198  *
199  * In particular to enable something like libnvme, we didn't want someone to
200  * have to open and close the file descriptor to change what kind of exclusive
201  * access they desired.
202  *
203  * There are two different sets of data structures that we employ for tracking
204  * locking information:
205  *
206  * 1) The nvme_lock_t structure is contained in both the nvme_t and the
207  * nvme_namespace_t and tracks the current writer, readers, and pending writers
208  * and readers. Each of these lists or the writer pointer all refer to our
209  * second data structure.
210  *
211  * When a lock is owned by a single writer, then the nl_writer field is set to a
212  * specific minor's lock data structure. If instead readers are present, then
213  * the nl_readers list_t is not empty. An invariant of the system is that if
214  * nl_writer is non-NULL, nl_readers must be empty and conversely, if nl_readers
215  * is not empty, nl_writer must be NULL.
216  *
217  * 2) The nvme_minor_lock_info_t exists in the nvme_minor_t. There is one
218  * information structure which represents the minor's controller lock and a
219  * second one that represents the minor's namespace lock. The members of this
220  * are broken into tracking what the current lock is and what it targets. It
221  * also several members that are intended for debugging (nli_last_change,
222  * nli_acq_kthread, etc.).
223  *
224  * While the minor has two different lock information structures, our rules
225  * ensure that only one of the two can be pending and that they shouldn't result
226  * in a deadlock. When a lock is pending, the caller is sleeping on the minor's
227  * nm_cv member.
228  *
229  * These relationships are represented in the following image which shows a
230  * controller write lock being held with a pending readers on the controller
231  * lock and pending writers on one of the controller's namespaces.
232  *
233  *  +---------+
234  *  | nvme_t  |
235  *  |         |
236  *  | n_lock -|-------+
237  *  | n_ns -+ |       |                          +-----------------------------+
238  *  +-------|-+   +-----------------+            | nvme_minor_t                |
239  *          |     | nvme_lock_t     |            |                             |
240  *          |     |                 |            |  +------------------------+ |
241  *          |     | writer        --|-------------->| nvme_minor_lock_info_t | |
242  *          |     | reader list     |            |  | nm_ctrl_lock           | |
243  *          |     | pending writers |            |  +------------------------+ |
244  *          |     | pending readers |------+     |  +------------------------+ |
245  *          |     +-----------------+      |     |  | nvme_minor_lock_info_t | |
246  *          |                              |     |  | nm_ns_lock             | |
247  *          |                              |     |  +------------------------+ |
248  *          |                              |     +-----------------------------+
249  *  +------------------+                   |                 +-----------------+
250  *  | nvme_namespace_t |                   |                 | nvme_minor_t    |
251  *  |                  |                   |                 |                 |
252  *  | ns_lock ---+     |                   |                 | +-------------+ |
253  *  +------------|-----+                   +-----------------|>|nm_ctrl_lock | |
254  *               |                                           | +-------------+ |
255  *               v                                           +-----------------+
256  *     +------------------+                                         ...
257  *     | nvme_lock_t      |                                  +-----------------+
258  *     |                  |                                  | nvme_minor_t    |
259  *     | writer           |                                  |                 |
260  *     | reader list      |                                  | +-------------+ |
261  *     | pending writers -|-----------------+                | |nm_ctrl_lock | |
262  *     | pending readers  |                 |                | +-------------+ |
263  *     +------------------+                 |                +-----------------+
264  *         +-----------------------------+  |  +-----------------------------+
265  *         | nvme_minor_t                |  |  | nvme_minor_t                |
266  *         |                             |  |  |                             |
267  *         |  +------------------------+ |  |  |  +------------------------+ |
268  *         |  | nvme_minor_lock_info_t | |  |  |  | nvme_minor_lock_info_t | |
269  *         |  | nm_ctrl_lock           | |  |  |  | nm_ctrl_lock           | |
270  *         |  +------------------------+ |  |  |  +------------------------+ |
271  *         |  +------------------------+ |  v  |  +------------------------+ |
272  *         |  | nvme_minor_lock_info_t |-|-----|->| nvme_minor_lock_info_t | |
273  *         |  | nm_ns_lock             | |     |  | nm_ns_lock             | |
274  *         |  +------------------------+ |     |  +------------------------+ |
275  *         +-----------------------------+     +-----------------------------+
276  *
277  * Blkdev Interface:
278  *
279  * This driver uses blkdev to do all the heavy lifting involved with presenting
280  * a disk device to the system. As a result, the processing of I/O requests is
281  * relatively simple as blkdev takes care of partitioning, boundary checks, DMA
282  * setup, and splitting of transfers into manageable chunks.
283  *
284  * I/O requests coming in from blkdev are turned into NVM commands and posted to
285  * an I/O queue. The queue is selected by taking the CPU id modulo the number of
286  * queues. There is currently no timeout handling of I/O commands.
287  *
288  * Blkdev also supports querying device/media information and generating a
289  * devid. The driver reports the best block size as determined by the namespace
290  * format back to blkdev as physical block size to support partition and block
291  * alignment. The devid is either based on the namespace GUID or EUI64, if
292  * present, or composed using the device vendor ID, model number, serial number,
293  * and the namespace ID.
294  *
295  *
296  * Error Handling:
297  *
298  * Error handling is currently limited to detecting fatal hardware errors,
299  * either by asynchronous events, or synchronously through command status or
300  * admin command timeouts. In case of severe errors the device is fenced off,
301  * all further requests will return EIO. FMA is then called to fault the device.
302  *
303  * The hardware has a limit for outstanding asynchronous event requests. Before
304  * this limit is known the driver assumes it is at least 1 and posts a single
305  * asynchronous request. Later when the limit is known more asynchronous event
306  * requests are posted to allow quicker reception of error information. When an
307  * asynchronous event is posted by the hardware the driver will parse the error
308  * status fields and log information or fault the device, depending on the
309  * severity of the asynchronous event. The asynchronous event request is then
310  * reused and posted to the admin queue again.
311  *
312  * On command completion the command status is checked for errors. In case of
313  * errors indicating a driver bug the driver panics. Almost all other error
314  * status values just cause EIO to be returned.
315  *
316  * Command timeouts are currently detected for all admin commands except
317  * asynchronous event requests. If a command times out and the hardware appears
318  * to be healthy the driver attempts to abort the command. The original command
319  * timeout is also applied to the abort command. If the abort times out too the
320  * driver assumes the device to be dead, fences it off, and calls FMA to retire
321  * it. In all other cases the aborted command should return immediately with a
322  * status indicating it was aborted, and the driver will wait indefinitely for
323  * that to happen. No timeout handling of normal I/O commands is presently done.
324  *
325  * Any command that times out due to the controller dropping dead will be put on
326  * nvme_lost_cmds list if it references DMA memory. This will prevent the DMA
327  * memory being reused by the system and later be written to by a "dead" NVMe
328  * controller.
329  *
330  *
331  * Locking:
332  *
333  * Each queue pair has a nq_mutex and ncq_mutex. The nq_mutex must be held
334  * when accessing shared state and submission queue registers, ncq_mutex
335  * is held when accessing completion queue state and registers.
336  * Callers of nvme_unqueue_cmd() must make sure that nq_mutex is held, while
337  * nvme_submit_{admin,io}_cmd() and nvme_retrieve_cmd() take care of both
338  * mutexes themselves.
339  *
340  * Each command also has its own nc_mutex, which is associated with the
341  * condition variable nc_cv. It is only used on admin commands which are run
342  * synchronously. In that case it must be held across calls to
343  * nvme_submit_{admin,io}_cmd() and nvme_wait_cmd(), which is taken care of by
344  * nvme_admin_cmd(). It must also be held whenever the completion state of the
345  * command is changed or while a admin command timeout is handled.
346  *
347  * If both nc_mutex and nq_mutex must be held, nc_mutex must be acquired first.
348  * More than one nc_mutex may only be held when aborting commands. In this case,
349  * the nc_mutex of the command to be aborted must be held across the call to
350  * nvme_abort_cmd() to prevent the command from completing while the abort is in
351  * progress.
352  *
353  * If both nq_mutex and ncq_mutex need to be held, ncq_mutex must be
354  * acquired first. More than one nq_mutex is never held by a single thread.
355  * The ncq_mutex is only held by nvme_retrieve_cmd() and
356  * nvme_process_iocq(). nvme_process_iocq() is only called from the
357  * interrupt thread and nvme_retrieve_cmd() during polled I/O, so the
358  * mutex is non-contentious but is required for implementation completeness
359  * and safety.
360  *
361  * There is one mutex n_minor_mutex which protects all open flags nm_open and
362  * exclusive-open thread pointers nm_oexcl of each minor node associated with a
363  * controller and its namespaces.
364  *
365  * In addition, there is one mutex n_mgmt_mutex which must be held whenever the
366  * driver state for any namespace is changed, especially across calls to
367  * nvme_init_ns(), nvme_attach_ns() and nvme_detach_ns(). Except when detaching
368  * nvme, it should also be held across calls that modify the blkdev handle of a
369  * namespace. Command and queue mutexes may be acquired and released while
370  * n_mgmt_mutex is held, n_minor_mutex should not.
371  *
372  *
373  * Quiesce / Fast Reboot:
374  *
375  * The driver currently does not support fast reboot. A quiesce(9E) entry point
376  * is still provided which is used to send a shutdown notification to the
377  * device.
378  *
379  *
380  * NVMe Hotplug:
381  *
382  * The driver supports hot removal. The driver uses the NDI event framework
383  * to register a callback, nvme_remove_callback, to clean up when a disk is
384  * removed. In particular, the driver will unqueue outstanding I/O commands and
385  * set n_dead on the softstate to true so that other operations, such as ioctls
386  * and command submissions, fail as well.
387  *
388  * While the callback registration relies on the NDI event framework, the
389  * removal event itself is kicked off in the PCIe hotplug framework, when the
390  * PCIe bridge driver ("pcieb") gets a hotplug interrupt indicating that a
391  * device was removed from the slot.
392  *
393  * The NVMe driver instance itself will remain until the final close of the
394  * device.
395  *
396  *
397  * DDI UFM Support
398  *
399  * The driver supports the DDI UFM framework for reporting information about
400  * the device's firmware image and slot configuration. This data can be
401  * queried by userland software via ioctls to the ufm driver. For more
402  * information, see ddi_ufm(9E).
403  *
404  *
405  * Driver Configuration:
406  *
407  * The following driver properties can be changed to control some aspects of the
408  * drivers operation:
409  * - strict-version: can be set to 0 to allow devices conforming to newer
410  *   major versions to be used
411  * - ignore-unknown-vendor-status: can be set to 1 to not handle any vendor
412  *   specific command status as a fatal error leading device faulting
413  * - admin-queue-len: the maximum length of the admin queue (16-4096)
414  * - io-squeue-len: the maximum length of the I/O submission queues (16-65536)
415  * - io-cqueue-len: the maximum length of the I/O completion queues (16-65536)
416  * - async-event-limit: the maximum number of asynchronous event requests to be
417  *   posted by the driver
418  * - volatile-write-cache-enable: can be set to 0 to disable the volatile write
419  *   cache
420  * - min-phys-block-size: the minimum physical block size to report to blkdev,
421  *   which is among other things the basis for ZFS vdev ashift
422  * - max-submission-queues: the maximum number of I/O submission queues.
423  * - max-completion-queues: the maximum number of I/O completion queues,
424  *   can be less than max-submission-queues, in which case the completion
425  *   queues are shared.
426  *
427  * In addition to the above properties, some device-specific tunables can be
428  * configured using the nvme-config-list global property. The value of this
429  * property is a list of triplets. The formal syntax is:
430  *
431  *   nvme-config-list ::= <triplet> [, <triplet>]* ;
432  *   <triplet>        ::= "<model>" , "<rev-list>" , "<tuple-list>"
433  *   <rev-list>       ::= [ <fwrev> [, <fwrev>]*]
434  *   <tuple-list>     ::= <tunable> [, <tunable>]*
435  *   <tunable>        ::= <name> : <value>
436  *
437  * The <model> and <fwrev> are the strings in nvme_identify_ctrl_t`id_model and
438  * nvme_identify_ctrl_t`id_fwrev, respectively. The remainder of <tuple-list>
439  * contains one or more tunables to apply to all controllers that match the
440  * specified model number and optionally firmware revision. Each <tunable> is a
441  * <name> : <value> pair.  Supported tunables are:
442  *
443  * - ignore-unknown-vendor-status:  can be set to "on" to not handle any vendor
444  *   specific command status as a fatal error leading device faulting
445  *
446  * - min-phys-block-size: the minimum physical block size to report to blkdev,
447  *   which is among other things the basis for ZFS vdev ashift
448  *
449  * - volatile-write-cache: can be set to "on" or "off" to enable or disable the
450  *   volatile write cache, if present
451  *
452  *
453  * TODO:
454  * - figure out sane default for I/O queue depth reported to blkdev
455  * - FMA handling of media errors
456  * - support for devices supporting very large I/O requests using chained PRPs
457  * - support for configuring hardware parameters like interrupt coalescing
458  * - support for media formatting and hard partitioning into namespaces
459  * - support for big-endian systems
460  * - support for fast reboot
461  * - support for NVMe Subsystem Reset (1.1)
462  * - support for Scatter/Gather lists (1.1)
463  * - support for Reservations (1.1)
464  * - support for power management
465  */
466 
467 #include <sys/byteorder.h>
468 #ifdef _BIG_ENDIAN
469 #error nvme driver needs porting for big-endian platforms
470 #endif
471 
472 #include <sys/modctl.h>
473 #include <sys/conf.h>
474 #include <sys/devops.h>
475 #include <sys/ddi.h>
476 #include <sys/ddi_ufm.h>
477 #include <sys/sunddi.h>
478 #include <sys/sunndi.h>
479 #include <sys/bitmap.h>
480 #include <sys/sysmacros.h>
481 #include <sys/param.h>
482 #include <sys/varargs.h>
483 #include <sys/cpuvar.h>
484 #include <sys/disp.h>
485 #include <sys/blkdev.h>
486 #include <sys/atomic.h>
487 #include <sys/archsystm.h>
488 #include <sys/sata/sata_hba.h>
489 #include <sys/stat.h>
490 #include <sys/policy.h>
491 #include <sys/list.h>
492 #include <sys/dkio.h>
493 #include <sys/pci.h>
494 #include <sys/mkdev.h>
495 
496 #include <sys/nvme.h>
497 
498 #ifdef __x86
499 #include <sys/x86_archext.h>
500 #endif
501 
502 #include "nvme_reg.h"
503 #include "nvme_var.h"
504 
505 /*
506  * Assertions to make sure that we've properly captured various aspects of the
507  * packed structures and haven't broken them during updates.
508  */
509 CTASSERT(sizeof (nvme_identify_ctrl_t) == NVME_IDENTIFY_BUFSIZE);
510 CTASSERT(offsetof(nvme_identify_ctrl_t, id_oacs) == 256);
511 CTASSERT(offsetof(nvme_identify_ctrl_t, id_sqes) == 512);
512 CTASSERT(offsetof(nvme_identify_ctrl_t, id_oncs) == 520);
513 CTASSERT(offsetof(nvme_identify_ctrl_t, id_subnqn) == 768);
514 CTASSERT(offsetof(nvme_identify_ctrl_t, id_nvmof) == 1792);
515 CTASSERT(offsetof(nvme_identify_ctrl_t, id_psd) == 2048);
516 CTASSERT(offsetof(nvme_identify_ctrl_t, id_vs) == 3072);
517 
518 CTASSERT(sizeof (nvme_identify_nsid_t) == NVME_IDENTIFY_BUFSIZE);
519 CTASSERT(offsetof(nvme_identify_nsid_t, id_fpi) == 32);
520 CTASSERT(offsetof(nvme_identify_nsid_t, id_anagrpid) == 92);
521 CTASSERT(offsetof(nvme_identify_nsid_t, id_nguid) == 104);
522 CTASSERT(offsetof(nvme_identify_nsid_t, id_lbaf) == 128);
523 CTASSERT(offsetof(nvme_identify_nsid_t, id_vs) == 384);
524 
525 CTASSERT(sizeof (nvme_identify_nsid_list_t) == NVME_IDENTIFY_BUFSIZE);
526 CTASSERT(sizeof (nvme_identify_ctrl_list_t) == NVME_IDENTIFY_BUFSIZE);
527 
528 CTASSERT(sizeof (nvme_identify_primary_caps_t) == NVME_IDENTIFY_BUFSIZE);
529 CTASSERT(offsetof(nvme_identify_primary_caps_t, nipc_vqfrt) == 32);
530 CTASSERT(offsetof(nvme_identify_primary_caps_t, nipc_vifrt) == 64);
531 
532 CTASSERT(sizeof (nvme_nschange_list_t) == 4096);
533 
534 
535 /* NVMe spec version supported */
536 static const int nvme_version_major = 2;
537 
538 /* tunable for admin command timeout in seconds, default is 1s */
539 uint32_t nvme_admin_cmd_timeout = 1;
540 
541 /* tunable for FORMAT NVM command timeout in seconds, default is 600s */
542 uint32_t nvme_format_cmd_timeout = 600;
543 
544 /* tunable for firmware commit with NVME_FWC_SAVE, default is 15s */
545 uint32_t nvme_commit_save_cmd_timeout = 15;
546 
547 /*
548  * tunable for the size of arbitrary vendor specific admin commands,
549  * default is 16MiB.
550  */
551 uint32_t nvme_vendor_specific_admin_cmd_size = 1 << 24;
552 
553 /*
554  * tunable for the max timeout of arbitary vendor specific admin commands,
555  * default is 60s.
556  */
557 uint_t nvme_vendor_specific_admin_cmd_max_timeout = 60;
558 
559 /*
560  * This ID space, AVL, and lock are used for keeping track of minor state across
561  * opens between different devices.
562  */
563 static id_space_t *nvme_open_minors;
564 static avl_tree_t nvme_open_minors_avl;
565 kmutex_t nvme_open_minors_mutex;
566 
567 /*
568  * Removal taskq used for n_dead callback processing.
569  */
570 taskq_t *nvme_dead_taskq;
571 
572 static int nvme_attach(dev_info_t *, ddi_attach_cmd_t);
573 static int nvme_detach(dev_info_t *, ddi_detach_cmd_t);
574 static int nvme_quiesce(dev_info_t *);
575 static int nvme_fm_errcb(dev_info_t *, ddi_fm_error_t *, const void *);
576 static int nvme_setup_interrupts(nvme_t *, int, int);
577 static void nvme_release_interrupts(nvme_t *);
578 static uint_t nvme_intr(caddr_t, caddr_t);
579 
580 static void nvme_shutdown(nvme_t *, boolean_t);
581 static boolean_t nvme_reset(nvme_t *, boolean_t);
582 static int nvme_init(nvme_t *);
583 static nvme_cmd_t *nvme_alloc_cmd(nvme_t *, int);
584 static void nvme_free_cmd(nvme_cmd_t *);
585 static nvme_cmd_t *nvme_create_nvm_cmd(nvme_namespace_t *, uint8_t,
586     bd_xfer_t *);
587 static void nvme_admin_cmd(nvme_cmd_t *, uint32_t);
588 static void nvme_submit_admin_cmd(nvme_qpair_t *, nvme_cmd_t *);
589 static int nvme_submit_io_cmd(nvme_qpair_t *, nvme_cmd_t *);
590 static void nvme_submit_cmd_common(nvme_qpair_t *, nvme_cmd_t *);
591 static nvme_cmd_t *nvme_unqueue_cmd(nvme_t *, nvme_qpair_t *, int);
592 static nvme_cmd_t *nvme_retrieve_cmd(nvme_t *, nvme_qpair_t *);
593 static void nvme_wait_cmd(nvme_cmd_t *, uint_t);
594 static void nvme_wakeup_cmd(void *);
595 static void nvme_async_event_task(void *);
596 
597 static int nvme_check_unknown_cmd_status(nvme_cmd_t *);
598 static int nvme_check_vendor_cmd_status(nvme_cmd_t *);
599 static int nvme_check_integrity_cmd_status(nvme_cmd_t *);
600 static int nvme_check_specific_cmd_status(nvme_cmd_t *);
601 static int nvme_check_generic_cmd_status(nvme_cmd_t *);
602 static inline int nvme_check_cmd_status(nvme_cmd_t *);
603 static boolean_t nvme_check_cmd_status_ioctl(nvme_cmd_t *,
604     nvme_ioctl_common_t *);
605 
606 static int nvme_abort_cmd(nvme_cmd_t *, uint_t);
607 static void nvme_async_event(nvme_t *);
608 static boolean_t nvme_format_nvm(nvme_t *, nvme_ioctl_format_t *);
609 static boolean_t nvme_get_logpage_int(nvme_t *, boolean_t, void **, size_t *,
610     uint8_t);
611 static boolean_t nvme_identify(nvme_t *, boolean_t, nvme_ioctl_identify_t *,
612     void **);
613 static boolean_t nvme_identify_int(nvme_t *, uint32_t, uint8_t, void **);
614 static int nvme_set_features(nvme_t *, boolean_t, uint32_t, uint8_t, uint32_t,
615     uint32_t *);
616 static int nvme_write_cache_set(nvme_t *, boolean_t);
617 static int nvme_set_nqueues(nvme_t *);
618 
619 static void nvme_free_dma(nvme_dma_t *);
620 static int nvme_zalloc_dma(nvme_t *, size_t, uint_t, ddi_dma_attr_t *,
621     nvme_dma_t **);
622 static int nvme_zalloc_queue_dma(nvme_t *, uint32_t, uint16_t, uint_t,
623     nvme_dma_t **);
624 static void nvme_free_qpair(nvme_qpair_t *);
625 static int nvme_alloc_qpair(nvme_t *, uint32_t, nvme_qpair_t **, uint_t);
626 static int nvme_create_io_qpair(nvme_t *, nvme_qpair_t *, uint16_t);
627 
628 static inline void nvme_put64(nvme_t *, uintptr_t, uint64_t);
629 static inline void nvme_put32(nvme_t *, uintptr_t, uint32_t);
630 static inline uint64_t nvme_get64(nvme_t *, uintptr_t);
631 static inline uint32_t nvme_get32(nvme_t *, uintptr_t);
632 
633 static boolean_t nvme_check_regs_hdl(nvme_t *);
634 static boolean_t nvme_check_dma_hdl(nvme_dma_t *);
635 
636 static int nvme_fill_prp(nvme_cmd_t *, ddi_dma_handle_t);
637 
638 static void nvme_bd_xfer_done(void *);
639 static void nvme_bd_driveinfo(void *, bd_drive_t *);
640 static int nvme_bd_mediainfo(void *, bd_media_t *);
641 static int nvme_bd_cmd(nvme_namespace_t *, bd_xfer_t *, uint8_t);
642 static int nvme_bd_read(void *, bd_xfer_t *);
643 static int nvme_bd_write(void *, bd_xfer_t *);
644 static int nvme_bd_sync(void *, bd_xfer_t *);
645 static int nvme_bd_devid(void *, dev_info_t *, ddi_devid_t *);
646 static int nvme_bd_free_space(void *, bd_xfer_t *);
647 
648 static int nvme_prp_dma_constructor(void *, void *, int);
649 static void nvme_prp_dma_destructor(void *, void *);
650 
651 static void nvme_prepare_devid(nvme_t *, uint32_t);
652 
653 /* DDI UFM callbacks */
654 static int nvme_ufm_fill_image(ddi_ufm_handle_t *, void *, uint_t,
655     ddi_ufm_image_t *);
656 static int nvme_ufm_fill_slot(ddi_ufm_handle_t *, void *, uint_t, uint_t,
657     ddi_ufm_slot_t *);
658 static int nvme_ufm_getcaps(ddi_ufm_handle_t *, void *, ddi_ufm_cap_t *);
659 
660 static int nvme_open(dev_t *, int, int, cred_t *);
661 static int nvme_close(dev_t, int, int, cred_t *);
662 static int nvme_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
663 
664 static int nvme_init_ns(nvme_t *, uint32_t);
665 static boolean_t nvme_attach_ns(nvme_t *, nvme_ioctl_common_t *);
666 static boolean_t nvme_detach_ns(nvme_t *, nvme_ioctl_common_t *);
667 
668 static int nvme_minor_comparator(const void *, const void *);
669 
670 static ddi_ufm_ops_t nvme_ufm_ops = {
671 	NULL,
672 	nvme_ufm_fill_image,
673 	nvme_ufm_fill_slot,
674 	nvme_ufm_getcaps
675 };
676 
677 /*
678  * Minor numbers are split amongst those used for controllers and for device
679  * opens. The number of controller minors are limited based open MAXMIN32 per
680  * the theory statement. We allocate 1 million minors as a total guess at a
681  * number that'll probably be enough. The starting point of the open minors can
682  * be shifted to accommodate future expansion of the NVMe device minors.
683  */
684 #define	NVME_MINOR_INST_SHIFT	9
685 #define	NVME_MINOR(inst, nsid)	(((inst) << NVME_MINOR_INST_SHIFT) | (nsid))
686 #define	NVME_MINOR_INST(minor)	((minor) >> NVME_MINOR_INST_SHIFT)
687 #define	NVME_MINOR_NSID(minor)	((minor) & ((1 << NVME_MINOR_INST_SHIFT) - 1))
688 #define	NVME_MINOR_MAX		(NVME_MINOR(1, 0) - 2)
689 #define	NVME_IS_VENDOR_SPECIFIC_CMD(x)	(((x) >= 0xC0) && ((x) <= 0xFF))
690 
691 #define	NVME_OPEN_NMINORS		(1024 * 1024)
692 #define	NVME_OPEN_MINOR_MIN		(MAXMIN32 + 1)
693 #define	NVME_OPEN_MINOR_MAX_EXCL	(NVME_OPEN_MINOR_MIN + \
694     NVME_OPEN_NMINORS)
695 
696 static void *nvme_state;
697 static kmem_cache_t *nvme_cmd_cache;
698 
699 /*
700  * DMA attributes for queue DMA memory
701  *
702  * Queue DMA memory must be page aligned. The maximum length of a queue is
703  * 65536 entries, and an entry can be 64 bytes long.
704  */
705 static const ddi_dma_attr_t nvme_queue_dma_attr = {
706 	.dma_attr_version	= DMA_ATTR_V0,
707 	.dma_attr_addr_lo	= 0,
708 	.dma_attr_addr_hi	= 0xffffffffffffffffULL,
709 	.dma_attr_count_max	= (UINT16_MAX + 1) * sizeof (nvme_sqe_t) - 1,
710 	.dma_attr_align		= 0x1000,
711 	.dma_attr_burstsizes	= 0x7ff,
712 	.dma_attr_minxfer	= 0x1000,
713 	.dma_attr_maxxfer	= (UINT16_MAX + 1) * sizeof (nvme_sqe_t),
714 	.dma_attr_seg		= 0xffffffffffffffffULL,
715 	.dma_attr_sgllen	= 1,
716 	.dma_attr_granular	= 1,
717 	.dma_attr_flags		= 0,
718 };
719 
720 /*
721  * DMA attributes for transfers using Physical Region Page (PRP) entries
722  *
723  * A PRP entry describes one page of DMA memory using the page size specified
724  * in the controller configuration's memory page size register (CC.MPS). It uses
725  * a 64bit base address aligned to this page size. There is no limitation on
726  * chaining PRPs together for arbitrarily large DMA transfers. These DMA
727  * attributes will be copied into the nvme_t during nvme_attach() and the
728  * dma_attr_maxxfer will be updated.
729  */
730 static const ddi_dma_attr_t nvme_prp_dma_attr = {
731 	.dma_attr_version	= DMA_ATTR_V0,
732 	.dma_attr_addr_lo	= 0,
733 	.dma_attr_addr_hi	= 0xffffffffffffffffULL,
734 	.dma_attr_count_max	= 0xfff,
735 	.dma_attr_align		= 0x1000,
736 	.dma_attr_burstsizes	= 0x7ff,
737 	.dma_attr_minxfer	= 0x1000,
738 	.dma_attr_maxxfer	= 0x1000,
739 	.dma_attr_seg		= 0xfff,
740 	.dma_attr_sgllen	= -1,
741 	.dma_attr_granular	= 1,
742 	.dma_attr_flags		= 0,
743 };
744 
745 /*
746  * DMA attributes for transfers using scatter/gather lists
747  *
748  * A SGL entry describes a chunk of DMA memory using a 64bit base address and a
749  * 32bit length field. SGL Segment and SGL Last Segment entries require the
750  * length to be a multiple of 16 bytes. While the SGL DMA attributes are copied
751  * into the nvme_t, they are not currently used for any I/O.
752  */
753 static const ddi_dma_attr_t nvme_sgl_dma_attr = {
754 	.dma_attr_version	= DMA_ATTR_V0,
755 	.dma_attr_addr_lo	= 0,
756 	.dma_attr_addr_hi	= 0xffffffffffffffffULL,
757 	.dma_attr_count_max	= 0xffffffffUL,
758 	.dma_attr_align		= 1,
759 	.dma_attr_burstsizes	= 0x7ff,
760 	.dma_attr_minxfer	= 0x10,
761 	.dma_attr_maxxfer	= 0xfffffffffULL,
762 	.dma_attr_seg		= 0xffffffffffffffffULL,
763 	.dma_attr_sgllen	= -1,
764 	.dma_attr_granular	= 0x10,
765 	.dma_attr_flags		= 0
766 };
767 
768 static ddi_device_acc_attr_t nvme_reg_acc_attr = {
769 	.devacc_attr_version	= DDI_DEVICE_ATTR_V0,
770 	.devacc_attr_endian_flags = DDI_STRUCTURE_LE_ACC,
771 	.devacc_attr_dataorder	= DDI_STRICTORDER_ACC
772 };
773 
774 /*
775  * ioctl validation policies. These are policies that determine which namespaces
776  * are allowed or disallowed for various operations. Note, all policy items
777  * should be explicitly listed here to help make it clear what our intent is.
778  * That is also why some of these are identical or repeated when they cover
779  * different ioctls.
780  */
781 
782 /*
783  * The controller information ioctl generally contains read-only information
784  * about the controller that is sourced from multiple different pieces of
785  * information. This does not operate on a namespace and none are accepted.
786  */
787 static const nvme_ioctl_check_t nvme_check_ctrl_info = {
788 	.nck_ns_ok = B_FALSE, .nck_ns_minor_ok = B_FALSE,
789 	.nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_FALSE,
790 	.nck_bcast_ok = B_FALSE, .nck_excl = NVME_IOCTL_EXCL_NONE
791 };
792 
793 /*
794  * The kernel namespace information requires a namespace ID to be specified. It
795  * does not allow for the broadcast ID to be specified.
796  */
797 static const nvme_ioctl_check_t nvme_check_ns_info = {
798 	.nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_TRUE,
799 	.nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_FALSE,
800 	.nck_bcast_ok = B_FALSE, .nck_excl = NVME_IOCTL_EXCL_NONE
801 };
802 
803 /*
804  * Identify commands are allowed to operate on a namespace minor. Unfortunately,
805  * the namespace field in identify commands is a bit, weird. In particular, some
806  * commands need a valid namespace, while others are namespace listing
807  * operations, which means illegal namespaces like zero are allowed.
808  */
809 static const nvme_ioctl_check_t nvme_check_identify = {
810 	.nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_TRUE,
811 	.nck_skip_ctrl = B_TRUE, .nck_ctrl_rewrite = B_FALSE,
812 	.nck_bcast_ok = B_TRUE, .nck_excl = NVME_IOCTL_EXCL_NONE
813 };
814 
815 /*
816  * The get log page command requires the ability to specify namespaces. When
817  * targeting the controller, one must use the broadcast NSID.
818  */
819 static const nvme_ioctl_check_t nvme_check_get_logpage = {
820 	.nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_TRUE,
821 	.nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_TRUE,
822 	.nck_bcast_ok = B_TRUE, .nck_excl = NVME_IOCTL_EXCL_NONE
823 };
824 
825 /*
826  * When getting a feature, we do not want rewriting behavior as most features do
827  * not require a namespace to be specified. Specific instances are checked in
828  * nvme_validate_get_feature().
829  */
830 static const nvme_ioctl_check_t nvme_check_get_feature = {
831 	.nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_TRUE,
832 	.nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_FALSE,
833 	.nck_bcast_ok = B_TRUE, .nck_excl = NVME_IOCTL_EXCL_NONE
834 };
835 
836 /*
837  * Format commands must target a namespace. The broadcast namespace must be used
838  * when referring to the controller.
839  */
840 static const nvme_ioctl_check_t nvme_check_format = {
841 	.nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_TRUE,
842 	.nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_TRUE,
843 	.nck_bcast_ok = B_TRUE, .nck_excl = NVME_IOCTL_EXCL_WRITE
844 };
845 
846 /*
847  * Attach and detach must always target a minor. However, the broadcast
848  * namespace is not allowed. We still perform rewriting so that way specifying
849  * the controller node with 0 will be caught.
850  */
851 static const nvme_ioctl_check_t nvme_check_attach_detach = {
852 	.nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_TRUE,
853 	.nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_TRUE,
854 	.nck_bcast_ok = B_FALSE, .nck_excl = NVME_IOCTL_EXCL_WRITE
855 };
856 
857 /*
858  * Firmware operations must not target a namespace and are only allowed from the
859  * controller.
860  */
861 static const nvme_ioctl_check_t nvme_check_firmware = {
862 	.nck_ns_ok = B_FALSE, .nck_ns_minor_ok = B_FALSE,
863 	.nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_FALSE,
864 	.nck_bcast_ok = B_FALSE, .nck_excl = NVME_IOCTL_EXCL_WRITE
865 };
866 
867 /*
868  * Passthru commands are an odd set. We only allow them from the primary
869  * controller; however, we allow a namespace to be specified in them and allow
870  * the broadcast namespace. We do not perform rewriting because we don't know
871  * what the semantics are. We explicitly exempt passthru commands from needing
872  * an exclusive lock and leave it up to them to tell us the impact of the
873  * command and semantics. As this is a privileged interface and the semantics
874  * are arbitrary, there's not much we can do without some assistance from the
875  * consumer.
876  */
877 static const nvme_ioctl_check_t nvme_check_passthru = {
878 	.nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_FALSE,
879 	.nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_FALSE,
880 	.nck_bcast_ok = B_TRUE, .nck_excl = NVME_IOCTL_EXCL_NONE
881 };
882 
883 /*
884  * Lock operations are allowed to target a namespace, but must not be rewritten.
885  * There is no support for the broadcast namespace. This is the only ioctl that
886  * should skip exclusive checking as it's used to grant it.
887  */
888 static const nvme_ioctl_check_t nvme_check_locking = {
889 	.nck_ns_ok = B_TRUE, .nck_ns_minor_ok = B_TRUE,
890 	.nck_skip_ctrl = B_FALSE, .nck_ctrl_rewrite = B_FALSE,
891 	.nck_bcast_ok = B_FALSE, .nck_excl = NVME_IOCTL_EXCL_SKIP
892 };
893 
894 static struct cb_ops nvme_cb_ops = {
895 	.cb_open	= nvme_open,
896 	.cb_close	= nvme_close,
897 	.cb_strategy	= nodev,
898 	.cb_print	= nodev,
899 	.cb_dump	= nodev,
900 	.cb_read	= nodev,
901 	.cb_write	= nodev,
902 	.cb_ioctl	= nvme_ioctl,
903 	.cb_devmap	= nodev,
904 	.cb_mmap	= nodev,
905 	.cb_segmap	= nodev,
906 	.cb_chpoll	= nochpoll,
907 	.cb_prop_op	= ddi_prop_op,
908 	.cb_str		= 0,
909 	.cb_flag	= D_NEW | D_MP,
910 	.cb_rev		= CB_REV,
911 	.cb_aread	= nodev,
912 	.cb_awrite	= nodev
913 };
914 
915 static struct dev_ops nvme_dev_ops = {
916 	.devo_rev	= DEVO_REV,
917 	.devo_refcnt	= 0,
918 	.devo_getinfo	= ddi_no_info,
919 	.devo_identify	= nulldev,
920 	.devo_probe	= nulldev,
921 	.devo_attach	= nvme_attach,
922 	.devo_detach	= nvme_detach,
923 	.devo_reset	= nodev,
924 	.devo_cb_ops	= &nvme_cb_ops,
925 	.devo_bus_ops	= NULL,
926 	.devo_power	= NULL,
927 	.devo_quiesce	= nvme_quiesce,
928 };
929 
930 static struct modldrv nvme_modldrv = {
931 	.drv_modops	= &mod_driverops,
932 	.drv_linkinfo	= "NVMe driver",
933 	.drv_dev_ops	= &nvme_dev_ops
934 };
935 
936 static struct modlinkage nvme_modlinkage = {
937 	.ml_rev		= MODREV_1,
938 	.ml_linkage	= { &nvme_modldrv, NULL }
939 };
940 
941 static bd_ops_t nvme_bd_ops = {
942 	.o_version	= BD_OPS_CURRENT_VERSION,
943 	.o_drive_info	= nvme_bd_driveinfo,
944 	.o_media_info	= nvme_bd_mediainfo,
945 	.o_devid_init	= nvme_bd_devid,
946 	.o_sync_cache	= nvme_bd_sync,
947 	.o_read		= nvme_bd_read,
948 	.o_write	= nvme_bd_write,
949 	.o_free_space	= nvme_bd_free_space,
950 };
951 
952 /*
953  * This list will hold commands that have timed out and couldn't be aborted.
954  * As we don't know what the hardware may still do with the DMA memory we can't
955  * free them, so we'll keep them forever on this list where we can easily look
956  * at them with mdb.
957  */
958 static struct list nvme_lost_cmds;
959 static kmutex_t nvme_lc_mutex;
960 
961 int
962 _init(void)
963 {
964 	int error;
965 
966 	error = ddi_soft_state_init(&nvme_state, sizeof (nvme_t), 1);
967 	if (error != DDI_SUCCESS)
968 		return (error);
969 
970 	if ((nvme_open_minors = id_space_create("nvme_open_minors",
971 	    NVME_OPEN_MINOR_MIN, NVME_OPEN_MINOR_MAX_EXCL)) == NULL) {
972 		ddi_soft_state_fini(&nvme_state);
973 		return (ENOMEM);
974 	}
975 
976 	nvme_cmd_cache = kmem_cache_create("nvme_cmd_cache",
977 	    sizeof (nvme_cmd_t), 64, NULL, NULL, NULL, NULL, NULL, 0);
978 
979 	mutex_init(&nvme_lc_mutex, NULL, MUTEX_DRIVER, NULL);
980 	list_create(&nvme_lost_cmds, sizeof (nvme_cmd_t),
981 	    offsetof(nvme_cmd_t, nc_list));
982 
983 	mutex_init(&nvme_open_minors_mutex, NULL, MUTEX_DRIVER, NULL);
984 	avl_create(&nvme_open_minors_avl, nvme_minor_comparator,
985 	    sizeof (nvme_minor_t), offsetof(nvme_minor_t, nm_avl));
986 
987 	nvme_dead_taskq = taskq_create("nvme_dead_taskq", 1, minclsyspri, 1, 1,
988 	    TASKQ_PREPOPULATE);
989 
990 	bd_mod_init(&nvme_dev_ops);
991 
992 	error = mod_install(&nvme_modlinkage);
993 	if (error != DDI_SUCCESS) {
994 		ddi_soft_state_fini(&nvme_state);
995 		id_space_destroy(nvme_open_minors);
996 		mutex_destroy(&nvme_lc_mutex);
997 		list_destroy(&nvme_lost_cmds);
998 		bd_mod_fini(&nvme_dev_ops);
999 		mutex_destroy(&nvme_open_minors_mutex);
1000 		avl_destroy(&nvme_open_minors_avl);
1001 		taskq_destroy(nvme_dead_taskq);
1002 	}
1003 
1004 	return (error);
1005 }
1006 
1007 int
1008 _fini(void)
1009 {
1010 	int error;
1011 
1012 	if (!list_is_empty(&nvme_lost_cmds))
1013 		return (DDI_FAILURE);
1014 
1015 	error = mod_remove(&nvme_modlinkage);
1016 	if (error == DDI_SUCCESS) {
1017 		ddi_soft_state_fini(&nvme_state);
1018 		id_space_destroy(nvme_open_minors);
1019 		kmem_cache_destroy(nvme_cmd_cache);
1020 		mutex_destroy(&nvme_lc_mutex);
1021 		list_destroy(&nvme_lost_cmds);
1022 		bd_mod_fini(&nvme_dev_ops);
1023 		mutex_destroy(&nvme_open_minors_mutex);
1024 		avl_destroy(&nvme_open_minors_avl);
1025 		taskq_destroy(nvme_dead_taskq);
1026 	}
1027 
1028 	return (error);
1029 }
1030 
1031 int
1032 _info(struct modinfo *modinfop)
1033 {
1034 	return (mod_info(&nvme_modlinkage, modinfop));
1035 }
1036 
1037 static inline void
1038 nvme_put64(nvme_t *nvme, uintptr_t reg, uint64_t val)
1039 {
1040 	ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0);
1041 
1042 	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
1043 	ddi_put64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg), val);
1044 }
1045 
1046 static inline void
1047 nvme_put32(nvme_t *nvme, uintptr_t reg, uint32_t val)
1048 {
1049 	ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x3) == 0);
1050 
1051 	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
1052 	ddi_put32(nvme->n_regh, (uint32_t *)(nvme->n_regs + reg), val);
1053 }
1054 
1055 static inline uint64_t
1056 nvme_get64(nvme_t *nvme, uintptr_t reg)
1057 {
1058 	uint64_t val;
1059 
1060 	ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0);
1061 
1062 	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
1063 	val = ddi_get64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg));
1064 
1065 	return (val);
1066 }
1067 
1068 static inline uint32_t
1069 nvme_get32(nvme_t *nvme, uintptr_t reg)
1070 {
1071 	uint32_t val;
1072 
1073 	ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x3) == 0);
1074 
1075 	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
1076 	val = ddi_get32(nvme->n_regh, (uint32_t *)(nvme->n_regs + reg));
1077 
1078 	return (val);
1079 }
1080 
1081 /*
1082  * This is a central clearing house for marking an NVMe controller dead and/or
1083  * removed. This takes care of setting the flag, taking care of outstanding
1084  * blocked locks, and sending a DDI FMA impact. This is called from a precarious
1085  * place where locking is suspect. The only guarantee we have is that the nvme_t
1086  * is valid and won't disappear until we return.
1087  *
1088  * This should only be used after attach has been called.
1089  */
1090 static void
1091 nvme_ctrl_mark_dead(nvme_t *nvme, boolean_t removed)
1092 {
1093 	boolean_t was_dead;
1094 
1095 	/*
1096 	 * See if we win the race to set things up here. If someone beat us to
1097 	 * it, we do not do anything.
1098 	 */
1099 	was_dead = atomic_cas_32((volatile uint32_t *)&nvme->n_dead, B_FALSE,
1100 	    B_TRUE);
1101 	if (was_dead) {
1102 		return;
1103 	}
1104 
1105 	/*
1106 	 * If this was removed, there is no reason to change the service impact.
1107 	 * However, then we need to change our default return code that we use
1108 	 * here to indicate that it was gone versus that it is dead.
1109 	 */
1110 	if (removed) {
1111 		nvme->n_dead_status = NVME_IOCTL_E_CTRL_GONE;
1112 	} else {
1113 		ASSERT3U(nvme->n_dead_status, ==, NVME_IOCTL_E_CTRL_DEAD);
1114 		ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST);
1115 	}
1116 
1117 	taskq_dispatch_ent(nvme_dead_taskq, nvme_rwlock_ctrl_dead, nvme,
1118 	    TQ_NOSLEEP, &nvme->n_dead_tqent);
1119 }
1120 
1121 static boolean_t
1122 nvme_check_regs_hdl(nvme_t *nvme)
1123 {
1124 	ddi_fm_error_t error;
1125 
1126 	ddi_fm_acc_err_get(nvme->n_regh, &error, DDI_FME_VERSION);
1127 
1128 	if (error.fme_status != DDI_FM_OK)
1129 		return (B_TRUE);
1130 
1131 	return (B_FALSE);
1132 }
1133 
1134 static boolean_t
1135 nvme_check_dma_hdl(nvme_dma_t *dma)
1136 {
1137 	ddi_fm_error_t error;
1138 
1139 	if (dma == NULL)
1140 		return (B_FALSE);
1141 
1142 	ddi_fm_dma_err_get(dma->nd_dmah, &error, DDI_FME_VERSION);
1143 
1144 	if (error.fme_status != DDI_FM_OK)
1145 		return (B_TRUE);
1146 
1147 	return (B_FALSE);
1148 }
1149 
1150 static void
1151 nvme_free_dma_common(nvme_dma_t *dma)
1152 {
1153 	if (dma->nd_dmah != NULL)
1154 		(void) ddi_dma_unbind_handle(dma->nd_dmah);
1155 	if (dma->nd_acch != NULL)
1156 		ddi_dma_mem_free(&dma->nd_acch);
1157 	if (dma->nd_dmah != NULL)
1158 		ddi_dma_free_handle(&dma->nd_dmah);
1159 }
1160 
1161 static void
1162 nvme_free_dma(nvme_dma_t *dma)
1163 {
1164 	nvme_free_dma_common(dma);
1165 	kmem_free(dma, sizeof (*dma));
1166 }
1167 
1168 /* ARGSUSED */
1169 static void
1170 nvme_prp_dma_destructor(void *buf, void *private)
1171 {
1172 	nvme_dma_t *dma = (nvme_dma_t *)buf;
1173 
1174 	nvme_free_dma_common(dma);
1175 }
1176 
1177 static int
1178 nvme_alloc_dma_common(nvme_t *nvme, nvme_dma_t *dma,
1179     size_t len, uint_t flags, ddi_dma_attr_t *dma_attr)
1180 {
1181 	if (ddi_dma_alloc_handle(nvme->n_dip, dma_attr, DDI_DMA_SLEEP, NULL,
1182 	    &dma->nd_dmah) != DDI_SUCCESS) {
1183 		/*
1184 		 * Due to DDI_DMA_SLEEP this can't be DDI_DMA_NORESOURCES, and
1185 		 * the only other possible error is DDI_DMA_BADATTR which
1186 		 * indicates a driver bug which should cause a panic.
1187 		 */
1188 		dev_err(nvme->n_dip, CE_PANIC,
1189 		    "!failed to get DMA handle, check DMA attributes");
1190 		return (DDI_FAILURE);
1191 	}
1192 
1193 	/*
1194 	 * ddi_dma_mem_alloc() can only fail when DDI_DMA_NOSLEEP is specified
1195 	 * or the flags are conflicting, which isn't the case here.
1196 	 */
1197 	(void) ddi_dma_mem_alloc(dma->nd_dmah, len, &nvme->n_reg_acc_attr,
1198 	    DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL, &dma->nd_memp,
1199 	    &dma->nd_len, &dma->nd_acch);
1200 
1201 	if (ddi_dma_addr_bind_handle(dma->nd_dmah, NULL, dma->nd_memp,
1202 	    dma->nd_len, flags | DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL,
1203 	    &dma->nd_cookie, &dma->nd_ncookie) != DDI_DMA_MAPPED) {
1204 		dev_err(nvme->n_dip, CE_WARN,
1205 		    "!failed to bind DMA memory");
1206 		atomic_inc_32(&nvme->n_dma_bind_err);
1207 		nvme_free_dma_common(dma);
1208 		return (DDI_FAILURE);
1209 	}
1210 
1211 	return (DDI_SUCCESS);
1212 }
1213 
1214 static int
1215 nvme_zalloc_dma(nvme_t *nvme, size_t len, uint_t flags,
1216     ddi_dma_attr_t *dma_attr, nvme_dma_t **ret)
1217 {
1218 	nvme_dma_t *dma = kmem_zalloc(sizeof (nvme_dma_t), KM_SLEEP);
1219 
1220 	if (nvme_alloc_dma_common(nvme, dma, len, flags, dma_attr) !=
1221 	    DDI_SUCCESS) {
1222 		*ret = NULL;
1223 		kmem_free(dma, sizeof (nvme_dma_t));
1224 		return (DDI_FAILURE);
1225 	}
1226 
1227 	bzero(dma->nd_memp, dma->nd_len);
1228 
1229 	*ret = dma;
1230 	return (DDI_SUCCESS);
1231 }
1232 
1233 /* ARGSUSED */
1234 static int
1235 nvme_prp_dma_constructor(void *buf, void *private, int flags)
1236 {
1237 	nvme_dma_t *dma = (nvme_dma_t *)buf;
1238 	nvme_t *nvme = (nvme_t *)private;
1239 
1240 	dma->nd_dmah = NULL;
1241 	dma->nd_acch = NULL;
1242 
1243 	if (nvme_alloc_dma_common(nvme, dma, nvme->n_pagesize,
1244 	    DDI_DMA_READ, &nvme->n_prp_dma_attr) != DDI_SUCCESS) {
1245 		return (-1);
1246 	}
1247 
1248 	ASSERT(dma->nd_ncookie == 1);
1249 
1250 	dma->nd_cached = B_TRUE;
1251 
1252 	return (0);
1253 }
1254 
1255 static int
1256 nvme_zalloc_queue_dma(nvme_t *nvme, uint32_t nentry, uint16_t qe_len,
1257     uint_t flags, nvme_dma_t **dma)
1258 {
1259 	uint32_t len = nentry * qe_len;
1260 	ddi_dma_attr_t q_dma_attr = nvme->n_queue_dma_attr;
1261 
1262 	len = roundup(len, nvme->n_pagesize);
1263 
1264 	if (nvme_zalloc_dma(nvme, len, flags, &q_dma_attr, dma)
1265 	    != DDI_SUCCESS) {
1266 		dev_err(nvme->n_dip, CE_WARN,
1267 		    "!failed to get DMA memory for queue");
1268 		goto fail;
1269 	}
1270 
1271 	if ((*dma)->nd_ncookie != 1) {
1272 		dev_err(nvme->n_dip, CE_WARN,
1273 		    "!got too many cookies for queue DMA");
1274 		goto fail;
1275 	}
1276 
1277 	return (DDI_SUCCESS);
1278 
1279 fail:
1280 	if (*dma) {
1281 		nvme_free_dma(*dma);
1282 		*dma = NULL;
1283 	}
1284 
1285 	return (DDI_FAILURE);
1286 }
1287 
1288 static void
1289 nvme_free_cq(nvme_cq_t *cq)
1290 {
1291 	mutex_destroy(&cq->ncq_mutex);
1292 
1293 	if (cq->ncq_cmd_taskq != NULL)
1294 		taskq_destroy(cq->ncq_cmd_taskq);
1295 
1296 	if (cq->ncq_dma != NULL)
1297 		nvme_free_dma(cq->ncq_dma);
1298 
1299 	kmem_free(cq, sizeof (*cq));
1300 }
1301 
1302 static void
1303 nvme_free_qpair(nvme_qpair_t *qp)
1304 {
1305 	int i;
1306 
1307 	mutex_destroy(&qp->nq_mutex);
1308 	sema_destroy(&qp->nq_sema);
1309 
1310 	if (qp->nq_sqdma != NULL)
1311 		nvme_free_dma(qp->nq_sqdma);
1312 
1313 	if (qp->nq_active_cmds > 0)
1314 		for (i = 0; i != qp->nq_nentry; i++)
1315 			if (qp->nq_cmd[i] != NULL)
1316 				nvme_free_cmd(qp->nq_cmd[i]);
1317 
1318 	if (qp->nq_cmd != NULL)
1319 		kmem_free(qp->nq_cmd, sizeof (nvme_cmd_t *) * qp->nq_nentry);
1320 
1321 	kmem_free(qp, sizeof (nvme_qpair_t));
1322 }
1323 
1324 /*
1325  * Destroy the pre-allocated cq array, but only free individual completion
1326  * queues from the given starting index.
1327  */
1328 static void
1329 nvme_destroy_cq_array(nvme_t *nvme, uint_t start)
1330 {
1331 	uint_t i;
1332 
1333 	for (i = start; i < nvme->n_cq_count; i++)
1334 		if (nvme->n_cq[i] != NULL)
1335 			nvme_free_cq(nvme->n_cq[i]);
1336 
1337 	kmem_free(nvme->n_cq, sizeof (*nvme->n_cq) * nvme->n_cq_count);
1338 }
1339 
1340 static int
1341 nvme_alloc_cq(nvme_t *nvme, uint32_t nentry, nvme_cq_t **cqp, uint16_t idx,
1342     uint_t nthr)
1343 {
1344 	nvme_cq_t *cq = kmem_zalloc(sizeof (*cq), KM_SLEEP);
1345 	char name[64];		/* large enough for the taskq name */
1346 
1347 	mutex_init(&cq->ncq_mutex, NULL, MUTEX_DRIVER,
1348 	    DDI_INTR_PRI(nvme->n_intr_pri));
1349 
1350 	if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_cqe_t),
1351 	    DDI_DMA_READ, &cq->ncq_dma) != DDI_SUCCESS)
1352 		goto fail;
1353 
1354 	cq->ncq_cq = (nvme_cqe_t *)cq->ncq_dma->nd_memp;
1355 	cq->ncq_nentry = nentry;
1356 	cq->ncq_id = idx;
1357 	cq->ncq_hdbl = NVME_REG_CQHDBL(nvme, idx);
1358 
1359 	/*
1360 	 * Each completion queue has its own command taskq.
1361 	 */
1362 	(void) snprintf(name, sizeof (name), "%s%d_cmd_taskq%u",
1363 	    ddi_driver_name(nvme->n_dip), ddi_get_instance(nvme->n_dip), idx);
1364 
1365 	cq->ncq_cmd_taskq = taskq_create(name, nthr, minclsyspri, 64, INT_MAX,
1366 	    TASKQ_PREPOPULATE);
1367 
1368 	if (cq->ncq_cmd_taskq == NULL) {
1369 		dev_err(nvme->n_dip, CE_WARN, "!failed to create cmd "
1370 		    "taskq for cq %u", idx);
1371 		goto fail;
1372 	}
1373 
1374 	*cqp = cq;
1375 	return (DDI_SUCCESS);
1376 
1377 fail:
1378 	nvme_free_cq(cq);
1379 	*cqp = NULL;
1380 
1381 	return (DDI_FAILURE);
1382 }
1383 
1384 /*
1385  * Create the n_cq array big enough to hold "ncq" completion queues.
1386  * If the array already exists it will be re-sized (but only larger).
1387  * The admin queue is included in this array, which boosts the
1388  * max number of entries to UINT16_MAX + 1.
1389  */
1390 static int
1391 nvme_create_cq_array(nvme_t *nvme, uint_t ncq, uint32_t nentry, uint_t nthr)
1392 {
1393 	nvme_cq_t **cq;
1394 	uint_t i, cq_count;
1395 
1396 	ASSERT3U(ncq, >, nvme->n_cq_count);
1397 
1398 	cq = nvme->n_cq;
1399 	cq_count = nvme->n_cq_count;
1400 
1401 	nvme->n_cq = kmem_zalloc(sizeof (*nvme->n_cq) * ncq, KM_SLEEP);
1402 	nvme->n_cq_count = ncq;
1403 
1404 	for (i = 0; i < cq_count; i++)
1405 		nvme->n_cq[i] = cq[i];
1406 
1407 	for (; i < nvme->n_cq_count; i++)
1408 		if (nvme_alloc_cq(nvme, nentry, &nvme->n_cq[i], i, nthr) !=
1409 		    DDI_SUCCESS)
1410 			goto fail;
1411 
1412 	if (cq != NULL)
1413 		kmem_free(cq, sizeof (*cq) * cq_count);
1414 
1415 	return (DDI_SUCCESS);
1416 
1417 fail:
1418 	nvme_destroy_cq_array(nvme, cq_count);
1419 	/*
1420 	 * Restore the original array
1421 	 */
1422 	nvme->n_cq_count = cq_count;
1423 	nvme->n_cq = cq;
1424 
1425 	return (DDI_FAILURE);
1426 }
1427 
1428 static int
1429 nvme_alloc_qpair(nvme_t *nvme, uint32_t nentry, nvme_qpair_t **nqp,
1430     uint_t idx)
1431 {
1432 	nvme_qpair_t *qp = kmem_zalloc(sizeof (*qp), KM_SLEEP);
1433 	uint_t cq_idx;
1434 
1435 	mutex_init(&qp->nq_mutex, NULL, MUTEX_DRIVER,
1436 	    DDI_INTR_PRI(nvme->n_intr_pri));
1437 
1438 	/*
1439 	 * The NVMe spec defines that a full queue has one empty (unused) slot;
1440 	 * initialize the semaphore accordingly.
1441 	 */
1442 	sema_init(&qp->nq_sema, nentry - 1, NULL, SEMA_DRIVER, NULL);
1443 
1444 	if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_sqe_t),
1445 	    DDI_DMA_WRITE, &qp->nq_sqdma) != DDI_SUCCESS)
1446 		goto fail;
1447 
1448 	/*
1449 	 * idx == 0 is adminq, those above 0 are shared io completion queues.
1450 	 */
1451 	cq_idx = idx == 0 ? 0 : 1 + (idx - 1) % (nvme->n_cq_count - 1);
1452 	qp->nq_cq = nvme->n_cq[cq_idx];
1453 	qp->nq_sq = (nvme_sqe_t *)qp->nq_sqdma->nd_memp;
1454 	qp->nq_nentry = nentry;
1455 
1456 	qp->nq_sqtdbl = NVME_REG_SQTDBL(nvme, idx);
1457 
1458 	qp->nq_cmd = kmem_zalloc(sizeof (nvme_cmd_t *) * nentry, KM_SLEEP);
1459 	qp->nq_next_cmd = 0;
1460 
1461 	*nqp = qp;
1462 	return (DDI_SUCCESS);
1463 
1464 fail:
1465 	nvme_free_qpair(qp);
1466 	*nqp = NULL;
1467 
1468 	return (DDI_FAILURE);
1469 }
1470 
1471 static nvme_cmd_t *
1472 nvme_alloc_cmd(nvme_t *nvme, int kmflag)
1473 {
1474 	nvme_cmd_t *cmd = kmem_cache_alloc(nvme_cmd_cache, kmflag);
1475 
1476 	if (cmd == NULL)
1477 		return (cmd);
1478 
1479 	bzero(cmd, sizeof (nvme_cmd_t));
1480 
1481 	cmd->nc_nvme = nvme;
1482 
1483 	mutex_init(&cmd->nc_mutex, NULL, MUTEX_DRIVER,
1484 	    DDI_INTR_PRI(nvme->n_intr_pri));
1485 	cv_init(&cmd->nc_cv, NULL, CV_DRIVER, NULL);
1486 
1487 	return (cmd);
1488 }
1489 
1490 static void
1491 nvme_free_cmd(nvme_cmd_t *cmd)
1492 {
1493 	/* Don't free commands on the lost commands list. */
1494 	if (list_link_active(&cmd->nc_list))
1495 		return;
1496 
1497 	if (cmd->nc_dma) {
1498 		nvme_free_dma(cmd->nc_dma);
1499 		cmd->nc_dma = NULL;
1500 	}
1501 
1502 	if (cmd->nc_prp) {
1503 		kmem_cache_free(cmd->nc_nvme->n_prp_cache, cmd->nc_prp);
1504 		cmd->nc_prp = NULL;
1505 	}
1506 
1507 	cv_destroy(&cmd->nc_cv);
1508 	mutex_destroy(&cmd->nc_mutex);
1509 
1510 	kmem_cache_free(nvme_cmd_cache, cmd);
1511 }
1512 
1513 static void
1514 nvme_submit_admin_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd)
1515 {
1516 	sema_p(&qp->nq_sema);
1517 	nvme_submit_cmd_common(qp, cmd);
1518 }
1519 
1520 static int
1521 nvme_submit_io_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd)
1522 {
1523 	if (cmd->nc_nvme->n_dead) {
1524 		return (EIO);
1525 	}
1526 
1527 	if (sema_tryp(&qp->nq_sema) == 0)
1528 		return (EAGAIN);
1529 
1530 	nvme_submit_cmd_common(qp, cmd);
1531 	return (0);
1532 }
1533 
1534 static void
1535 nvme_submit_cmd_common(nvme_qpair_t *qp, nvme_cmd_t *cmd)
1536 {
1537 	nvme_reg_sqtdbl_t tail = { 0 };
1538 
1539 	mutex_enter(&qp->nq_mutex);
1540 	cmd->nc_completed = B_FALSE;
1541 
1542 	/*
1543 	 * Now that we hold the queue pair lock, we must check whether or not
1544 	 * the controller has been listed as dead (e.g. was removed due to
1545 	 * hotplug). This is necessary as otherwise we could race with
1546 	 * nvme_remove_callback(). Because this has not been enqueued, we don't
1547 	 * call nvme_unqueue_cmd(), which is why we must manually decrement the
1548 	 * semaphore.
1549 	 */
1550 	if (cmd->nc_nvme->n_dead) {
1551 		taskq_dispatch_ent(qp->nq_cq->ncq_cmd_taskq, cmd->nc_callback,
1552 		    cmd, TQ_NOSLEEP, &cmd->nc_tqent);
1553 		sema_v(&qp->nq_sema);
1554 		mutex_exit(&qp->nq_mutex);
1555 		return;
1556 	}
1557 
1558 	/*
1559 	 * Try to insert the cmd into the active cmd array at the nq_next_cmd
1560 	 * slot. If the slot is already occupied advance to the next slot and
1561 	 * try again. This can happen for long running commands like async event
1562 	 * requests.
1563 	 */
1564 	while (qp->nq_cmd[qp->nq_next_cmd] != NULL)
1565 		qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry;
1566 	qp->nq_cmd[qp->nq_next_cmd] = cmd;
1567 
1568 	qp->nq_active_cmds++;
1569 
1570 	cmd->nc_sqe.sqe_cid = qp->nq_next_cmd;
1571 	bcopy(&cmd->nc_sqe, &qp->nq_sq[qp->nq_sqtail], sizeof (nvme_sqe_t));
1572 	(void) ddi_dma_sync(qp->nq_sqdma->nd_dmah,
1573 	    sizeof (nvme_sqe_t) * qp->nq_sqtail,
1574 	    sizeof (nvme_sqe_t), DDI_DMA_SYNC_FORDEV);
1575 	qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry;
1576 
1577 	tail.b.sqtdbl_sqt = qp->nq_sqtail = (qp->nq_sqtail + 1) % qp->nq_nentry;
1578 	nvme_put32(cmd->nc_nvme, qp->nq_sqtdbl, tail.r);
1579 
1580 	mutex_exit(&qp->nq_mutex);
1581 }
1582 
1583 static nvme_cmd_t *
1584 nvme_unqueue_cmd(nvme_t *nvme, nvme_qpair_t *qp, int cid)
1585 {
1586 	nvme_cmd_t *cmd;
1587 
1588 	ASSERT(mutex_owned(&qp->nq_mutex));
1589 	ASSERT3S(cid, <, qp->nq_nentry);
1590 
1591 	cmd = qp->nq_cmd[cid];
1592 	/*
1593 	 * Some controllers will erroneously add things to the completion queue
1594 	 * for which there is no matching outstanding command. If this happens,
1595 	 * it is almost certainly a controller firmware bug since nq_mutex
1596 	 * is held across command submission and ringing the queue doorbell,
1597 	 * and is also held in this function.
1598 	 *
1599 	 * If we see such an unexpected command, there is not much we can do.
1600 	 * These will be logged and counted in nvme_get_completed(), but
1601 	 * otherwise ignored.
1602 	 */
1603 	if (cmd == NULL)
1604 		return (NULL);
1605 	qp->nq_cmd[cid] = NULL;
1606 	ASSERT3U(qp->nq_active_cmds, >, 0);
1607 	qp->nq_active_cmds--;
1608 	sema_v(&qp->nq_sema);
1609 
1610 	ASSERT3P(cmd, !=, NULL);
1611 	ASSERT3P(cmd->nc_nvme, ==, nvme);
1612 	ASSERT3S(cmd->nc_sqe.sqe_cid, ==, cid);
1613 
1614 	return (cmd);
1615 }
1616 
1617 /*
1618  * Get the command tied to the next completed cqe and bump along completion
1619  * queue head counter.
1620  */
1621 static nvme_cmd_t *
1622 nvme_get_completed(nvme_t *nvme, nvme_cq_t *cq)
1623 {
1624 	nvme_qpair_t *qp;
1625 	nvme_cqe_t *cqe;
1626 	nvme_cmd_t *cmd;
1627 
1628 	ASSERT(mutex_owned(&cq->ncq_mutex));
1629 
1630 retry:
1631 	cqe = &cq->ncq_cq[cq->ncq_head];
1632 
1633 	/* Check phase tag of CQE. Hardware inverts it for new entries. */
1634 	if (cqe->cqe_sf.sf_p == cq->ncq_phase)
1635 		return (NULL);
1636 
1637 	qp = nvme->n_ioq[cqe->cqe_sqid];
1638 
1639 	mutex_enter(&qp->nq_mutex);
1640 	cmd = nvme_unqueue_cmd(nvme, qp, cqe->cqe_cid);
1641 	mutex_exit(&qp->nq_mutex);
1642 
1643 	qp->nq_sqhead = cqe->cqe_sqhd;
1644 	cq->ncq_head = (cq->ncq_head + 1) % cq->ncq_nentry;
1645 
1646 	/* Toggle phase on wrap-around. */
1647 	if (cq->ncq_head == 0)
1648 		cq->ncq_phase = cq->ncq_phase != 0 ? 0 : 1;
1649 
1650 	if (cmd == NULL) {
1651 		dev_err(nvme->n_dip, CE_WARN,
1652 		    "!received completion for unknown cid 0x%x", cqe->cqe_cid);
1653 		atomic_inc_32(&nvme->n_unknown_cid);
1654 		/*
1655 		 * We want to ignore this unexpected completion entry as it
1656 		 * is most likely a result of a bug in the controller firmware.
1657 		 * However, if we return NULL, then callers will assume there
1658 		 * are no more pending commands for this wakeup. Retry to keep
1659 		 * enumerating commands until the phase tag indicates there are
1660 		 * no more and we are really done.
1661 		 */
1662 		goto retry;
1663 	}
1664 
1665 	ASSERT3U(cmd->nc_sqid, ==, cqe->cqe_sqid);
1666 	bcopy(cqe, &cmd->nc_cqe, sizeof (nvme_cqe_t));
1667 
1668 	return (cmd);
1669 }
1670 
1671 /*
1672  * Process all completed commands on the io completion queue.
1673  */
1674 static uint_t
1675 nvme_process_iocq(nvme_t *nvme, nvme_cq_t *cq)
1676 {
1677 	nvme_reg_cqhdbl_t head = { 0 };
1678 	nvme_cmd_t *cmd;
1679 	uint_t completed = 0;
1680 
1681 	if (ddi_dma_sync(cq->ncq_dma->nd_dmah, 0, 0, DDI_DMA_SYNC_FORKERNEL) !=
1682 	    DDI_SUCCESS)
1683 		dev_err(nvme->n_dip, CE_WARN, "!ddi_dma_sync() failed in %s",
1684 		    __func__);
1685 
1686 	mutex_enter(&cq->ncq_mutex);
1687 
1688 	while ((cmd = nvme_get_completed(nvme, cq)) != NULL) {
1689 		taskq_dispatch_ent(cq->ncq_cmd_taskq, cmd->nc_callback, cmd,
1690 		    TQ_NOSLEEP, &cmd->nc_tqent);
1691 
1692 		completed++;
1693 	}
1694 
1695 	if (completed > 0) {
1696 		/*
1697 		 * Update the completion queue head doorbell.
1698 		 */
1699 		head.b.cqhdbl_cqh = cq->ncq_head;
1700 		nvme_put32(nvme, cq->ncq_hdbl, head.r);
1701 	}
1702 
1703 	mutex_exit(&cq->ncq_mutex);
1704 
1705 	return (completed);
1706 }
1707 
1708 static nvme_cmd_t *
1709 nvme_retrieve_cmd(nvme_t *nvme, nvme_qpair_t *qp)
1710 {
1711 	nvme_cq_t *cq = qp->nq_cq;
1712 	nvme_reg_cqhdbl_t head = { 0 };
1713 	nvme_cmd_t *cmd;
1714 
1715 	if (ddi_dma_sync(cq->ncq_dma->nd_dmah, 0, 0, DDI_DMA_SYNC_FORKERNEL) !=
1716 	    DDI_SUCCESS)
1717 		dev_err(nvme->n_dip, CE_WARN, "!ddi_dma_sync() failed in %s",
1718 		    __func__);
1719 
1720 	mutex_enter(&cq->ncq_mutex);
1721 
1722 	if ((cmd = nvme_get_completed(nvme, cq)) != NULL) {
1723 		head.b.cqhdbl_cqh = cq->ncq_head;
1724 		nvme_put32(nvme, cq->ncq_hdbl, head.r);
1725 	}
1726 
1727 	mutex_exit(&cq->ncq_mutex);
1728 
1729 	return (cmd);
1730 }
1731 
1732 static int
1733 nvme_check_unknown_cmd_status(nvme_cmd_t *cmd)
1734 {
1735 	nvme_cqe_t *cqe = &cmd->nc_cqe;
1736 
1737 	dev_err(cmd->nc_nvme->n_dip, CE_WARN,
1738 	    "!unknown command status received: opc = %x, sqid = %d, cid = %d, "
1739 	    "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc,
1740 	    cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct,
1741 	    cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m);
1742 
1743 	if (cmd->nc_xfer != NULL)
1744 		bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
1745 
1746 	if (cmd->nc_nvme->n_strict_version) {
1747 		nvme_ctrl_mark_dead(cmd->nc_nvme, B_FALSE);
1748 	}
1749 
1750 	return (EIO);
1751 }
1752 
1753 static int
1754 nvme_check_vendor_cmd_status(nvme_cmd_t *cmd)
1755 {
1756 	nvme_cqe_t *cqe = &cmd->nc_cqe;
1757 
1758 	dev_err(cmd->nc_nvme->n_dip, CE_WARN,
1759 	    "!unknown command status received: opc = %x, sqid = %d, cid = %d, "
1760 	    "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc,
1761 	    cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct,
1762 	    cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m);
1763 	if (!cmd->nc_nvme->n_ignore_unknown_vendor_status) {
1764 		nvme_ctrl_mark_dead(cmd->nc_nvme, B_FALSE);
1765 	}
1766 
1767 	return (EIO);
1768 }
1769 
1770 static int
1771 nvme_check_integrity_cmd_status(nvme_cmd_t *cmd)
1772 {
1773 	nvme_cqe_t *cqe = &cmd->nc_cqe;
1774 
1775 	switch (cqe->cqe_sf.sf_sc) {
1776 	case NVME_CQE_SC_INT_NVM_WRITE:
1777 		/* write fail */
1778 		/* TODO: post ereport */
1779 		if (cmd->nc_xfer != NULL)
1780 			bd_error(cmd->nc_xfer, BD_ERR_MEDIA);
1781 		return (EIO);
1782 
1783 	case NVME_CQE_SC_INT_NVM_READ:
1784 		/* read fail */
1785 		/* TODO: post ereport */
1786 		if (cmd->nc_xfer != NULL)
1787 			bd_error(cmd->nc_xfer, BD_ERR_MEDIA);
1788 		return (EIO);
1789 
1790 	default:
1791 		return (nvme_check_unknown_cmd_status(cmd));
1792 	}
1793 }
1794 
1795 static int
1796 nvme_check_generic_cmd_status(nvme_cmd_t *cmd)
1797 {
1798 	nvme_cqe_t *cqe = &cmd->nc_cqe;
1799 
1800 	switch (cqe->cqe_sf.sf_sc) {
1801 	case NVME_CQE_SC_GEN_SUCCESS:
1802 		return (0);
1803 
1804 	/*
1805 	 * Errors indicating a bug in the driver should cause a panic.
1806 	 */
1807 	case NVME_CQE_SC_GEN_INV_OPC:
1808 		/* Invalid Command Opcode */
1809 		if (!cmd->nc_dontpanic)
1810 			dev_err(cmd->nc_nvme->n_dip, CE_PANIC,
1811 			    "programming error: invalid opcode in cmd %p",
1812 			    (void *)cmd);
1813 		return (EINVAL);
1814 
1815 	case NVME_CQE_SC_GEN_INV_FLD:
1816 		/* Invalid Field in Command */
1817 		if (!cmd->nc_dontpanic)
1818 			dev_err(cmd->nc_nvme->n_dip, CE_PANIC,
1819 			    "programming error: invalid field in cmd %p",
1820 			    (void *)cmd);
1821 		return (EIO);
1822 
1823 	case NVME_CQE_SC_GEN_ID_CNFL:
1824 		/* Command ID Conflict */
1825 		dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
1826 		    "cmd ID conflict in cmd %p", (void *)cmd);
1827 		return (0);
1828 
1829 	case NVME_CQE_SC_GEN_INV_NS:
1830 		/* Invalid Namespace or Format */
1831 		if (!cmd->nc_dontpanic)
1832 			dev_err(cmd->nc_nvme->n_dip, CE_PANIC,
1833 			    "programming error: invalid NS/format in cmd %p",
1834 			    (void *)cmd);
1835 		return (EINVAL);
1836 
1837 	case NVME_CQE_SC_GEN_NVM_LBA_RANGE:
1838 		/* LBA Out Of Range */
1839 		dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
1840 		    "LBA out of range in cmd %p", (void *)cmd);
1841 		return (0);
1842 
1843 	/*
1844 	 * Non-fatal errors, handle gracefully.
1845 	 */
1846 	case NVME_CQE_SC_GEN_DATA_XFR_ERR:
1847 		/* Data Transfer Error (DMA) */
1848 		/* TODO: post ereport */
1849 		atomic_inc_32(&cmd->nc_nvme->n_data_xfr_err);
1850 		if (cmd->nc_xfer != NULL)
1851 			bd_error(cmd->nc_xfer, BD_ERR_NTRDY);
1852 		return (EIO);
1853 
1854 	case NVME_CQE_SC_GEN_INTERNAL_ERR:
1855 		/*
1856 		 * Internal Error. The spec (v1.0, section 4.5.1.2) says
1857 		 * detailed error information is returned as async event,
1858 		 * so we pretty much ignore the error here and handle it
1859 		 * in the async event handler.
1860 		 */
1861 		atomic_inc_32(&cmd->nc_nvme->n_internal_err);
1862 		if (cmd->nc_xfer != NULL)
1863 			bd_error(cmd->nc_xfer, BD_ERR_NTRDY);
1864 		return (EIO);
1865 
1866 	case NVME_CQE_SC_GEN_ABORT_REQUEST:
1867 		/*
1868 		 * Command Abort Requested. This normally happens only when a
1869 		 * command times out.
1870 		 */
1871 		/* TODO: post ereport or change blkdev to handle this? */
1872 		atomic_inc_32(&cmd->nc_nvme->n_abort_rq_err);
1873 		return (ECANCELED);
1874 
1875 	case NVME_CQE_SC_GEN_ABORT_PWRLOSS:
1876 		/* Command Aborted due to Power Loss Notification */
1877 		nvme_ctrl_mark_dead(cmd->nc_nvme, B_FALSE);
1878 		return (EIO);
1879 
1880 	case NVME_CQE_SC_GEN_ABORT_SQ_DEL:
1881 		/* Command Aborted due to SQ Deletion */
1882 		atomic_inc_32(&cmd->nc_nvme->n_abort_sq_del);
1883 		return (EIO);
1884 
1885 	case NVME_CQE_SC_GEN_NVM_CAP_EXC:
1886 		/* Capacity Exceeded */
1887 		atomic_inc_32(&cmd->nc_nvme->n_nvm_cap_exc);
1888 		if (cmd->nc_xfer != NULL)
1889 			bd_error(cmd->nc_xfer, BD_ERR_MEDIA);
1890 		return (EIO);
1891 
1892 	case NVME_CQE_SC_GEN_NVM_NS_NOTRDY:
1893 		/* Namespace Not Ready */
1894 		atomic_inc_32(&cmd->nc_nvme->n_nvm_ns_notrdy);
1895 		if (cmd->nc_xfer != NULL)
1896 			bd_error(cmd->nc_xfer, BD_ERR_NTRDY);
1897 		return (EIO);
1898 
1899 	case NVME_CQE_SC_GEN_NVM_FORMATTING:
1900 		/* Format in progress (1.2) */
1901 		if (!NVME_VERSION_ATLEAST(&cmd->nc_nvme->n_version, 1, 2))
1902 			return (nvme_check_unknown_cmd_status(cmd));
1903 		atomic_inc_32(&cmd->nc_nvme->n_nvm_ns_formatting);
1904 		if (cmd->nc_xfer != NULL)
1905 			bd_error(cmd->nc_xfer, BD_ERR_NTRDY);
1906 		return (EIO);
1907 
1908 	default:
1909 		return (nvme_check_unknown_cmd_status(cmd));
1910 	}
1911 }
1912 
1913 static int
1914 nvme_check_specific_cmd_status(nvme_cmd_t *cmd)
1915 {
1916 	nvme_cqe_t *cqe = &cmd->nc_cqe;
1917 
1918 	switch (cqe->cqe_sf.sf_sc) {
1919 	case NVME_CQE_SC_SPC_INV_CQ:
1920 		/* Completion Queue Invalid */
1921 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE);
1922 		atomic_inc_32(&cmd->nc_nvme->n_inv_cq_err);
1923 		return (EINVAL);
1924 
1925 	case NVME_CQE_SC_SPC_INV_QID:
1926 		/* Invalid Queue Identifier */
1927 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE ||
1928 		    cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_SQUEUE ||
1929 		    cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE ||
1930 		    cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_CQUEUE);
1931 		atomic_inc_32(&cmd->nc_nvme->n_inv_qid_err);
1932 		return (EINVAL);
1933 
1934 	case NVME_CQE_SC_SPC_MAX_QSZ_EXC:
1935 		/* Max Queue Size Exceeded */
1936 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE ||
1937 		    cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE);
1938 		atomic_inc_32(&cmd->nc_nvme->n_max_qsz_exc);
1939 		return (EINVAL);
1940 
1941 	case NVME_CQE_SC_SPC_ABRT_CMD_EXC:
1942 		/* Abort Command Limit Exceeded */
1943 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_ABORT);
1944 		dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
1945 		    "abort command limit exceeded in cmd %p", (void *)cmd);
1946 		return (0);
1947 
1948 	case NVME_CQE_SC_SPC_ASYNC_EVREQ_EXC:
1949 		/* Async Event Request Limit Exceeded */
1950 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_ASYNC_EVENT);
1951 		dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
1952 		    "async event request limit exceeded in cmd %p",
1953 		    (void *)cmd);
1954 		return (0);
1955 
1956 	case NVME_CQE_SC_SPC_INV_INT_VECT:
1957 		/* Invalid Interrupt Vector */
1958 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE);
1959 		atomic_inc_32(&cmd->nc_nvme->n_inv_int_vect);
1960 		return (EINVAL);
1961 
1962 	case NVME_CQE_SC_SPC_INV_LOG_PAGE:
1963 		/* Invalid Log Page */
1964 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_GET_LOG_PAGE);
1965 		atomic_inc_32(&cmd->nc_nvme->n_inv_log_page);
1966 		return (EINVAL);
1967 
1968 	case NVME_CQE_SC_SPC_INV_FORMAT:
1969 		/* Invalid Format */
1970 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_FORMAT);
1971 		atomic_inc_32(&cmd->nc_nvme->n_inv_format);
1972 		if (cmd->nc_xfer != NULL)
1973 			bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
1974 		return (EINVAL);
1975 
1976 	case NVME_CQE_SC_SPC_INV_Q_DEL:
1977 		/* Invalid Queue Deletion */
1978 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_CQUEUE);
1979 		atomic_inc_32(&cmd->nc_nvme->n_inv_q_del);
1980 		return (EINVAL);
1981 
1982 	case NVME_CQE_SC_SPC_NVM_CNFL_ATTR:
1983 		/* Conflicting Attributes */
1984 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_DSET_MGMT ||
1985 		    cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_READ ||
1986 		    cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE);
1987 		atomic_inc_32(&cmd->nc_nvme->n_cnfl_attr);
1988 		if (cmd->nc_xfer != NULL)
1989 			bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
1990 		return (EINVAL);
1991 
1992 	case NVME_CQE_SC_SPC_NVM_INV_PROT:
1993 		/* Invalid Protection Information */
1994 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_COMPARE ||
1995 		    cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_READ ||
1996 		    cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE);
1997 		atomic_inc_32(&cmd->nc_nvme->n_inv_prot);
1998 		if (cmd->nc_xfer != NULL)
1999 			bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
2000 		return (EINVAL);
2001 
2002 	case NVME_CQE_SC_SPC_NVM_READONLY:
2003 		/* Write to Read Only Range */
2004 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE);
2005 		atomic_inc_32(&cmd->nc_nvme->n_readonly);
2006 		if (cmd->nc_xfer != NULL)
2007 			bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
2008 		return (EROFS);
2009 
2010 	case NVME_CQE_SC_SPC_INV_FW_SLOT:
2011 		/* Invalid Firmware Slot */
2012 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
2013 		return (EINVAL);
2014 
2015 	case NVME_CQE_SC_SPC_INV_FW_IMG:
2016 		/* Invalid Firmware Image */
2017 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
2018 		return (EINVAL);
2019 
2020 	case NVME_CQE_SC_SPC_FW_RESET:
2021 		/* Conventional Reset Required */
2022 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
2023 		return (0);
2024 
2025 	case NVME_CQE_SC_SPC_FW_NSSR:
2026 		/* NVMe Subsystem Reset Required */
2027 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
2028 		return (0);
2029 
2030 	case NVME_CQE_SC_SPC_FW_NEXT_RESET:
2031 		/* Activation Requires Reset */
2032 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
2033 		return (0);
2034 
2035 	case NVME_CQE_SC_SPC_FW_MTFA:
2036 		/* Activation Requires Maximum Time Violation */
2037 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
2038 		return (EAGAIN);
2039 
2040 	case NVME_CQE_SC_SPC_FW_PROHIBITED:
2041 		/* Activation Prohibited */
2042 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
2043 		return (EINVAL);
2044 
2045 	case NVME_CQE_SC_SPC_FW_OVERLAP:
2046 		/* Overlapping Firmware Ranges */
2047 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_IMAGE_LOAD);
2048 		return (EINVAL);
2049 
2050 	default:
2051 		return (nvme_check_unknown_cmd_status(cmd));
2052 	}
2053 }
2054 
2055 static inline int
2056 nvme_check_cmd_status(nvme_cmd_t *cmd)
2057 {
2058 	nvme_cqe_t *cqe = &cmd->nc_cqe;
2059 
2060 	/*
2061 	 * Take a shortcut if the controller is dead, or if
2062 	 * command status indicates no error.
2063 	 */
2064 	if (cmd->nc_nvme->n_dead)
2065 		return (EIO);
2066 
2067 	if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
2068 	    cqe->cqe_sf.sf_sc == NVME_CQE_SC_GEN_SUCCESS)
2069 		return (0);
2070 
2071 	if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC)
2072 		return (nvme_check_generic_cmd_status(cmd));
2073 	else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_SPECIFIC)
2074 		return (nvme_check_specific_cmd_status(cmd));
2075 	else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_INTEGRITY)
2076 		return (nvme_check_integrity_cmd_status(cmd));
2077 	else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_VENDOR)
2078 		return (nvme_check_vendor_cmd_status(cmd));
2079 
2080 	return (nvme_check_unknown_cmd_status(cmd));
2081 }
2082 
2083 /*
2084  * Check the command status as used by an ioctl path and do not convert it to an
2085  * errno. We still allow all the command status checking to occur, but otherwise
2086  * will pass back the controller error as is.
2087  */
2088 static boolean_t
2089 nvme_check_cmd_status_ioctl(nvme_cmd_t *cmd, nvme_ioctl_common_t *ioc)
2090 {
2091 	nvme_cqe_t *cqe = &cmd->nc_cqe;
2092 	nvme_t *nvme = cmd->nc_nvme;
2093 
2094 	if (nvme->n_dead) {
2095 		return (nvme_ioctl_error(ioc, nvme->n_dead_status, 0, 0));
2096 	}
2097 
2098 	if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
2099 	    cqe->cqe_sf.sf_sc == NVME_CQE_SC_GEN_SUCCESS)
2100 		return (B_TRUE);
2101 
2102 	if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC) {
2103 		(void) nvme_check_generic_cmd_status(cmd);
2104 	} else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_SPECIFIC) {
2105 		(void) nvme_check_specific_cmd_status(cmd);
2106 	} else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_INTEGRITY) {
2107 		(void) nvme_check_integrity_cmd_status(cmd);
2108 	} else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_VENDOR) {
2109 		(void) nvme_check_vendor_cmd_status(cmd);
2110 	} else {
2111 		(void) nvme_check_unknown_cmd_status(cmd);
2112 	}
2113 
2114 	return (nvme_ioctl_error(ioc, NVME_IOCTL_E_CTRL_ERROR,
2115 	    cqe->cqe_sf.sf_sct, cqe->cqe_sf.sf_sc));
2116 }
2117 
2118 static int
2119 nvme_abort_cmd(nvme_cmd_t *abort_cmd, uint_t sec)
2120 {
2121 	nvme_t *nvme = abort_cmd->nc_nvme;
2122 	nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
2123 	nvme_abort_cmd_t ac = { 0 };
2124 	int ret = 0;
2125 
2126 	sema_p(&nvme->n_abort_sema);
2127 
2128 	ac.b.ac_cid = abort_cmd->nc_sqe.sqe_cid;
2129 	ac.b.ac_sqid = abort_cmd->nc_sqid;
2130 
2131 	cmd->nc_sqid = 0;
2132 	cmd->nc_sqe.sqe_opc = NVME_OPC_ABORT;
2133 	cmd->nc_callback = nvme_wakeup_cmd;
2134 	cmd->nc_sqe.sqe_cdw10 = ac.r;
2135 
2136 	/*
2137 	 * Send the ABORT to the hardware. The ABORT command will return _after_
2138 	 * the aborted command has completed (aborted or otherwise), but since
2139 	 * we still hold the aborted command's mutex its callback hasn't been
2140 	 * processed yet.
2141 	 */
2142 	nvme_admin_cmd(cmd, sec);
2143 	sema_v(&nvme->n_abort_sema);
2144 
2145 	if ((ret = nvme_check_cmd_status(cmd)) != 0) {
2146 		dev_err(nvme->n_dip, CE_WARN,
2147 		    "!ABORT failed with sct = %x, sc = %x",
2148 		    cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
2149 		atomic_inc_32(&nvme->n_abort_failed);
2150 	} else {
2151 		dev_err(nvme->n_dip, CE_WARN,
2152 		    "!ABORT of command %d/%d %ssuccessful",
2153 		    abort_cmd->nc_sqe.sqe_cid, abort_cmd->nc_sqid,
2154 		    cmd->nc_cqe.cqe_dw0 & 1 ? "un" : "");
2155 		if ((cmd->nc_cqe.cqe_dw0 & 1) == 0)
2156 			atomic_inc_32(&nvme->n_cmd_aborted);
2157 	}
2158 
2159 	nvme_free_cmd(cmd);
2160 	return (ret);
2161 }
2162 
2163 /*
2164  * nvme_wait_cmd -- wait for command completion or timeout
2165  *
2166  * In case of a serious error or a timeout of the abort command the hardware
2167  * will be declared dead and FMA will be notified.
2168  */
2169 static void
2170 nvme_wait_cmd(nvme_cmd_t *cmd, uint32_t sec)
2171 {
2172 	clock_t timeout = ddi_get_lbolt() + drv_usectohz((long)sec * MICROSEC);
2173 	nvme_t *nvme = cmd->nc_nvme;
2174 	nvme_reg_csts_t csts;
2175 	nvme_qpair_t *qp;
2176 
2177 	ASSERT(mutex_owned(&cmd->nc_mutex));
2178 
2179 	while (!cmd->nc_completed) {
2180 		if (cv_timedwait(&cmd->nc_cv, &cmd->nc_mutex, timeout) == -1)
2181 			break;
2182 	}
2183 
2184 	if (cmd->nc_completed)
2185 		return;
2186 
2187 	/*
2188 	 * The command timed out.
2189 	 *
2190 	 * Check controller for fatal status, any errors associated with the
2191 	 * register or DMA handle, or for a double timeout (abort command timed
2192 	 * out). If necessary log a warning and call FMA.
2193 	 */
2194 	csts.r = nvme_get32(nvme, NVME_REG_CSTS);
2195 	dev_err(nvme->n_dip, CE_WARN, "!command %d/%d timeout, "
2196 	    "OPC = %x, CFS = %d", cmd->nc_sqe.sqe_cid, cmd->nc_sqid,
2197 	    cmd->nc_sqe.sqe_opc, csts.b.csts_cfs);
2198 	atomic_inc_32(&nvme->n_cmd_timeout);
2199 
2200 	if (csts.b.csts_cfs ||
2201 	    nvme_check_regs_hdl(nvme) ||
2202 	    nvme_check_dma_hdl(cmd->nc_dma) ||
2203 	    cmd->nc_sqe.sqe_opc == NVME_OPC_ABORT) {
2204 		nvme_ctrl_mark_dead(cmd->nc_nvme, B_FALSE);
2205 	} else if (nvme_abort_cmd(cmd, sec) == 0) {
2206 		/*
2207 		 * If the abort succeeded the command should complete
2208 		 * immediately with an appropriate status.
2209 		 */
2210 		while (!cmd->nc_completed)
2211 			cv_wait(&cmd->nc_cv, &cmd->nc_mutex);
2212 
2213 		return;
2214 	}
2215 
2216 	qp = nvme->n_ioq[cmd->nc_sqid];
2217 
2218 	mutex_enter(&qp->nq_mutex);
2219 	(void) nvme_unqueue_cmd(nvme, qp, cmd->nc_sqe.sqe_cid);
2220 	mutex_exit(&qp->nq_mutex);
2221 
2222 	/*
2223 	 * As we don't know what the presumed dead hardware might still do with
2224 	 * the DMA memory, we'll put the command on the lost commands list if it
2225 	 * has any DMA memory.
2226 	 */
2227 	if (cmd->nc_dma != NULL) {
2228 		mutex_enter(&nvme_lc_mutex);
2229 		list_insert_head(&nvme_lost_cmds, cmd);
2230 		mutex_exit(&nvme_lc_mutex);
2231 	}
2232 }
2233 
2234 static void
2235 nvme_wakeup_cmd(void *arg)
2236 {
2237 	nvme_cmd_t *cmd = arg;
2238 
2239 	mutex_enter(&cmd->nc_mutex);
2240 	cmd->nc_completed = B_TRUE;
2241 	cv_signal(&cmd->nc_cv);
2242 	mutex_exit(&cmd->nc_mutex);
2243 }
2244 
2245 static void
2246 nvme_async_event_task(void *arg)
2247 {
2248 	nvme_cmd_t *cmd = arg;
2249 	nvme_t *nvme = cmd->nc_nvme;
2250 	nvme_error_log_entry_t *error_log = NULL;
2251 	nvme_health_log_t *health_log = NULL;
2252 	nvme_nschange_list_t *nslist = NULL;
2253 	size_t logsize = 0;
2254 	nvme_async_event_t event;
2255 
2256 	/*
2257 	 * Check for errors associated with the async request itself. The only
2258 	 * command-specific error is "async event limit exceeded", which
2259 	 * indicates a programming error in the driver and causes a panic in
2260 	 * nvme_check_cmd_status().
2261 	 *
2262 	 * Other possible errors are various scenarios where the async request
2263 	 * was aborted, or internal errors in the device. Internal errors are
2264 	 * reported to FMA, the command aborts need no special handling here.
2265 	 *
2266 	 * And finally, at least qemu nvme does not support async events,
2267 	 * and will return NVME_CQE_SC_GEN_INV_OPC | DNR. If so, we
2268 	 * will avoid posting async events.
2269 	 */
2270 
2271 	if (nvme_check_cmd_status(cmd) != 0) {
2272 		dev_err(cmd->nc_nvme->n_dip, CE_WARN,
2273 		    "!async event request returned failure, sct = 0x%x, "
2274 		    "sc = 0x%x, dnr = %d, m = %d", cmd->nc_cqe.cqe_sf.sf_sct,
2275 		    cmd->nc_cqe.cqe_sf.sf_sc, cmd->nc_cqe.cqe_sf.sf_dnr,
2276 		    cmd->nc_cqe.cqe_sf.sf_m);
2277 
2278 		if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
2279 		    cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INTERNAL_ERR) {
2280 			nvme_ctrl_mark_dead(cmd->nc_nvme, B_FALSE);
2281 		}
2282 
2283 		if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
2284 		    cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INV_OPC &&
2285 		    cmd->nc_cqe.cqe_sf.sf_dnr == 1) {
2286 			nvme->n_async_event_supported = B_FALSE;
2287 		}
2288 
2289 		nvme_free_cmd(cmd);
2290 		return;
2291 	}
2292 
2293 	event.r = cmd->nc_cqe.cqe_dw0;
2294 
2295 	/* Clear CQE and re-submit the async request. */
2296 	bzero(&cmd->nc_cqe, sizeof (nvme_cqe_t));
2297 	nvme_submit_admin_cmd(nvme->n_adminq, cmd);
2298 	cmd = NULL;	/* cmd can no longer be used after resubmission */
2299 
2300 	switch (event.b.ae_type) {
2301 	case NVME_ASYNC_TYPE_ERROR:
2302 		if (event.b.ae_logpage == NVME_LOGPAGE_ERROR) {
2303 			if (!nvme_get_logpage_int(nvme, B_FALSE,
2304 			    (void **)&error_log, &logsize,
2305 			    NVME_LOGPAGE_ERROR)) {
2306 				return;
2307 			}
2308 		} else {
2309 			dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in "
2310 			    "async event reply: type=0x%x logpage=0x%x",
2311 			    event.b.ae_type, event.b.ae_logpage);
2312 			atomic_inc_32(&nvme->n_wrong_logpage);
2313 			return;
2314 		}
2315 
2316 		switch (event.b.ae_info) {
2317 		case NVME_ASYNC_ERROR_INV_SQ:
2318 			dev_err(nvme->n_dip, CE_PANIC, "programming error: "
2319 			    "invalid submission queue");
2320 			return;
2321 
2322 		case NVME_ASYNC_ERROR_INV_DBL:
2323 			dev_err(nvme->n_dip, CE_PANIC, "programming error: "
2324 			    "invalid doorbell write value");
2325 			return;
2326 
2327 		case NVME_ASYNC_ERROR_DIAGFAIL:
2328 			dev_err(nvme->n_dip, CE_WARN, "!diagnostic failure");
2329 			nvme_ctrl_mark_dead(cmd->nc_nvme, B_FALSE);
2330 			atomic_inc_32(&nvme->n_diagfail_event);
2331 			break;
2332 
2333 		case NVME_ASYNC_ERROR_PERSISTENT:
2334 			dev_err(nvme->n_dip, CE_WARN, "!persistent internal "
2335 			    "device error");
2336 			nvme_ctrl_mark_dead(cmd->nc_nvme, B_FALSE);
2337 			atomic_inc_32(&nvme->n_persistent_event);
2338 			break;
2339 
2340 		case NVME_ASYNC_ERROR_TRANSIENT:
2341 			dev_err(nvme->n_dip, CE_WARN, "!transient internal "
2342 			    "device error");
2343 			/* TODO: send ereport */
2344 			atomic_inc_32(&nvme->n_transient_event);
2345 			break;
2346 
2347 		case NVME_ASYNC_ERROR_FW_LOAD:
2348 			dev_err(nvme->n_dip, CE_WARN,
2349 			    "!firmware image load error");
2350 			atomic_inc_32(&nvme->n_fw_load_event);
2351 			break;
2352 		}
2353 		break;
2354 
2355 	case NVME_ASYNC_TYPE_HEALTH:
2356 		if (event.b.ae_logpage == NVME_LOGPAGE_HEALTH) {
2357 			if (!nvme_get_logpage_int(nvme, B_FALSE,
2358 			    (void **)&health_log, &logsize,
2359 			    NVME_LOGPAGE_HEALTH)) {
2360 				return;
2361 			}
2362 		} else {
2363 			dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in "
2364 			    "type=0x%x logpage=0x%x", event.b.ae_type,
2365 			    event.b.ae_logpage);
2366 			atomic_inc_32(&nvme->n_wrong_logpage);
2367 			return;
2368 		}
2369 
2370 		switch (event.b.ae_info) {
2371 		case NVME_ASYNC_HEALTH_RELIABILITY:
2372 			dev_err(nvme->n_dip, CE_WARN,
2373 			    "!device reliability compromised");
2374 			/* TODO: send ereport */
2375 			atomic_inc_32(&nvme->n_reliability_event);
2376 			break;
2377 
2378 		case NVME_ASYNC_HEALTH_TEMPERATURE:
2379 			dev_err(nvme->n_dip, CE_WARN,
2380 			    "!temperature above threshold");
2381 			/* TODO: send ereport */
2382 			atomic_inc_32(&nvme->n_temperature_event);
2383 			break;
2384 
2385 		case NVME_ASYNC_HEALTH_SPARE:
2386 			dev_err(nvme->n_dip, CE_WARN,
2387 			    "!spare space below threshold");
2388 			/* TODO: send ereport */
2389 			atomic_inc_32(&nvme->n_spare_event);
2390 			break;
2391 		}
2392 		break;
2393 
2394 	case NVME_ASYNC_TYPE_NOTICE:
2395 		switch (event.b.ae_info) {
2396 		case NVME_ASYNC_NOTICE_NS_CHANGE:
2397 			if (event.b.ae_logpage != NVME_LOGPAGE_NSCHANGE) {
2398 				dev_err(nvme->n_dip, CE_WARN,
2399 				    "!wrong logpage in async event reply: "
2400 				    "type=0x%x logpage=0x%x",
2401 				    event.b.ae_type, event.b.ae_logpage);
2402 				atomic_inc_32(&nvme->n_wrong_logpage);
2403 				break;
2404 			}
2405 
2406 			dev_err(nvme->n_dip, CE_NOTE,
2407 			    "namespace attribute change event, "
2408 			    "logpage = 0x%x", event.b.ae_logpage);
2409 			atomic_inc_32(&nvme->n_notice_event);
2410 
2411 			if (!nvme_get_logpage_int(nvme, B_FALSE,
2412 			    (void **)&nslist, &logsize,
2413 			    NVME_LOGPAGE_NSCHANGE)) {
2414 				break;
2415 			}
2416 
2417 			if (nslist->nscl_ns[0] == UINT32_MAX) {
2418 				dev_err(nvme->n_dip, CE_CONT,
2419 				    "more than %u namespaces have changed.\n",
2420 				    NVME_NSCHANGE_LIST_SIZE);
2421 				break;
2422 			}
2423 
2424 			mutex_enter(&nvme->n_mgmt_mutex);
2425 			for (uint_t i = 0; i < NVME_NSCHANGE_LIST_SIZE; i++) {
2426 				uint32_t nsid = nslist->nscl_ns[i];
2427 
2428 				if (nsid == 0)	/* end of list */
2429 					break;
2430 
2431 				dev_err(nvme->n_dip, CE_NOTE,
2432 				    "!namespace nvme%d/%u has changed.",
2433 				    ddi_get_instance(nvme->n_dip), nsid);
2434 
2435 
2436 				if (nvme_init_ns(nvme, nsid) != DDI_SUCCESS)
2437 					continue;
2438 
2439 				bd_state_change(nvme_nsid2ns(nvme,
2440 				    nsid)->ns_bd_hdl);
2441 			}
2442 			mutex_exit(&nvme->n_mgmt_mutex);
2443 
2444 			break;
2445 
2446 		case NVME_ASYNC_NOTICE_FW_ACTIVATE:
2447 			dev_err(nvme->n_dip, CE_NOTE,
2448 			    "firmware activation starting, "
2449 			    "logpage = 0x%x", event.b.ae_logpage);
2450 			atomic_inc_32(&nvme->n_notice_event);
2451 			break;
2452 
2453 		case NVME_ASYNC_NOTICE_TELEMETRY:
2454 			dev_err(nvme->n_dip, CE_NOTE,
2455 			    "telemetry log changed, "
2456 			    "logpage = 0x%x", event.b.ae_logpage);
2457 			atomic_inc_32(&nvme->n_notice_event);
2458 			break;
2459 
2460 		case NVME_ASYNC_NOTICE_NS_ASYMM:
2461 			dev_err(nvme->n_dip, CE_NOTE,
2462 			    "asymmetric namespace access change, "
2463 			    "logpage = 0x%x", event.b.ae_logpage);
2464 			atomic_inc_32(&nvme->n_notice_event);
2465 			break;
2466 
2467 		case NVME_ASYNC_NOTICE_LATENCYLOG:
2468 			dev_err(nvme->n_dip, CE_NOTE,
2469 			    "predictable latency event aggregate log change, "
2470 			    "logpage = 0x%x", event.b.ae_logpage);
2471 			atomic_inc_32(&nvme->n_notice_event);
2472 			break;
2473 
2474 		case NVME_ASYNC_NOTICE_LBASTATUS:
2475 			dev_err(nvme->n_dip, CE_NOTE,
2476 			    "LBA status information alert, "
2477 			    "logpage = 0x%x", event.b.ae_logpage);
2478 			atomic_inc_32(&nvme->n_notice_event);
2479 			break;
2480 
2481 		case NVME_ASYNC_NOTICE_ENDURANCELOG:
2482 			dev_err(nvme->n_dip, CE_NOTE,
2483 			    "endurance group event aggregate log page change, "
2484 			    "logpage = 0x%x", event.b.ae_logpage);
2485 			atomic_inc_32(&nvme->n_notice_event);
2486 			break;
2487 
2488 		default:
2489 			dev_err(nvme->n_dip, CE_WARN,
2490 			    "!unknown notice async event received, "
2491 			    "info = 0x%x, logpage = 0x%x", event.b.ae_info,
2492 			    event.b.ae_logpage);
2493 			atomic_inc_32(&nvme->n_unknown_event);
2494 			break;
2495 		}
2496 		break;
2497 
2498 	case NVME_ASYNC_TYPE_VENDOR:
2499 		dev_err(nvme->n_dip, CE_WARN, "!vendor specific async event "
2500 		    "received, info = 0x%x, logpage = 0x%x", event.b.ae_info,
2501 		    event.b.ae_logpage);
2502 		atomic_inc_32(&nvme->n_vendor_event);
2503 		break;
2504 
2505 	default:
2506 		dev_err(nvme->n_dip, CE_WARN, "!unknown async event received, "
2507 		    "type = 0x%x, info = 0x%x, logpage = 0x%x", event.b.ae_type,
2508 		    event.b.ae_info, event.b.ae_logpage);
2509 		atomic_inc_32(&nvme->n_unknown_event);
2510 		break;
2511 	}
2512 
2513 	if (error_log != NULL)
2514 		kmem_free(error_log, logsize);
2515 
2516 	if (health_log != NULL)
2517 		kmem_free(health_log, logsize);
2518 
2519 	if (nslist != NULL)
2520 		kmem_free(nslist, logsize);
2521 }
2522 
2523 static void
2524 nvme_admin_cmd(nvme_cmd_t *cmd, uint32_t sec)
2525 {
2526 	mutex_enter(&cmd->nc_mutex);
2527 	nvme_submit_admin_cmd(cmd->nc_nvme->n_adminq, cmd);
2528 	nvme_wait_cmd(cmd, sec);
2529 	mutex_exit(&cmd->nc_mutex);
2530 }
2531 
2532 static void
2533 nvme_async_event(nvme_t *nvme)
2534 {
2535 	nvme_cmd_t *cmd;
2536 
2537 	cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
2538 	cmd->nc_sqid = 0;
2539 	cmd->nc_sqe.sqe_opc = NVME_OPC_ASYNC_EVENT;
2540 	cmd->nc_callback = nvme_async_event_task;
2541 	cmd->nc_dontpanic = B_TRUE;
2542 
2543 	nvme_submit_admin_cmd(nvme->n_adminq, cmd);
2544 }
2545 
2546 /*
2547  * There are commands such as format or vendor unique commands that are going to
2548  * manipulate the data in a namespace or destroy them, we make sure that none of
2549  * the ones that will be impacted are actually attached.
2550  */
2551 static boolean_t
2552 nvme_no_blkdev_attached(nvme_t *nvme, uint32_t nsid)
2553 {
2554 	ASSERT(MUTEX_HELD(&nvme->n_mgmt_mutex));
2555 	ASSERT3U(nsid, !=, 0);
2556 
2557 	if (nsid != NVME_NSID_BCAST) {
2558 		nvme_namespace_t *ns = nvme_nsid2ns(nvme, nsid);
2559 		return (!ns->ns_attached);
2560 	}
2561 
2562 	for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) {
2563 		nvme_namespace_t *ns = nvme_nsid2ns(nvme, i);
2564 
2565 		if (ns->ns_attached) {
2566 			return (B_FALSE);
2567 		}
2568 	}
2569 
2570 	return (B_TRUE);
2571 }
2572 
2573 static boolean_t
2574 nvme_format_nvm(nvme_t *nvme, nvme_ioctl_format_t *ioc)
2575 {
2576 	nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
2577 	nvme_format_nvm_t format_nvm = { 0 };
2578 	boolean_t ret;
2579 
2580 	format_nvm.b.fm_lbaf = bitx32(ioc->nif_lbaf, 3, 0);
2581 	format_nvm.b.fm_ses = bitx32(ioc->nif_ses, 2, 0);
2582 
2583 	cmd->nc_sqid = 0;
2584 	cmd->nc_callback = nvme_wakeup_cmd;
2585 	cmd->nc_sqe.sqe_nsid = ioc->nif_common.nioc_nsid;
2586 	cmd->nc_sqe.sqe_opc = NVME_OPC_NVM_FORMAT;
2587 	cmd->nc_sqe.sqe_cdw10 = format_nvm.r;
2588 
2589 	/*
2590 	 * We don't want to panic on any format commands. There are two reasons
2591 	 * for this:
2592 	 *
2593 	 * 1) All format commands are initiated by users. We don't want to panic
2594 	 * on user commands.
2595 	 *
2596 	 * 2) Several devices like the Samsung SM951 don't allow formatting of
2597 	 * all namespaces in one command and we'd prefer to handle that
2598 	 * gracefully.
2599 	 */
2600 	cmd->nc_dontpanic = B_TRUE;
2601 
2602 	nvme_admin_cmd(cmd, nvme_format_cmd_timeout);
2603 
2604 	if (!nvme_check_cmd_status_ioctl(cmd, &ioc->nif_common) != 0) {
2605 		dev_err(nvme->n_dip, CE_WARN,
2606 		    "!FORMAT failed with sct = %x, sc = %x",
2607 		    cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
2608 		ret = B_FALSE;
2609 		goto fail;
2610 	}
2611 
2612 	ret = B_TRUE;
2613 fail:
2614 	nvme_free_cmd(cmd);
2615 	return (ret);
2616 }
2617 
2618 /*
2619  * Retrieve a specific log page. The contents of the log page request should
2620  * have already been validated by the system.
2621  */
2622 static boolean_t
2623 nvme_get_logpage(nvme_t *nvme, boolean_t user, nvme_ioctl_get_logpage_t *log,
2624     void **buf)
2625 {
2626 	nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
2627 	nvme_getlogpage_dw10_t dw10;
2628 	uint32_t offlo, offhi;
2629 	nvme_getlogpage_dw11_t dw11;
2630 	nvme_getlogpage_dw14_t dw14;
2631 	uint32_t ndw;
2632 	boolean_t ret = B_FALSE;
2633 
2634 	bzero(&dw10, sizeof (dw10));
2635 	bzero(&dw11, sizeof (dw11));
2636 	bzero(&dw14, sizeof (dw14));
2637 
2638 	cmd->nc_sqid = 0;
2639 	cmd->nc_callback = nvme_wakeup_cmd;
2640 	cmd->nc_sqe.sqe_opc = NVME_OPC_GET_LOG_PAGE;
2641 	cmd->nc_sqe.sqe_nsid = log->nigl_common.nioc_nsid;
2642 
2643 	if (user)
2644 		cmd->nc_dontpanic = B_TRUE;
2645 
2646 	/*
2647 	 * The size field is the number of double words, but is a zeros based
2648 	 * value. We need to store our actual value minus one.
2649 	 */
2650 	ndw = (uint32_t)(log->nigl_len / 4);
2651 	ASSERT3U(ndw, >, 0);
2652 	ndw--;
2653 
2654 	dw10.b.lp_lid = bitx32(log->nigl_lid, 7, 0);
2655 	dw10.b.lp_lsp = bitx32(log->nigl_lsp, 6, 0);
2656 	dw10.b.lp_rae = bitx32(log->nigl_lsp, 0, 0);
2657 	dw10.b.lp_lnumdl = bitx32(ndw, 15, 0);
2658 
2659 	dw11.b.lp_numdu = bitx32(ndw, 31, 16);
2660 	dw11.b.lp_lsi = bitx32(log->nigl_lsi, 15, 0);
2661 
2662 	offlo = bitx64(log->nigl_offset, 31, 0);
2663 	offhi = bitx64(log->nigl_offset, 63, 32);
2664 
2665 	dw14.b.lp_csi = bitx32(log->nigl_csi, 7, 0);
2666 
2667 	cmd->nc_sqe.sqe_cdw10 = dw10.r;
2668 	cmd->nc_sqe.sqe_cdw11 = dw11.r;
2669 	cmd->nc_sqe.sqe_cdw12 = offlo;
2670 	cmd->nc_sqe.sqe_cdw13 = offhi;
2671 	cmd->nc_sqe.sqe_cdw14 = dw14.r;
2672 
2673 	if (nvme_zalloc_dma(nvme, log->nigl_len, DDI_DMA_READ,
2674 	    &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) {
2675 		dev_err(nvme->n_dip, CE_WARN,
2676 		    "!nvme_zalloc_dma failed for GET LOG PAGE");
2677 		ret = nvme_ioctl_error(&log->nigl_common,
2678 		    NVME_IOCTL_E_NO_DMA_MEM, 0, 0);
2679 		goto fail;
2680 	}
2681 
2682 	if (nvme_fill_prp(cmd, cmd->nc_dma->nd_dmah) != 0) {
2683 		ret = nvme_ioctl_error(&log->nigl_common,
2684 		    NVME_IOCTL_E_NO_DMA_MEM, 0, 0);
2685 		goto fail;
2686 	}
2687 	nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
2688 
2689 	if (!nvme_check_cmd_status_ioctl(cmd, &log->nigl_common)) {
2690 		if (!user) {
2691 			dev_err(nvme->n_dip, CE_WARN,
2692 			    "!GET LOG PAGE failed with sct = %x, sc = %x",
2693 			    cmd->nc_cqe.cqe_sf.sf_sct,
2694 			    cmd->nc_cqe.cqe_sf.sf_sc);
2695 		}
2696 		ret = B_FALSE;
2697 		goto fail;
2698 	}
2699 
2700 	*buf = kmem_alloc(log->nigl_len, KM_SLEEP);
2701 	bcopy(cmd->nc_dma->nd_memp, *buf, log->nigl_len);
2702 
2703 	ret = B_TRUE;
2704 fail:
2705 	nvme_free_cmd(cmd);
2706 
2707 	return (ret);
2708 }
2709 
2710 /*
2711  * This is an internal wrapper for when the kernel wants to get a log page.
2712  * Currently this assumes that the only thing that is required is the log page
2713  * ID. If more information is required, we'll be better served to just use the
2714  * general ioctl interface.
2715  */
2716 static boolean_t
2717 nvme_get_logpage_int(nvme_t *nvme, boolean_t user, void **buf, size_t *bufsize,
2718     uint8_t lid)
2719 {
2720 	const nvme_log_page_info_t *info = NULL;
2721 	nvme_ioctl_get_logpage_t log;
2722 	nvme_valid_ctrl_data_t data;
2723 	boolean_t bret;
2724 	bool var;
2725 
2726 	for (size_t i = 0; i < nvme_std_log_npages; i++) {
2727 		if (nvme_std_log_pages[i].nlpi_lid == lid &&
2728 		    nvme_std_log_pages[i].nlpi_csi == NVME_CSI_NVM) {
2729 			info = &nvme_std_log_pages[i];
2730 			break;
2731 		}
2732 	}
2733 
2734 	if (info == NULL) {
2735 		return (B_FALSE);
2736 	}
2737 
2738 	data.vcd_vers = &nvme->n_version;
2739 	data.vcd_id = nvme->n_idctl;
2740 	bzero(&log, sizeof (log));
2741 	log.nigl_common.nioc_nsid = NVME_NSID_BCAST;
2742 	log.nigl_csi = info->nlpi_csi;
2743 	log.nigl_lid = info->nlpi_lid;
2744 	log.nigl_len = nvme_log_page_info_size(info, &data, &var);
2745 
2746 	/*
2747 	 * We only support getting standard fixed-length log pages through the
2748 	 * kernel interface at this time. If a log page either has an unknown
2749 	 * size or has a variable length, then we cannot get it.
2750 	 */
2751 	if (log.nigl_len == 0 || var) {
2752 		return (B_FALSE);
2753 	}
2754 
2755 	bret = nvme_get_logpage(nvme, user, &log, buf);
2756 	if (!bret) {
2757 		return (B_FALSE);
2758 	}
2759 
2760 	*bufsize = log.nigl_len;
2761 	return (B_TRUE);
2762 }
2763 
2764 static boolean_t
2765 nvme_identify(nvme_t *nvme, boolean_t user, nvme_ioctl_identify_t *ioc,
2766     void **buf)
2767 {
2768 	nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
2769 	boolean_t ret = B_FALSE;
2770 	nvme_identify_dw10_t dw10;
2771 
2772 	ASSERT3P(buf, !=, NULL);
2773 
2774 	bzero(&dw10, sizeof (dw10));
2775 
2776 	cmd->nc_sqid = 0;
2777 	cmd->nc_callback = nvme_wakeup_cmd;
2778 	cmd->nc_sqe.sqe_opc = NVME_OPC_IDENTIFY;
2779 	cmd->nc_sqe.sqe_nsid = ioc->nid_common.nioc_nsid;
2780 
2781 	dw10.b.id_cns = bitx32(ioc->nid_cns, 7, 0);
2782 	dw10.b.id_cntid = bitx32(ioc->nid_ctrlid, 15, 0);
2783 
2784 	cmd->nc_sqe.sqe_cdw10 = dw10.r;
2785 
2786 	if (nvme_zalloc_dma(nvme, NVME_IDENTIFY_BUFSIZE, DDI_DMA_READ,
2787 	    &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) {
2788 		dev_err(nvme->n_dip, CE_WARN,
2789 		    "!nvme_zalloc_dma failed for IDENTIFY");
2790 		ret = nvme_ioctl_error(&ioc->nid_common,
2791 		    NVME_IOCTL_E_NO_DMA_MEM, 0, 0);
2792 		goto fail;
2793 	}
2794 
2795 	if (cmd->nc_dma->nd_ncookie > 2) {
2796 		dev_err(nvme->n_dip, CE_WARN,
2797 		    "!too many DMA cookies for IDENTIFY");
2798 		atomic_inc_32(&nvme->n_too_many_cookies);
2799 		ret = nvme_ioctl_error(&ioc->nid_common,
2800 		    NVME_IOCTL_E_BAD_PRP, 0, 0);
2801 		goto fail;
2802 	}
2803 
2804 	cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_dma->nd_cookie.dmac_laddress;
2805 	if (cmd->nc_dma->nd_ncookie > 1) {
2806 		ddi_dma_nextcookie(cmd->nc_dma->nd_dmah,
2807 		    &cmd->nc_dma->nd_cookie);
2808 		cmd->nc_sqe.sqe_dptr.d_prp[1] =
2809 		    cmd->nc_dma->nd_cookie.dmac_laddress;
2810 	}
2811 
2812 	if (user)
2813 		cmd->nc_dontpanic = B_TRUE;
2814 
2815 	nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
2816 
2817 	if (!nvme_check_cmd_status_ioctl(cmd, &ioc->nid_common)) {
2818 		dev_err(nvme->n_dip, CE_WARN,
2819 		    "!IDENTIFY failed with sct = %x, sc = %x",
2820 		    cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
2821 		ret = B_FALSE;
2822 		goto fail;
2823 	}
2824 
2825 	*buf = kmem_alloc(NVME_IDENTIFY_BUFSIZE, KM_SLEEP);
2826 	bcopy(cmd->nc_dma->nd_memp, *buf, NVME_IDENTIFY_BUFSIZE);
2827 	ret = B_TRUE;
2828 
2829 fail:
2830 	nvme_free_cmd(cmd);
2831 
2832 	return (ret);
2833 }
2834 
2835 static boolean_t
2836 nvme_identify_int(nvme_t *nvme, uint32_t nsid, uint8_t cns, void **buf)
2837 {
2838 	nvme_ioctl_identify_t id;
2839 
2840 	bzero(&id, sizeof (nvme_ioctl_identify_t));
2841 	id.nid_common.nioc_nsid = nsid;
2842 	id.nid_cns = cns;
2843 
2844 	return (nvme_identify(nvme, B_FALSE, &id, buf));
2845 }
2846 
2847 static int
2848 nvme_set_features(nvme_t *nvme, boolean_t user, uint32_t nsid, uint8_t feature,
2849     uint32_t val, uint32_t *res)
2850 {
2851 	_NOTE(ARGUNUSED(nsid));
2852 	nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
2853 	int ret = EINVAL;
2854 
2855 	ASSERT(res != NULL);
2856 
2857 	cmd->nc_sqid = 0;
2858 	cmd->nc_callback = nvme_wakeup_cmd;
2859 	cmd->nc_sqe.sqe_opc = NVME_OPC_SET_FEATURES;
2860 	cmd->nc_sqe.sqe_cdw10 = feature;
2861 	cmd->nc_sqe.sqe_cdw11 = val;
2862 
2863 	if (user)
2864 		cmd->nc_dontpanic = B_TRUE;
2865 
2866 	switch (feature) {
2867 	case NVME_FEAT_WRITE_CACHE:
2868 		if (!nvme->n_write_cache_present)
2869 			goto fail;
2870 		break;
2871 
2872 	case NVME_FEAT_NQUEUES:
2873 		break;
2874 
2875 	default:
2876 		goto fail;
2877 	}
2878 
2879 	nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
2880 
2881 	if ((ret = nvme_check_cmd_status(cmd)) != 0) {
2882 		dev_err(nvme->n_dip, CE_WARN,
2883 		    "!SET FEATURES %d failed with sct = %x, sc = %x",
2884 		    feature, cmd->nc_cqe.cqe_sf.sf_sct,
2885 		    cmd->nc_cqe.cqe_sf.sf_sc);
2886 		goto fail;
2887 	}
2888 
2889 	*res = cmd->nc_cqe.cqe_dw0;
2890 
2891 fail:
2892 	nvme_free_cmd(cmd);
2893 	return (ret);
2894 }
2895 
2896 static int
2897 nvme_write_cache_set(nvme_t *nvme, boolean_t enable)
2898 {
2899 	nvme_write_cache_t nwc = { 0 };
2900 
2901 	if (enable)
2902 		nwc.b.wc_wce = 1;
2903 
2904 	return (nvme_set_features(nvme, B_FALSE, 0, NVME_FEAT_WRITE_CACHE,
2905 	    nwc.r, &nwc.r));
2906 }
2907 
2908 static int
2909 nvme_set_nqueues(nvme_t *nvme)
2910 {
2911 	nvme_nqueues_t nq = { 0 };
2912 	int ret;
2913 
2914 	/*
2915 	 * The default is to allocate one completion queue per vector.
2916 	 */
2917 	if (nvme->n_completion_queues == -1)
2918 		nvme->n_completion_queues = nvme->n_intr_cnt;
2919 
2920 	/*
2921 	 * There is no point in having more completion queues than
2922 	 * interrupt vectors.
2923 	 */
2924 	nvme->n_completion_queues = MIN(nvme->n_completion_queues,
2925 	    nvme->n_intr_cnt);
2926 
2927 	/*
2928 	 * The default is to use one submission queue per completion queue.
2929 	 */
2930 	if (nvme->n_submission_queues == -1)
2931 		nvme->n_submission_queues = nvme->n_completion_queues;
2932 
2933 	/*
2934 	 * There is no point in having more completion queues than
2935 	 * submission queues.
2936 	 */
2937 	nvme->n_completion_queues = MIN(nvme->n_completion_queues,
2938 	    nvme->n_submission_queues);
2939 
2940 	ASSERT(nvme->n_submission_queues > 0);
2941 	ASSERT(nvme->n_completion_queues > 0);
2942 
2943 	nq.b.nq_nsq = nvme->n_submission_queues - 1;
2944 	nq.b.nq_ncq = nvme->n_completion_queues - 1;
2945 
2946 	ret = nvme_set_features(nvme, B_FALSE, 0, NVME_FEAT_NQUEUES, nq.r,
2947 	    &nq.r);
2948 
2949 	if (ret == 0) {
2950 		/*
2951 		 * Never use more than the requested number of queues.
2952 		 */
2953 		nvme->n_submission_queues = MIN(nvme->n_submission_queues,
2954 		    nq.b.nq_nsq + 1);
2955 		nvme->n_completion_queues = MIN(nvme->n_completion_queues,
2956 		    nq.b.nq_ncq + 1);
2957 	}
2958 
2959 	return (ret);
2960 }
2961 
2962 static int
2963 nvme_create_completion_queue(nvme_t *nvme, nvme_cq_t *cq)
2964 {
2965 	nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
2966 	nvme_create_queue_dw10_t dw10 = { 0 };
2967 	nvme_create_cq_dw11_t c_dw11 = { 0 };
2968 	int ret;
2969 
2970 	dw10.b.q_qid = cq->ncq_id;
2971 	dw10.b.q_qsize = cq->ncq_nentry - 1;
2972 
2973 	c_dw11.b.cq_pc = 1;
2974 	c_dw11.b.cq_ien = 1;
2975 	c_dw11.b.cq_iv = cq->ncq_id % nvme->n_intr_cnt;
2976 
2977 	cmd->nc_sqid = 0;
2978 	cmd->nc_callback = nvme_wakeup_cmd;
2979 	cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_CQUEUE;
2980 	cmd->nc_sqe.sqe_cdw10 = dw10.r;
2981 	cmd->nc_sqe.sqe_cdw11 = c_dw11.r;
2982 	cmd->nc_sqe.sqe_dptr.d_prp[0] = cq->ncq_dma->nd_cookie.dmac_laddress;
2983 
2984 	nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
2985 
2986 	if ((ret = nvme_check_cmd_status(cmd)) != 0) {
2987 		dev_err(nvme->n_dip, CE_WARN,
2988 		    "!CREATE CQUEUE failed with sct = %x, sc = %x",
2989 		    cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
2990 	}
2991 
2992 	nvme_free_cmd(cmd);
2993 
2994 	return (ret);
2995 }
2996 
2997 static int
2998 nvme_create_io_qpair(nvme_t *nvme, nvme_qpair_t *qp, uint16_t idx)
2999 {
3000 	nvme_cq_t *cq = qp->nq_cq;
3001 	nvme_cmd_t *cmd;
3002 	nvme_create_queue_dw10_t dw10 = { 0 };
3003 	nvme_create_sq_dw11_t s_dw11 = { 0 };
3004 	int ret;
3005 
3006 	/*
3007 	 * It is possible to have more qpairs than completion queues,
3008 	 * and when the idx > ncq_id, that completion queue is shared
3009 	 * and has already been created.
3010 	 */
3011 	if (idx <= cq->ncq_id &&
3012 	    nvme_create_completion_queue(nvme, cq) != DDI_SUCCESS)
3013 		return (DDI_FAILURE);
3014 
3015 	dw10.b.q_qid = idx;
3016 	dw10.b.q_qsize = qp->nq_nentry - 1;
3017 
3018 	s_dw11.b.sq_pc = 1;
3019 	s_dw11.b.sq_cqid = cq->ncq_id;
3020 
3021 	cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
3022 	cmd->nc_sqid = 0;
3023 	cmd->nc_callback = nvme_wakeup_cmd;
3024 	cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_SQUEUE;
3025 	cmd->nc_sqe.sqe_cdw10 = dw10.r;
3026 	cmd->nc_sqe.sqe_cdw11 = s_dw11.r;
3027 	cmd->nc_sqe.sqe_dptr.d_prp[0] = qp->nq_sqdma->nd_cookie.dmac_laddress;
3028 
3029 	nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
3030 
3031 	if ((ret = nvme_check_cmd_status(cmd)) != 0) {
3032 		dev_err(nvme->n_dip, CE_WARN,
3033 		    "!CREATE SQUEUE failed with sct = %x, sc = %x",
3034 		    cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
3035 	}
3036 
3037 	nvme_free_cmd(cmd);
3038 
3039 	return (ret);
3040 }
3041 
3042 static boolean_t
3043 nvme_reset(nvme_t *nvme, boolean_t quiesce)
3044 {
3045 	nvme_reg_csts_t csts;
3046 	int i;
3047 
3048 	nvme_put32(nvme, NVME_REG_CC, 0);
3049 
3050 	csts.r = nvme_get32(nvme, NVME_REG_CSTS);
3051 	if (csts.b.csts_rdy == 1) {
3052 		nvme_put32(nvme, NVME_REG_CC, 0);
3053 
3054 		/*
3055 		 * The timeout value is from the Controller Capabilities
3056 		 * register (CAP.TO, section 3.1.1). This is the worst case
3057 		 * time to wait for CSTS.RDY to transition from 1 to 0 after
3058 		 * CC.EN transitions from 1 to 0.
3059 		 *
3060 		 * The timeout units are in 500 ms units, and we are delaying
3061 		 * in 50ms chunks, hence counting to n_timeout * 10.
3062 		 */
3063 		for (i = 0; i < nvme->n_timeout * 10; i++) {
3064 			csts.r = nvme_get32(nvme, NVME_REG_CSTS);
3065 			if (csts.b.csts_rdy == 0)
3066 				break;
3067 
3068 			/*
3069 			 * Quiescing drivers should not use locks or timeouts,
3070 			 * so if this is the quiesce path, use a quiesce-safe
3071 			 * delay.
3072 			 */
3073 			if (quiesce) {
3074 				drv_usecwait(50000);
3075 			} else {
3076 				delay(drv_usectohz(50000));
3077 			}
3078 		}
3079 	}
3080 
3081 	nvme_put32(nvme, NVME_REG_AQA, 0);
3082 	nvme_put32(nvme, NVME_REG_ASQ, 0);
3083 	nvme_put32(nvme, NVME_REG_ACQ, 0);
3084 
3085 	csts.r = nvme_get32(nvme, NVME_REG_CSTS);
3086 	return (csts.b.csts_rdy == 0 ? B_TRUE : B_FALSE);
3087 }
3088 
3089 static void
3090 nvme_shutdown(nvme_t *nvme, boolean_t quiesce)
3091 {
3092 	nvme_reg_cc_t cc;
3093 	nvme_reg_csts_t csts;
3094 	int i;
3095 
3096 	cc.r = nvme_get32(nvme, NVME_REG_CC);
3097 	cc.b.cc_shn = NVME_CC_SHN_NORMAL;
3098 	nvme_put32(nvme, NVME_REG_CC, cc.r);
3099 
3100 	for (i = 0; i < 10; i++) {
3101 		csts.r = nvme_get32(nvme, NVME_REG_CSTS);
3102 		if (csts.b.csts_shst == NVME_CSTS_SHN_COMPLETE)
3103 			break;
3104 
3105 		if (quiesce) {
3106 			drv_usecwait(100000);
3107 		} else {
3108 			delay(drv_usectohz(100000));
3109 		}
3110 	}
3111 }
3112 
3113 /*
3114  * Return length of string without trailing spaces.
3115  */
3116 static int
3117 nvme_strlen(const char *str, int len)
3118 {
3119 	if (len <= 0)
3120 		return (0);
3121 
3122 	while (str[--len] == ' ')
3123 		;
3124 
3125 	return (++len);
3126 }
3127 
3128 static void
3129 nvme_config_min_block_size(nvme_t *nvme, char *model, char *val)
3130 {
3131 	ulong_t bsize = 0;
3132 	char *msg = "";
3133 
3134 	if (ddi_strtoul(val, NULL, 0, &bsize) != 0)
3135 		goto err;
3136 
3137 	if (!ISP2(bsize)) {
3138 		msg = ": not a power of 2";
3139 		goto err;
3140 	}
3141 
3142 	if (bsize < NVME_DEFAULT_MIN_BLOCK_SIZE) {
3143 		msg = ": too low";
3144 		goto err;
3145 	}
3146 
3147 	nvme->n_min_block_size = bsize;
3148 	return;
3149 
3150 err:
3151 	dev_err(nvme->n_dip, CE_WARN,
3152 	    "!nvme-config-list: ignoring invalid min-phys-block-size '%s' "
3153 	    "for model '%s'%s", val, model, msg);
3154 
3155 	nvme->n_min_block_size = NVME_DEFAULT_MIN_BLOCK_SIZE;
3156 }
3157 
3158 static void
3159 nvme_config_boolean(nvme_t *nvme, char *model, char *name, char *val,
3160     boolean_t *b)
3161 {
3162 	if (strcmp(val, "on") == 0 ||
3163 	    strcmp(val, "true") == 0)
3164 		*b = B_TRUE;
3165 	else if (strcmp(val, "off") == 0 ||
3166 	    strcmp(val, "false") == 0)
3167 		*b = B_FALSE;
3168 	else
3169 		dev_err(nvme->n_dip, CE_WARN,
3170 		    "!nvme-config-list: invalid value for %s '%s'"
3171 		    " for model '%s', ignoring", name, val, model);
3172 }
3173 
3174 static void
3175 nvme_config_list(nvme_t *nvme)
3176 {
3177 	char	**config_list;
3178 	uint_t	nelem;
3179 	int	rv, i;
3180 
3181 	/*
3182 	 * We're following the pattern of 'sd-config-list' here, but extend it.
3183 	 * Instead of two we have three separate strings for "model", "fwrev",
3184 	 * and "name-value-list".
3185 	 */
3186 	rv = ddi_prop_lookup_string_array(DDI_DEV_T_ANY, nvme->n_dip,
3187 	    DDI_PROP_DONTPASS, "nvme-config-list", &config_list, &nelem);
3188 
3189 	if (rv != DDI_PROP_SUCCESS) {
3190 		if (rv == DDI_PROP_CANNOT_DECODE) {
3191 			dev_err(nvme->n_dip, CE_WARN,
3192 			    "!nvme-config-list: cannot be decoded");
3193 		}
3194 
3195 		return;
3196 	}
3197 
3198 	if ((nelem % 3) != 0) {
3199 		dev_err(nvme->n_dip, CE_WARN, "!nvme-config-list: must be "
3200 		    "triplets of <model>/<fwrev>/<name-value-list> strings ");
3201 		goto out;
3202 	}
3203 
3204 	for (i = 0; i < nelem; i += 3) {
3205 		char	*model = config_list[i];
3206 		char	*fwrev = config_list[i + 1];
3207 		char	*nvp, *save_nv;
3208 		int	id_model_len, id_fwrev_len;
3209 
3210 		id_model_len = nvme_strlen(nvme->n_idctl->id_model,
3211 		    sizeof (nvme->n_idctl->id_model));
3212 
3213 		if (strlen(model) != id_model_len)
3214 			continue;
3215 
3216 		if (strncmp(model, nvme->n_idctl->id_model, id_model_len) != 0)
3217 			continue;
3218 
3219 		id_fwrev_len = nvme_strlen(nvme->n_idctl->id_fwrev,
3220 		    sizeof (nvme->n_idctl->id_fwrev));
3221 
3222 		if (strlen(fwrev) != 0) {
3223 			boolean_t match = B_FALSE;
3224 			char *fwr, *last_fw;
3225 
3226 			for (fwr = strtok_r(fwrev, ",", &last_fw);
3227 			    fwr != NULL;
3228 			    fwr = strtok_r(NULL, ",", &last_fw)) {
3229 				if (strlen(fwr) != id_fwrev_len)
3230 					continue;
3231 
3232 				if (strncmp(fwr, nvme->n_idctl->id_fwrev,
3233 				    id_fwrev_len) == 0)
3234 					match = B_TRUE;
3235 			}
3236 
3237 			if (!match)
3238 				continue;
3239 		}
3240 
3241 		/*
3242 		 * We should now have a comma-separated list of name:value
3243 		 * pairs.
3244 		 */
3245 		for (nvp = strtok_r(config_list[i + 2], ",", &save_nv);
3246 		    nvp != NULL; nvp = strtok_r(NULL, ",", &save_nv)) {
3247 			char	*name = nvp;
3248 			char	*val = strchr(nvp, ':');
3249 
3250 			if (val == NULL || name == val) {
3251 				dev_err(nvme->n_dip, CE_WARN,
3252 				    "!nvme-config-list: <name-value-list> "
3253 				    "for model '%s' is malformed", model);
3254 				goto out;
3255 			}
3256 
3257 			/*
3258 			 * Null-terminate 'name', move 'val' past ':' sep.
3259 			 */
3260 			*val++ = '\0';
3261 
3262 			/*
3263 			 * Process the name:val pairs that we know about.
3264 			 */
3265 			if (strcmp(name, "ignore-unknown-vendor-status") == 0) {
3266 				nvme_config_boolean(nvme, model, name, val,
3267 				    &nvme->n_ignore_unknown_vendor_status);
3268 			} else if (strcmp(name, "min-phys-block-size") == 0) {
3269 				nvme_config_min_block_size(nvme, model, val);
3270 			} else if (strcmp(name, "volatile-write-cache") == 0) {
3271 				nvme_config_boolean(nvme, model, name, val,
3272 				    &nvme->n_write_cache_enabled);
3273 			} else {
3274 				/*
3275 				 * Unknown 'name'.
3276 				 */
3277 				dev_err(nvme->n_dip, CE_WARN,
3278 				    "!nvme-config-list: unknown config '%s' "
3279 				    "for model '%s', ignoring", name, model);
3280 			}
3281 		}
3282 	}
3283 
3284 out:
3285 	ddi_prop_free(config_list);
3286 }
3287 
3288 static void
3289 nvme_prepare_devid(nvme_t *nvme, uint32_t nsid)
3290 {
3291 	/*
3292 	 * Section 7.7 of the spec describes how to get a unique ID for
3293 	 * the controller: the vendor ID, the model name and the serial
3294 	 * number shall be unique when combined.
3295 	 *
3296 	 * If a namespace has no EUI64 we use the above and add the hex
3297 	 * namespace ID to get a unique ID for the namespace.
3298 	 */
3299 	char model[sizeof (nvme->n_idctl->id_model) + 1];
3300 	char serial[sizeof (nvme->n_idctl->id_serial) + 1];
3301 
3302 	bcopy(nvme->n_idctl->id_model, model, sizeof (nvme->n_idctl->id_model));
3303 	bcopy(nvme->n_idctl->id_serial, serial,
3304 	    sizeof (nvme->n_idctl->id_serial));
3305 
3306 	model[sizeof (nvme->n_idctl->id_model)] = '\0';
3307 	serial[sizeof (nvme->n_idctl->id_serial)] = '\0';
3308 
3309 	nvme_nsid2ns(nvme, nsid)->ns_devid = kmem_asprintf("%4X-%s-%s-%X",
3310 	    nvme->n_idctl->id_vid, model, serial, nsid);
3311 }
3312 
3313 static nvme_identify_nsid_list_t *
3314 nvme_update_nsid_list(nvme_t *nvme, int cns)
3315 {
3316 	nvme_identify_nsid_list_t *nslist;
3317 
3318 	/*
3319 	 * We currently don't handle cases where there are more than
3320 	 * 1024 active namespaces, requiring several IDENTIFY commands.
3321 	 */
3322 	if (nvme_identify_int(nvme, 0, cns, (void **)&nslist))
3323 		return (nslist);
3324 
3325 	return (NULL);
3326 }
3327 
3328 nvme_namespace_t *
3329 nvme_nsid2ns(nvme_t *nvme, uint32_t nsid)
3330 {
3331 	ASSERT3U(nsid, !=, 0);
3332 	ASSERT3U(nsid, <=, nvme->n_namespace_count);
3333 	return (&nvme->n_ns[nsid - 1]);
3334 }
3335 
3336 static boolean_t
3337 nvme_allocated_ns(nvme_namespace_t *ns)
3338 {
3339 	nvme_t *nvme = ns->ns_nvme;
3340 	uint32_t i;
3341 
3342 	ASSERT(MUTEX_HELD(&nvme->n_mgmt_mutex));
3343 
3344 	/*
3345 	 * If supported, update the list of allocated namespace IDs.
3346 	 */
3347 	if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 2) &&
3348 	    nvme->n_idctl->id_oacs.oa_nsmgmt != 0) {
3349 		nvme_identify_nsid_list_t *nslist = nvme_update_nsid_list(nvme,
3350 		    NVME_IDENTIFY_NSID_ALLOC_LIST);
3351 		boolean_t found = B_FALSE;
3352 
3353 		/*
3354 		 * When namespace management is supported, this really shouldn't
3355 		 * be NULL. Treat all namespaces as allocated if it is.
3356 		 */
3357 		if (nslist == NULL)
3358 			return (B_TRUE);
3359 
3360 		for (i = 0; i < ARRAY_SIZE(nslist->nl_nsid); i++) {
3361 			if (ns->ns_id == 0)
3362 				break;
3363 
3364 			if (ns->ns_id == nslist->nl_nsid[i])
3365 				found = B_TRUE;
3366 		}
3367 
3368 		kmem_free(nslist, NVME_IDENTIFY_BUFSIZE);
3369 		return (found);
3370 	} else {
3371 		/*
3372 		 * If namespace management isn't supported, report all
3373 		 * namespaces as allocated.
3374 		 */
3375 		return (B_TRUE);
3376 	}
3377 }
3378 
3379 static boolean_t
3380 nvme_active_ns(nvme_namespace_t *ns)
3381 {
3382 	nvme_t *nvme = ns->ns_nvme;
3383 	uint64_t *ptr;
3384 	uint32_t i;
3385 
3386 	ASSERT(MUTEX_HELD(&nvme->n_mgmt_mutex));
3387 
3388 	/*
3389 	 * If supported, update the list of active namespace IDs.
3390 	 */
3391 	if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1)) {
3392 		nvme_identify_nsid_list_t *nslist = nvme_update_nsid_list(nvme,
3393 		    NVME_IDENTIFY_NSID_LIST);
3394 		boolean_t found = B_FALSE;
3395 
3396 		/*
3397 		 * When namespace management is supported, this really shouldn't
3398 		 * be NULL. Treat all namespaces as allocated if it is.
3399 		 */
3400 		if (nslist == NULL)
3401 			return (B_TRUE);
3402 
3403 		for (i = 0; i < ARRAY_SIZE(nslist->nl_nsid); i++) {
3404 			if (ns->ns_id == 0)
3405 				break;
3406 
3407 			if (ns->ns_id == nslist->nl_nsid[i])
3408 				found = B_TRUE;
3409 		}
3410 
3411 		kmem_free(nslist, NVME_IDENTIFY_BUFSIZE);
3412 		return (found);
3413 	}
3414 
3415 	/*
3416 	 * Workaround for revision 1.0:
3417 	 * Check whether the IDENTIFY NAMESPACE data is zero-filled.
3418 	 */
3419 	for (ptr = (uint64_t *)ns->ns_idns;
3420 	    ptr != (uint64_t *)(ns->ns_idns + 1);
3421 	    ptr++) {
3422 		if (*ptr != 0) {
3423 			return (B_TRUE);
3424 		}
3425 	}
3426 
3427 	return (B_FALSE);
3428 }
3429 
3430 static int
3431 nvme_init_ns(nvme_t *nvme, uint32_t nsid)
3432 {
3433 	nvme_namespace_t *ns = nvme_nsid2ns(nvme, nsid);
3434 	nvme_identify_nsid_t *idns;
3435 	boolean_t was_ignored;
3436 	int last_rp;
3437 
3438 	ns->ns_nvme = nvme;
3439 
3440 	ASSERT(MUTEX_HELD(&nvme->n_mgmt_mutex));
3441 
3442 	/*
3443 	 * Because we might rescan a namespace and this will fail after boot
3444 	 * that'd leave us in a bad spot. We need to do something about this
3445 	 * longer term, but it's not clear how exactly we would recover right
3446 	 * now.
3447 	 */
3448 	if (!nvme_identify_int(nvme, nsid, NVME_IDENTIFY_NSID,
3449 	    (void **)&idns)) {
3450 		dev_err(nvme->n_dip, CE_WARN,
3451 		    "!failed to identify namespace %d", nsid);
3452 		return (DDI_FAILURE);
3453 	}
3454 
3455 	if (ns->ns_idns != NULL)
3456 		kmem_free(ns->ns_idns, sizeof (nvme_identify_nsid_t));
3457 
3458 	ns->ns_idns = idns;
3459 	ns->ns_id = nsid;
3460 
3461 	was_ignored = ns->ns_ignore;
3462 
3463 	ns->ns_allocated = nvme_allocated_ns(ns);
3464 	ns->ns_active = nvme_active_ns(ns);
3465 
3466 	ns->ns_block_count = idns->id_nsize;
3467 	ns->ns_block_size =
3468 	    1 << idns->id_lbaf[idns->id_flbas.lba_format].lbaf_lbads;
3469 	ns->ns_best_block_size = ns->ns_block_size;
3470 
3471 	/*
3472 	 * Get the EUI64 if present.
3473 	 */
3474 	if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1))
3475 		bcopy(idns->id_eui64, ns->ns_eui64, sizeof (ns->ns_eui64));
3476 
3477 	/*
3478 	 * Get the NGUID if present.
3479 	 */
3480 	if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 2))
3481 		bcopy(idns->id_nguid, ns->ns_nguid, sizeof (ns->ns_nguid));
3482 
3483 	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
3484 	if (*(uint64_t *)ns->ns_eui64 == 0)
3485 		nvme_prepare_devid(nvme, ns->ns_id);
3486 
3487 	(void) snprintf(ns->ns_name, sizeof (ns->ns_name), "%u", ns->ns_id);
3488 
3489 	/*
3490 	 * Find the LBA format with no metadata and the best relative
3491 	 * performance. A value of 3 means "degraded", 0 is best.
3492 	 */
3493 	last_rp = 3;
3494 	for (int j = 0; j <= idns->id_nlbaf; j++) {
3495 		if (idns->id_lbaf[j].lbaf_lbads == 0)
3496 			break;
3497 		if (idns->id_lbaf[j].lbaf_ms != 0)
3498 			continue;
3499 		if (idns->id_lbaf[j].lbaf_rp >= last_rp)
3500 			continue;
3501 		last_rp = idns->id_lbaf[j].lbaf_rp;
3502 		ns->ns_best_block_size =
3503 		    1 << idns->id_lbaf[j].lbaf_lbads;
3504 	}
3505 
3506 	if (ns->ns_best_block_size < nvme->n_min_block_size)
3507 		ns->ns_best_block_size = nvme->n_min_block_size;
3508 
3509 	was_ignored = ns->ns_ignore;
3510 
3511 	/*
3512 	 * We currently don't support namespaces that are inactive, or use
3513 	 * either:
3514 	 * - protection information
3515 	 * - illegal block size (< 512)
3516 	 */
3517 	if (!ns->ns_active) {
3518 		ns->ns_ignore = B_TRUE;
3519 	} else if (idns->id_dps.dp_pinfo) {
3520 		dev_err(nvme->n_dip, CE_WARN,
3521 		    "!ignoring namespace %d, unsupported feature: "
3522 		    "pinfo = %d", nsid, idns->id_dps.dp_pinfo);
3523 		ns->ns_ignore = B_TRUE;
3524 	} else if (ns->ns_block_size < 512) {
3525 		dev_err(nvme->n_dip, CE_WARN,
3526 		    "!ignoring namespace %d, unsupported block size %"PRIu64,
3527 		    nsid, (uint64_t)ns->ns_block_size);
3528 		ns->ns_ignore = B_TRUE;
3529 	} else {
3530 		ns->ns_ignore = B_FALSE;
3531 	}
3532 
3533 	/*
3534 	 * Keep a count of namespaces which are attachable.
3535 	 * See comments in nvme_bd_driveinfo() to understand its effect.
3536 	 */
3537 	if (was_ignored) {
3538 		/*
3539 		 * Previously ignored, but now not. Count it.
3540 		 */
3541 		if (!ns->ns_ignore)
3542 			nvme->n_namespaces_attachable++;
3543 	} else {
3544 		/*
3545 		 * Wasn't ignored previously, but now needs to be.
3546 		 * Discount it.
3547 		 */
3548 		if (ns->ns_ignore)
3549 			nvme->n_namespaces_attachable--;
3550 	}
3551 
3552 	return (DDI_SUCCESS);
3553 }
3554 
3555 static boolean_t
3556 nvme_attach_ns(nvme_t *nvme, nvme_ioctl_common_t *com)
3557 {
3558 	nvme_namespace_t *ns = nvme_nsid2ns(nvme, com->nioc_nsid);
3559 
3560 	ASSERT(MUTEX_HELD(&nvme->n_mgmt_mutex));
3561 
3562 	if (ns->ns_ignore) {
3563 		return (nvme_ioctl_error(com, NVME_IOCTL_E_UNSUP_ATTACH_NS,
3564 		    0, 0));
3565 	}
3566 
3567 	if (ns->ns_bd_hdl == NULL) {
3568 		bd_ops_t ops = nvme_bd_ops;
3569 
3570 		if (!nvme->n_idctl->id_oncs.on_dset_mgmt)
3571 			ops.o_free_space = NULL;
3572 
3573 		ns->ns_bd_hdl = bd_alloc_handle(ns, &ops, &nvme->n_prp_dma_attr,
3574 		    KM_SLEEP);
3575 
3576 		if (ns->ns_bd_hdl == NULL) {
3577 			dev_err(nvme->n_dip, CE_WARN, "!Failed to get blkdev "
3578 			    "handle for namespace id %u", com->nioc_nsid);
3579 			return (nvme_ioctl_error(com,
3580 			    NVME_IOCTL_E_BLKDEV_ATTACH, 0, 0));
3581 		}
3582 	}
3583 
3584 	if (bd_attach_handle(nvme->n_dip, ns->ns_bd_hdl) != DDI_SUCCESS) {
3585 		return (nvme_ioctl_error(com, NVME_IOCTL_E_BLKDEV_ATTACH,
3586 		    0, 0));
3587 	}
3588 
3589 	ns->ns_attached = B_TRUE;
3590 
3591 	return (B_TRUE);
3592 }
3593 
3594 static boolean_t
3595 nvme_detach_ns(nvme_t *nvme, nvme_ioctl_common_t *com)
3596 {
3597 	nvme_namespace_t *ns = nvme_nsid2ns(nvme, com->nioc_nsid);
3598 
3599 	ASSERT(MUTEX_HELD(&nvme->n_mgmt_mutex));
3600 
3601 	if (ns->ns_ignore || !ns->ns_attached)
3602 		return (B_TRUE);
3603 
3604 	ASSERT3P(ns->ns_bd_hdl, !=, NULL);
3605 	if (bd_detach_handle(ns->ns_bd_hdl) != DDI_SUCCESS) {
3606 		return (nvme_ioctl_error(com, NVME_IOCTL_E_BLKDEV_DETACH, 0,
3607 		    0));
3608 	}
3609 
3610 	ns->ns_attached = B_FALSE;
3611 	return (B_TRUE);
3612 
3613 }
3614 
3615 /*
3616  * Rescan the namespace information associated with the namespaces indicated by
3617  * ioc. They should not be attached to blkdev right now.
3618  */
3619 static void
3620 nvme_rescan_ns(nvme_t *nvme, uint32_t nsid)
3621 {
3622 	ASSERT(MUTEX_HELD(&nvme->n_mgmt_mutex));
3623 	ASSERT3U(nsid, !=, 0);
3624 
3625 	if (nsid != NVME_NSID_BCAST) {
3626 		nvme_namespace_t *ns = nvme_nsid2ns(nvme, nsid);
3627 
3628 		ASSERT3U(ns->ns_attached, ==, B_FALSE);
3629 		(void) nvme_init_ns(nvme, nsid);
3630 		return;
3631 	}
3632 
3633 	for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) {
3634 		nvme_namespace_t *ns = nvme_nsid2ns(nvme, i);
3635 
3636 		ASSERT3U(ns->ns_attached, ==, B_FALSE);
3637 		(void) nvme_init_ns(nvme, i);
3638 	}
3639 }
3640 
3641 typedef struct nvme_quirk_table {
3642 	uint16_t nq_vendor_id;
3643 	uint16_t nq_device_id;
3644 	nvme_quirk_t nq_quirks;
3645 } nvme_quirk_table_t;
3646 
3647 static const nvme_quirk_table_t nvme_quirks[] = {
3648 	{ 0x1987, 0x5018, NVME_QUIRK_START_CID },	/* Phison E18 */
3649 };
3650 
3651 static void
3652 nvme_detect_quirks(nvme_t *nvme)
3653 {
3654 	for (uint_t i = 0; i < ARRAY_SIZE(nvme_quirks); i++) {
3655 		const nvme_quirk_table_t *nqt = &nvme_quirks[i];
3656 
3657 		if (nqt->nq_vendor_id == nvme->n_vendor_id &&
3658 		    nqt->nq_device_id == nvme->n_device_id) {
3659 			nvme->n_quirks = nqt->nq_quirks;
3660 			return;
3661 		}
3662 	}
3663 }
3664 
3665 static int
3666 nvme_init(nvme_t *nvme)
3667 {
3668 	nvme_reg_cc_t cc = { 0 };
3669 	nvme_reg_aqa_t aqa = { 0 };
3670 	nvme_reg_asq_t asq = { 0 };
3671 	nvme_reg_acq_t acq = { 0 };
3672 	nvme_reg_cap_t cap;
3673 	nvme_reg_vs_t vs;
3674 	nvme_reg_csts_t csts;
3675 	int i = 0;
3676 	uint16_t nqueues;
3677 	uint_t tq_threads;
3678 	char model[sizeof (nvme->n_idctl->id_model) + 1];
3679 	char *vendor, *product;
3680 	uint32_t nsid;
3681 
3682 	/* Check controller version */
3683 	vs.r = nvme_get32(nvme, NVME_REG_VS);
3684 	nvme->n_version.v_major = vs.b.vs_mjr;
3685 	nvme->n_version.v_minor = vs.b.vs_mnr;
3686 	dev_err(nvme->n_dip, CE_CONT, "?NVMe spec version %d.%d",
3687 	    nvme->n_version.v_major, nvme->n_version.v_minor);
3688 
3689 	if (nvme->n_version.v_major > nvme_version_major) {
3690 		dev_err(nvme->n_dip, CE_WARN, "!no support for version > %d.x",
3691 		    nvme_version_major);
3692 		if (nvme->n_strict_version)
3693 			goto fail;
3694 	}
3695 
3696 	/* retrieve controller configuration */
3697 	cap.r = nvme_get64(nvme, NVME_REG_CAP);
3698 
3699 	if ((cap.b.cap_css & NVME_CAP_CSS_NVM) == 0) {
3700 		dev_err(nvme->n_dip, CE_WARN,
3701 		    "!NVM command set not supported by hardware");
3702 		goto fail;
3703 	}
3704 
3705 	nvme->n_nssr_supported = cap.b.cap_nssrs;
3706 	nvme->n_doorbell_stride = 4 << cap.b.cap_dstrd;
3707 	nvme->n_timeout = cap.b.cap_to;
3708 	nvme->n_arbitration_mechanisms = cap.b.cap_ams;
3709 	nvme->n_cont_queues_reqd = cap.b.cap_cqr;
3710 	nvme->n_max_queue_entries = cap.b.cap_mqes + 1;
3711 
3712 	/*
3713 	 * The MPSMIN and MPSMAX fields in the CAP register use 0 to specify
3714 	 * the base page size of 4k (1<<12), so add 12 here to get the real
3715 	 * page size value.
3716 	 */
3717 	nvme->n_pageshift = MIN(MAX(cap.b.cap_mpsmin + 12, PAGESHIFT),
3718 	    cap.b.cap_mpsmax + 12);
3719 	nvme->n_pagesize = 1UL << (nvme->n_pageshift);
3720 
3721 	/*
3722 	 * Set up Queue DMA to transfer at least 1 page-aligned page at a time.
3723 	 */
3724 	nvme->n_queue_dma_attr.dma_attr_align = nvme->n_pagesize;
3725 	nvme->n_queue_dma_attr.dma_attr_minxfer = nvme->n_pagesize;
3726 
3727 	/*
3728 	 * Set up PRP DMA to transfer 1 page-aligned page at a time.
3729 	 * Maxxfer may be increased after we identified the controller limits.
3730 	 */
3731 	nvme->n_prp_dma_attr.dma_attr_maxxfer = nvme->n_pagesize;
3732 	nvme->n_prp_dma_attr.dma_attr_minxfer = nvme->n_pagesize;
3733 	nvme->n_prp_dma_attr.dma_attr_align = nvme->n_pagesize;
3734 	nvme->n_prp_dma_attr.dma_attr_seg = nvme->n_pagesize - 1;
3735 
3736 	/*
3737 	 * Reset controller if it's still in ready state.
3738 	 */
3739 	if (nvme_reset(nvme, B_FALSE) == B_FALSE) {
3740 		dev_err(nvme->n_dip, CE_WARN, "!unable to reset controller");
3741 		ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST);
3742 		nvme->n_dead = B_TRUE;
3743 		goto fail;
3744 	}
3745 
3746 	/*
3747 	 * Create the cq array with one completion queue to be assigned
3748 	 * to the admin queue pair and a limited number of taskqs (4).
3749 	 */
3750 	if (nvme_create_cq_array(nvme, 1, nvme->n_admin_queue_len, 4) !=
3751 	    DDI_SUCCESS) {
3752 		dev_err(nvme->n_dip, CE_WARN,
3753 		    "!failed to pre-allocate admin completion queue");
3754 		goto fail;
3755 	}
3756 	/*
3757 	 * Create the admin queue pair.
3758 	 */
3759 	if (nvme_alloc_qpair(nvme, nvme->n_admin_queue_len, &nvme->n_adminq, 0)
3760 	    != DDI_SUCCESS) {
3761 		dev_err(nvme->n_dip, CE_WARN,
3762 		    "!unable to allocate admin qpair");
3763 		goto fail;
3764 	}
3765 	nvme->n_ioq = kmem_alloc(sizeof (nvme_qpair_t *), KM_SLEEP);
3766 	nvme->n_ioq[0] = nvme->n_adminq;
3767 
3768 	if (nvme->n_quirks & NVME_QUIRK_START_CID)
3769 		nvme->n_adminq->nq_next_cmd++;
3770 
3771 	nvme->n_progress |= NVME_ADMIN_QUEUE;
3772 
3773 	(void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip,
3774 	    "admin-queue-len", nvme->n_admin_queue_len);
3775 
3776 	aqa.b.aqa_asqs = aqa.b.aqa_acqs = nvme->n_admin_queue_len - 1;
3777 	asq = nvme->n_adminq->nq_sqdma->nd_cookie.dmac_laddress;
3778 	acq = nvme->n_adminq->nq_cq->ncq_dma->nd_cookie.dmac_laddress;
3779 
3780 	ASSERT((asq & (nvme->n_pagesize - 1)) == 0);
3781 	ASSERT((acq & (nvme->n_pagesize - 1)) == 0);
3782 
3783 	nvme_put32(nvme, NVME_REG_AQA, aqa.r);
3784 	nvme_put64(nvme, NVME_REG_ASQ, asq);
3785 	nvme_put64(nvme, NVME_REG_ACQ, acq);
3786 
3787 	cc.b.cc_ams = 0;	/* use Round-Robin arbitration */
3788 	cc.b.cc_css = 0;	/* use NVM command set */
3789 	cc.b.cc_mps = nvme->n_pageshift - 12;
3790 	cc.b.cc_shn = 0;	/* no shutdown in progress */
3791 	cc.b.cc_en = 1;		/* enable controller */
3792 	cc.b.cc_iosqes = 6;	/* submission queue entry is 2^6 bytes long */
3793 	cc.b.cc_iocqes = 4;	/* completion queue entry is 2^4 bytes long */
3794 
3795 	nvme_put32(nvme, NVME_REG_CC, cc.r);
3796 
3797 	/*
3798 	 * Wait for the controller to become ready.
3799 	 */
3800 	csts.r = nvme_get32(nvme, NVME_REG_CSTS);
3801 	if (csts.b.csts_rdy == 0) {
3802 		for (i = 0; i != nvme->n_timeout * 10; i++) {
3803 			delay(drv_usectohz(50000));
3804 			csts.r = nvme_get32(nvme, NVME_REG_CSTS);
3805 
3806 			if (csts.b.csts_cfs == 1) {
3807 				dev_err(nvme->n_dip, CE_WARN,
3808 				    "!controller fatal status at init");
3809 				ddi_fm_service_impact(nvme->n_dip,
3810 				    DDI_SERVICE_LOST);
3811 				nvme->n_dead = B_TRUE;
3812 				goto fail;
3813 			}
3814 
3815 			if (csts.b.csts_rdy == 1)
3816 				break;
3817 		}
3818 	}
3819 
3820 	if (csts.b.csts_rdy == 0) {
3821 		dev_err(nvme->n_dip, CE_WARN, "!controller not ready");
3822 		ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST);
3823 		nvme->n_dead = B_TRUE;
3824 		goto fail;
3825 	}
3826 
3827 	/*
3828 	 * Assume an abort command limit of 1. We'll destroy and re-init
3829 	 * that later when we know the true abort command limit.
3830 	 */
3831 	sema_init(&nvme->n_abort_sema, 1, NULL, SEMA_DRIVER, NULL);
3832 
3833 	/*
3834 	 * Set up initial interrupt for admin queue.
3835 	 */
3836 	if ((nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSIX, 1)
3837 	    != DDI_SUCCESS) &&
3838 	    (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSI, 1)
3839 	    != DDI_SUCCESS) &&
3840 	    (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_FIXED, 1)
3841 	    != DDI_SUCCESS)) {
3842 		dev_err(nvme->n_dip, CE_WARN,
3843 		    "!failed to set up initial interrupt");
3844 		goto fail;
3845 	}
3846 
3847 	/*
3848 	 * Post an asynchronous event command to catch errors.
3849 	 * We assume the asynchronous events are supported as required by
3850 	 * specification (Figure 40 in section 5 of NVMe 1.2).
3851 	 * However, since at least qemu does not follow the specification,
3852 	 * we need a mechanism to protect ourselves.
3853 	 */
3854 	nvme->n_async_event_supported = B_TRUE;
3855 	nvme_async_event(nvme);
3856 
3857 	/*
3858 	 * Identify Controller
3859 	 */
3860 	if (!nvme_identify_int(nvme, 0, NVME_IDENTIFY_CTRL,
3861 	    (void **)&nvme->n_idctl)) {
3862 		dev_err(nvme->n_dip, CE_WARN, "!failed to identify controller");
3863 		goto fail;
3864 	}
3865 
3866 	/*
3867 	 * Get the common namespace information if available. If not, we use the
3868 	 * information for nsid 1.
3869 	 */
3870 	if (nvme_ctrl_atleast(nvme, &nvme_vers_1v2) &&
3871 	    nvme->n_idctl->id_oacs.oa_nsmgmt != 0) {
3872 		nsid = NVME_NSID_BCAST;
3873 	} else {
3874 		nsid = 1;
3875 	}
3876 
3877 	if (!nvme_identify_int(nvme, nsid, NVME_IDENTIFY_NSID,
3878 	    (void **)&nvme->n_idcomns)) {
3879 		dev_err(nvme->n_dip, CE_WARN, "!failed to identify common "
3880 		    "namespace information");
3881 		goto fail;
3882 	}
3883 	/*
3884 	 * Process nvme-config-list (if present) in nvme.conf.
3885 	 */
3886 	nvme_config_list(nvme);
3887 
3888 	/*
3889 	 * Get Vendor & Product ID
3890 	 */
3891 	bcopy(nvme->n_idctl->id_model, model, sizeof (nvme->n_idctl->id_model));
3892 	model[sizeof (nvme->n_idctl->id_model)] = '\0';
3893 	sata_split_model(model, &vendor, &product);
3894 
3895 	if (vendor == NULL)
3896 		nvme->n_vendor = strdup("NVMe");
3897 	else
3898 		nvme->n_vendor = strdup(vendor);
3899 
3900 	nvme->n_product = strdup(product);
3901 
3902 	/*
3903 	 * Get controller limits.
3904 	 */
3905 	nvme->n_async_event_limit = MAX(NVME_MIN_ASYNC_EVENT_LIMIT,
3906 	    MIN(nvme->n_admin_queue_len / 10,
3907 	    MIN(nvme->n_idctl->id_aerl + 1, nvme->n_async_event_limit)));
3908 
3909 	(void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip,
3910 	    "async-event-limit", nvme->n_async_event_limit);
3911 
3912 	nvme->n_abort_command_limit = nvme->n_idctl->id_acl + 1;
3913 
3914 	/*
3915 	 * Reinitialize the semaphore with the true abort command limit
3916 	 * supported by the hardware. It's not necessary to disable interrupts
3917 	 * as only command aborts use the semaphore, and no commands are
3918 	 * executed or aborted while we're here.
3919 	 */
3920 	sema_destroy(&nvme->n_abort_sema);
3921 	sema_init(&nvme->n_abort_sema, nvme->n_abort_command_limit - 1, NULL,
3922 	    SEMA_DRIVER, NULL);
3923 
3924 	nvme->n_progress |= NVME_CTRL_LIMITS;
3925 
3926 	if (nvme->n_idctl->id_mdts == 0)
3927 		nvme->n_max_data_transfer_size = nvme->n_pagesize * 65536;
3928 	else
3929 		nvme->n_max_data_transfer_size =
3930 		    1ull << (nvme->n_pageshift + nvme->n_idctl->id_mdts);
3931 
3932 	nvme->n_error_log_len = nvme->n_idctl->id_elpe + 1;
3933 
3934 	/*
3935 	 * Limit n_max_data_transfer_size to what we can handle in one PRP.
3936 	 * Chained PRPs are currently unsupported.
3937 	 *
3938 	 * This is a no-op on hardware which doesn't support a transfer size
3939 	 * big enough to require chained PRPs.
3940 	 */
3941 	nvme->n_max_data_transfer_size = MIN(nvme->n_max_data_transfer_size,
3942 	    (nvme->n_pagesize / sizeof (uint64_t) * nvme->n_pagesize));
3943 
3944 	nvme->n_prp_dma_attr.dma_attr_maxxfer = nvme->n_max_data_transfer_size;
3945 
3946 	/*
3947 	 * Make sure the minimum/maximum queue entry sizes are not
3948 	 * larger/smaller than the default.
3949 	 */
3950 
3951 	if (((1 << nvme->n_idctl->id_sqes.qes_min) > sizeof (nvme_sqe_t)) ||
3952 	    ((1 << nvme->n_idctl->id_sqes.qes_max) < sizeof (nvme_sqe_t)) ||
3953 	    ((1 << nvme->n_idctl->id_cqes.qes_min) > sizeof (nvme_cqe_t)) ||
3954 	    ((1 << nvme->n_idctl->id_cqes.qes_max) < sizeof (nvme_cqe_t)))
3955 		goto fail;
3956 
3957 	/*
3958 	 * Check for the presence of a Volatile Write Cache. If present,
3959 	 * enable or disable based on the value of the property
3960 	 * volatile-write-cache-enable (default is enabled).
3961 	 */
3962 	nvme->n_write_cache_present =
3963 	    nvme->n_idctl->id_vwc.vwc_present == 0 ? B_FALSE : B_TRUE;
3964 
3965 	(void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip,
3966 	    "volatile-write-cache-present",
3967 	    nvme->n_write_cache_present ? 1 : 0);
3968 
3969 	if (!nvme->n_write_cache_present) {
3970 		nvme->n_write_cache_enabled = B_FALSE;
3971 	} else if (nvme_write_cache_set(nvme, nvme->n_write_cache_enabled)
3972 	    != 0) {
3973 		dev_err(nvme->n_dip, CE_WARN,
3974 		    "!failed to %sable volatile write cache",
3975 		    nvme->n_write_cache_enabled ? "en" : "dis");
3976 		/*
3977 		 * Assume the cache is (still) enabled.
3978 		 */
3979 		nvme->n_write_cache_enabled = B_TRUE;
3980 	}
3981 
3982 	(void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip,
3983 	    "volatile-write-cache-enable",
3984 	    nvme->n_write_cache_enabled ? 1 : 0);
3985 
3986 	/*
3987 	 * Get number of supported namespaces and allocate namespace array.
3988 	 */
3989 	nvme->n_namespace_count = nvme->n_idctl->id_nn;
3990 
3991 	if (nvme->n_namespace_count == 0) {
3992 		dev_err(nvme->n_dip, CE_WARN,
3993 		    "!controllers without namespaces are not supported");
3994 		goto fail;
3995 	}
3996 
3997 	if (nvme->n_namespace_count > NVME_MINOR_MAX) {
3998 		dev_err(nvme->n_dip, CE_WARN,
3999 		    "!too many namespaces: %d, limiting to %d\n",
4000 		    nvme->n_namespace_count, NVME_MINOR_MAX);
4001 		nvme->n_namespace_count = NVME_MINOR_MAX;
4002 	}
4003 
4004 	nvme->n_ns = kmem_zalloc(sizeof (nvme_namespace_t) *
4005 	    nvme->n_namespace_count, KM_SLEEP);
4006 
4007 	/*
4008 	 * Try to set up MSI/MSI-X interrupts.
4009 	 */
4010 	if ((nvme->n_intr_types & (DDI_INTR_TYPE_MSI | DDI_INTR_TYPE_MSIX))
4011 	    != 0) {
4012 		nvme_release_interrupts(nvme);
4013 
4014 		nqueues = MIN(UINT16_MAX, ncpus);
4015 
4016 		if ((nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSIX,
4017 		    nqueues) != DDI_SUCCESS) &&
4018 		    (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSI,
4019 		    nqueues) != DDI_SUCCESS)) {
4020 			dev_err(nvme->n_dip, CE_WARN,
4021 			    "!failed to set up MSI/MSI-X interrupts");
4022 			goto fail;
4023 		}
4024 	}
4025 
4026 	/*
4027 	 * Create I/O queue pairs.
4028 	 */
4029 
4030 	if (nvme_set_nqueues(nvme) != 0) {
4031 		dev_err(nvme->n_dip, CE_WARN,
4032 		    "!failed to set number of I/O queues to %d",
4033 		    nvme->n_intr_cnt);
4034 		goto fail;
4035 	}
4036 
4037 	/*
4038 	 * Reallocate I/O queue array
4039 	 */
4040 	kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *));
4041 	nvme->n_ioq = kmem_zalloc(sizeof (nvme_qpair_t *) *
4042 	    (nvme->n_submission_queues + 1), KM_SLEEP);
4043 	nvme->n_ioq[0] = nvme->n_adminq;
4044 
4045 	/*
4046 	 * There should always be at least as many submission queues
4047 	 * as completion queues.
4048 	 */
4049 	ASSERT(nvme->n_submission_queues >= nvme->n_completion_queues);
4050 
4051 	nvme->n_ioq_count = nvme->n_submission_queues;
4052 
4053 	nvme->n_io_squeue_len =
4054 	    MIN(nvme->n_io_squeue_len, nvme->n_max_queue_entries);
4055 
4056 	(void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-squeue-len",
4057 	    nvme->n_io_squeue_len);
4058 
4059 	/*
4060 	 * Pre-allocate completion queues.
4061 	 * When there are the same number of submission and completion
4062 	 * queues there is no value in having a larger completion
4063 	 * queue length.
4064 	 */
4065 	if (nvme->n_submission_queues == nvme->n_completion_queues)
4066 		nvme->n_io_cqueue_len = MIN(nvme->n_io_cqueue_len,
4067 		    nvme->n_io_squeue_len);
4068 
4069 	nvme->n_io_cqueue_len = MIN(nvme->n_io_cqueue_len,
4070 	    nvme->n_max_queue_entries);
4071 
4072 	(void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-cqueue-len",
4073 	    nvme->n_io_cqueue_len);
4074 
4075 	/*
4076 	 * Assign the equal quantity of taskq threads to each completion
4077 	 * queue, capping the total number of threads to the number
4078 	 * of CPUs.
4079 	 */
4080 	tq_threads = MIN(UINT16_MAX, ncpus) / nvme->n_completion_queues;
4081 
4082 	/*
4083 	 * In case the calculation above is zero, we need at least one
4084 	 * thread per completion queue.
4085 	 */
4086 	tq_threads = MAX(1, tq_threads);
4087 
4088 	if (nvme_create_cq_array(nvme, nvme->n_completion_queues + 1,
4089 	    nvme->n_io_cqueue_len, tq_threads) != DDI_SUCCESS) {
4090 		dev_err(nvme->n_dip, CE_WARN,
4091 		    "!failed to pre-allocate completion queues");
4092 		goto fail;
4093 	}
4094 
4095 	/*
4096 	 * If we use less completion queues than interrupt vectors return
4097 	 * some of the interrupt vectors back to the system.
4098 	 */
4099 	if (nvme->n_completion_queues + 1 < nvme->n_intr_cnt) {
4100 		nvme_release_interrupts(nvme);
4101 
4102 		if (nvme_setup_interrupts(nvme, nvme->n_intr_type,
4103 		    nvme->n_completion_queues + 1) != DDI_SUCCESS) {
4104 			dev_err(nvme->n_dip, CE_WARN,
4105 			    "!failed to reduce number of interrupts");
4106 			goto fail;
4107 		}
4108 	}
4109 
4110 	/*
4111 	 * Alloc & register I/O queue pairs
4112 	 */
4113 
4114 	for (i = 1; i != nvme->n_ioq_count + 1; i++) {
4115 		if (nvme_alloc_qpair(nvme, nvme->n_io_squeue_len,
4116 		    &nvme->n_ioq[i], i) != DDI_SUCCESS) {
4117 			dev_err(nvme->n_dip, CE_WARN,
4118 			    "!unable to allocate I/O qpair %d", i);
4119 			goto fail;
4120 		}
4121 
4122 		if (nvme_create_io_qpair(nvme, nvme->n_ioq[i], i) != 0) {
4123 			dev_err(nvme->n_dip, CE_WARN,
4124 			    "!unable to create I/O qpair %d", i);
4125 			goto fail;
4126 		}
4127 	}
4128 
4129 	/*
4130 	 * Post more asynchronous events commands to reduce event reporting
4131 	 * latency as suggested by the spec.
4132 	 */
4133 	if (nvme->n_async_event_supported) {
4134 		for (i = 1; i != nvme->n_async_event_limit; i++)
4135 			nvme_async_event(nvme);
4136 	}
4137 
4138 	return (DDI_SUCCESS);
4139 
4140 fail:
4141 	(void) nvme_reset(nvme, B_FALSE);
4142 	return (DDI_FAILURE);
4143 }
4144 
4145 static uint_t
4146 nvme_intr(caddr_t arg1, caddr_t arg2)
4147 {
4148 	/*LINTED: E_PTR_BAD_CAST_ALIGN*/
4149 	nvme_t *nvme = (nvme_t *)arg1;
4150 	int inum = (int)(uintptr_t)arg2;
4151 	int ccnt = 0;
4152 	int qnum;
4153 
4154 	if (inum >= nvme->n_intr_cnt)
4155 		return (DDI_INTR_UNCLAIMED);
4156 
4157 	if (nvme->n_dead)
4158 		return (nvme->n_intr_type == DDI_INTR_TYPE_FIXED ?
4159 		    DDI_INTR_UNCLAIMED : DDI_INTR_CLAIMED);
4160 
4161 	/*
4162 	 * The interrupt vector a queue uses is calculated as queue_idx %
4163 	 * intr_cnt in nvme_create_io_qpair(). Iterate through the queue array
4164 	 * in steps of n_intr_cnt to process all queues using this vector.
4165 	 */
4166 	for (qnum = inum;
4167 	    qnum < nvme->n_cq_count && nvme->n_cq[qnum] != NULL;
4168 	    qnum += nvme->n_intr_cnt) {
4169 		ccnt += nvme_process_iocq(nvme, nvme->n_cq[qnum]);
4170 	}
4171 
4172 	return (ccnt > 0 ? DDI_INTR_CLAIMED : DDI_INTR_UNCLAIMED);
4173 }
4174 
4175 static void
4176 nvme_release_interrupts(nvme_t *nvme)
4177 {
4178 	int i;
4179 
4180 	for (i = 0; i < nvme->n_intr_cnt; i++) {
4181 		if (nvme->n_inth[i] == NULL)
4182 			break;
4183 
4184 		if (nvme->n_intr_cap & DDI_INTR_FLAG_BLOCK)
4185 			(void) ddi_intr_block_disable(&nvme->n_inth[i], 1);
4186 		else
4187 			(void) ddi_intr_disable(nvme->n_inth[i]);
4188 
4189 		(void) ddi_intr_remove_handler(nvme->n_inth[i]);
4190 		(void) ddi_intr_free(nvme->n_inth[i]);
4191 	}
4192 
4193 	kmem_free(nvme->n_inth, nvme->n_inth_sz);
4194 	nvme->n_inth = NULL;
4195 	nvme->n_inth_sz = 0;
4196 
4197 	nvme->n_progress &= ~NVME_INTERRUPTS;
4198 }
4199 
4200 static int
4201 nvme_setup_interrupts(nvme_t *nvme, int intr_type, int nqpairs)
4202 {
4203 	int nintrs, navail, count;
4204 	int ret;
4205 	int i;
4206 
4207 	if (nvme->n_intr_types == 0) {
4208 		ret = ddi_intr_get_supported_types(nvme->n_dip,
4209 		    &nvme->n_intr_types);
4210 		if (ret != DDI_SUCCESS) {
4211 			dev_err(nvme->n_dip, CE_WARN,
4212 			    "!%s: ddi_intr_get_supported types failed",
4213 			    __func__);
4214 			return (ret);
4215 		}
4216 #ifdef __x86
4217 		if (get_hwenv() == HW_VMWARE)
4218 			nvme->n_intr_types &= ~DDI_INTR_TYPE_MSIX;
4219 #endif
4220 	}
4221 
4222 	if ((nvme->n_intr_types & intr_type) == 0)
4223 		return (DDI_FAILURE);
4224 
4225 	ret = ddi_intr_get_nintrs(nvme->n_dip, intr_type, &nintrs);
4226 	if (ret != DDI_SUCCESS) {
4227 		dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_nintrs failed",
4228 		    __func__);
4229 		return (ret);
4230 	}
4231 
4232 	ret = ddi_intr_get_navail(nvme->n_dip, intr_type, &navail);
4233 	if (ret != DDI_SUCCESS) {
4234 		dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_navail failed",
4235 		    __func__);
4236 		return (ret);
4237 	}
4238 
4239 	/* We want at most one interrupt per queue pair. */
4240 	if (navail > nqpairs)
4241 		navail = nqpairs;
4242 
4243 	nvme->n_inth_sz = sizeof (ddi_intr_handle_t) * navail;
4244 	nvme->n_inth = kmem_zalloc(nvme->n_inth_sz, KM_SLEEP);
4245 
4246 	ret = ddi_intr_alloc(nvme->n_dip, nvme->n_inth, intr_type, 0, navail,
4247 	    &count, 0);
4248 	if (ret != DDI_SUCCESS) {
4249 		dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_alloc failed",
4250 		    __func__);
4251 		goto fail;
4252 	}
4253 
4254 	nvme->n_intr_cnt = count;
4255 
4256 	ret = ddi_intr_get_pri(nvme->n_inth[0], &nvme->n_intr_pri);
4257 	if (ret != DDI_SUCCESS) {
4258 		dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_pri failed",
4259 		    __func__);
4260 		goto fail;
4261 	}
4262 
4263 	for (i = 0; i < count; i++) {
4264 		ret = ddi_intr_add_handler(nvme->n_inth[i], nvme_intr,
4265 		    (void *)nvme, (void *)(uintptr_t)i);
4266 		if (ret != DDI_SUCCESS) {
4267 			dev_err(nvme->n_dip, CE_WARN,
4268 			    "!%s: ddi_intr_add_handler failed", __func__);
4269 			goto fail;
4270 		}
4271 	}
4272 
4273 	(void) ddi_intr_get_cap(nvme->n_inth[0], &nvme->n_intr_cap);
4274 
4275 	for (i = 0; i < count; i++) {
4276 		if (nvme->n_intr_cap & DDI_INTR_FLAG_BLOCK)
4277 			ret = ddi_intr_block_enable(&nvme->n_inth[i], 1);
4278 		else
4279 			ret = ddi_intr_enable(nvme->n_inth[i]);
4280 
4281 		if (ret != DDI_SUCCESS) {
4282 			dev_err(nvme->n_dip, CE_WARN,
4283 			    "!%s: enabling interrupt %d failed", __func__, i);
4284 			goto fail;
4285 		}
4286 	}
4287 
4288 	nvme->n_intr_type = intr_type;
4289 
4290 	nvme->n_progress |= NVME_INTERRUPTS;
4291 
4292 	return (DDI_SUCCESS);
4293 
4294 fail:
4295 	nvme_release_interrupts(nvme);
4296 
4297 	return (ret);
4298 }
4299 
4300 static int
4301 nvme_fm_errcb(dev_info_t *dip, ddi_fm_error_t *fm_error, const void *arg)
4302 {
4303 	_NOTE(ARGUNUSED(arg));
4304 
4305 	pci_ereport_post(dip, fm_error, NULL);
4306 	return (fm_error->fme_status);
4307 }
4308 
4309 static void
4310 nvme_remove_callback(dev_info_t *dip, ddi_eventcookie_t cookie, void *a,
4311     void *b)
4312 {
4313 	nvme_t *nvme = a;
4314 
4315 	nvme_ctrl_mark_dead(nvme, B_TRUE);
4316 
4317 	/*
4318 	 * Fail all outstanding commands, including those in the admin queue
4319 	 * (queue 0).
4320 	 */
4321 	for (uint_t i = 0; i < nvme->n_ioq_count + 1; i++) {
4322 		nvme_qpair_t *qp = nvme->n_ioq[i];
4323 
4324 		mutex_enter(&qp->nq_mutex);
4325 		for (size_t j = 0; j < qp->nq_nentry; j++) {
4326 			nvme_cmd_t *cmd = qp->nq_cmd[j];
4327 			nvme_cmd_t *u_cmd;
4328 
4329 			if (cmd == NULL) {
4330 				continue;
4331 			}
4332 
4333 			/*
4334 			 * Since we have the queue lock held the entire time we
4335 			 * iterate over it, it's not possible for the queue to
4336 			 * change underneath us. Thus, we don't need to check
4337 			 * that the return value of nvme_unqueue_cmd matches the
4338 			 * requested cmd to unqueue.
4339 			 */
4340 			u_cmd = nvme_unqueue_cmd(nvme, qp, cmd->nc_sqe.sqe_cid);
4341 			taskq_dispatch_ent(qp->nq_cq->ncq_cmd_taskq,
4342 			    cmd->nc_callback, cmd, TQ_NOSLEEP, &cmd->nc_tqent);
4343 
4344 			ASSERT3P(u_cmd, ==, cmd);
4345 		}
4346 		mutex_exit(&qp->nq_mutex);
4347 	}
4348 }
4349 
4350 /*
4351  * Open minor management
4352  */
4353 static int
4354 nvme_minor_comparator(const void *l, const void *r)
4355 {
4356 	const nvme_minor_t *lm = l;
4357 	const nvme_minor_t *rm = r;
4358 
4359 	if (lm->nm_minor > rm->nm_minor) {
4360 		return (1);
4361 	} else if (lm->nm_minor < rm->nm_minor) {
4362 		return (-1);
4363 	} else {
4364 		return (0);
4365 	}
4366 }
4367 
4368 static void
4369 nvme_minor_free(nvme_minor_t *minor)
4370 {
4371 	if (minor->nm_minor > 0) {
4372 		ASSERT3S(minor->nm_minor, >=, NVME_OPEN_MINOR_MIN);
4373 		id_free(nvme_open_minors, minor->nm_minor);
4374 		minor->nm_minor = 0;
4375 	}
4376 	VERIFY0(list_link_active(&minor->nm_ctrl_lock.nli_node));
4377 	VERIFY0(list_link_active(&minor->nm_ns_lock.nli_node));
4378 	cv_destroy(&minor->nm_cv);
4379 	kmem_free(minor, sizeof (nvme_minor_t));
4380 }
4381 
4382 static nvme_minor_t *
4383 nvme_minor_find_by_dev(dev_t dev)
4384 {
4385 	id_t id = (id_t)getminor(dev);
4386 	nvme_minor_t search = { .nm_minor = id };
4387 	nvme_minor_t *ret;
4388 
4389 	mutex_enter(&nvme_open_minors_mutex);
4390 	ret = avl_find(&nvme_open_minors_avl, &search, NULL);
4391 	mutex_exit(&nvme_open_minors_mutex);
4392 
4393 	return (ret);
4394 }
4395 
4396 static int
4397 nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
4398 {
4399 	nvme_t *nvme;
4400 	int instance;
4401 	int nregs;
4402 	off_t regsize;
4403 	char name[32];
4404 	boolean_t attached_ns;
4405 
4406 	if (cmd != DDI_ATTACH)
4407 		return (DDI_FAILURE);
4408 
4409 	instance = ddi_get_instance(dip);
4410 
4411 	if (ddi_soft_state_zalloc(nvme_state, instance) != DDI_SUCCESS)
4412 		return (DDI_FAILURE);
4413 
4414 	nvme = ddi_get_soft_state(nvme_state, instance);
4415 	ddi_set_driver_private(dip, nvme);
4416 	nvme->n_dip = dip;
4417 
4418 	/*
4419 	 * Map PCI config space
4420 	 */
4421 	if (pci_config_setup(dip, &nvme->n_pcicfg_handle) != DDI_SUCCESS) {
4422 		dev_err(dip, CE_WARN, "!failed to map PCI config space");
4423 		goto fail;
4424 	}
4425 	nvme->n_progress |= NVME_PCI_CONFIG;
4426 
4427 	/*
4428 	 * Get the various PCI IDs from config space
4429 	 */
4430 	nvme->n_vendor_id =
4431 	    pci_config_get16(nvme->n_pcicfg_handle, PCI_CONF_VENID);
4432 	nvme->n_device_id =
4433 	    pci_config_get16(nvme->n_pcicfg_handle, PCI_CONF_DEVID);
4434 	nvme->n_revision_id =
4435 	    pci_config_get8(nvme->n_pcicfg_handle, PCI_CONF_REVID);
4436 	nvme->n_subsystem_device_id =
4437 	    pci_config_get16(nvme->n_pcicfg_handle, PCI_CONF_SUBSYSID);
4438 	nvme->n_subsystem_vendor_id =
4439 	    pci_config_get16(nvme->n_pcicfg_handle, PCI_CONF_SUBVENID);
4440 
4441 	nvme_detect_quirks(nvme);
4442 
4443 	/*
4444 	 * Set up event handlers for hot removal. While npe(4D) supports the hot
4445 	 * removal event being injected for devices, the same is not true of all
4446 	 * of our possible parents (i.e. pci(4D) as of this writing). The most
4447 	 * common case this shows up is in some virtualization environments. We
4448 	 * should treat this as non-fatal so that way devices work but leave
4449 	 * this set up in such a way that if a nexus does grow support for this
4450 	 * we're good to go.
4451 	 */
4452 	if (ddi_get_eventcookie(nvme->n_dip, DDI_DEVI_REMOVE_EVENT,
4453 	    &nvme->n_rm_cookie) == DDI_SUCCESS) {
4454 		if (ddi_add_event_handler(nvme->n_dip, nvme->n_rm_cookie,
4455 		    nvme_remove_callback, nvme, &nvme->n_ev_rm_cb_id) !=
4456 		    DDI_SUCCESS) {
4457 			goto fail;
4458 		}
4459 	} else {
4460 		nvme->n_ev_rm_cb_id = NULL;
4461 	}
4462 
4463 	mutex_init(&nvme->n_minor_mutex, NULL, MUTEX_DRIVER, NULL);
4464 	nvme->n_progress |= NVME_MUTEX_INIT;
4465 
4466 	nvme->n_strict_version = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
4467 	    DDI_PROP_DONTPASS, "strict-version", 1) == 1 ? B_TRUE : B_FALSE;
4468 	nvme->n_ignore_unknown_vendor_status = ddi_prop_get_int(DDI_DEV_T_ANY,
4469 	    dip, DDI_PROP_DONTPASS, "ignore-unknown-vendor-status", 0) == 1 ?
4470 	    B_TRUE : B_FALSE;
4471 	nvme->n_admin_queue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
4472 	    DDI_PROP_DONTPASS, "admin-queue-len", NVME_DEFAULT_ADMIN_QUEUE_LEN);
4473 	nvme->n_io_squeue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
4474 	    DDI_PROP_DONTPASS, "io-squeue-len", NVME_DEFAULT_IO_QUEUE_LEN);
4475 	/*
4476 	 * Double up the default for completion queues in case of
4477 	 * queue sharing.
4478 	 */
4479 	nvme->n_io_cqueue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
4480 	    DDI_PROP_DONTPASS, "io-cqueue-len", 2 * NVME_DEFAULT_IO_QUEUE_LEN);
4481 	nvme->n_async_event_limit = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
4482 	    DDI_PROP_DONTPASS, "async-event-limit",
4483 	    NVME_DEFAULT_ASYNC_EVENT_LIMIT);
4484 	nvme->n_write_cache_enabled = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
4485 	    DDI_PROP_DONTPASS, "volatile-write-cache-enable", 1) != 0 ?
4486 	    B_TRUE : B_FALSE;
4487 	nvme->n_min_block_size = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
4488 	    DDI_PROP_DONTPASS, "min-phys-block-size",
4489 	    NVME_DEFAULT_MIN_BLOCK_SIZE);
4490 	nvme->n_submission_queues = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
4491 	    DDI_PROP_DONTPASS, "max-submission-queues", -1);
4492 	nvme->n_completion_queues = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
4493 	    DDI_PROP_DONTPASS, "max-completion-queues", -1);
4494 
4495 	if (!ISP2(nvme->n_min_block_size) ||
4496 	    (nvme->n_min_block_size < NVME_DEFAULT_MIN_BLOCK_SIZE)) {
4497 		dev_err(dip, CE_WARN, "!min-phys-block-size %s, "
4498 		    "using default %d", ISP2(nvme->n_min_block_size) ?
4499 		    "too low" : "not a power of 2",
4500 		    NVME_DEFAULT_MIN_BLOCK_SIZE);
4501 		nvme->n_min_block_size = NVME_DEFAULT_MIN_BLOCK_SIZE;
4502 	}
4503 
4504 	if (nvme->n_submission_queues != -1 &&
4505 	    (nvme->n_submission_queues < 1 ||
4506 	    nvme->n_submission_queues > UINT16_MAX)) {
4507 		dev_err(dip, CE_WARN, "!\"submission-queues\"=%d is not "
4508 		    "valid. Must be [1..%d]", nvme->n_submission_queues,
4509 		    UINT16_MAX);
4510 		nvme->n_submission_queues = -1;
4511 	}
4512 
4513 	if (nvme->n_completion_queues != -1 &&
4514 	    (nvme->n_completion_queues < 1 ||
4515 	    nvme->n_completion_queues > UINT16_MAX)) {
4516 		dev_err(dip, CE_WARN, "!\"completion-queues\"=%d is not "
4517 		    "valid. Must be [1..%d]", nvme->n_completion_queues,
4518 		    UINT16_MAX);
4519 		nvme->n_completion_queues = -1;
4520 	}
4521 
4522 	if (nvme->n_admin_queue_len < NVME_MIN_ADMIN_QUEUE_LEN)
4523 		nvme->n_admin_queue_len = NVME_MIN_ADMIN_QUEUE_LEN;
4524 	else if (nvme->n_admin_queue_len > NVME_MAX_ADMIN_QUEUE_LEN)
4525 		nvme->n_admin_queue_len = NVME_MAX_ADMIN_QUEUE_LEN;
4526 
4527 	if (nvme->n_io_squeue_len < NVME_MIN_IO_QUEUE_LEN)
4528 		nvme->n_io_squeue_len = NVME_MIN_IO_QUEUE_LEN;
4529 	if (nvme->n_io_cqueue_len < NVME_MIN_IO_QUEUE_LEN)
4530 		nvme->n_io_cqueue_len = NVME_MIN_IO_QUEUE_LEN;
4531 
4532 	if (nvme->n_async_event_limit < 1)
4533 		nvme->n_async_event_limit = NVME_DEFAULT_ASYNC_EVENT_LIMIT;
4534 
4535 	nvme->n_reg_acc_attr = nvme_reg_acc_attr;
4536 	nvme->n_queue_dma_attr = nvme_queue_dma_attr;
4537 	nvme->n_prp_dma_attr = nvme_prp_dma_attr;
4538 	nvme->n_sgl_dma_attr = nvme_sgl_dma_attr;
4539 
4540 	/*
4541 	 * Set up FMA support.
4542 	 */
4543 	nvme->n_fm_cap = ddi_getprop(DDI_DEV_T_ANY, dip,
4544 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "fm-capable",
4545 	    DDI_FM_EREPORT_CAPABLE | DDI_FM_ACCCHK_CAPABLE |
4546 	    DDI_FM_DMACHK_CAPABLE | DDI_FM_ERRCB_CAPABLE);
4547 
4548 	ddi_fm_init(dip, &nvme->n_fm_cap, &nvme->n_fm_ibc);
4549 
4550 	if (nvme->n_fm_cap) {
4551 		if (nvme->n_fm_cap & DDI_FM_ACCCHK_CAPABLE)
4552 			nvme->n_reg_acc_attr.devacc_attr_access =
4553 			    DDI_FLAGERR_ACC;
4554 
4555 		if (nvme->n_fm_cap & DDI_FM_DMACHK_CAPABLE) {
4556 			nvme->n_prp_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
4557 			nvme->n_sgl_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
4558 		}
4559 
4560 		if (DDI_FM_EREPORT_CAP(nvme->n_fm_cap) ||
4561 		    DDI_FM_ERRCB_CAP(nvme->n_fm_cap))
4562 			pci_ereport_setup(dip);
4563 
4564 		if (DDI_FM_ERRCB_CAP(nvme->n_fm_cap))
4565 			ddi_fm_handler_register(dip, nvme_fm_errcb,
4566 			    (void *)nvme);
4567 	}
4568 
4569 	nvme->n_progress |= NVME_FMA_INIT;
4570 
4571 	/*
4572 	 * The spec defines several register sets. Only the controller
4573 	 * registers (set 1) are currently used.
4574 	 */
4575 	if (ddi_dev_nregs(dip, &nregs) == DDI_FAILURE ||
4576 	    nregs < 2 ||
4577 	    ddi_dev_regsize(dip, 1, &regsize) == DDI_FAILURE)
4578 		goto fail;
4579 
4580 	if (ddi_regs_map_setup(dip, 1, &nvme->n_regs, 0, regsize,
4581 	    &nvme->n_reg_acc_attr, &nvme->n_regh) != DDI_SUCCESS) {
4582 		dev_err(dip, CE_WARN, "!failed to map regset 1");
4583 		goto fail;
4584 	}
4585 
4586 	nvme->n_progress |= NVME_REGS_MAPPED;
4587 
4588 	/*
4589 	 * Create PRP DMA cache
4590 	 */
4591 	(void) snprintf(name, sizeof (name), "%s%d_prp_cache",
4592 	    ddi_driver_name(dip), ddi_get_instance(dip));
4593 	nvme->n_prp_cache = kmem_cache_create(name, sizeof (nvme_dma_t),
4594 	    0, nvme_prp_dma_constructor, nvme_prp_dma_destructor,
4595 	    NULL, (void *)nvme, NULL, 0);
4596 
4597 	if (nvme_init(nvme) != DDI_SUCCESS)
4598 		goto fail;
4599 
4600 	/*
4601 	 * Initialize the driver with the UFM subsystem
4602 	 */
4603 	if (ddi_ufm_init(dip, DDI_UFM_CURRENT_VERSION, &nvme_ufm_ops,
4604 	    &nvme->n_ufmh, nvme) != 0) {
4605 		dev_err(dip, CE_WARN, "!failed to initialize UFM subsystem");
4606 		goto fail;
4607 	}
4608 	mutex_init(&nvme->n_fwslot_mutex, NULL, MUTEX_DRIVER, NULL);
4609 	ddi_ufm_update(nvme->n_ufmh);
4610 	nvme->n_progress |= NVME_UFM_INIT;
4611 
4612 	mutex_init(&nvme->n_mgmt_mutex, NULL, MUTEX_DRIVER, NULL);
4613 	nvme_lock_init(&nvme->n_lock);
4614 	nvme->n_progress |= NVME_MGMT_INIT;
4615 	nvme->n_dead_status = NVME_IOCTL_E_CTRL_DEAD;
4616 
4617 
4618 	/*
4619 	 * Identify namespaces.
4620 	 */
4621 	mutex_enter(&nvme->n_mgmt_mutex);
4622 
4623 	for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) {
4624 		nvme_namespace_t *ns = nvme_nsid2ns(nvme, i);
4625 
4626 		nvme_lock_init(&ns->ns_lock);
4627 		ns->ns_progress |= NVME_NS_LOCK;
4628 
4629 		/*
4630 		 * Namespaces start out ignored. When nvme_init_ns() checks
4631 		 * their properties and finds they can be used, it will set
4632 		 * ns_ignore to B_FALSE. It will also use this state change
4633 		 * to keep an accurate count of attachable namespaces.
4634 		 */
4635 		ns->ns_ignore = B_TRUE;
4636 		if (nvme_init_ns(nvme, i) != 0) {
4637 			mutex_exit(&nvme->n_mgmt_mutex);
4638 			goto fail;
4639 		}
4640 
4641 		if (ddi_create_minor_node(nvme->n_dip, ns->ns_name, S_IFCHR,
4642 		    NVME_MINOR(ddi_get_instance(nvme->n_dip), i),
4643 		    DDI_NT_NVME_ATTACHMENT_POINT, 0) != DDI_SUCCESS) {
4644 			mutex_exit(&nvme->n_mgmt_mutex);
4645 			dev_err(dip, CE_WARN,
4646 			    "!failed to create minor node for namespace %d", i);
4647 			goto fail;
4648 		}
4649 	}
4650 
4651 	if (ddi_create_minor_node(dip, "devctl", S_IFCHR,
4652 	    NVME_MINOR(ddi_get_instance(dip), 0), DDI_NT_NVME_NEXUS, 0)
4653 	    != DDI_SUCCESS) {
4654 		mutex_exit(&nvme->n_mgmt_mutex);
4655 		dev_err(dip, CE_WARN, "nvme_attach: "
4656 		    "cannot create devctl minor node");
4657 		goto fail;
4658 	}
4659 
4660 	attached_ns = B_FALSE;
4661 	for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) {
4662 		nvme_ioctl_common_t com = { .nioc_nsid = i };
4663 
4664 		if (nvme_attach_ns(nvme, &com)) {
4665 			attached_ns = B_TRUE;
4666 		} else if (com.nioc_drv_err != NVME_IOCTL_E_UNSUP_ATTACH_NS) {
4667 			dev_err(nvme->n_dip, CE_WARN, "!failed to attach "
4668 			    "namespace %d due to blkdev error", i);
4669 			/*
4670 			 * Once we have successfully attached a namespace we
4671 			 * can no longer fail the driver attach as there is now
4672 			 * a blkdev child node linked to this device, and
4673 			 * our node is not yet in the attached state.
4674 			 */
4675 			if (!attached_ns) {
4676 				mutex_exit(&nvme->n_mgmt_mutex);
4677 				goto fail;
4678 			}
4679 		}
4680 	}
4681 
4682 	mutex_exit(&nvme->n_mgmt_mutex);
4683 
4684 	return (DDI_SUCCESS);
4685 
4686 fail:
4687 	/* attach successful anyway so that FMA can retire the device */
4688 	if (nvme->n_dead)
4689 		return (DDI_SUCCESS);
4690 
4691 	(void) nvme_detach(dip, DDI_DETACH);
4692 
4693 	return (DDI_FAILURE);
4694 }
4695 
4696 static int
4697 nvme_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
4698 {
4699 	int instance;
4700 	nvme_t *nvme;
4701 
4702 	if (cmd != DDI_DETACH)
4703 		return (DDI_FAILURE);
4704 
4705 	instance = ddi_get_instance(dip);
4706 
4707 	nvme = ddi_get_soft_state(nvme_state, instance);
4708 
4709 	if (nvme == NULL)
4710 		return (DDI_FAILURE);
4711 
4712 	/*
4713 	 * Remove all minor nodes from the device regardless of the source in
4714 	 * one swoop.
4715 	 */
4716 	ddi_remove_minor_node(dip, NULL);
4717 
4718 	/*
4719 	 * We need to remove the event handler as one of the first things that
4720 	 * we do. If we proceed with other teardown without removing the event
4721 	 * handler, we could end up in a very unfortunate race with ourselves.
4722 	 * The DDI does not serialize these with detach (just like timeout(9F)
4723 	 * and others).
4724 	 */
4725 	if (nvme->n_ev_rm_cb_id != NULL) {
4726 		(void) ddi_remove_event_handler(nvme->n_ev_rm_cb_id);
4727 	}
4728 	nvme->n_ev_rm_cb_id = NULL;
4729 
4730 	/*
4731 	 * If the controller was marked dead, there is a slight chance that we
4732 	 * are asynchronusly processing the removal taskq. Because we have
4733 	 * removed the callback handler above and all minor nodes and commands
4734 	 * are closed, there is no other way to get in here. As such, we wait on
4735 	 * the nvme_dead_taskq to complete so we can avoid tracking if it's
4736 	 * running or not.
4737 	 */
4738 	taskq_wait(nvme_dead_taskq);
4739 
4740 	if (nvme->n_ns) {
4741 		for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) {
4742 			nvme_namespace_t *ns = nvme_nsid2ns(nvme, i);
4743 
4744 			if (ns->ns_bd_hdl) {
4745 				(void) bd_detach_handle(ns->ns_bd_hdl);
4746 				bd_free_handle(ns->ns_bd_hdl);
4747 			}
4748 
4749 			if (ns->ns_idns)
4750 				kmem_free(ns->ns_idns,
4751 				    sizeof (nvme_identify_nsid_t));
4752 			if (ns->ns_devid)
4753 				strfree(ns->ns_devid);
4754 
4755 			if ((ns->ns_progress & NVME_NS_LOCK) != 0)
4756 				nvme_lock_fini(&ns->ns_lock);
4757 		}
4758 
4759 		kmem_free(nvme->n_ns, sizeof (nvme_namespace_t) *
4760 		    nvme->n_namespace_count);
4761 	}
4762 
4763 	if (nvme->n_progress & NVME_MGMT_INIT) {
4764 		nvme_lock_fini(&nvme->n_lock);
4765 		mutex_destroy(&nvme->n_mgmt_mutex);
4766 	}
4767 
4768 	if (nvme->n_progress & NVME_UFM_INIT) {
4769 		ddi_ufm_fini(nvme->n_ufmh);
4770 		mutex_destroy(&nvme->n_fwslot_mutex);
4771 	}
4772 
4773 	if (nvme->n_progress & NVME_INTERRUPTS)
4774 		nvme_release_interrupts(nvme);
4775 
4776 	for (uint_t i = 0; i < nvme->n_cq_count; i++) {
4777 		if (nvme->n_cq[i]->ncq_cmd_taskq != NULL)
4778 			taskq_wait(nvme->n_cq[i]->ncq_cmd_taskq);
4779 	}
4780 
4781 	if (nvme->n_progress & NVME_MUTEX_INIT) {
4782 		mutex_destroy(&nvme->n_minor_mutex);
4783 	}
4784 
4785 	if (nvme->n_ioq_count > 0) {
4786 		for (uint_t i = 1; i != nvme->n_ioq_count + 1; i++) {
4787 			if (nvme->n_ioq[i] != NULL) {
4788 				/* TODO: send destroy queue commands */
4789 				nvme_free_qpair(nvme->n_ioq[i]);
4790 			}
4791 		}
4792 
4793 		kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *) *
4794 		    (nvme->n_ioq_count + 1));
4795 	}
4796 
4797 	if (nvme->n_prp_cache != NULL) {
4798 		kmem_cache_destroy(nvme->n_prp_cache);
4799 	}
4800 
4801 	if (nvme->n_progress & NVME_REGS_MAPPED) {
4802 		nvme_shutdown(nvme, B_FALSE);
4803 		(void) nvme_reset(nvme, B_FALSE);
4804 	}
4805 
4806 	if (nvme->n_progress & NVME_CTRL_LIMITS)
4807 		sema_destroy(&nvme->n_abort_sema);
4808 
4809 	if (nvme->n_progress & NVME_ADMIN_QUEUE)
4810 		nvme_free_qpair(nvme->n_adminq);
4811 
4812 	if (nvme->n_cq_count > 0) {
4813 		nvme_destroy_cq_array(nvme, 0);
4814 		nvme->n_cq = NULL;
4815 		nvme->n_cq_count = 0;
4816 	}
4817 
4818 	if (nvme->n_idcomns)
4819 		kmem_free(nvme->n_idcomns, NVME_IDENTIFY_BUFSIZE);
4820 
4821 	if (nvme->n_idctl)
4822 		kmem_free(nvme->n_idctl, NVME_IDENTIFY_BUFSIZE);
4823 
4824 	if (nvme->n_progress & NVME_REGS_MAPPED)
4825 		ddi_regs_map_free(&nvme->n_regh);
4826 
4827 	if (nvme->n_progress & NVME_FMA_INIT) {
4828 		if (DDI_FM_ERRCB_CAP(nvme->n_fm_cap))
4829 			ddi_fm_handler_unregister(nvme->n_dip);
4830 
4831 		if (DDI_FM_EREPORT_CAP(nvme->n_fm_cap) ||
4832 		    DDI_FM_ERRCB_CAP(nvme->n_fm_cap))
4833 			pci_ereport_teardown(nvme->n_dip);
4834 
4835 		ddi_fm_fini(nvme->n_dip);
4836 	}
4837 
4838 	if (nvme->n_progress & NVME_PCI_CONFIG)
4839 		pci_config_teardown(&nvme->n_pcicfg_handle);
4840 
4841 	if (nvme->n_vendor != NULL)
4842 		strfree(nvme->n_vendor);
4843 
4844 	if (nvme->n_product != NULL)
4845 		strfree(nvme->n_product);
4846 
4847 	ddi_soft_state_free(nvme_state, instance);
4848 
4849 	return (DDI_SUCCESS);
4850 }
4851 
4852 static int
4853 nvme_quiesce(dev_info_t *dip)
4854 {
4855 	int instance;
4856 	nvme_t *nvme;
4857 
4858 	instance = ddi_get_instance(dip);
4859 
4860 	nvme = ddi_get_soft_state(nvme_state, instance);
4861 
4862 	if (nvme == NULL)
4863 		return (DDI_FAILURE);
4864 
4865 	nvme_shutdown(nvme, B_TRUE);
4866 
4867 	(void) nvme_reset(nvme, B_TRUE);
4868 
4869 	return (DDI_SUCCESS);
4870 }
4871 
4872 static int
4873 nvme_fill_prp(nvme_cmd_t *cmd, ddi_dma_handle_t dma)
4874 {
4875 	nvme_t *nvme = cmd->nc_nvme;
4876 	uint_t nprp_per_page, nprp;
4877 	uint64_t *prp;
4878 	const ddi_dma_cookie_t *cookie;
4879 	uint_t idx;
4880 	uint_t ncookies = ddi_dma_ncookies(dma);
4881 
4882 	if (ncookies == 0)
4883 		return (DDI_FAILURE);
4884 
4885 	if ((cookie = ddi_dma_cookie_get(dma, 0)) == NULL)
4886 		return (DDI_FAILURE);
4887 	cmd->nc_sqe.sqe_dptr.d_prp[0] = cookie->dmac_laddress;
4888 
4889 	if (ncookies == 1) {
4890 		cmd->nc_sqe.sqe_dptr.d_prp[1] = 0;
4891 		return (DDI_SUCCESS);
4892 	} else if (ncookies == 2) {
4893 		if ((cookie = ddi_dma_cookie_get(dma, 1)) == NULL)
4894 			return (DDI_FAILURE);
4895 		cmd->nc_sqe.sqe_dptr.d_prp[1] = cookie->dmac_laddress;
4896 		return (DDI_SUCCESS);
4897 	}
4898 
4899 	/*
4900 	 * At this point, we're always operating on cookies at
4901 	 * index >= 1 and writing the addresses of those cookies
4902 	 * into a new page. The address of that page is stored
4903 	 * as the second PRP entry.
4904 	 */
4905 	nprp_per_page = nvme->n_pagesize / sizeof (uint64_t);
4906 	ASSERT(nprp_per_page > 0);
4907 
4908 	/*
4909 	 * We currently don't support chained PRPs and set up our DMA
4910 	 * attributes to reflect that. If we still get an I/O request
4911 	 * that needs a chained PRP something is very wrong. Account
4912 	 * for the first cookie here, which we've placed in d_prp[0].
4913 	 */
4914 	nprp = howmany(ncookies - 1, nprp_per_page);
4915 	VERIFY(nprp == 1);
4916 
4917 	/*
4918 	 * Allocate a page of pointers, in which we'll write the
4919 	 * addresses of cookies 1 to `ncookies`.
4920 	 */
4921 	cmd->nc_prp = kmem_cache_alloc(nvme->n_prp_cache, KM_SLEEP);
4922 	bzero(cmd->nc_prp->nd_memp, cmd->nc_prp->nd_len);
4923 	cmd->nc_sqe.sqe_dptr.d_prp[1] = cmd->nc_prp->nd_cookie.dmac_laddress;
4924 
4925 	prp = (uint64_t *)cmd->nc_prp->nd_memp;
4926 	for (idx = 1; idx < ncookies; idx++) {
4927 		if ((cookie = ddi_dma_cookie_get(dma, idx)) == NULL)
4928 			return (DDI_FAILURE);
4929 		*prp++ = cookie->dmac_laddress;
4930 	}
4931 
4932 	(void) ddi_dma_sync(cmd->nc_prp->nd_dmah, 0, cmd->nc_prp->nd_len,
4933 	    DDI_DMA_SYNC_FORDEV);
4934 	return (DDI_SUCCESS);
4935 }
4936 
4937 /*
4938  * The maximum number of requests supported for a deallocate request is
4939  * NVME_DSET_MGMT_MAX_RANGES (256) -- this is from the NVMe 1.1 spec (and
4940  * unchanged through at least 1.4a). The definition of nvme_range_t is also
4941  * from the NVMe 1.1 spec. Together, the result is that all of the ranges for
4942  * a deallocate request will fit into the smallest supported namespace page
4943  * (4k).
4944  */
4945 CTASSERT(sizeof (nvme_range_t) * NVME_DSET_MGMT_MAX_RANGES == 4096);
4946 
4947 static int
4948 nvme_fill_ranges(nvme_cmd_t *cmd, bd_xfer_t *xfer, uint64_t blocksize,
4949     int allocflag)
4950 {
4951 	const dkioc_free_list_t *dfl = xfer->x_dfl;
4952 	const dkioc_free_list_ext_t *exts = dfl->dfl_exts;
4953 	nvme_t *nvme = cmd->nc_nvme;
4954 	nvme_range_t *ranges = NULL;
4955 	uint_t i;
4956 
4957 	/*
4958 	 * The number of ranges in the request is 0s based (that is
4959 	 * word10 == 0 -> 1 range, word10 == 1 -> 2 ranges, ...,
4960 	 * word10 == 255 -> 256 ranges). Therefore the allowed values are
4961 	 * [1..NVME_DSET_MGMT_MAX_RANGES]. If blkdev gives us a bad request,
4962 	 * we either provided bad info in nvme_bd_driveinfo() or there is a bug
4963 	 * in blkdev.
4964 	 */
4965 	VERIFY3U(dfl->dfl_num_exts, >, 0);
4966 	VERIFY3U(dfl->dfl_num_exts, <=, NVME_DSET_MGMT_MAX_RANGES);
4967 	cmd->nc_sqe.sqe_cdw10 = (dfl->dfl_num_exts - 1) & 0xff;
4968 
4969 	cmd->nc_sqe.sqe_cdw11 = NVME_DSET_MGMT_ATTR_DEALLOCATE;
4970 
4971 	cmd->nc_prp = kmem_cache_alloc(nvme->n_prp_cache, allocflag);
4972 	if (cmd->nc_prp == NULL)
4973 		return (DDI_FAILURE);
4974 
4975 	bzero(cmd->nc_prp->nd_memp, cmd->nc_prp->nd_len);
4976 	ranges = (nvme_range_t *)cmd->nc_prp->nd_memp;
4977 
4978 	cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_prp->nd_cookie.dmac_laddress;
4979 	cmd->nc_sqe.sqe_dptr.d_prp[1] = 0;
4980 
4981 	for (i = 0; i < dfl->dfl_num_exts; i++) {
4982 		uint64_t lba, len;
4983 
4984 		lba = (dfl->dfl_offset + exts[i].dfle_start) / blocksize;
4985 		len = exts[i].dfle_length / blocksize;
4986 
4987 		VERIFY3U(len, <=, UINT32_MAX);
4988 
4989 		/* No context attributes for a deallocate request */
4990 		ranges[i].nr_ctxattr = 0;
4991 		ranges[i].nr_len = len;
4992 		ranges[i].nr_lba = lba;
4993 	}
4994 
4995 	(void) ddi_dma_sync(cmd->nc_prp->nd_dmah, 0, cmd->nc_prp->nd_len,
4996 	    DDI_DMA_SYNC_FORDEV);
4997 
4998 	return (DDI_SUCCESS);
4999 }
5000 
5001 static nvme_cmd_t *
5002 nvme_create_nvm_cmd(nvme_namespace_t *ns, uint8_t opc, bd_xfer_t *xfer)
5003 {
5004 	nvme_t *nvme = ns->ns_nvme;
5005 	nvme_cmd_t *cmd;
5006 	int allocflag;
5007 
5008 	/*
5009 	 * Blkdev only sets BD_XFER_POLL when dumping, so don't sleep.
5010 	 */
5011 	allocflag = (xfer->x_flags & BD_XFER_POLL) ? KM_NOSLEEP : KM_SLEEP;
5012 	cmd = nvme_alloc_cmd(nvme, allocflag);
5013 
5014 	if (cmd == NULL)
5015 		return (NULL);
5016 
5017 	cmd->nc_sqe.sqe_opc = opc;
5018 	cmd->nc_callback = nvme_bd_xfer_done;
5019 	cmd->nc_xfer = xfer;
5020 
5021 	switch (opc) {
5022 	case NVME_OPC_NVM_WRITE:
5023 	case NVME_OPC_NVM_READ:
5024 		VERIFY(xfer->x_nblks <= 0x10000);
5025 
5026 		cmd->nc_sqe.sqe_nsid = ns->ns_id;
5027 
5028 		cmd->nc_sqe.sqe_cdw10 = xfer->x_blkno & 0xffffffffu;
5029 		cmd->nc_sqe.sqe_cdw11 = (xfer->x_blkno >> 32);
5030 		cmd->nc_sqe.sqe_cdw12 = (uint16_t)(xfer->x_nblks - 1);
5031 
5032 		if (nvme_fill_prp(cmd, xfer->x_dmah) != DDI_SUCCESS)
5033 			goto fail;
5034 		break;
5035 
5036 	case NVME_OPC_NVM_FLUSH:
5037 		cmd->nc_sqe.sqe_nsid = ns->ns_id;
5038 		break;
5039 
5040 	case NVME_OPC_NVM_DSET_MGMT:
5041 		cmd->nc_sqe.sqe_nsid = ns->ns_id;
5042 
5043 		if (nvme_fill_ranges(cmd, xfer,
5044 		    (uint64_t)ns->ns_block_size, allocflag) != DDI_SUCCESS)
5045 			goto fail;
5046 		break;
5047 
5048 	default:
5049 		goto fail;
5050 	}
5051 
5052 	return (cmd);
5053 
5054 fail:
5055 	nvme_free_cmd(cmd);
5056 	return (NULL);
5057 }
5058 
5059 static void
5060 nvme_bd_xfer_done(void *arg)
5061 {
5062 	nvme_cmd_t *cmd = arg;
5063 	bd_xfer_t *xfer = cmd->nc_xfer;
5064 	int error = 0;
5065 
5066 	error = nvme_check_cmd_status(cmd);
5067 	nvme_free_cmd(cmd);
5068 
5069 	bd_xfer_done(xfer, error);
5070 }
5071 
5072 static void
5073 nvme_bd_driveinfo(void *arg, bd_drive_t *drive)
5074 {
5075 	nvme_namespace_t *ns = arg;
5076 	nvme_t *nvme = ns->ns_nvme;
5077 	uint_t ns_count = MAX(1, nvme->n_namespaces_attachable);
5078 	boolean_t mutex_exit_needed = B_TRUE;
5079 
5080 	/*
5081 	 * nvme_bd_driveinfo is called by blkdev in two situations:
5082 	 * - during bd_attach_handle(), which we call with the mutex held
5083 	 * - during bd_attach(), which may be called with or without the
5084 	 *   mutex held
5085 	 */
5086 	if (mutex_owned(&nvme->n_mgmt_mutex))
5087 		mutex_exit_needed = B_FALSE;
5088 	else
5089 		mutex_enter(&nvme->n_mgmt_mutex);
5090 
5091 	/*
5092 	 * Set the blkdev qcount to the number of submission queues.
5093 	 * It will then create one waitq/runq pair for each submission
5094 	 * queue and spread I/O requests across the queues.
5095 	 */
5096 	drive->d_qcount = nvme->n_ioq_count;
5097 
5098 	/*
5099 	 * I/O activity to individual namespaces is distributed across
5100 	 * each of the d_qcount blkdev queues (which has been set to
5101 	 * the number of nvme submission queues). d_qsize is the number
5102 	 * of submitted and not completed I/Os within each queue that blkdev
5103 	 * will allow before it starts holding them in the waitq.
5104 	 *
5105 	 * Each namespace will create a child blkdev instance, for each one
5106 	 * we try and set the d_qsize so that each namespace gets an
5107 	 * equal portion of the submission queue.
5108 	 *
5109 	 * If post instantiation of the nvme drive, n_namespaces_attachable
5110 	 * changes and a namespace is attached it could calculate a
5111 	 * different d_qsize. It may even be that the sum of the d_qsizes is
5112 	 * now beyond the submission queue size. Should that be the case
5113 	 * and the I/O rate is such that blkdev attempts to submit more
5114 	 * I/Os than the size of the submission queue, the excess I/Os
5115 	 * will be held behind the semaphore nq_sema.
5116 	 */
5117 	drive->d_qsize = nvme->n_io_squeue_len / ns_count;
5118 
5119 	/*
5120 	 * Don't let the queue size drop below the minimum, though.
5121 	 */
5122 	drive->d_qsize = MAX(drive->d_qsize, NVME_MIN_IO_QUEUE_LEN);
5123 
5124 	/*
5125 	 * d_maxxfer is not set, which means the value is taken from the DMA
5126 	 * attributes specified to bd_alloc_handle.
5127 	 */
5128 
5129 	drive->d_removable = B_FALSE;
5130 	drive->d_hotpluggable = B_FALSE;
5131 
5132 	bcopy(ns->ns_eui64, drive->d_eui64, sizeof (drive->d_eui64));
5133 	drive->d_target = ns->ns_id;
5134 	drive->d_lun = 0;
5135 
5136 	drive->d_model = nvme->n_idctl->id_model;
5137 	drive->d_model_len = sizeof (nvme->n_idctl->id_model);
5138 	drive->d_vendor = nvme->n_vendor;
5139 	drive->d_vendor_len = strlen(nvme->n_vendor);
5140 	drive->d_product = nvme->n_product;
5141 	drive->d_product_len = strlen(nvme->n_product);
5142 	drive->d_serial = nvme->n_idctl->id_serial;
5143 	drive->d_serial_len = sizeof (nvme->n_idctl->id_serial);
5144 	drive->d_revision = nvme->n_idctl->id_fwrev;
5145 	drive->d_revision_len = sizeof (nvme->n_idctl->id_fwrev);
5146 
5147 	/*
5148 	 * If we support the dataset management command, the only restrictions
5149 	 * on a discard request are the maximum number of ranges (segments)
5150 	 * per single request.
5151 	 */
5152 	if (nvme->n_idctl->id_oncs.on_dset_mgmt)
5153 		drive->d_max_free_seg = NVME_DSET_MGMT_MAX_RANGES;
5154 
5155 	if (mutex_exit_needed)
5156 		mutex_exit(&nvme->n_mgmt_mutex);
5157 }
5158 
5159 static int
5160 nvme_bd_mediainfo(void *arg, bd_media_t *media)
5161 {
5162 	nvme_namespace_t *ns = arg;
5163 	nvme_t *nvme = ns->ns_nvme;
5164 	boolean_t mutex_exit_needed = B_TRUE;
5165 
5166 	if (nvme->n_dead) {
5167 		return (EIO);
5168 	}
5169 
5170 	/*
5171 	 * nvme_bd_mediainfo is called by blkdev in various situations,
5172 	 * most of them out of our control. There's one exception though:
5173 	 * When we call bd_state_change() in response to "namespace change"
5174 	 * notification, where the mutex is already being held by us.
5175 	 */
5176 	if (mutex_owned(&nvme->n_mgmt_mutex))
5177 		mutex_exit_needed = B_FALSE;
5178 	else
5179 		mutex_enter(&nvme->n_mgmt_mutex);
5180 
5181 	media->m_nblks = ns->ns_block_count;
5182 	media->m_blksize = ns->ns_block_size;
5183 	media->m_readonly = B_FALSE;
5184 	media->m_solidstate = B_TRUE;
5185 
5186 	media->m_pblksize = ns->ns_best_block_size;
5187 
5188 	if (mutex_exit_needed)
5189 		mutex_exit(&nvme->n_mgmt_mutex);
5190 
5191 	return (0);
5192 }
5193 
5194 static int
5195 nvme_bd_cmd(nvme_namespace_t *ns, bd_xfer_t *xfer, uint8_t opc)
5196 {
5197 	nvme_t *nvme = ns->ns_nvme;
5198 	nvme_cmd_t *cmd;
5199 	nvme_qpair_t *ioq;
5200 	boolean_t poll;
5201 	int ret;
5202 
5203 	if (nvme->n_dead) {
5204 		return (EIO);
5205 	}
5206 
5207 	cmd = nvme_create_nvm_cmd(ns, opc, xfer);
5208 	if (cmd == NULL)
5209 		return (ENOMEM);
5210 
5211 	cmd->nc_sqid = xfer->x_qnum + 1;
5212 	ASSERT(cmd->nc_sqid <= nvme->n_ioq_count);
5213 	ioq = nvme->n_ioq[cmd->nc_sqid];
5214 
5215 	/*
5216 	 * Get the polling flag before submitting the command. The command may
5217 	 * complete immediately after it was submitted, which means we must
5218 	 * treat both cmd and xfer as if they have been freed already.
5219 	 */
5220 	poll = (xfer->x_flags & BD_XFER_POLL) != 0;
5221 
5222 	ret = nvme_submit_io_cmd(ioq, cmd);
5223 
5224 	if (ret != 0)
5225 		return (ret);
5226 
5227 	if (!poll)
5228 		return (0);
5229 
5230 	do {
5231 		cmd = nvme_retrieve_cmd(nvme, ioq);
5232 		if (cmd != NULL)
5233 			cmd->nc_callback(cmd);
5234 		else
5235 			drv_usecwait(10);
5236 	} while (ioq->nq_active_cmds != 0);
5237 
5238 	return (0);
5239 }
5240 
5241 static int
5242 nvme_bd_read(void *arg, bd_xfer_t *xfer)
5243 {
5244 	nvme_namespace_t *ns = arg;
5245 
5246 	return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_READ));
5247 }
5248 
5249 static int
5250 nvme_bd_write(void *arg, bd_xfer_t *xfer)
5251 {
5252 	nvme_namespace_t *ns = arg;
5253 
5254 	return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_WRITE));
5255 }
5256 
5257 static int
5258 nvme_bd_sync(void *arg, bd_xfer_t *xfer)
5259 {
5260 	nvme_namespace_t *ns = arg;
5261 
5262 	if (ns->ns_nvme->n_dead)
5263 		return (EIO);
5264 
5265 	/*
5266 	 * If the volatile write cache is not present or not enabled the FLUSH
5267 	 * command is a no-op, so we can take a shortcut here.
5268 	 */
5269 	if (!ns->ns_nvme->n_write_cache_present) {
5270 		bd_xfer_done(xfer, ENOTSUP);
5271 		return (0);
5272 	}
5273 
5274 	if (!ns->ns_nvme->n_write_cache_enabled) {
5275 		bd_xfer_done(xfer, 0);
5276 		return (0);
5277 	}
5278 
5279 	return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_FLUSH));
5280 }
5281 
5282 static int
5283 nvme_bd_devid(void *arg, dev_info_t *devinfo, ddi_devid_t *devid)
5284 {
5285 	nvme_namespace_t *ns = arg;
5286 	nvme_t *nvme = ns->ns_nvme;
5287 
5288 	if (nvme->n_dead) {
5289 		return (EIO);
5290 	}
5291 
5292 	if (*(uint64_t *)ns->ns_nguid != 0 ||
5293 	    *(uint64_t *)(ns->ns_nguid + 8) != 0) {
5294 		return (ddi_devid_init(devinfo, DEVID_NVME_NGUID,
5295 		    sizeof (ns->ns_nguid), ns->ns_nguid, devid));
5296 	} else if (*(uint64_t *)ns->ns_eui64 != 0) {
5297 		return (ddi_devid_init(devinfo, DEVID_NVME_EUI64,
5298 		    sizeof (ns->ns_eui64), ns->ns_eui64, devid));
5299 	} else {
5300 		return (ddi_devid_init(devinfo, DEVID_NVME_NSID,
5301 		    strlen(ns->ns_devid), ns->ns_devid, devid));
5302 	}
5303 }
5304 
5305 static int
5306 nvme_bd_free_space(void *arg, bd_xfer_t *xfer)
5307 {
5308 	nvme_namespace_t *ns = arg;
5309 
5310 	if (xfer->x_dfl == NULL)
5311 		return (EINVAL);
5312 
5313 	if (!ns->ns_nvme->n_idctl->id_oncs.on_dset_mgmt)
5314 		return (ENOTSUP);
5315 
5316 	return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_DSET_MGMT));
5317 }
5318 
5319 static int
5320 nvme_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
5321 {
5322 #ifndef __lock_lint
5323 	_NOTE(ARGUNUSED(cred_p));
5324 #endif
5325 	nvme_t *nvme;
5326 	nvme_minor_t *minor = NULL;
5327 	uint32_t nsid;
5328 	minor_t m = getminor(*devp);
5329 	int rv = 0;
5330 
5331 	if (otyp != OTYP_CHR)
5332 		return (EINVAL);
5333 
5334 	if (m >= NVME_OPEN_MINOR_MIN)
5335 		return (ENXIO);
5336 
5337 	nvme = ddi_get_soft_state(nvme_state, NVME_MINOR_INST(m));
5338 	nsid = NVME_MINOR_NSID(m);
5339 
5340 	if (nvme == NULL)
5341 		return (ENXIO);
5342 
5343 	if (nsid > nvme->n_namespace_count)
5344 		return (ENXIO);
5345 
5346 	if (nvme->n_dead)
5347 		return (EIO);
5348 
5349 	/*
5350 	 * At this point, we're going to allow an open to proceed on this
5351 	 * device. We need to allocate a new instance for this (presuming one is
5352 	 * available).
5353 	 */
5354 	minor = kmem_zalloc(sizeof (nvme_minor_t), KM_NOSLEEP_LAZY);
5355 	if (minor == NULL) {
5356 		return (ENOMEM);
5357 	}
5358 
5359 	cv_init(&minor->nm_cv, NULL, CV_DRIVER, NULL);
5360 	list_link_init(&minor->nm_ctrl_lock.nli_node);
5361 	minor->nm_ctrl_lock.nli_nvme = nvme;
5362 	minor->nm_ctrl_lock.nli_minor = minor;
5363 	list_link_init(&minor->nm_ns_lock.nli_node);
5364 	minor->nm_ns_lock.nli_nvme = nvme;
5365 	minor->nm_ns_lock.nli_minor = minor;
5366 	minor->nm_minor = id_alloc_nosleep(nvme_open_minors);
5367 	if (minor->nm_minor == -1) {
5368 		nvme_minor_free(minor);
5369 		return (ENOSPC);
5370 	}
5371 
5372 	minor->nm_ctrl = nvme;
5373 	if (nsid != 0) {
5374 		minor->nm_ns = nvme_nsid2ns(nvme, nsid);
5375 	}
5376 
5377 	/*
5378 	 * Before we check for exclusive access and attempt a lock if requested,
5379 	 * ensure that this minor is persisted.
5380 	 */
5381 	mutex_enter(&nvme_open_minors_mutex);
5382 	avl_add(&nvme_open_minors_avl, minor);
5383 	mutex_exit(&nvme_open_minors_mutex);
5384 
5385 	/*
5386 	 * A request for opening this FEXCL, is translated into a non-blocking
5387 	 * write lock of the appropriate entity. This honors the original
5388 	 * semantics here. In the future, we should see if we can remove this
5389 	 * and turn a request for FEXCL at open into ENOTSUP.
5390 	 */
5391 	mutex_enter(&nvme->n_minor_mutex);
5392 	if ((flag & FEXCL) != 0) {
5393 		nvme_ioctl_lock_t lock = {
5394 			.nil_level = NVME_LOCK_L_WRITE,
5395 			.nil_flags = NVME_LOCK_F_DONT_BLOCK
5396 		};
5397 
5398 		if (minor->nm_ns != NULL) {
5399 			lock.nil_ent = NVME_LOCK_E_NS;
5400 			lock.nil_common.nioc_nsid = nsid;
5401 		} else {
5402 			lock.nil_ent = NVME_LOCK_E_CTRL;
5403 		}
5404 		nvme_rwlock(minor, &lock);
5405 		if (lock.nil_common.nioc_drv_err != NVME_IOCTL_E_OK) {
5406 			mutex_exit(&nvme->n_minor_mutex);
5407 
5408 			mutex_enter(&nvme_open_minors_mutex);
5409 			avl_remove(&nvme_open_minors_avl, minor);
5410 			mutex_exit(&nvme_open_minors_mutex);
5411 
5412 			nvme_minor_free(minor);
5413 			return (EBUSY);
5414 		}
5415 	}
5416 	mutex_exit(&nvme->n_minor_mutex);
5417 
5418 	*devp = makedevice(getmajor(*devp), (minor_t)minor->nm_minor);
5419 	return (rv);
5420 
5421 }
5422 
5423 static int
5424 nvme_close(dev_t dev, int flag __unused, int otyp, cred_t *cred_p __unused)
5425 {
5426 	nvme_minor_t *minor;
5427 	nvme_t *nvme;
5428 
5429 	if (otyp != OTYP_CHR) {
5430 		return (ENXIO);
5431 	}
5432 
5433 	minor = nvme_minor_find_by_dev(dev);
5434 	if (minor == NULL) {
5435 		return (ENXIO);
5436 	}
5437 
5438 	mutex_enter(&nvme_open_minors_mutex);
5439 	avl_remove(&nvme_open_minors_avl, minor);
5440 	mutex_exit(&nvme_open_minors_mutex);
5441 
5442 	/*
5443 	 * When this device is being closed, we must ensure that any locks held
5444 	 * by this are dealt with.
5445 	 */
5446 	nvme = minor->nm_ctrl;
5447 	mutex_enter(&nvme->n_minor_mutex);
5448 	ASSERT3U(minor->nm_ctrl_lock.nli_state, !=, NVME_LOCK_STATE_BLOCKED);
5449 	ASSERT3U(minor->nm_ns_lock.nli_state, !=, NVME_LOCK_STATE_BLOCKED);
5450 
5451 	if (minor->nm_ctrl_lock.nli_state == NVME_LOCK_STATE_ACQUIRED) {
5452 		VERIFY3P(minor->nm_ctrl_lock.nli_lock, !=, NULL);
5453 		nvme_rwunlock(&minor->nm_ctrl_lock,
5454 		    minor->nm_ctrl_lock.nli_lock);
5455 	}
5456 
5457 	if (minor->nm_ns_lock.nli_state == NVME_LOCK_STATE_ACQUIRED) {
5458 		VERIFY3P(minor->nm_ns_lock.nli_lock, !=, NULL);
5459 		nvme_rwunlock(&minor->nm_ns_lock, minor->nm_ns_lock.nli_lock);
5460 	}
5461 	mutex_exit(&nvme->n_minor_mutex);
5462 
5463 	nvme_minor_free(minor);
5464 
5465 	return (0);
5466 }
5467 
5468 void
5469 nvme_ioctl_success(nvme_ioctl_common_t *ioc)
5470 {
5471 	ioc->nioc_drv_err = NVME_IOCTL_E_OK;
5472 	ioc->nioc_ctrl_sc = NVME_CQE_SC_GEN_SUCCESS;
5473 	ioc->nioc_ctrl_sct = NVME_CQE_SCT_GENERIC;
5474 }
5475 
5476 boolean_t
5477 nvme_ioctl_error(nvme_ioctl_common_t *ioc, nvme_ioctl_errno_t err, uint32_t sct,
5478     uint32_t sc)
5479 {
5480 	ioc->nioc_drv_err = err;
5481 	ioc->nioc_ctrl_sct = sct;
5482 	ioc->nioc_ctrl_sc = sc;
5483 
5484 	return (B_FALSE);
5485 }
5486 
5487 static int
5488 nvme_ioctl_copyout_error(nvme_ioctl_errno_t err, intptr_t uaddr, int mode)
5489 {
5490 	nvme_ioctl_common_t ioc;
5491 
5492 	ASSERT3U(err, !=, NVME_IOCTL_E_CTRL_ERROR);
5493 	bzero(&ioc, sizeof (ioc));
5494 	if (ddi_copyout(&ioc, (void *)uaddr, sizeof (nvme_ioctl_common_t),
5495 	    mode & FKIOCTL) != 0) {
5496 		return (EFAULT);
5497 	}
5498 	return (0);
5499 }
5500 
5501 
5502 /*
5503  * The companion to the namespace checking. This occurs after any rewriting
5504  * occurs. This is the primary point that we attempt to enforce any operation's
5505  * exclusivity. Note, it is theoretically possible for an operation to be
5506  * ongoing and to have someone with an exclusive lock ask to unlock it for some
5507  * reason. This does not maintain the number of such events that are going on.
5508  * While perhaps this is leaving too much up to the user, by the same token we
5509  * don't try to stop them from issuing two different format NVM commands
5510  * targeting the whole device at the same time either, even though the
5511  * controller would really rather that didn't happen.
5512  */
5513 static boolean_t
5514 nvme_ioctl_excl_check(nvme_minor_t *minor, nvme_ioctl_common_t *ioc,
5515     const nvme_ioctl_check_t *check)
5516 {
5517 	nvme_t *const nvme = minor->nm_ctrl;
5518 	nvme_namespace_t *ns;
5519 	boolean_t have_ctrl, have_ns, ctrl_is_excl, ns_is_excl;
5520 
5521 	/*
5522 	 * If the command doesn't require anything, then we're done.
5523 	 */
5524 	if (check->nck_excl == NVME_IOCTL_EXCL_SKIP) {
5525 		return (B_TRUE);
5526 	}
5527 
5528 	if (ioc->nioc_nsid == 0 || ioc->nioc_nsid == NVME_NSID_BCAST) {
5529 		ns = NULL;
5530 	} else {
5531 		ns = nvme_nsid2ns(nvme, ioc->nioc_nsid);
5532 	}
5533 
5534 	mutex_enter(&nvme->n_minor_mutex);
5535 	ctrl_is_excl = nvme->n_lock.nl_writer != NULL;
5536 	have_ctrl = nvme->n_lock.nl_writer == &minor->nm_ctrl_lock;
5537 	if (ns != NULL) {
5538 		/*
5539 		 * We explicitly test the namespace lock's writer versus asking
5540 		 * the minor because the minor's namespace lock may apply to a
5541 		 * different namespace.
5542 		 */
5543 		ns_is_excl = ns->ns_lock.nl_writer != NULL;
5544 		have_ns = ns->ns_lock.nl_writer == &minor->nm_ns_lock;
5545 		ASSERT0(have_ctrl && have_ns);
5546 #ifdef	DEBUG
5547 		if (have_ns) {
5548 			ASSERT3P(minor->nm_ns_lock.nli_ns, ==, ns);
5549 		}
5550 #endif
5551 	} else {
5552 		ns_is_excl = B_FALSE;
5553 		have_ns = B_FALSE;
5554 	}
5555 	ASSERT0(ctrl_is_excl && ns_is_excl);
5556 	mutex_exit(&nvme->n_minor_mutex);
5557 
5558 	if (check->nck_excl == NVME_IOCTL_EXCL_WRITE) {
5559 		if (ns == NULL) {
5560 			if (have_ctrl) {
5561 				return (B_TRUE);
5562 			}
5563 			return (nvme_ioctl_error(ioc,
5564 			    NVME_IOCTL_E_NEED_CTRL_WRLOCK, 0, 0));
5565 		} else {
5566 			if (have_ctrl || have_ns) {
5567 				return (B_TRUE);
5568 			}
5569 			return (nvme_ioctl_error(ioc,
5570 			    NVME_IOCTL_E_NEED_NS_WRLOCK, 0, 0));
5571 		}
5572 	}
5573 
5574 	/*
5575 	 * Now we have an operation that does not require exclusive access. We
5576 	 * can proceed as long as no one else has it or if someone does it is
5577 	 * us. Regardless of what we target, a controller lock will stop us.
5578 	 */
5579 	if (ctrl_is_excl && !have_ctrl) {
5580 		return (nvme_ioctl_error(ioc, NVME_IOCTL_E_CTRL_LOCKED, 0, 0));
5581 	}
5582 
5583 	/*
5584 	 * Only check namespace exclusivity if we are targeting one.
5585 	 */
5586 	if (ns != NULL && ns_is_excl && !have_ns) {
5587 		return (nvme_ioctl_error(ioc, NVME_IOCTL_E_NS_LOCKED, 0, 0));
5588 	}
5589 
5590 	return (B_TRUE);
5591 }
5592 
5593 /*
5594  * Perform common checking as to whether or not an ioctl operation may proceed.
5595  * We check in this function various aspects of the namespace attributes that
5596  * it's calling on. Once the namespace attributes and any possible rewriting
5597  * have been performed, then we proceed to check whether or not the requisite
5598  * exclusive access is present in nvme_ioctl_excl_check().
5599  */
5600 static boolean_t
5601 nvme_ioctl_check(nvme_minor_t *minor, nvme_ioctl_common_t *ioc,
5602     const nvme_ioctl_check_t *check)
5603 {
5604 	/*
5605 	 * If the minor has a namespace pointer, then it is constrained to that
5606 	 * namespace. If a namespace is allowed, then there are only two valid
5607 	 * values that we can find. The first is matching the minor. The second
5608 	 * is our value zero, which will be transformed to the current
5609 	 * namespace.
5610 	 */
5611 	if (minor->nm_ns != NULL) {
5612 		if (!check->nck_ns_ok || !check->nck_ns_minor_ok) {
5613 			return (nvme_ioctl_error(ioc, NVME_IOCTL_E_NOT_CTRL, 0,
5614 			    0));
5615 		}
5616 
5617 		if (ioc->nioc_nsid == 0) {
5618 			ioc->nioc_nsid = minor->nm_ns->ns_id;
5619 		} else if (ioc->nioc_nsid != minor->nm_ns->ns_id) {
5620 			return (nvme_ioctl_error(ioc,
5621 			    NVME_IOCTL_E_MINOR_WRONG_NS, 0, 0));
5622 		}
5623 
5624 		return (nvme_ioctl_excl_check(minor, ioc, check));
5625 	}
5626 
5627 	/*
5628 	 * If we've been told to skip checking the controller, here's where we
5629 	 * do that. This should really only be for commands which use the
5630 	 * namespace ID for listing purposes and therefore can have
5631 	 * traditionally illegal values here.
5632 	 */
5633 	if (check->nck_skip_ctrl) {
5634 		return (nvme_ioctl_excl_check(minor, ioc, check));
5635 	}
5636 
5637 	/*
5638 	 * At this point, we know that we're on the controller's node. We first
5639 	 * deal with the simple case, is a namespace allowed at all or not. If
5640 	 * it is not allowed, then the only acceptable value is zero.
5641 	 */
5642 	if (!check->nck_ns_ok) {
5643 		if (ioc->nioc_nsid != 0) {
5644 			return (nvme_ioctl_error(ioc, NVME_IOCTL_E_NS_UNUSE, 0,
5645 			    0));
5646 		}
5647 
5648 		return (nvme_ioctl_excl_check(minor, ioc, check));
5649 	}
5650 
5651 	/*
5652 	 * At this point, we know that a controller is allowed to use a
5653 	 * namespace. If we haven't been given zero or the broadcast namespace,
5654 	 * check to see if it's actually a valid namespace ID. If is outside of
5655 	 * range, then it is an error. Next, if we have been requested to
5656 	 * rewrite 0 (the this controller indicator) as the broadcast namespace,
5657 	 * do so.
5658 	 *
5659 	 * While we validate that this namespace is within the valid range, we
5660 	 * do not check if it is active or inactive. That is left to our callers
5661 	 * to determine.
5662 	 */
5663 	if (ioc->nioc_nsid > minor->nm_ctrl->n_namespace_count &&
5664 	    ioc->nioc_nsid != NVME_NSID_BCAST) {
5665 		return (nvme_ioctl_error(ioc, NVME_IOCTL_E_NS_RANGE, 0, 0));
5666 	}
5667 
5668 	if (ioc->nioc_nsid == 0 && check->nck_ctrl_rewrite) {
5669 		ioc->nioc_nsid = NVME_NSID_BCAST;
5670 	}
5671 
5672 	/*
5673 	 * Finally, see if we have ended up with a broadcast namespace ID
5674 	 * whether through specification or rewriting. If that is not allowed,
5675 	 * then that is an error.
5676 	 */
5677 	if (!check->nck_bcast_ok && ioc->nioc_nsid == NVME_NSID_BCAST) {
5678 		return (nvme_ioctl_error(ioc, NVME_IOCTL_E_NO_BCAST_NS, 0, 0));
5679 	}
5680 
5681 	return (nvme_ioctl_excl_check(minor, ioc, check));
5682 }
5683 
5684 static int
5685 nvme_ioctl_ctrl_info(nvme_minor_t *minor, intptr_t arg, int mode,
5686     cred_t *cred_p)
5687 {
5688 	nvme_t *const nvme = minor->nm_ctrl;
5689 	nvme_ioctl_ctrl_info_t *info;
5690 	nvme_reg_cap_t cap = { 0 };
5691 	nvme_ioctl_identify_t id = { .nid_cns = NVME_IDENTIFY_CTRL };
5692 	void *idbuf;
5693 
5694 	if ((mode & FREAD) == 0)
5695 		return (EBADF);
5696 
5697 	info = kmem_alloc(sizeof (nvme_ioctl_ctrl_info_t), KM_NOSLEEP_LAZY);
5698 	if (info == NULL) {
5699 		return (nvme_ioctl_copyout_error(NVME_IOCTL_E_NO_KERN_MEM, arg,
5700 		    mode));
5701 	}
5702 
5703 	if (ddi_copyin((void *)arg, info, sizeof (nvme_ioctl_ctrl_info_t),
5704 	    mode & FKIOCTL) != 0) {
5705 		kmem_free(info, sizeof (nvme_ioctl_ctrl_info_t));
5706 		return (EFAULT);
5707 	}
5708 
5709 	if (!nvme_ioctl_check(minor, &info->nci_common,
5710 	    &nvme_check_ctrl_info)) {
5711 		goto copyout;
5712 	}
5713 
5714 	/*
5715 	 * We explicitly do not use the identify controller copy in the kernel
5716 	 * right now so that way we can get a snapshot of the controller's
5717 	 * current capacity and values. While it's tempting to try to use this
5718 	 * to refresh the kernel's version we don't just to simplify the rest of
5719 	 * the driver right now.
5720 	 */
5721 	if (!nvme_identify(nvme, B_TRUE, &id, &idbuf)) {
5722 		info->nci_common = id.nid_common;
5723 		goto copyout;
5724 	}
5725 	bcopy(idbuf, &info->nci_ctrl_id, sizeof (nvme_identify_ctrl_t));
5726 	kmem_free(idbuf, NVME_IDENTIFY_BUFSIZE);
5727 
5728 	/*
5729 	 * Use the kernel's cached common namespace information for this.
5730 	 */
5731 	bcopy(nvme->n_idcomns, &info->nci_common_ns,
5732 	    sizeof (nvme_identify_nsid_t));
5733 
5734 	info->nci_vers = nvme->n_version;
5735 
5736 	/*
5737 	 * The MPSMIN and MPSMAX fields in the CAP register use 0 to
5738 	 * specify the base page size of 4k (1<<12), so add 12 here to
5739 	 * get the real page size value.
5740 	 */
5741 	cap.r = nvme_get64(nvme, NVME_REG_CAP);
5742 	info->nci_caps.cap_mpsmax = 1 << (12 + cap.b.cap_mpsmax);
5743 	info->nci_caps.cap_mpsmin = 1 << (12 + cap.b.cap_mpsmin);
5744 
5745 	info->nci_nintrs = (uint32_t)nvme->n_intr_cnt;
5746 
5747 copyout:
5748 	if (ddi_copyout(info, (void *)arg, sizeof (nvme_ioctl_ctrl_info_t),
5749 	    mode & FKIOCTL) != 0) {
5750 		kmem_free(info, sizeof (nvme_ioctl_ctrl_info_t));
5751 		return (EFAULT);
5752 	}
5753 
5754 	kmem_free(info, sizeof (nvme_ioctl_ctrl_info_t));
5755 	return (0);
5756 }
5757 
5758 static int
5759 nvme_ioctl_ns_info(nvme_minor_t *minor, intptr_t arg, int mode, cred_t *cred_p)
5760 {
5761 	nvme_t *const nvme = minor->nm_ctrl;
5762 	nvme_ioctl_ns_info_t *ns_info;
5763 	nvme_namespace_t *ns;
5764 	nvme_ioctl_identify_t id = { .nid_cns = NVME_IDENTIFY_NSID };
5765 	void *idbuf;
5766 
5767 	if ((mode & FREAD) == 0)
5768 		return (EBADF);
5769 
5770 	ns_info = kmem_zalloc(sizeof (nvme_ioctl_ns_info_t), KM_NOSLEEP_LAZY);
5771 	if (ns_info == NULL) {
5772 		return (nvme_ioctl_copyout_error(NVME_IOCTL_E_NO_KERN_MEM, arg,
5773 		    mode));
5774 	}
5775 
5776 	if (ddi_copyin((void *)arg, ns_info, sizeof (nvme_ioctl_ns_info_t),
5777 	    mode & FKIOCTL) != 0) {
5778 		kmem_free(ns_info, sizeof (nvme_ioctl_ns_info_t));
5779 		return (EFAULT);
5780 	}
5781 
5782 	if (!nvme_ioctl_check(minor, &ns_info->nni_common,
5783 	    &nvme_check_ns_info)) {
5784 		goto copyout;
5785 	}
5786 
5787 	ASSERT3U(ns_info->nni_common.nioc_nsid, >, 0);
5788 	ns = nvme_nsid2ns(nvme, ns_info->nni_common.nioc_nsid);
5789 
5790 	/*
5791 	 * First fetch a fresh copy of the namespace information. Most callers
5792 	 * are using this because they will want a mostly accurate snapshot of
5793 	 * capacity and utilization.
5794 	 */
5795 	id.nid_common.nioc_nsid = ns_info->nni_common.nioc_nsid;
5796 	if (!nvme_identify(nvme, B_TRUE, &id, &idbuf)) {
5797 		ns_info->nni_common = id.nid_common;
5798 		goto copyout;
5799 	}
5800 	bcopy(idbuf, &ns_info->nni_id, sizeof (nvme_identify_nsid_t));
5801 	kmem_free(idbuf, NVME_IDENTIFY_BUFSIZE);
5802 
5803 	mutex_enter(&nvme->n_mgmt_mutex);
5804 	if (ns->ns_allocated)
5805 		ns_info->nni_state |= NVME_NS_STATE_ALLOCATED;
5806 
5807 	if (ns->ns_active)
5808 		ns_info->nni_state |= NVME_NS_STATE_ACTIVE;
5809 
5810 	if (ns->ns_ignore)
5811 		ns_info->nni_state |= NVME_NS_STATE_IGNORED;
5812 
5813 	if (ns->ns_attached) {
5814 		const char *addr;
5815 
5816 		ns_info->nni_state |= NVME_NS_STATE_ATTACHED;
5817 		addr = bd_address(ns->ns_bd_hdl);
5818 		if (strlcpy(ns_info->nni_addr, addr,
5819 		    sizeof (ns_info->nni_addr)) >= sizeof (ns_info->nni_addr)) {
5820 			mutex_exit(&nvme->n_mgmt_mutex);
5821 			(void) nvme_ioctl_error(&ns_info->nni_common,
5822 			    NVME_IOCTL_E_BD_ADDR_OVER, 0, 0);
5823 			goto copyout;
5824 		}
5825 	}
5826 	mutex_exit(&nvme->n_mgmt_mutex);
5827 
5828 copyout:
5829 	if (ddi_copyout(ns_info, (void *)arg, sizeof (nvme_ioctl_ns_info_t),
5830 	    mode & FKIOCTL) != 0) {
5831 		kmem_free(ns_info, sizeof (nvme_ioctl_ns_info_t));
5832 		return (EFAULT);
5833 	}
5834 
5835 	kmem_free(ns_info, sizeof (nvme_ioctl_ns_info_t));
5836 	return (0);
5837 }
5838 
5839 static int
5840 nvme_ioctl_identify(nvme_minor_t *minor, intptr_t arg, int mode, cred_t *cred_p)
5841 {
5842 	_NOTE(ARGUNUSED(cred_p));
5843 	nvme_t *const nvme = minor->nm_ctrl;
5844 	void *idctl;
5845 	uint_t model;
5846 	nvme_ioctl_identify_t id;
5847 #ifdef	_MULTI_DATAMODEL
5848 	nvme_ioctl_identify32_t id32;
5849 #endif
5850 	boolean_t ns_minor;
5851 
5852 	if ((mode & FREAD) == 0)
5853 		return (EBADF);
5854 
5855 	model = ddi_model_convert_from(mode);
5856 	switch (model) {
5857 #ifdef	_MULTI_DATAMODEL
5858 	case DDI_MODEL_ILP32:
5859 		bzero(&id, sizeof (id));
5860 		if (ddi_copyin((void *)arg, &id32, sizeof (id32),
5861 		    mode & FKIOCTL) != 0) {
5862 			return (EFAULT);
5863 		}
5864 		id.nid_common.nioc_nsid = id32.nid_common.nioc_nsid;
5865 		id.nid_cns = id32.nid_cns;
5866 		id.nid_ctrlid = id32.nid_ctrlid;
5867 		id.nid_data = id32.nid_data;
5868 		break;
5869 #endif	/* _MULTI_DATAMODEL */
5870 	case DDI_MODEL_NONE:
5871 		if (ddi_copyin((void *)arg, &id, sizeof (id),
5872 		    mode & FKIOCTL) != 0) {
5873 			return (EFAULT);
5874 		}
5875 		break;
5876 	default:
5877 		return (ENOTSUP);
5878 	}
5879 
5880 	if (!nvme_ioctl_check(minor, &id.nid_common, &nvme_check_identify)) {
5881 		goto copyout;
5882 	}
5883 
5884 	ns_minor = minor->nm_ns != NULL;
5885 	if (!nvme_validate_identify(nvme, &id, ns_minor)) {
5886 		goto copyout;
5887 	}
5888 
5889 	if (nvme_identify(nvme, B_TRUE, &id, &idctl)) {
5890 		int ret = ddi_copyout(idctl, (void *)id.nid_data,
5891 		    NVME_IDENTIFY_BUFSIZE, mode & FKIOCTL);
5892 		kmem_free(idctl, NVME_IDENTIFY_BUFSIZE);
5893 		if (ret != 0) {
5894 			(void) nvme_ioctl_error(&id.nid_common,
5895 			    NVME_IOCTL_E_BAD_USER_DATA, 0, 0);
5896 			goto copyout;
5897 		}
5898 
5899 		nvme_ioctl_success(&id.nid_common);
5900 	}
5901 
5902 copyout:
5903 	switch (model) {
5904 #ifdef	_MULTI_DATAMODEL
5905 	case DDI_MODEL_ILP32:
5906 		id32.nid_common = id.nid_common;
5907 
5908 		if (ddi_copyout(&id32, (void *)arg, sizeof (id32),
5909 		    mode & FKIOCTL) != 0) {
5910 			return (EFAULT);
5911 		}
5912 		break;
5913 #endif	/* _MULTI_DATAMODEL */
5914 	case DDI_MODEL_NONE:
5915 		if (ddi_copyout(&id, (void *)arg, sizeof (id),
5916 		    mode & FKIOCTL) != 0) {
5917 			return (EFAULT);
5918 		}
5919 		break;
5920 	default:
5921 		return (ENOTSUP);
5922 	}
5923 
5924 	return (0);
5925 }
5926 
5927 /*
5928  * Execute commands on behalf of the various ioctls.
5929  *
5930  * If this returns true then the command completed successfully. Otherwise error
5931  * information is returned in the nvme_ioctl_common_t arguments.
5932  */
5933 typedef struct {
5934 	nvme_sqe_t *ica_sqe;
5935 	void *ica_data;
5936 	uint32_t ica_data_len;
5937 	uint_t ica_dma_flags;
5938 	int ica_copy_flags;
5939 	uint32_t ica_timeout;
5940 	uint32_t ica_cdw0;
5941 } nvme_ioc_cmd_args_t;
5942 
5943 static boolean_t
5944 nvme_ioc_cmd(nvme_t *nvme, nvme_ioctl_common_t *ioc, nvme_ioc_cmd_args_t *args)
5945 {
5946 	nvme_cmd_t *cmd;
5947 	boolean_t ret = B_FALSE;
5948 
5949 	cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
5950 	cmd->nc_sqid = 0;
5951 
5952 	/*
5953 	 * This function is used to facilitate requests from
5954 	 * userspace, so don't panic if the command fails. This
5955 	 * is especially true for admin passthru commands, where
5956 	 * the actual command data structure is entirely defined
5957 	 * by userspace.
5958 	 */
5959 	cmd->nc_dontpanic = B_TRUE;
5960 
5961 	cmd->nc_callback = nvme_wakeup_cmd;
5962 	cmd->nc_sqe = *args->ica_sqe;
5963 
5964 	if ((args->ica_dma_flags & DDI_DMA_RDWR) != 0) {
5965 		if (args->ica_data == NULL) {
5966 			ret = nvme_ioctl_error(ioc, NVME_IOCTL_E_NO_DMA_MEM,
5967 			    0, 0);
5968 			goto free_cmd;
5969 		}
5970 
5971 		if (nvme_zalloc_dma(nvme, args->ica_data_len,
5972 		    args->ica_dma_flags, &nvme->n_prp_dma_attr, &cmd->nc_dma) !=
5973 		    DDI_SUCCESS) {
5974 			dev_err(nvme->n_dip, CE_WARN,
5975 			    "!nvme_zalloc_dma failed for nvme_ioc_cmd()");
5976 			ret = nvme_ioctl_error(ioc,
5977 			    NVME_IOCTL_E_NO_DMA_MEM, 0, 0);
5978 			goto free_cmd;
5979 		}
5980 
5981 		if (nvme_fill_prp(cmd, cmd->nc_dma->nd_dmah) != 0) {
5982 			ret = nvme_ioctl_error(ioc,
5983 			    NVME_IOCTL_E_NO_DMA_MEM, 0, 0);
5984 			goto free_cmd;
5985 		}
5986 
5987 		if ((args->ica_dma_flags & DDI_DMA_WRITE) != 0 &&
5988 		    ddi_copyin(args->ica_data, cmd->nc_dma->nd_memp,
5989 		    args->ica_data_len, args->ica_copy_flags) != 0) {
5990 			ret = nvme_ioctl_error(ioc, NVME_IOCTL_E_BAD_USER_DATA,
5991 			    0, 0);
5992 			goto free_cmd;
5993 		}
5994 	}
5995 
5996 	nvme_admin_cmd(cmd, args->ica_timeout);
5997 
5998 	if (!nvme_check_cmd_status_ioctl(cmd, ioc)) {
5999 		ret = B_FALSE;
6000 		goto free_cmd;
6001 	}
6002 
6003 	args->ica_cdw0 = cmd->nc_cqe.cqe_dw0;
6004 
6005 	if ((args->ica_dma_flags & DDI_DMA_READ) != 0 &&
6006 	    ddi_copyout(cmd->nc_dma->nd_memp, args->ica_data,
6007 	    args->ica_data_len, args->ica_copy_flags) != 0) {
6008 		ret = nvme_ioctl_error(ioc, NVME_IOCTL_E_BAD_USER_DATA, 0, 0);
6009 		goto free_cmd;
6010 	}
6011 
6012 	ret = B_TRUE;
6013 	nvme_ioctl_success(ioc);
6014 
6015 free_cmd:
6016 	nvme_free_cmd(cmd);
6017 
6018 	return (ret);
6019 }
6020 
6021 static int
6022 nvme_ioctl_get_logpage(nvme_minor_t *minor, intptr_t arg, int mode,
6023     cred_t *cred_p)
6024 {
6025 	nvme_t *const nvme = minor->nm_ctrl;
6026 	void *buf;
6027 	nvme_ioctl_get_logpage_t log;
6028 	uint_t model;
6029 #ifdef	_MULTI_DATAMODEL
6030 	nvme_ioctl_get_logpage32_t log32;
6031 #endif
6032 
6033 	if ((mode & FREAD) == 0) {
6034 		return (EBADF);
6035 	}
6036 
6037 	model = ddi_model_convert_from(mode);
6038 	switch (model) {
6039 #ifdef	_MULTI_DATAMODEL
6040 	case DDI_MODEL_ILP32:
6041 		bzero(&log, sizeof (log));
6042 		if (ddi_copyin((void *)arg, &log32, sizeof (log32),
6043 		    mode & FKIOCTL) != 0) {
6044 			return (EFAULT);
6045 		}
6046 
6047 		log.nigl_common.nioc_nsid = log32.nigl_common.nioc_nsid;
6048 		log.nigl_csi = log32.nigl_csi;
6049 		log.nigl_lid = log32.nigl_lid;
6050 		log.nigl_lsp = log32.nigl_lsp;
6051 		log.nigl_len = log32.nigl_len;
6052 		log.nigl_offset = log32.nigl_offset;
6053 		log.nigl_data = log32.nigl_data;
6054 		break;
6055 #endif	/* _MULTI_DATAMODEL */
6056 	case DDI_MODEL_NONE:
6057 		if (ddi_copyin((void *)arg, &log, sizeof (log),
6058 		    mode & FKIOCTL) != 0) {
6059 			return (EFAULT);
6060 		}
6061 		break;
6062 	default:
6063 		return (ENOTSUP);
6064 	}
6065 
6066 	/*
6067 	 * Eventually we'd like to do a soft lock on the namespaces from
6068 	 * changing out from us during this operation in the future. But we
6069 	 * haven't implemented that yet.
6070 	 */
6071 	if (!nvme_ioctl_check(minor, &log.nigl_common,
6072 	    &nvme_check_get_logpage)) {
6073 		goto copyout;
6074 	}
6075 
6076 	if (!nvme_validate_logpage(nvme, &log)) {
6077 		goto copyout;
6078 	}
6079 
6080 	if (nvme_get_logpage(nvme, B_TRUE, &log, &buf)) {
6081 		int copy;
6082 
6083 		copy = ddi_copyout(buf, (void *)log.nigl_data, log.nigl_len,
6084 		    mode & FKIOCTL);
6085 		kmem_free(buf, log.nigl_len);
6086 		if (copy != 0) {
6087 			(void) nvme_ioctl_error(&log.nigl_common,
6088 			    NVME_IOCTL_E_BAD_USER_DATA, 0, 0);
6089 			goto copyout;
6090 		}
6091 
6092 		nvme_ioctl_success(&log.nigl_common);
6093 	}
6094 
6095 copyout:
6096 	switch (model) {
6097 #ifdef	_MULTI_DATAMODEL
6098 	case DDI_MODEL_ILP32:
6099 		bzero(&log32, sizeof (log32));
6100 
6101 		log32.nigl_common = log.nigl_common;
6102 		log32.nigl_csi = log.nigl_csi;
6103 		log32.nigl_lid = log.nigl_lid;
6104 		log32.nigl_lsp = log.nigl_lsp;
6105 		log32.nigl_len = log.nigl_len;
6106 		log32.nigl_offset = log.nigl_offset;
6107 		log32.nigl_data = log.nigl_data;
6108 		if (ddi_copyout(&log32, (void *)arg, sizeof (log32),
6109 		    mode & FKIOCTL) != 0) {
6110 			return (EFAULT);
6111 		}
6112 		break;
6113 #endif	/* _MULTI_DATAMODEL */
6114 	case DDI_MODEL_NONE:
6115 		if (ddi_copyout(&log, (void *)arg, sizeof (log),
6116 		    mode & FKIOCTL) != 0) {
6117 			return (EFAULT);
6118 		}
6119 		break;
6120 	default:
6121 		return (ENOTSUP);
6122 	}
6123 
6124 	return (0);
6125 }
6126 
6127 static int
6128 nvme_ioctl_get_feature(nvme_minor_t *minor, intptr_t arg, int mode,
6129     cred_t *cred_p)
6130 {
6131 	nvme_t *const nvme = minor->nm_ctrl;
6132 	nvme_ioctl_get_feature_t feat;
6133 	uint_t model;
6134 #ifdef	_MULTI_DATAMODEL
6135 	nvme_ioctl_get_feature32_t feat32;
6136 #endif
6137 	nvme_get_features_dw10_t gf_dw10 = { 0 };
6138 	nvme_ioc_cmd_args_t args = { NULL };
6139 	nvme_sqe_t sqe = {
6140 	    .sqe_opc	= NVME_OPC_GET_FEATURES
6141 	};
6142 
6143 	if ((mode & FREAD) == 0) {
6144 		return (EBADF);
6145 	}
6146 
6147 	model = ddi_model_convert_from(mode);
6148 	switch (model) {
6149 #ifdef	_MULTI_DATAMODEL
6150 	case DDI_MODEL_ILP32:
6151 		bzero(&feat, sizeof (feat));
6152 		if (ddi_copyin((void *)arg, &feat32, sizeof (feat32),
6153 		    mode & FKIOCTL) != 0) {
6154 			return (EFAULT);
6155 		}
6156 
6157 		feat.nigf_common.nioc_nsid = feat32.nigf_common.nioc_nsid;
6158 		feat.nigf_fid = feat32.nigf_fid;
6159 		feat.nigf_sel = feat32.nigf_sel;
6160 		feat.nigf_cdw11 = feat32.nigf_cdw11;
6161 		feat.nigf_data = feat32.nigf_data;
6162 		feat.nigf_len = feat32.nigf_len;
6163 		break;
6164 #endif	/* _MULTI_DATAMODEL */
6165 	case DDI_MODEL_NONE:
6166 		if (ddi_copyin((void *)arg, &feat, sizeof (feat),
6167 		    mode & FKIOCTL) != 0) {
6168 			return (EFAULT);
6169 		}
6170 		break;
6171 	default:
6172 		return (ENOTSUP);
6173 	}
6174 
6175 	if (!nvme_ioctl_check(minor, &feat.nigf_common,
6176 	    &nvme_check_get_feature)) {
6177 		goto copyout;
6178 	}
6179 
6180 	if (!nvme_validate_get_feature(nvme, &feat)) {
6181 		goto copyout;
6182 	}
6183 
6184 	gf_dw10.b.gt_fid = bitx32(feat.nigf_fid, 7, 0);
6185 	gf_dw10.b.gt_sel = bitx32(feat.nigf_sel, 2, 0);
6186 	sqe.sqe_cdw10 = gf_dw10.r;
6187 	sqe.sqe_cdw11 = feat.nigf_cdw11;
6188 	sqe.sqe_nsid = feat.nigf_common.nioc_nsid;
6189 
6190 	args.ica_sqe = &sqe;
6191 	if (feat.nigf_len != 0) {
6192 		args.ica_data = (void *)feat.nigf_data;
6193 		args.ica_data_len = feat.nigf_len;
6194 		args.ica_dma_flags = DDI_DMA_READ;
6195 	}
6196 	args.ica_copy_flags = mode;
6197 	args.ica_timeout = nvme_admin_cmd_timeout;
6198 
6199 	if (!nvme_ioc_cmd(nvme, &feat.nigf_common, &args)) {
6200 		goto copyout;
6201 	}
6202 
6203 	feat.nigf_cdw0 = args.ica_cdw0;
6204 
6205 copyout:
6206 	switch (model) {
6207 #ifdef	_MULTI_DATAMODEL
6208 	case DDI_MODEL_ILP32:
6209 		bzero(&feat32, sizeof (feat32));
6210 
6211 		feat32.nigf_common = feat.nigf_common;
6212 		feat32.nigf_fid = feat.nigf_fid;
6213 		feat32.nigf_sel = feat.nigf_sel;
6214 		feat32.nigf_cdw11 = feat.nigf_cdw11;
6215 		feat32.nigf_data = feat.nigf_data;
6216 		feat32.nigf_len = feat.nigf_len;
6217 		feat32.nigf_cdw0 = feat.nigf_cdw0;
6218 		if (ddi_copyout(&feat32, (void *)arg, sizeof (feat32),
6219 		    mode & FKIOCTL) != 0) {
6220 			return (EFAULT);
6221 		}
6222 		break;
6223 #endif	/* _MULTI_DATAMODEL */
6224 	case DDI_MODEL_NONE:
6225 		if (ddi_copyout(&feat, (void *)arg, sizeof (feat),
6226 		    mode & FKIOCTL) != 0) {
6227 			return (EFAULT);
6228 		}
6229 		break;
6230 	default:
6231 		return (ENOTSUP);
6232 	}
6233 
6234 	return (0);
6235 }
6236 
6237 static int
6238 nvme_ioctl_format(nvme_minor_t *minor, intptr_t arg, int mode, cred_t *cred_p)
6239 {
6240 	nvme_t *const nvme = minor->nm_ctrl;
6241 	nvme_ioctl_format_t ioc;
6242 
6243 	if ((mode & FWRITE) == 0)
6244 		return (EBADF);
6245 
6246 	if (secpolicy_sys_config(cred_p, B_FALSE) != 0)
6247 		return (EPERM);
6248 
6249 	if (ddi_copyin((void *)(uintptr_t)arg, &ioc,
6250 	    sizeof (nvme_ioctl_format_t), mode & FKIOCTL) != 0)
6251 		return (EFAULT);
6252 
6253 	if (!nvme_ioctl_check(minor, &ioc.nif_common, &nvme_check_format)) {
6254 		goto copyout;
6255 	}
6256 
6257 	if (!nvme_validate_format(nvme, &ioc)) {
6258 		goto copyout;
6259 	}
6260 
6261 	mutex_enter(&nvme->n_mgmt_mutex);
6262 	if (!nvme_no_blkdev_attached(nvme, ioc.nif_common.nioc_nsid)) {
6263 		mutex_exit(&nvme->n_mgmt_mutex);
6264 		(void) nvme_ioctl_error(&ioc.nif_common,
6265 		    NVME_IOCTL_E_NS_BLKDEV_ATTACH, 0, 0);
6266 		goto copyout;
6267 	}
6268 
6269 	if (nvme_format_nvm(nvme, &ioc)) {
6270 		nvme_ioctl_success(&ioc.nif_common);
6271 		nvme_rescan_ns(nvme, ioc.nif_common.nioc_nsid);
6272 	}
6273 	mutex_exit(&nvme->n_mgmt_mutex);
6274 
6275 copyout:
6276 	if (ddi_copyout(&ioc, (void *)(uintptr_t)arg, sizeof (ioc),
6277 	    mode & FKIOCTL) != 0) {
6278 		return (EFAULT);
6279 	}
6280 
6281 	return (0);
6282 }
6283 
6284 static int
6285 nvme_ioctl_detach(nvme_minor_t *minor, intptr_t arg, int mode, cred_t *cred_p)
6286 {
6287 	nvme_t *const nvme = minor->nm_ctrl;
6288 	nvme_ioctl_common_t com;
6289 
6290 	if ((mode & FWRITE) == 0)
6291 		return (EBADF);
6292 
6293 	if (secpolicy_sys_config(cred_p, B_FALSE) != 0)
6294 		return (EPERM);
6295 
6296 	if (ddi_copyin((void *)(uintptr_t)arg, &com, sizeof (com),
6297 	    mode & FKIOCTL) != 0) {
6298 		return (EFAULT);
6299 	}
6300 
6301 	if (!nvme_ioctl_check(minor, &com, &nvme_check_attach_detach)) {
6302 		goto copyout;
6303 	}
6304 
6305 	mutex_enter(&nvme->n_mgmt_mutex);
6306 	if (nvme_detach_ns(nvme, &com)) {
6307 		nvme_ioctl_success(&com);
6308 	}
6309 	mutex_exit(&nvme->n_mgmt_mutex);
6310 
6311 copyout:
6312 	if (ddi_copyout(&com, (void *)(uintptr_t)arg, sizeof (com),
6313 	    mode & FKIOCTL) != 0) {
6314 		return (EFAULT);
6315 	}
6316 
6317 	return (0);
6318 }
6319 
6320 static int
6321 nvme_ioctl_attach(nvme_minor_t *minor, intptr_t arg, int mode,
6322     cred_t *cred_p)
6323 {
6324 	nvme_t *const nvme = minor->nm_ctrl;
6325 	nvme_ioctl_common_t com;
6326 	nvme_namespace_t *ns;
6327 
6328 	if ((mode & FWRITE) == 0)
6329 		return (EBADF);
6330 
6331 	if (secpolicy_sys_config(cred_p, B_FALSE) != 0)
6332 		return (EPERM);
6333 
6334 	if (ddi_copyin((void *)(uintptr_t)arg, &com, sizeof (com),
6335 	    mode & FKIOCTL) != 0) {
6336 		return (EFAULT);
6337 	}
6338 
6339 	if (!nvme_ioctl_check(minor, &com, &nvme_check_attach_detach)) {
6340 		goto copyout;
6341 	}
6342 
6343 	mutex_enter(&nvme->n_mgmt_mutex);
6344 	ns = nvme_nsid2ns(nvme, com.nioc_nsid);
6345 
6346 	/*
6347 	 * Strictly speaking we shouldn't need to call nvme_init_ns() here as
6348 	 * we should be properly refreshing the internal state when we are
6349 	 * issuing commands that change things. However, we opt to still do so
6350 	 * as a bit of a safety check lest we give the kernel something bad or a
6351 	 * vendor unique command somehow did something behind our backs.
6352 	 */
6353 	if (!ns->ns_attached) {
6354 		(void) nvme_rescan_ns(nvme, com.nioc_nsid);
6355 		if (nvme_attach_ns(nvme, &com)) {
6356 			nvme_ioctl_success(&com);
6357 		}
6358 	} else {
6359 		nvme_ioctl_success(&com);
6360 	}
6361 	mutex_exit(&nvme->n_mgmt_mutex);
6362 
6363 copyout:
6364 	if (ddi_copyout(&com, (void *)(uintptr_t)arg, sizeof (com),
6365 	    mode & FKIOCTL) != 0) {
6366 		return (EFAULT);
6367 	}
6368 
6369 	return (0);
6370 }
6371 
6372 static void
6373 nvme_ufm_update(nvme_t *nvme)
6374 {
6375 	mutex_enter(&nvme->n_fwslot_mutex);
6376 	ddi_ufm_update(nvme->n_ufmh);
6377 	if (nvme->n_fwslot != NULL) {
6378 		kmem_free(nvme->n_fwslot, sizeof (nvme_fwslot_log_t));
6379 		nvme->n_fwslot = NULL;
6380 	}
6381 	mutex_exit(&nvme->n_fwslot_mutex);
6382 }
6383 
6384 /*
6385  * Download new firmware to the device's internal staging area. We do not call
6386  * nvme_ufm_update() here because after a firmware download, there has been no
6387  * change to any of the actual persistent firmware data. That requires a
6388  * subsequent ioctl (NVME_IOC_FIRMWARE_COMMIT) to commit the firmware to a slot
6389  * or to activate a slot.
6390  */
6391 static int
6392 nvme_ioctl_firmware_download(nvme_minor_t *minor, intptr_t arg, int mode,
6393     cred_t *cred_p)
6394 {
6395 	nvme_t *const nvme = minor->nm_ctrl;
6396 	nvme_ioctl_fw_load_t fw;
6397 	uint64_t len, maxcopy;
6398 	offset_t offset;
6399 	uint32_t gran;
6400 	nvme_valid_ctrl_data_t data;
6401 	uintptr_t buf;
6402 	nvme_sqe_t sqe = {
6403 	    .sqe_opc	= NVME_OPC_FW_IMAGE_LOAD
6404 	};
6405 
6406 	if ((mode & FWRITE) == 0)
6407 		return (EBADF);
6408 
6409 	if (secpolicy_sys_config(cred_p, B_FALSE) != 0)
6410 		return (EPERM);
6411 
6412 	if (ddi_copyin((void *)(uintptr_t)arg, &fw, sizeof (fw),
6413 	    mode & FKIOCTL) != 0) {
6414 		return (EFAULT);
6415 	}
6416 
6417 	if (!nvme_ioctl_check(minor, &fw.fwl_common, &nvme_check_firmware)) {
6418 		goto copyout;
6419 	}
6420 
6421 	if (!nvme_validate_fw_load(nvme, &fw)) {
6422 		goto copyout;
6423 	}
6424 
6425 	len = fw.fwl_len;
6426 	offset = fw.fwl_off;
6427 	buf = fw.fwl_buf;
6428 
6429 	/*
6430 	 * We need to determine the minimum and maximum amount of data that we
6431 	 * will send to the device in a given go. Starting in NMVe 1.3 this must
6432 	 * be a multiple of the firmware update granularity (FWUG), but must not
6433 	 * exceed the maximum data transfer that we've set. Many devices don't
6434 	 * report something here, which means we'll end up getting our default
6435 	 * value. Our policy is a little simple, but it's basically if the
6436 	 * maximum data transfer is evenly divided by the granularity, then use
6437 	 * it. Otherwise we use the granularity itself. The granularity is
6438 	 * always in page sized units, so trying to find another optimum point
6439 	 * isn't worth it. If we encounter a contradiction, then we will have to
6440 	 * error out.
6441 	 */
6442 	data.vcd_vers = &nvme->n_version;
6443 	data.vcd_id = nvme->n_idctl;
6444 	gran = nvme_fw_load_granularity(&data);
6445 
6446 	if ((nvme->n_max_data_transfer_size % gran) == 0) {
6447 		maxcopy = nvme->n_max_data_transfer_size;
6448 	} else if (gran <= nvme->n_max_data_transfer_size) {
6449 		maxcopy = gran;
6450 	} else {
6451 		(void) nvme_ioctl_error(&fw.fwl_common,
6452 		    NVME_IOCTL_E_FW_LOAD_IMPOS_GRAN, 0, 0);
6453 		goto copyout;
6454 	}
6455 
6456 	while (len > 0) {
6457 		nvme_ioc_cmd_args_t args = { NULL };
6458 		uint64_t copylen = MIN(maxcopy, len);
6459 
6460 		sqe.sqe_cdw10 = (uint32_t)(copylen >> NVME_DWORD_SHIFT) - 1;
6461 		sqe.sqe_cdw11 = (uint32_t)(offset >> NVME_DWORD_SHIFT);
6462 
6463 		args.ica_sqe = &sqe;
6464 		args.ica_data = (void *)buf;
6465 		args.ica_data_len = copylen;
6466 		args.ica_dma_flags = DDI_DMA_WRITE;
6467 		args.ica_copy_flags = mode;
6468 		args.ica_timeout = nvme_admin_cmd_timeout;
6469 
6470 		if (!nvme_ioc_cmd(nvme, &fw.fwl_common, &args)) {
6471 			break;
6472 		}
6473 
6474 		buf += copylen;
6475 		offset += copylen;
6476 		len -= copylen;
6477 	}
6478 
6479 copyout:
6480 	if (ddi_copyout(&fw, (void *)(uintptr_t)arg, sizeof (fw),
6481 	    mode & FKIOCTL) != 0) {
6482 		return (EFAULT);
6483 	}
6484 
6485 	return (0);
6486 }
6487 
6488 static int
6489 nvme_ioctl_firmware_commit(nvme_minor_t *minor, intptr_t arg, int mode,
6490     cred_t *cred_p)
6491 {
6492 	nvme_t *const nvme = minor->nm_ctrl;
6493 	nvme_ioctl_fw_commit_t fw;
6494 	nvme_firmware_commit_dw10_t fc_dw10 = { 0 };
6495 	nvme_ioc_cmd_args_t args = { NULL };
6496 	nvme_sqe_t sqe = {
6497 	    .sqe_opc	= NVME_OPC_FW_ACTIVATE
6498 	};
6499 
6500 	if ((mode & FWRITE) == 0)
6501 		return (EBADF);
6502 
6503 	if (secpolicy_sys_config(cred_p, B_FALSE) != 0)
6504 		return (EPERM);
6505 
6506 	if (ddi_copyin((void *)(uintptr_t)arg, &fw, sizeof (fw),
6507 	    mode & FKIOCTL) != 0) {
6508 		return (EFAULT);
6509 	}
6510 
6511 	if (!nvme_ioctl_check(minor, &fw.fwc_common, &nvme_check_firmware)) {
6512 		goto copyout;
6513 	}
6514 
6515 	if (!nvme_validate_fw_commit(nvme, &fw)) {
6516 		goto copyout;
6517 	}
6518 
6519 	fc_dw10.b.fc_slot = fw.fwc_slot;
6520 	fc_dw10.b.fc_action = fw.fwc_action;
6521 	sqe.sqe_cdw10 = fc_dw10.r;
6522 
6523 	args.ica_sqe = &sqe;
6524 	args.ica_timeout = nvme_commit_save_cmd_timeout;
6525 
6526 	/*
6527 	 * There are no conditional actions to take based on this succeeding or
6528 	 * failing. A failure is recorded in the ioctl structure returned to the
6529 	 * user.
6530 	 */
6531 	(void) nvme_ioc_cmd(nvme, &fw.fwc_common, &args);
6532 
6533 	/*
6534 	 * Let the DDI UFM subsystem know that the firmware information for
6535 	 * this device has changed. We perform this unconditionally as an
6536 	 * invalidation doesn't particularly hurt us.
6537 	 */
6538 	nvme_ufm_update(nvme);
6539 
6540 copyout:
6541 	if (ddi_copyout(&fw, (void *)(uintptr_t)arg, sizeof (fw),
6542 	    mode & FKIOCTL) != 0) {
6543 		return (EFAULT);
6544 	}
6545 
6546 	return (0);
6547 }
6548 
6549 /*
6550  * Helper to copy in a passthru command from userspace, handling
6551  * different data models.
6552  */
6553 static int
6554 nvme_passthru_copyin_cmd(const void *buf, nvme_ioctl_passthru_t *cmd, int mode)
6555 {
6556 	switch (ddi_model_convert_from(mode & FMODELS)) {
6557 #ifdef _MULTI_DATAMODEL
6558 	case DDI_MODEL_ILP32: {
6559 		nvme_ioctl_passthru32_t cmd32;
6560 
6561 		if (ddi_copyin(buf, (void*)&cmd32, sizeof (cmd32), mode) != 0)
6562 			return (EFAULT);
6563 
6564 		bzero(cmd, sizeof (nvme_ioctl_passthru_t));
6565 
6566 		cmd->npc_common.nioc_nsid = cmd32.npc_common.nioc_nsid;
6567 		cmd->npc_opcode = cmd32.npc_opcode;
6568 		cmd->npc_timeout = cmd32.npc_timeout;
6569 		cmd->npc_flags = cmd32.npc_flags;
6570 		cmd->npc_impact = cmd32.npc_impact;
6571 		cmd->npc_cdw12 = cmd32.npc_cdw12;
6572 		cmd->npc_cdw13 = cmd32.npc_cdw13;
6573 		cmd->npc_cdw14 = cmd32.npc_cdw14;
6574 		cmd->npc_cdw15 = cmd32.npc_cdw15;
6575 		cmd->npc_buflen = cmd32.npc_buflen;
6576 		cmd->npc_buf = cmd32.npc_buf;
6577 		break;
6578 	}
6579 #endif	/* _MULTI_DATAMODEL */
6580 	case DDI_MODEL_NONE:
6581 		if (ddi_copyin(buf, (void *)cmd, sizeof (nvme_ioctl_passthru_t),
6582 		    mode) != 0) {
6583 			return (EFAULT);
6584 		}
6585 		break;
6586 	default:
6587 		return (ENOTSUP);
6588 	}
6589 
6590 	return (0);
6591 }
6592 
6593 /*
6594  * Helper to copy out a passthru command result to userspace, handling
6595  * different data models.
6596  */
6597 static int
6598 nvme_passthru_copyout_cmd(const nvme_ioctl_passthru_t *cmd, void *buf, int mode)
6599 {
6600 	switch (ddi_model_convert_from(mode & FMODELS)) {
6601 #ifdef _MULTI_DATAMODEL
6602 	case DDI_MODEL_ILP32: {
6603 		nvme_ioctl_passthru32_t cmd32;
6604 
6605 		bzero(&cmd32, sizeof (nvme_ioctl_passthru32_t));
6606 
6607 		cmd32.npc_common = cmd->npc_common;
6608 		cmd32.npc_opcode = cmd->npc_opcode;
6609 		cmd32.npc_timeout = cmd->npc_timeout;
6610 		cmd32.npc_flags = cmd->npc_flags;
6611 		cmd32.npc_impact = cmd->npc_impact;
6612 		cmd32.npc_cdw0 = cmd->npc_cdw0;
6613 		cmd32.npc_cdw12 = cmd->npc_cdw12;
6614 		cmd32.npc_cdw13 = cmd->npc_cdw13;
6615 		cmd32.npc_cdw14 = cmd->npc_cdw14;
6616 		cmd32.npc_cdw15 = cmd->npc_cdw15;
6617 		cmd32.npc_buflen = (size32_t)cmd->npc_buflen;
6618 		cmd32.npc_buf = (uintptr32_t)cmd->npc_buf;
6619 		if (ddi_copyout(&cmd32, buf, sizeof (cmd32), mode) != 0)
6620 			return (EFAULT);
6621 		break;
6622 	}
6623 #endif	/* _MULTI_DATAMODEL */
6624 	case DDI_MODEL_NONE:
6625 		if (ddi_copyout(cmd, buf, sizeof (nvme_ioctl_passthru_t),
6626 		    mode) != 0) {
6627 			return (EFAULT);
6628 		}
6629 		break;
6630 	default:
6631 		return (ENOTSUP);
6632 	}
6633 	return (0);
6634 }
6635 
6636 /*
6637  * Run an arbitrary vendor-specific admin command on the device.
6638  */
6639 static int
6640 nvme_ioctl_passthru(nvme_minor_t *minor, intptr_t arg, int mode, cred_t *cred_p)
6641 {
6642 	nvme_t *const nvme = minor->nm_ctrl;
6643 	int rv;
6644 	nvme_ioctl_passthru_t pass;
6645 	nvme_sqe_t sqe;
6646 	nvme_ioc_cmd_args_t args = { NULL };
6647 
6648 	/*
6649 	 * Basic checks: permissions, data model, argument size.
6650 	 */
6651 	if ((mode & FWRITE) == 0)
6652 		return (EBADF);
6653 
6654 	if (secpolicy_sys_config(cred_p, B_FALSE) != 0)
6655 		return (EPERM);
6656 
6657 	if ((rv = nvme_passthru_copyin_cmd((void *)(uintptr_t)arg, &pass,
6658 	    mode)) != 0) {
6659 		return (rv);
6660 	}
6661 
6662 	if (!nvme_ioctl_check(minor, &pass.npc_common, &nvme_check_passthru)) {
6663 		goto copyout;
6664 	}
6665 
6666 	if (!nvme_validate_vuc(nvme, &pass)) {
6667 		goto copyout;
6668 	}
6669 
6670 	mutex_enter(&nvme->n_mgmt_mutex);
6671 	if ((pass.npc_impact & NVME_IMPACT_NS) != 0) {
6672 		/*
6673 		 * We've been told this has ns impact. Right now force that to
6674 		 * be every ns until we have more use cases and reason to trust
6675 		 * the nsid field.
6676 		 */
6677 		if (!nvme_no_blkdev_attached(nvme, NVME_NSID_BCAST)) {
6678 			mutex_exit(&nvme->n_mgmt_mutex);
6679 			(void) nvme_ioctl_error(&pass.npc_common,
6680 			    NVME_IOCTL_E_NS_BLKDEV_ATTACH, 0, 0);
6681 			goto copyout;
6682 		}
6683 	}
6684 
6685 	bzero(&sqe, sizeof (sqe));
6686 
6687 	sqe.sqe_opc = pass.npc_opcode;
6688 	sqe.sqe_nsid = pass.npc_common.nioc_nsid;
6689 	sqe.sqe_cdw10 = (uint32_t)(pass.npc_buflen >> NVME_DWORD_SHIFT);
6690 	sqe.sqe_cdw12 = pass.npc_cdw12;
6691 	sqe.sqe_cdw13 = pass.npc_cdw13;
6692 	sqe.sqe_cdw14 = pass.npc_cdw14;
6693 	sqe.sqe_cdw15 = pass.npc_cdw15;
6694 
6695 	args.ica_sqe = &sqe;
6696 	args.ica_data = (void *)pass.npc_buf;
6697 	args.ica_data_len = pass.npc_buflen;
6698 	args.ica_copy_flags = mode;
6699 	args.ica_timeout = pass.npc_timeout;
6700 
6701 	if ((pass.npc_flags & NVME_PASSTHRU_READ) != 0)
6702 		args.ica_dma_flags |= DDI_DMA_READ;
6703 	else if ((pass.npc_flags & NVME_PASSTHRU_WRITE) != 0)
6704 		args.ica_dma_flags |= DDI_DMA_WRITE;
6705 
6706 	if (nvme_ioc_cmd(nvme, &pass.npc_common, &args)) {
6707 		pass.npc_cdw0 = args.ica_cdw0;
6708 		if ((pass.npc_impact & NVME_IMPACT_NS) != 0) {
6709 			nvme_rescan_ns(nvme, NVME_NSID_BCAST);
6710 		}
6711 	}
6712 	mutex_exit(&nvme->n_mgmt_mutex);
6713 
6714 copyout:
6715 	rv = nvme_passthru_copyout_cmd(&pass, (void *)(uintptr_t)arg,
6716 	    mode);
6717 
6718 	return (rv);
6719 }
6720 
6721 static int
6722 nvme_ioctl_lock(nvme_minor_t *minor, intptr_t arg, int mode,
6723     cred_t *cred_p)
6724 {
6725 	nvme_ioctl_lock_t lock;
6726 	const nvme_lock_flags_t all_flags = NVME_LOCK_F_DONT_BLOCK;
6727 	nvme_t *nvme = minor->nm_ctrl;
6728 
6729 	if ((mode & FWRITE) == 0)
6730 		return (EBADF);
6731 
6732 	if (secpolicy_sys_config(cred_p, B_FALSE) != 0)
6733 		return (EPERM);
6734 
6735 	if (ddi_copyin((void *)(uintptr_t)arg, &lock, sizeof (lock),
6736 	    mode & FKIOCTL) != 0) {
6737 		return (EFAULT);
6738 	}
6739 
6740 	if (lock.nil_ent != NVME_LOCK_E_CTRL &&
6741 	    lock.nil_ent != NVME_LOCK_E_NS) {
6742 		(void) nvme_ioctl_error(&lock.nil_common,
6743 		    NVME_IOCTL_E_BAD_LOCK_ENTITY, 0, 0);
6744 		goto copyout;
6745 	}
6746 
6747 	if (lock.nil_level != NVME_LOCK_L_READ &&
6748 	    lock.nil_level != NVME_LOCK_L_WRITE) {
6749 		(void) nvme_ioctl_error(&lock.nil_common,
6750 		    NVME_IOCTL_E_BAD_LOCK_LEVEL, 0, 0);
6751 		goto copyout;
6752 	}
6753 
6754 	if ((lock.nil_flags & ~all_flags) != 0) {
6755 		(void) nvme_ioctl_error(&lock.nil_common,
6756 		    NVME_IOCTL_E_BAD_LOCK_FLAGS, 0, 0);
6757 		goto copyout;
6758 	}
6759 
6760 	if (!nvme_ioctl_check(minor, &lock.nil_common, &nvme_check_locking)) {
6761 		goto copyout;
6762 	}
6763 
6764 	/*
6765 	 * If we're on a namespace, confirm that we're not asking for the
6766 	 * controller.
6767 	 */
6768 	if (lock.nil_common.nioc_nsid != 0 &&
6769 	    lock.nil_ent == NVME_LOCK_E_CTRL) {
6770 		(void) nvme_ioctl_error(&lock.nil_common,
6771 		    NVME_IOCTL_E_NS_CANNOT_LOCK_CTRL, 0, 0);
6772 		goto copyout;
6773 	}
6774 
6775 	/*
6776 	 * We've reached the point where we can no longer actually check things
6777 	 * without serializing state. First, we need to check to make sure that
6778 	 * none of our invariants are being broken for locking:
6779 	 *
6780 	 * 1) The caller isn't already blocking for a lock operation to
6781 	 * complete.
6782 	 *
6783 	 * 2) The caller is attempting to grab a lock that they already have.
6784 	 * While there are other rule violations that this might create, we opt
6785 	 * to check this ahead of it so we can have slightly better error
6786 	 * messages for our callers.
6787 	 *
6788 	 * 3) The caller is trying to grab a controller lock, while holding a
6789 	 * namespace lock.
6790 	 *
6791 	 * 4) The caller has a controller write lock and is trying to get a
6792 	 * namespace lock. For now, we disallow this case. Holding a controller
6793 	 * read lock is allowed, but the write lock allows you to operate on all
6794 	 * namespaces anyways. In addition, this simplifies the locking logic;
6795 	 * however, this constraint may be loosened in the future.
6796 	 *
6797 	 * 5) The caller is trying to acquire a second namespace lock when they
6798 	 * already have one.
6799 	 */
6800 	mutex_enter(&nvme->n_minor_mutex);
6801 	if (minor->nm_ctrl_lock.nli_state == NVME_LOCK_STATE_BLOCKED ||
6802 	    minor->nm_ns_lock.nli_state == NVME_LOCK_STATE_BLOCKED) {
6803 		(void) nvme_ioctl_error(&lock.nil_common,
6804 		    NVME_IOCTL_E_LOCK_PENDING, 0, 0);
6805 		mutex_exit(&nvme->n_minor_mutex);
6806 		goto copyout;
6807 	}
6808 
6809 	if ((lock.nil_ent == NVME_LOCK_E_CTRL &&
6810 	    minor->nm_ctrl_lock.nli_state == NVME_LOCK_STATE_ACQUIRED) ||
6811 	    (lock.nil_ent == NVME_LOCK_E_NS &&
6812 	    minor->nm_ns_lock.nli_state == NVME_LOCK_STATE_ACQUIRED &&
6813 	    minor->nm_ns_lock.nli_ns->ns_id == lock.nil_common.nioc_nsid)) {
6814 		(void) nvme_ioctl_error(&lock.nil_common,
6815 		    NVME_IOCTL_E_LOCK_ALREADY_HELD, 0, 0);
6816 		mutex_exit(&nvme->n_minor_mutex);
6817 		goto copyout;
6818 	}
6819 
6820 	if (lock.nil_ent == NVME_LOCK_E_CTRL &&
6821 	    minor->nm_ns_lock.nli_state != NVME_LOCK_STATE_UNLOCKED) {
6822 		(void) nvme_ioctl_error(&lock.nil_common,
6823 		    NVME_IOCTL_E_LOCK_NO_CTRL_WITH_NS, 0, 0);
6824 		mutex_exit(&nvme->n_minor_mutex);
6825 		goto copyout;
6826 	}
6827 
6828 	if (lock.nil_ent == NVME_LOCK_E_NS &&
6829 	    (minor->nm_ctrl_lock.nli_state == NVME_LOCK_STATE_ACQUIRED &&
6830 	    minor->nm_ctrl_lock.nli_curlevel == NVME_LOCK_L_WRITE)) {
6831 		(void) nvme_ioctl_error(&lock.nil_common,
6832 		    NVME_IOCTL_LOCK_NO_NS_WITH_CTRL_WRLOCK, 0, 0);
6833 		mutex_exit(&nvme->n_minor_mutex);
6834 		goto copyout;
6835 	}
6836 
6837 	if (lock.nil_ent == NVME_LOCK_E_NS &&
6838 	    minor->nm_ns_lock.nli_state != NVME_LOCK_STATE_UNLOCKED) {
6839 		(void) nvme_ioctl_error(&lock.nil_common,
6840 		    NVME_IOCTL_E_LOCK_NO_2ND_NS, 0, 0);
6841 		mutex_exit(&nvme->n_minor_mutex);
6842 		goto copyout;
6843 	}
6844 
6845 
6846 #ifdef	DEBUG
6847 	/*
6848 	 * This is a big block of sanity checks to make sure that we haven't
6849 	 * allowed anything bad to happen.
6850 	 */
6851 	if (lock.nil_ent == NVME_LOCK_E_NS) {
6852 		ASSERT3P(minor->nm_ns_lock.nli_lock, ==, NULL);
6853 		ASSERT3U(minor->nm_ns_lock.nli_state, ==,
6854 		    NVME_LOCK_STATE_UNLOCKED);
6855 		ASSERT3U(minor->nm_ns_lock.nli_curlevel, ==, 0);
6856 		ASSERT3P(minor->nm_ns_lock.nli_ns, ==, NULL);
6857 
6858 		if (minor->nm_ns != NULL) {
6859 			ASSERT3U(minor->nm_ns->ns_id, ==,
6860 			    lock.nil_common.nioc_nsid);
6861 		}
6862 
6863 		ASSERT0(list_link_active(&minor->nm_ns_lock.nli_node));
6864 	} else {
6865 		ASSERT3P(minor->nm_ctrl_lock.nli_lock, ==, NULL);
6866 		ASSERT3U(minor->nm_ctrl_lock.nli_state, ==,
6867 		    NVME_LOCK_STATE_UNLOCKED);
6868 		ASSERT3U(minor->nm_ctrl_lock.nli_curlevel, ==, 0);
6869 		ASSERT3P(minor->nm_ns_lock.nli_ns, ==, NULL);
6870 		ASSERT0(list_link_active(&minor->nm_ctrl_lock.nli_node));
6871 
6872 		ASSERT3P(minor->nm_ns_lock.nli_lock, ==, NULL);
6873 		ASSERT3U(minor->nm_ns_lock.nli_state, ==,
6874 		    NVME_LOCK_STATE_UNLOCKED);
6875 		ASSERT3U(minor->nm_ns_lock.nli_curlevel, ==, 0);
6876 		ASSERT3P(minor->nm_ns_lock.nli_ns, ==, NULL);
6877 		ASSERT0(list_link_active(&minor->nm_ns_lock.nli_node));
6878 	}
6879 #endif	/* DEBUG */
6880 
6881 	/*
6882 	 * At this point we should actually attempt a locking operation.
6883 	 */
6884 	nvme_rwlock(minor, &lock);
6885 	mutex_exit(&nvme->n_minor_mutex);
6886 
6887 copyout:
6888 	if (ddi_copyout(&lock, (void *)(uintptr_t)arg, sizeof (lock),
6889 	    mode & FKIOCTL) != 0) {
6890 		return (EFAULT);
6891 	}
6892 
6893 	return (0);
6894 }
6895 
6896 static int
6897 nvme_ioctl_unlock(nvme_minor_t *minor, intptr_t arg, int mode,
6898     cred_t *cred_p)
6899 {
6900 	nvme_ioctl_unlock_t unlock;
6901 	nvme_t *const nvme = minor->nm_ctrl;
6902 	boolean_t is_ctrl;
6903 	nvme_lock_t *lock;
6904 	nvme_minor_lock_info_t *info;
6905 
6906 	/*
6907 	 * Note, we explicitly don't check for privileges for unlock. The idea
6908 	 * being that if you have the lock, that's what matters. If you don't
6909 	 * have the lock, it doesn't matter what privileges that you have at
6910 	 * all.
6911 	 */
6912 	if ((mode & FWRITE) == 0)
6913 		return (EBADF);
6914 
6915 	if (ddi_copyin((void *)(uintptr_t)arg, &unlock, sizeof (unlock),
6916 	    mode & FKIOCTL) != 0) {
6917 		return (EFAULT);
6918 	}
6919 
6920 	if (unlock.niu_ent != NVME_LOCK_E_CTRL &&
6921 	    unlock.niu_ent != NVME_LOCK_E_NS) {
6922 		(void) nvme_ioctl_error(&unlock.niu_common,
6923 		    NVME_IOCTL_E_BAD_LOCK_ENTITY, 0, 0);
6924 		goto copyout;
6925 	}
6926 
6927 	if (!nvme_ioctl_check(minor, &unlock.niu_common, &nvme_check_locking)) {
6928 		goto copyout;
6929 	}
6930 
6931 	/*
6932 	 * If we're on a namespace, confirm that we're not asking for the
6933 	 * controller.
6934 	 */
6935 	if (unlock.niu_common.nioc_nsid != 0 &&
6936 	    unlock.niu_ent == NVME_LOCK_E_CTRL) {
6937 		(void) nvme_ioctl_error(&unlock.niu_common,
6938 		    NVME_IOCTL_E_NS_CANNOT_UNLOCK_CTRL, 0, 0);
6939 		goto copyout;
6940 	}
6941 
6942 	mutex_enter(&nvme->n_minor_mutex);
6943 	if (unlock.niu_ent == NVME_LOCK_E_CTRL) {
6944 		if (minor->nm_ctrl_lock.nli_state != NVME_LOCK_STATE_ACQUIRED) {
6945 			mutex_exit(&nvme->n_minor_mutex);
6946 			(void) nvme_ioctl_error(&unlock.niu_common,
6947 			    NVME_IOCTL_E_LOCK_NOT_HELD, 0, 0);
6948 			goto copyout;
6949 		}
6950 	} else {
6951 		if (minor->nm_ns_lock.nli_ns == NULL) {
6952 			mutex_exit(&nvme->n_minor_mutex);
6953 			(void) nvme_ioctl_error(&unlock.niu_common,
6954 			    NVME_IOCTL_E_LOCK_NOT_HELD, 0, 0);
6955 			goto copyout;
6956 		}
6957 
6958 		/*
6959 		 * Check that our unlock request corresponds to the namespace ID
6960 		 * that is currently locked. This could happen if we're using
6961 		 * the controller node and it specified a valid, but not locked,
6962 		 * namespace ID.
6963 		 */
6964 		if (minor->nm_ns_lock.nli_ns->ns_id !=
6965 		    unlock.niu_common.nioc_nsid) {
6966 			mutex_exit(&nvme->n_minor_mutex);
6967 			ASSERT3P(minor->nm_ns, ==, NULL);
6968 			(void) nvme_ioctl_error(&unlock.niu_common,
6969 			    NVME_IOCTL_E_LOCK_WRONG_NS, 0, 0);
6970 			goto copyout;
6971 		}
6972 
6973 		if (minor->nm_ns_lock.nli_state != NVME_LOCK_STATE_ACQUIRED) {
6974 			mutex_exit(&nvme->n_minor_mutex);
6975 			(void) nvme_ioctl_error(&unlock.niu_common,
6976 			    NVME_IOCTL_E_LOCK_NOT_HELD, 0, 0);
6977 			goto copyout;
6978 		}
6979 	}
6980 
6981 	/*
6982 	 * Finally, perform the unlock.
6983 	 */
6984 	is_ctrl = unlock.niu_ent == NVME_LOCK_E_CTRL;
6985 	if (is_ctrl) {
6986 		lock = &nvme->n_lock;
6987 		info = &minor->nm_ctrl_lock;
6988 	} else {
6989 		nvme_namespace_t *ns;
6990 		const uint32_t nsid = unlock.niu_common.nioc_nsid;
6991 
6992 		ns = nvme_nsid2ns(nvme, nsid);
6993 		lock = &ns->ns_lock;
6994 		info = &minor->nm_ns_lock;
6995 		VERIFY3P(ns, ==, info->nli_ns);
6996 	}
6997 	nvme_rwunlock(info, lock);
6998 	mutex_exit(&nvme->n_minor_mutex);
6999 	nvme_ioctl_success(&unlock.niu_common);
7000 
7001 copyout:
7002 	if (ddi_copyout(&unlock, (void *)(uintptr_t)arg, sizeof (unlock),
7003 	    mode & FKIOCTL) != 0) {
7004 		return (EFAULT);
7005 	}
7006 
7007 	return (0);
7008 }
7009 
7010 static int
7011 nvme_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred_p,
7012     int *rval_p)
7013 {
7014 #ifndef __lock_lint
7015 	_NOTE(ARGUNUSED(rval_p));
7016 #endif
7017 	nvme_minor_t *minor;
7018 	nvme_t *nvme;
7019 
7020 	minor = nvme_minor_find_by_dev(dev);
7021 	if (minor == NULL) {
7022 		return (ENXIO);
7023 	}
7024 
7025 	nvme = minor->nm_ctrl;
7026 	if (nvme == NULL)
7027 		return (ENXIO);
7028 
7029 	if (IS_DEVCTL(cmd))
7030 		return (ndi_devctl_ioctl(nvme->n_dip, cmd, arg, mode, 0));
7031 
7032 	if (nvme->n_dead && (cmd != NVME_IOC_DETACH && cmd !=
7033 	    NVME_IOC_UNLOCK)) {
7034 		if (IS_NVME_IOC(cmd) == 0) {
7035 			return (EIO);
7036 		}
7037 
7038 		return (nvme_ioctl_copyout_error(nvme->n_dead_status, arg,
7039 		    mode));
7040 	}
7041 
7042 	/*
7043 	 * ioctls that are no longer using the original ioctl structure.
7044 	 */
7045 	switch (cmd) {
7046 	case NVME_IOC_CTRL_INFO:
7047 		return (nvme_ioctl_ctrl_info(minor, arg, mode, cred_p));
7048 	case NVME_IOC_IDENTIFY:
7049 		return (nvme_ioctl_identify(minor, arg, mode, cred_p));
7050 	case NVME_IOC_GET_LOGPAGE:
7051 		return (nvme_ioctl_get_logpage(minor, arg, mode, cred_p));
7052 	case NVME_IOC_GET_FEATURE:
7053 		return (nvme_ioctl_get_feature(minor, arg, mode, cred_p));
7054 	case NVME_IOC_DETACH:
7055 		return (nvme_ioctl_detach(minor, arg, mode, cred_p));
7056 	case NVME_IOC_ATTACH:
7057 		return (nvme_ioctl_attach(minor, arg, mode, cred_p));
7058 	case NVME_IOC_FORMAT:
7059 		return (nvme_ioctl_format(minor, arg, mode, cred_p));
7060 	case NVME_IOC_FIRMWARE_DOWNLOAD:
7061 		return (nvme_ioctl_firmware_download(minor, arg, mode,
7062 		    cred_p));
7063 	case NVME_IOC_FIRMWARE_COMMIT:
7064 		return (nvme_ioctl_firmware_commit(minor, arg, mode,
7065 		    cred_p));
7066 	case NVME_IOC_NS_INFO:
7067 		return (nvme_ioctl_ns_info(minor, arg, mode, cred_p));
7068 	case NVME_IOC_PASSTHRU:
7069 		return (nvme_ioctl_passthru(minor, arg, mode, cred_p));
7070 	case NVME_IOC_LOCK:
7071 		return (nvme_ioctl_lock(minor, arg, mode, cred_p));
7072 	case NVME_IOC_UNLOCK:
7073 		return (nvme_ioctl_unlock(minor, arg, mode, cred_p));
7074 	default:
7075 		return (ENOTTY);
7076 	}
7077 }
7078 
7079 /*
7080  * DDI UFM Callbacks
7081  */
7082 static int
7083 nvme_ufm_fill_image(ddi_ufm_handle_t *ufmh, void *arg, uint_t imgno,
7084     ddi_ufm_image_t *img)
7085 {
7086 	nvme_t *nvme = arg;
7087 
7088 	if (imgno != 0)
7089 		return (EINVAL);
7090 
7091 	ddi_ufm_image_set_desc(img, "Firmware");
7092 	ddi_ufm_image_set_nslots(img, nvme->n_idctl->id_frmw.fw_nslot);
7093 
7094 	return (0);
7095 }
7096 
7097 /*
7098  * Fill out firmware slot information for the requested slot.  The firmware
7099  * slot information is gathered by requesting the Firmware Slot Information log
7100  * page.  The format of the page is described in section 5.10.1.3.
7101  *
7102  * We lazily cache the log page on the first call and then invalidate the cache
7103  * data after a successful firmware download or firmware commit command.
7104  * The cached data is protected by a mutex as the state can change
7105  * asynchronous to this callback.
7106  */
7107 static int
7108 nvme_ufm_fill_slot(ddi_ufm_handle_t *ufmh, void *arg, uint_t imgno,
7109     uint_t slotno, ddi_ufm_slot_t *slot)
7110 {
7111 	nvme_t *nvme = arg;
7112 	void *log = NULL;
7113 	size_t bufsize;
7114 	ddi_ufm_attr_t attr = 0;
7115 	char fw_ver[NVME_FWVER_SZ + 1];
7116 
7117 	if (imgno > 0 || slotno > (nvme->n_idctl->id_frmw.fw_nslot - 1))
7118 		return (EINVAL);
7119 
7120 	mutex_enter(&nvme->n_fwslot_mutex);
7121 	if (nvme->n_fwslot == NULL) {
7122 		if (!nvme_get_logpage_int(nvme, B_TRUE, &log, &bufsize,
7123 		    NVME_LOGPAGE_FWSLOT) ||
7124 		    bufsize != sizeof (nvme_fwslot_log_t)) {
7125 			if (log != NULL)
7126 				kmem_free(log, bufsize);
7127 			mutex_exit(&nvme->n_fwslot_mutex);
7128 			return (EIO);
7129 		}
7130 		nvme->n_fwslot = (nvme_fwslot_log_t *)log;
7131 	}
7132 
7133 	/*
7134 	 * NVMe numbers firmware slots starting at 1
7135 	 */
7136 	if (slotno == (nvme->n_fwslot->fw_afi - 1))
7137 		attr |= DDI_UFM_ATTR_ACTIVE;
7138 
7139 	if (slotno != 0 || nvme->n_idctl->id_frmw.fw_readonly == 0)
7140 		attr |= DDI_UFM_ATTR_WRITEABLE;
7141 
7142 	if (nvme->n_fwslot->fw_frs[slotno][0] == '\0') {
7143 		attr |= DDI_UFM_ATTR_EMPTY;
7144 	} else {
7145 		(void) strncpy(fw_ver, nvme->n_fwslot->fw_frs[slotno],
7146 		    NVME_FWVER_SZ);
7147 		fw_ver[NVME_FWVER_SZ] = '\0';
7148 		ddi_ufm_slot_set_version(slot, fw_ver);
7149 	}
7150 	mutex_exit(&nvme->n_fwslot_mutex);
7151 
7152 	ddi_ufm_slot_set_attrs(slot, attr);
7153 
7154 	return (0);
7155 }
7156 
7157 static int
7158 nvme_ufm_getcaps(ddi_ufm_handle_t *ufmh, void *arg, ddi_ufm_cap_t *caps)
7159 {
7160 	*caps = DDI_UFM_CAP_REPORT;
7161 	return (0);
7162 }
7163 
7164 boolean_t
7165 nvme_ctrl_atleast(nvme_t *nvme, const nvme_version_t *min)
7166 {
7167 	return (nvme_vers_atleast(&nvme->n_version, min) ? B_TRUE : B_FALSE);
7168 }
7169