xref: /illumos-gate/usr/src/uts/common/io/mlxcx/mlxcx.c (revision 1e56f352c1c208679012bca47d552e127f5b1072)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2021, The University of Queensland
14  * Copyright (c) 2018, Joyent, Inc.
15  * Copyright 2020 RackTop Systems, Inc.
16  * Copyright 2023 MNX Cloud, Inc.
17  */
18 
19 /*
20  * Mellanox Connect-X 4/5/6 driver.
21  */
22 
23 /*
24  * The PRM for this family of parts was freely available at:
25  *
26  * https://www.mellanox.com/related-docs/user_manuals/ \
27  *   Ethernet_Adapters_Programming_Manual.pdf
28  *
29  * but has since disappeared.
30  */
31 /*
32  * ConnectX glossary
33  * -----------------
34  *
35  * WR		Work Request: something we've asked the hardware to do by
36  *		creating a Work Queue Entry (WQE), e.g. send or recv a packet
37  *
38  * WQE		Work Queue Entry: a descriptor on a work queue descriptor ring
39  *
40  * WQ		Work Queue: a descriptor ring that we can place WQEs on, usually
41  *		either a Send Queue (SQ) or Receive Queue (RQ). Different WQ
42  *		types have different WQE structures, different commands for
43  *		creating and destroying them, etc, but share a common context
44  *		structure, counter setup and state graph.
45  * SQ		Send Queue, a specific type of WQ that sends packets
46  * RQ		Receive Queue, a specific type of WQ that receives packets
47  *
48  * CQ		Completion Queue: completion of WRs from a WQ are reported to
49  *		one of these, as a CQE on its entry ring.
50  * CQE		Completion Queue Entry: an entry in a CQ ring. Contains error
51  *		info, as well as packet size, the ID of the WQ, and the index
52  *		of the WQE which completed. Does not contain any packet data.
53  *
54  * EQ		Event Queue: a ring of event structs from the hardware informing
55  *		us when particular events happen. Many events can point at a
56  *		a particular CQ which we should then go look at.
57  * EQE		Event Queue Entry: an entry on the EQ ring
58  *
59  * UAR		User Access Region, a page of the device's PCI BAR which is
60  *		tied to particular EQ/CQ/WQ sets and contains doorbells to
61  *		ring to arm them for interrupts or wake them up for new work
62  *
63  * RQT		RQ Table, a collection of indexed RQs used to refer to the group
64  *		as a single unit (for e.g. hashing/RSS).
65  *
66  * TIR		Transport Interface Recieve, a bucket of resources for the
67  *		reception of packets. TIRs have to point at either a single RQ
68  *		or a table of RQs (RQT). They then serve as a target for flow
69  *		table entries (FEs). TIRs that point at an RQT also contain the
70  *		settings for hashing for RSS.
71  *
72  * TIS		Transport Interface Send, a bucket of resources associated with
73  *		the transmission of packets. In particular, the temporary
74  *		resources used for LSO internally in the card are accounted to
75  *		a TIS.
76  *
77  * FT		Flow Table, a collection of FEs and FGs that can be referred to
78  *		as a single entity (e.g. used as a target from another flow
79  *		entry or set as the "root" table to handle incoming or outgoing
80  *		packets). Packets arriving at a FT are matched against the
81  *		FEs in the table until either one matches with a terminating
82  *		action or all FEs are exhausted (it's first-match-wins but with
83  *		some actions that are non-terminal, like counting actions).
84  *
85  * FG		Flow Group, a group of FEs which share a common "mask" (i.e.
86  *		they match on the same attributes of packets coming into the
87  *		flow).
88  *
89  * FE		Flow Entry, an individual set of values to match against
90  *		packets entering the flow table, combined with an action to
91  *		take upon a successful match. The action we use most is
92  *		"forward", which sends the packets to a TIR or another flow
93  *		table and then stops further processing within the FE's FT.
94  *
95  * lkey/mkey	A reference to something similar to a page table but in the
96  *		device's internal onboard MMU. Since Connect-X parts double as
97  *		IB cards (lots of RDMA) they have extensive onboard memory mgmt
98  *		features which we try very hard not to use. For our WQEs we use
99  *		the "reserved" lkey, which is a special value which indicates
100  *		that addresses we give are linear addresses and should not be
101  *		translated.
102  *
103  * PD		Protection Domain, an IB concept. We have to allocate one to
104  *		provide as a parameter for new WQs, but we don't do anything
105  *		with it.
106  *
107  * TDOM/TD	Transport Domain, an IB concept. We allocate one in order to
108  *		provide it as a parameter to TIR/TIS creation, but we don't do
109  *		anything with it.
110  */
111 /*
112  *
113  * Data flow overview
114  * ------------------
115  *
116  * This driver is a MAC ring-enabled driver which maps rings to send and recv
117  * queues in hardware on the device.
118  *
119  * Each SQ and RQ is set up to report to its own individual CQ, to ensure
120  * sufficient space, and simplify the logic needed to work out which buffer
121  * was completed.
122  *
123  * The CQs are then round-robin allocated onto EQs, of which we set up one per
124  * interrupt that the system gives us for the device. Normally this means we
125  * have 8 EQs.
126  *
127  * When we have >= 8 EQs available, we try to allocate only RX or only TX
128  * CQs on each one. The EQs are chosen for RX and TX in an alternating fashion.
129  *
130  * EQ #0 is reserved for all event types other than completion events, and has
131  * no CQs associated with it at any time. EQs #1 and upwards are only used for
132  * handling CQ completion events.
133  *
134  * +------+     +------+           +------+        +---------+
135  * | SQ 0 |---->| CQ 0 |-----+     | EQ 0 |------> | MSI-X 0 |     mlxcx_intr_0
136  * +------+     +------+     |     +------+        +---------+
137  *                           |
138  * +------+     +------+     |
139  * | SQ 1 |---->| CQ 1 |---+ |     +------+
140  * +------+     +------+   | +---> |      |
141  *                         |       |      |
142  * +------+     +------+   |       | EQ 1 |        +---------+
143  * | SQ 2 |---->| CQ 2 |---------> |      |------> | MSI-X 1 |     mlxcx_intr_n
144  * +------+     +------+   | +---> |      |        +---------+
145  *                         | |     +------+
146  *                         | |
147  *   ...                   | |
148  *                         | |     +------+
149  * +------+     +------+   +-----> |      |
150  * | RQ 0 |---->| CQ 3 |---------> |      |        +---------+
151  * +------+     +------+     |     | EQ 2 |------> | MSI-X 2 |     mlxcx_intr_n
152  *                           |     |      |        +---------+
153  * +------+     +------+     | +-> |      |
154  * | RQ 1 |---->| CQ 4 |-----+ |   +------+
155  * +------+     +------+       |
156  *                             |     ....
157  * +------+     +------+       |
158  * | RQ 2 |---->| CQ 5 |-------+
159  * +------+     +------+
160  *
161  *   ... (note this diagram does not show RX-only or TX-only EQs)
162  *
163  * For TX, we advertise all of the SQs we create as plain rings to MAC with
164  * no TX groups. This puts MAC in "virtual group" mode where it will allocate
165  * and use the rings as it sees fit.
166  *
167  * For RX, we advertise actual groups in order to make use of hardware
168  * classification.
169  *
170  * The hardware classification we use is based around Flow Tables, and we
171  * currently ignore all of the eswitch features of the card. The NIC VPORT
172  * is always set to promisc mode so that the eswitch sends us all of the
173  * traffic that arrives on the NIC, and we use flow entries to manage
174  * everything.
175  *
176  * We use 2 layers of flow tables for classification: traffic arrives at the
177  * root RX flow table which contains MAC address filters. Those then send
178  * matched traffic to the per-group L1 VLAN filter tables which contain VLAN
179  * presence and VID filters.
180  *
181  * Since these parts only support doing RSS hashing on a single protocol at a
182  * time, we have to use a third layer of flow tables as well to break traffic
183  * down by L4 and L3 protocol (TCPv6, TCPv4, UDPv6, UDPv4, IPv6, IPv4 etc)
184  * so that it can be sent to the appropriate TIR for hashing.
185  *
186  * Incoming packets
187  *        +           +---------+      +---------+
188  *        |        +->| group 0 |      | group 0 |
189  *        |        |  | vlan ft |  +-->| hash ft |
190  *        v        |  |   L1    |  |   |   L2    |
191  *   +----+----+   |  +---------+  |   +---------+    +-----+    +-----+------+
192  *   | eswitch |   |  |         |  |   |  TCPv6  |--->| TIR |--->|     |  RQ0 |
193  *   +----+----+   |  |         |  |   +---------+    +-----+    |     +------+
194  *        |        |  |         |  |   |  UDPv6  |--->| TIR |--->|     |  RQ1 |
195  *        |        |  |         |  |   +---------+    +-----+    |     +------+
196  *        |        |  |         |  |   |  TCPv4  |--->| TIR |--->|     |  RQ2 |
197  *        v        |  |         |  |   +---------+    +-----+    | RQT +------+
198  *   +----+----+   |  +---------+  |   |  UDPv4  |--->| TIR |--->|     |  ... |
199  *   | root rx |   |  | default |--+   +---------+    +-----+    |     |      |
200  *   | flow tb |   |  +---------+  |   |  IPv6   |--->| TIR |--->|     |      |
201  *   |    L0   |   |  | promisc |--+   +---------+    +-----+    |     |      |
202  *   +---------+   |  +---------+  ^   |  IPv4   |--->| TIR |--->|     |      |
203  *   |  bcast  |---|---------------+   +---------+    +-----+    +-----+------+
204  *   +---------+   |               ^   |  other  |-+
205  *   |  MAC 0  |---+               |   +---------+ |  +-----+    +-----+
206  *   +---------+                   |               +->| TIR |--->| RQ0 |
207  *   |  MAC 1  |-+                 |                  +-----+    +-----+
208  *   +---------+ | +---------------+
209  *   |  MAC 2  |-+ |               ^
210  *   +---------+ | |               |
211  *   |  MAC 3  |-+ |  +---------+  |   +---------+
212  *   +---------+ | |  | group 1 |  |   | group 1 |
213  *   |  .....  | +--->| vlan ft |  | +>| hash ft |
214  *   |         |   |  |   L1    |  | | |   L2    |
215  *   +---------+   |  +---------+  | | +---------+    +-----+    +-----+------+
216  *   | promisc |---+  | VLAN 0  |----+ |  TCPv6  |--->| TIR |--->|     |  RQ3 |
217  *   +---------+      +---------+  |   +---------+    +-----+    |     +------+
218  *                    |  .....  |  |   |  UDPv6  |--->| TIR |--->|     |  RQ4 |
219  *                    |         |  |   +---------+    +-----+    |     +------+
220  *                    |         |  |   |  TCPv4  |--->| TIR |--->|     |  RQ5 |
221  *                    |         |  |   +---------+    +-----+    | RQT +------+
222  *                    +---------+  |   |  UDPv4  |--->| TIR |--->|     |  ... |
223  *                    |         |  |   +---------+    +-----+    |     |      |
224  *                    +---------+  |   |  IPv6   |--->| TIR |--->|     |      |
225  *                    | promisc |--+   +---------+    +-----+    |     |      |
226  *                    +---------+      |  IPv4   |--->| TIR |--->|     |      |
227  *                                     +---------+    +-----+    +-----+------+
228  *                                     |  other  |-+
229  *                                     +---------+ |
230  *                      .......                    |  +-----+    +-----+
231  *                                                 +->| TIR |--->| RQ3 |
232  *                                                    +-----+    +-----+
233  *
234  * Note that the "promisc" flow entries are only set/enabled when promisc
235  * mode is enabled for the NIC. All promisc flow entries point directly at
236  * group 0's hashing flowtable (so all promisc-only traffic lands on group 0,
237  * the "default group" in MAC).
238  *
239  * The "default" entry in the L1 VLAN filter flow tables is used when there
240  * are no VLANs set for the group, to accept any traffic regardless of tag. It
241  * is deleted as soon as a VLAN filter is added (and re-instated if the
242  * last VLAN filter is removed).
243  *
244  * The actual descriptor ring structures for RX on Connect-X4 don't contain any
245  * space for packet data (they're a collection of scatter pointers only). TX
246  * descriptors contain some space for "inline headers" (and the card requires
247  * us to put at least the L2 Ethernet headers there for the eswitch to look at)
248  * but all the rest of the data comes from the gather pointers.
249  *
250  * When we get completions back they simply contain the ring index number of
251  * the WR (work request) which completed. So, we manage the buffers for actual
252  * packet data completely independently of the descriptors in this driver. When
253  * a WR is enqueued in a WQE (work queue entry), we stamp the packet data buffer
254  * with the WQE index that we put it at, and therefore don't have to look at
255  * the original descriptor at all when handling completions.
256  *
257  * For RX, we create sufficient packet data buffers to fill 150% of the
258  * available descriptors for each ring. These all are pre-set-up for DMA and
259  * have an mblk_t associated with them (with desballoc()).
260  *
261  * For TX we either borrow the mblk's memory and DMA bind it (if the packet is
262  * large enough), or we copy it into a pre-allocated buffer set up in the same
263  * as as for RX.
264  */
265 
266 /*
267  * Buffer lifecycle: RX
268  * --------------------
269  *
270  * The lifecycle of an mlxcx_buffer_t (packet buffer) used for RX is pretty
271  * straightforward.
272  *
273  * It is created (and has all its memory allocated) at the time of starting up
274  * the RX ring it belongs to. Then it is placed on the "free" list in the
275  * mlxcx_buffer_shard_t associated with its RQ. When mlxcx_rq_refill() wants
276  * more buffers to add to the RQ, it takes one off and marks it as "on WQ"
277  * before making a WQE for it.
278  *
279  * After a completion event occurs, the packet is either discarded (and the
280  * buffer_t returned to the free list), or it is readied for loaning to MAC
281  * and placed on the "loaned" list in the mlxcx_buffer_shard_t.
282  *
283  * Once MAC and the rest of the system have finished with the packet, they call
284  * freemsg() on its mblk, which will call mlxcx_buf_mp_return. At this point
285  * the fate of the buffer_t is determined by the state of the
286  * mlxcx_buffer_shard_t. When the shard is in its normal state the buffer_t
287  * will be returned to the free list, potentially to be recycled and used
288  * again. But if the shard is draining (E.g. after a ring stop) there will be
289  * no recycling and the buffer_t is immediately destroyed.
290  *
291  * At detach/teardown time, buffers are only every destroyed from the free list.
292  *
293  *
294  *                         +
295  *                         |
296  *                         | mlxcx_buf_create
297  *                         |
298  *                         v
299  *                    +----+----+
300  *                    | created |
301  *                    +----+----+                        +------+
302  *                         |                             | dead |
303  *                         |                             +------+
304  *                         | mlxcx_buf_return                ^
305  *                         |                                 |
306  *                         v                                 | mlxcx_buf_destroy
307  * mlxcx_buf_destroy  +----+----+          +-----------+     |
308  *          +---------|  free   |<------no-| draining? |-yes-+
309  *          |         +----+----+          +-----------+
310  *          |              |                     ^
311  *          |              |                     |
312  *          v              | mlxcx_buf_take      | mlxcx_buf_return
313  *      +---+--+           v                     |
314  *      | dead |       +---+---+                 |
315  *      +------+       | on WQ |- - - - - - - - >O
316  *                     +---+---+                 ^
317  *                         |                     |
318  *                         |                     |
319  *                         | mlxcx_buf_loan      | mlxcx_buf_mp_return
320  *                         v                     |
321  *                 +-------+--------+            |
322  *                 | on loan to MAC |----------->O
323  *                 +----------------+  freemsg()
324  *
325  */
326 
327 /*
328  * Buffer lifecycle: TX
329  * --------------------
330  *
331  * mlxcx_buffer_ts used for TX are divided into two kinds: regular buffers, and
332  * "foreign" buffers.
333  *
334  * The former have their memory allocated and DMA bound by this driver, while
335  * the latter (the "foreign" buffers) are on loan from MAC. Their memory is
336  * not owned by us, though we do DMA bind it (and take responsibility for
337  * un-binding it when we're done with them).
338  *
339  * We use separate mlxcx_buf_shard_ts for foreign and local buffers on each
340  * SQ. Thus, there is a separate free list and mutex for each kind.
341  *
342  * Since a TX packet might consist of multiple mblks, we translate each mblk
343  * into exactly one buffer_t. The buffer_ts are chained together in the same
344  * order as the mblks, using the mlb_tx_chain/mlb_tx_chain_entry list_t.
345  *
346  * Each chain of TX buffers may consist of foreign or driver buffers, in any
347  * mixture.
348  *
349  * The head of a TX buffer chain has mlb_tx_head == itself, which distinguishes
350  * it from the rest of the chain buffers.
351  *
352  * TX buffer chains are always returned to the free list by
353  * mlxcx_buf_return_chain(), which takes care of walking the mlb_tx_chain and
354  * freeing all of the members.
355  *
356  * We only call freemsg() once, on the head of the TX buffer chain's original
357  * mblk. This is true whether we copied it or bound it in a foreign buffer.
358  */
359 
360 /*
361  * Startup and command interface
362  * -----------------------------
363  *
364  * The command interface is the primary way in which we give control orders to
365  * the hardware (e.g. actions like "create this queue" or "delete this flow
366  * entry"). The command interface is never used to transmit or receive packets
367  * -- that takes place only on the queues that are set up through it.
368  *
369  * In mlxcx_cmd.c we implement our use of the command interface on top of a
370  * simple taskq. As commands are submitted from the taskq they choose a
371  * "slot", if there are no free slots then execution of the command will
372  * be paused until one is free. The hardware permits up to 32 independent
373  * slots for concurrent command execution.
374  *
375  * Before interrupts are enabled, command completion is polled, once
376  * interrupts are up command completions become asynchronous and are
377  * wired to EQ 0. A caveat to this is commands can not be submitted
378  * directly from EQ 0's completion handler, and any processing resulting from
379  * an asynchronous event which requires further use of the command interface
380  * is posted through a taskq.
381  *
382  * The startup/attach process for this card involves a bunch of different steps
383  * which are summarised pretty well in the PRM. We have to send a number of
384  * commands which do different things to start the card up, give it some pages
385  * of our own memory for it to use, then start creating all the entities that
386  * we need to use like EQs, CQs, WQs, as well as their dependencies like PDs
387  * and TDoms.
388  */
389 
390 /*
391  * UARs
392  * ----
393  *
394  * The pages of the PCI BAR other than the first few are reserved for use as
395  * "UAR" sections in this device. Each UAR section can be used as a set of
396  * doorbells for our queues.
397  *
398  * Currently we just make one single UAR for all of our queues. It doesn't
399  * seem to be a major limitation yet.
400  *
401  * When we're sending packets through an SQ, the PRM is not awful clear about
402  * exactly how we're meant to use the first 16 bytes of the Blueflame buffers
403  * (it's clear on the pattern of alternation you're expected to use between
404  * even and odd for Blueflame sends, but not for regular doorbells).
405  *
406  * Currently we don't do the even-odd alternating pattern for ordinary
407  * doorbells, and we don't use Blueflame at all. This seems to work fine, at
408  * least on Connect-X4 Lx.
409  */
410 
411 /*
412  * Lock ordering
413  * -------------
414  *
415  * Interrupt side:
416  *
417  *  - mleq_mtx
418  *    - mlcq_arm_mtx
419  *      - mlcq_mtx
420  *        - mlcq_bufbmtx
421  *        - mlwq_mtx
422  *          - mlbs_mtx
423  *    - mlp_mtx
424  *
425  * GLD side:
426  *
427  *  - mlp_mtx
428  *    - mlg_mtx
429  *      - mlg_*.mlft_mtx
430  *    - mlp_*.mlft_mtx
431  *    - mlwq_mtx
432  *      - mlbs_mtx
433  *      - mlcq_bufbmtx
434  *  - mleq_mtx
435  *    - mlcq_arm_mtx
436  *      - mlcq_mtx
437  *
438  */
439 
440 #include <sys/modctl.h>
441 #include <sys/conf.h>
442 #include <sys/devops.h>
443 #include <sys/sysmacros.h>
444 #include <sys/time.h>
445 
446 #include <sys/mac_provider.h>
447 
448 #include <mlxcx.h>
449 
450 CTASSERT((1 << MLXCX_RX_HASH_FT_SIZE_SHIFT) >= MLXCX_TIRS_PER_GROUP);
451 
452 #define	MLXCX_MODULE_NAME	"mlxcx"
453 /*
454  * We give this to the firmware, so it has to be in a fixed format that it
455  * understands.
456  */
457 #define	MLXCX_DRIVER_VERSION	"illumos,mlxcx,1.0.0,1,000,000000"
458 
459 /*
460  * Firmware may take a while to reclaim pages. Try a set number of times.
461  */
462 clock_t mlxcx_reclaim_delay = 1000 * 50; /* 50 ms in us */
463 uint_t mlxcx_reclaim_tries = 100; /* Wait at most 5000ms */
464 
465 static void *mlxcx_softstate;
466 
467 /*
468  * Fault detection thresholds.
469  */
470 uint_t mlxcx_doorbell_tries = MLXCX_DOORBELL_TRIES_DFLT;
471 uint_t mlxcx_stuck_intr_count = MLXCX_STUCK_INTR_COUNT_DFLT;
472 
473 static void
474 mlxcx_load_prop_defaults(mlxcx_t *mlxp)
475 {
476 	mlxcx_drv_props_t *p = &mlxp->mlx_props;
477 	mlxcx_port_t *port = &mlxp->mlx_ports[0];
478 
479 	VERIFY((mlxp->mlx_attach & MLXCX_ATTACH_PORTS) != 0);
480 	VERIFY((mlxp->mlx_attach & (MLXCX_ATTACH_CQS | MLXCX_ATTACH_WQS)) == 0);
481 
482 	/*
483 	 * Currently we have different queue size defaults for two
484 	 * categories of queues. One set for devices which support a
485 	 * maximum speed of 10Gb/s, and another for those above that.
486 	 */
487 	if ((port->mlp_max_proto & (MLXCX_PROTO_25G | MLXCX_PROTO_40G |
488 	    MLXCX_PROTO_50G | MLXCX_PROTO_100G)) != 0 ||
489 	    (port->mlp_ext_max_proto & (MLXCX_EXTPROTO_25G |
490 	    MLXCX_EXTPROTO_40G | MLXCX_EXTPROTO_50G | MLXCX_EXTPROTO_100G |
491 	    MLXCX_EXTPROTO_200G | MLXCX_EXTPROTO_400G)) != 0) {
492 		p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_25G;
493 		p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_25G;
494 		p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_25G;
495 	} else if ((port->mlp_max_proto & (MLXCX_PROTO_100M | MLXCX_PROTO_1G |
496 	    MLXCX_PROTO_10G)) != 0 ||
497 	    (port->mlp_ext_max_proto & (MLXCX_EXTPROTO_100M |
498 	    MLXCX_EXTPROTO_5G | MLXCX_EXTPROTO_1G | MLXCX_EXTPROTO_10G)) != 0) {
499 		p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT;
500 		p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT;
501 		p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT;
502 	} else {
503 		mlxcx_warn(mlxp, "Encountered a port with a speed we don't "
504 		    "recognize. Proto: 0x%x", port->mlp_max_proto);
505 		p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT;
506 		p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT;
507 		p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT;
508 	}
509 }
510 
511 /*
512  * Properties which may have different defaults based on hardware
513  * characteristics.
514  */
515 static void
516 mlxcx_load_model_props(mlxcx_t *mlxp)
517 {
518 	mlxcx_drv_props_t *p = &mlxp->mlx_props;
519 
520 	mlxcx_load_prop_defaults(mlxp);
521 
522 	p->mldp_cq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
523 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cq_size_shift",
524 	    p->mldp_cq_size_shift_default);
525 	p->mldp_sq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
526 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "sq_size_shift",
527 	    p->mldp_sq_size_shift_default);
528 	p->mldp_rq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
529 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rq_size_shift",
530 	    p->mldp_rq_size_shift_default);
531 }
532 
533 static void
534 mlxcx_load_props(mlxcx_t *mlxp)
535 {
536 	mlxcx_drv_props_t *p = &mlxp->mlx_props;
537 
538 	p->mldp_eq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
539 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "eq_size_shift",
540 	    MLXCX_EQ_SIZE_SHIFT_DFLT);
541 	p->mldp_cqemod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
542 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_period_usec",
543 	    MLXCX_CQEMOD_PERIOD_USEC_DFLT);
544 	p->mldp_cqemod_count = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
545 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_count",
546 	    MLXCX_CQEMOD_COUNT_DFLT);
547 	p->mldp_intrmod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
548 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "intrmod_period_usec",
549 	    MLXCX_INTRMOD_PERIOD_USEC_DFLT);
550 
551 	p->mldp_tx_ngroups = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
552 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_ngroups",
553 	    MLXCX_TX_NGROUPS_DFLT);
554 	p->mldp_tx_nrings_per_group = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
555 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_nrings_per_group",
556 	    MLXCX_TX_NRINGS_PER_GROUP_DFLT);
557 
558 	p->mldp_rx_ngroups_large = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
559 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_ngroups_large",
560 	    MLXCX_RX_NGROUPS_LARGE_DFLT);
561 	p->mldp_rx_ngroups_small = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
562 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_ngroups_small",
563 	    MLXCX_RX_NGROUPS_SMALL_DFLT);
564 	p->mldp_rx_nrings_per_large_group = ddi_getprop(DDI_DEV_T_ANY,
565 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
566 	    "rx_nrings_per_large_group", MLXCX_RX_NRINGS_PER_LARGE_GROUP_DFLT);
567 	p->mldp_rx_nrings_per_small_group = ddi_getprop(DDI_DEV_T_ANY,
568 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
569 	    "rx_nrings_per_small_group", MLXCX_RX_NRINGS_PER_SMALL_GROUP_DFLT);
570 
571 	p->mldp_ftbl_root_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
572 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "ftbl_root_size_shift",
573 	    MLXCX_FTBL_ROOT_SIZE_SHIFT_DFLT);
574 
575 	p->mldp_tx_bind_threshold = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
576 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_bind_threshold",
577 	    MLXCX_TX_BIND_THRESHOLD_DFLT);
578 
579 	p->mldp_ftbl_vlan_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
580 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "ftbl_vlan_size_shift",
581 	    MLXCX_FTBL_VLAN_SIZE_SHIFT_DFLT);
582 
583 	p->mldp_eq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY,
584 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
585 	    "eq_check_interval_sec", MLXCX_EQ_CHECK_INTERVAL_SEC_DFLT);
586 	p->mldp_cq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY,
587 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
588 	    "cq_check_interval_sec", MLXCX_CQ_CHECK_INTERVAL_SEC_DFLT);
589 	p->mldp_wq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY,
590 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
591 	    "wq_check_interval_sec", MLXCX_WQ_CHECK_INTERVAL_SEC_DFLT);
592 
593 	p->mldp_rx_per_cq = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
594 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_limit_per_completion",
595 	    MLXCX_RX_PER_CQ_DEFAULT);
596 
597 	if (p->mldp_rx_per_cq < MLXCX_RX_PER_CQ_MIN ||
598 	    p->mldp_rx_per_cq > MLXCX_RX_PER_CQ_MAX) {
599 		mlxcx_warn(mlxp, "!rx_limit_per_completion = %u is "
600 		    "out of range. Defaulting to: %d. Valid values are from "
601 		    "%d to %d", p->mldp_rx_per_cq, MLXCX_RX_PER_CQ_DEFAULT,
602 		    MLXCX_RX_PER_CQ_MIN, MLXCX_RX_PER_CQ_MAX);
603 		p->mldp_rx_per_cq = MLXCX_RX_PER_CQ_DEFAULT;
604 	}
605 }
606 
607 void
608 mlxcx_note(mlxcx_t *mlxp, const char *fmt, ...)
609 {
610 	va_list ap;
611 
612 	va_start(ap, fmt);
613 	if (mlxp != NULL && mlxp->mlx_dip != NULL) {
614 		vdev_err(mlxp->mlx_dip, CE_NOTE, fmt, ap);
615 	} else {
616 		vcmn_err(CE_NOTE, fmt, ap);
617 	}
618 	va_end(ap);
619 }
620 
621 void
622 mlxcx_warn(mlxcx_t *mlxp, const char *fmt, ...)
623 {
624 	va_list ap;
625 
626 	va_start(ap, fmt);
627 	if (mlxp != NULL && mlxp->mlx_dip != NULL) {
628 		vdev_err(mlxp->mlx_dip, CE_WARN, fmt, ap);
629 	} else {
630 		vcmn_err(CE_WARN, fmt, ap);
631 	}
632 	va_end(ap);
633 }
634 
635 void
636 mlxcx_panic(mlxcx_t *mlxp, const char *fmt, ...)
637 {
638 	va_list ap;
639 
640 	va_start(ap, fmt);
641 	if (mlxp != NULL && mlxp->mlx_dip != NULL) {
642 		vdev_err(mlxp->mlx_dip, CE_PANIC, fmt, ap);
643 	} else {
644 		vcmn_err(CE_PANIC, fmt, ap);
645 	}
646 	va_end(ap);
647 }
648 
649 uint16_t
650 mlxcx_get16(mlxcx_t *mlxp, uintptr_t off)
651 {
652 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
653 	return (ddi_get16(mlxp->mlx_regs_handle, (void *)addr));
654 }
655 
656 uint32_t
657 mlxcx_get32(mlxcx_t *mlxp, uintptr_t off)
658 {
659 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
660 	return (ddi_get32(mlxp->mlx_regs_handle, (void *)addr));
661 }
662 
663 uint64_t
664 mlxcx_get64(mlxcx_t *mlxp, uintptr_t off)
665 {
666 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
667 	return (ddi_get64(mlxp->mlx_regs_handle, (void *)addr));
668 }
669 
670 void
671 mlxcx_put32(mlxcx_t *mlxp, uintptr_t off, uint32_t val)
672 {
673 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
674 	ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val);
675 }
676 
677 void
678 mlxcx_put64(mlxcx_t *mlxp, uintptr_t off, uint64_t val)
679 {
680 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
681 	ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val);
682 }
683 
684 void
685 mlxcx_uar_put32(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint32_t val)
686 {
687 	/*
688 	 * The UAR is always inside the first BAR, which we mapped as
689 	 * mlx_regs
690 	 */
691 	uintptr_t addr = off + (uintptr_t)mlu->mlu_base +
692 	    (uintptr_t)mlxp->mlx_regs_base;
693 	ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val);
694 }
695 
696 void
697 mlxcx_uar_put64(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint64_t val)
698 {
699 	uintptr_t addr = off + (uintptr_t)mlu->mlu_base +
700 	    (uintptr_t)mlxp->mlx_regs_base;
701 	ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val);
702 }
703 
704 static void
705 mlxcx_fm_fini(mlxcx_t *mlxp)
706 {
707 	if (mlxp->mlx_fm_caps == 0)
708 		return;
709 
710 	if (DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps))
711 		ddi_fm_handler_unregister(mlxp->mlx_dip);
712 
713 	if (DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps) ||
714 	    DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps))
715 		pci_ereport_teardown(mlxp->mlx_dip);
716 
717 	ddi_fm_fini(mlxp->mlx_dip);
718 
719 	mlxp->mlx_fm_caps = 0;
720 }
721 
722 void
723 mlxcx_fm_ereport(mlxcx_t *mlxp, const char *detail)
724 {
725 	uint64_t ena;
726 	char buf[FM_MAX_CLASS];
727 
728 	if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps))
729 		return;
730 
731 	(void) snprintf(buf, FM_MAX_CLASS, "%s.%s", DDI_FM_DEVICE, detail);
732 	ena = fm_ena_generate(0, FM_ENA_FMT1);
733 	ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP,
734 	    FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0,
735 	    NULL);
736 }
737 
738 static int
739 mlxcx_fm_errcb(dev_info_t *dip, ddi_fm_error_t *err, const void *arg)
740 {
741 	/*
742 	 * as the driver can always deal with an error in any dma or
743 	 * access handle, we can just return the fme_status value.
744 	 */
745 	pci_ereport_post(dip, err, NULL);
746 	return (err->fme_status);
747 }
748 
749 static void
750 mlxcx_fm_init(mlxcx_t *mlxp)
751 {
752 	ddi_iblock_cookie_t iblk;
753 	int def = DDI_FM_EREPORT_CAPABLE | DDI_FM_ACCCHK_CAPABLE |
754 	    DDI_FM_DMACHK_CAPABLE | DDI_FM_ERRCB_CAPABLE;
755 
756 	mlxp->mlx_fm_caps = ddi_prop_get_int(DDI_DEV_T_ANY, mlxp->mlx_dip,
757 	    DDI_PROP_DONTPASS, "fm_capable", def);
758 
759 	if (mlxp->mlx_fm_caps < 0) {
760 		mlxp->mlx_fm_caps = 0;
761 	}
762 	mlxp->mlx_fm_caps &= def;
763 
764 	if (mlxp->mlx_fm_caps == 0)
765 		return;
766 
767 	ddi_fm_init(mlxp->mlx_dip, &mlxp->mlx_fm_caps, &iblk);
768 	if (DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps) ||
769 	    DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) {
770 		pci_ereport_setup(mlxp->mlx_dip);
771 	}
772 	if (DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) {
773 		ddi_fm_handler_register(mlxp->mlx_dip, mlxcx_fm_errcb,
774 		    (void *)mlxp);
775 	}
776 }
777 
778 static void
779 mlxcx_mlbs_teardown(mlxcx_t *mlxp, mlxcx_buf_shard_t *s)
780 {
781 	mlxcx_buffer_t *buf;
782 
783 	mutex_enter(&s->mlbs_mtx);
784 
785 	while (!list_is_empty(&s->mlbs_busy))
786 		cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
787 
788 	while (!list_is_empty(&s->mlbs_loaned))
789 		cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
790 
791 	while ((buf = list_head(&s->mlbs_free)) != NULL)
792 		mlxcx_buf_destroy(mlxp, buf);
793 
794 	list_destroy(&s->mlbs_free);
795 	list_destroy(&s->mlbs_busy);
796 	list_destroy(&s->mlbs_loaned);
797 	mutex_exit(&s->mlbs_mtx);
798 
799 	cv_destroy(&s->mlbs_free_nonempty);
800 	mutex_destroy(&s->mlbs_mtx);
801 }
802 
803 static void
804 mlxcx_teardown_bufs(mlxcx_t *mlxp)
805 {
806 	mlxcx_buf_shard_t *s;
807 
808 	while ((s = list_remove_head(&mlxp->mlx_buf_shards)) != NULL) {
809 		mlxcx_mlbs_teardown(mlxp, s);
810 		kmem_free(s, sizeof (mlxcx_buf_shard_t));
811 	}
812 	list_destroy(&mlxp->mlx_buf_shards);
813 
814 	kmem_cache_destroy(mlxp->mlx_bufs_cache);
815 }
816 
817 static void
818 mlxcx_teardown_pages(mlxcx_t *mlxp)
819 {
820 	uint_t nzeros = 0;
821 	uint64_t *pas;
822 
823 	pas = kmem_alloc(sizeof (*pas) * MLXCX_MANAGE_PAGES_MAX_PAGES,
824 	    KM_SLEEP);
825 
826 	mutex_enter(&mlxp->mlx_pagemtx);
827 
828 	while (mlxp->mlx_npages > 0) {
829 		int32_t req, ret;
830 
831 		ASSERT0(avl_is_empty(&mlxp->mlx_pages));
832 		req = MIN(mlxp->mlx_npages, MLXCX_MANAGE_PAGES_MAX_PAGES);
833 
834 		if (!mlxcx_cmd_return_pages(mlxp, req, pas, &ret)) {
835 			mlxcx_warn(mlxp, "hardware refused to return pages, "
836 			    "leaking %u remaining pages", mlxp->mlx_npages);
837 			goto out;
838 		}
839 
840 		for (int32_t i = 0; i < ret; i++) {
841 			mlxcx_dev_page_t *mdp, probe;
842 			bzero(&probe, sizeof (probe));
843 			probe.mxdp_pa = pas[i];
844 
845 			mdp = avl_find(&mlxp->mlx_pages, &probe, NULL);
846 
847 			if (mdp != NULL) {
848 				avl_remove(&mlxp->mlx_pages, mdp);
849 				mlxp->mlx_npages--;
850 				mlxcx_dma_free(&mdp->mxdp_dma);
851 				kmem_free(mdp, sizeof (mlxcx_dev_page_t));
852 			} else {
853 				mlxcx_panic(mlxp, "hardware returned a page "
854 				    "with PA 0x%" PRIx64 " but we have no "
855 				    "record of giving out such a page", pas[i]);
856 			}
857 		}
858 
859 		/*
860 		 * If no pages were returned, note that fact.
861 		 */
862 		if (ret == 0) {
863 			nzeros++;
864 			if (nzeros > mlxcx_reclaim_tries) {
865 				mlxcx_warn(mlxp, "hardware refused to return "
866 				    "pages, leaking %u remaining pages",
867 				    mlxp->mlx_npages);
868 				goto out;
869 			}
870 			delay(drv_usectohz(mlxcx_reclaim_delay));
871 		}
872 	}
873 
874 	avl_destroy(&mlxp->mlx_pages);
875 
876 out:
877 	mutex_exit(&mlxp->mlx_pagemtx);
878 	mutex_destroy(&mlxp->mlx_pagemtx);
879 
880 	kmem_free(pas, sizeof (*pas) * MLXCX_MANAGE_PAGES_MAX_PAGES);
881 }
882 
883 static boolean_t
884 mlxcx_eq_alloc_dma(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
885 {
886 	ddi_device_acc_attr_t acc;
887 	ddi_dma_attr_t attr;
888 	boolean_t ret;
889 	size_t sz, i;
890 
891 	VERIFY0(mleq->mleq_state & MLXCX_EQ_ALLOC);
892 
893 	mleq->mleq_entshift = mlxp->mlx_props.mldp_eq_size_shift;
894 	mleq->mleq_nents = (1 << mleq->mleq_entshift);
895 	sz = mleq->mleq_nents * sizeof (mlxcx_eventq_ent_t);
896 	ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0);
897 
898 	mlxcx_dma_acc_attr(mlxp, &acc);
899 	mlxcx_dma_queue_attr(mlxp, &attr);
900 
901 	ret = mlxcx_dma_alloc(mlxp, &mleq->mleq_dma, &attr, &acc,
902 	    B_TRUE, sz, B_TRUE);
903 	if (!ret) {
904 		mlxcx_warn(mlxp, "failed to allocate EQ memory");
905 		return (B_FALSE);
906 	}
907 
908 	mleq->mleq_ent = (mlxcx_eventq_ent_t *)mleq->mleq_dma.mxdb_va;
909 
910 	for (i = 0; i < mleq->mleq_nents; ++i)
911 		mleq->mleq_ent[i].mleqe_owner = MLXCX_EQ_OWNER_INIT;
912 
913 	mleq->mleq_state |= MLXCX_EQ_ALLOC;
914 
915 	return (B_TRUE);
916 }
917 
918 static void
919 mlxcx_eq_rele_dma(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
920 {
921 	VERIFY(mleq->mleq_state & MLXCX_EQ_ALLOC);
922 	if (mleq->mleq_state & MLXCX_EQ_CREATED)
923 		VERIFY(mleq->mleq_state & MLXCX_EQ_DESTROYED);
924 
925 	mlxcx_dma_free(&mleq->mleq_dma);
926 	mleq->mleq_ent = NULL;
927 
928 	mleq->mleq_state &= ~MLXCX_EQ_ALLOC;
929 }
930 
931 void
932 mlxcx_teardown_flow_table(mlxcx_t *mlxp, mlxcx_flow_table_t *ft)
933 {
934 	mlxcx_flow_group_t *fg;
935 	mlxcx_flow_entry_t *fe;
936 	int i;
937 
938 	ASSERT(mutex_owned(&ft->mlft_mtx));
939 
940 	for (i = ft->mlft_nents - 1; i >= 0; --i) {
941 		fe = &ft->mlft_ent[i];
942 		if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
943 			if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) {
944 				mlxcx_panic(mlxp, "failed to delete flow "
945 				    "entry %u on table %u", i,
946 				    ft->mlft_num);
947 			}
948 		}
949 	}
950 
951 	while ((fg = list_remove_head(&ft->mlft_groups)) != NULL) {
952 		if (fg->mlfg_state & MLXCX_FLOW_GROUP_CREATED &&
953 		    !(fg->mlfg_state & MLXCX_FLOW_GROUP_DESTROYED)) {
954 			if (!mlxcx_cmd_destroy_flow_group(mlxp, fg)) {
955 				mlxcx_panic(mlxp, "failed to destroy flow "
956 				    "group %u", fg->mlfg_num);
957 			}
958 		}
959 		kmem_free(fg, sizeof (mlxcx_flow_group_t));
960 	}
961 	list_destroy(&ft->mlft_groups);
962 	if (ft->mlft_state & MLXCX_FLOW_TABLE_CREATED &&
963 	    !(ft->mlft_state & MLXCX_FLOW_TABLE_DESTROYED)) {
964 		if (!mlxcx_cmd_destroy_flow_table(mlxp, ft)) {
965 			mlxcx_panic(mlxp, "failed to destroy flow table %u",
966 			    ft->mlft_num);
967 		}
968 	}
969 	kmem_free(ft->mlft_ent, ft->mlft_entsize);
970 	ft->mlft_ent = NULL;
971 	mutex_exit(&ft->mlft_mtx);
972 	mutex_destroy(&ft->mlft_mtx);
973 	kmem_free(ft, sizeof (mlxcx_flow_table_t));
974 }
975 
976 static void
977 mlxcx_teardown_ports(mlxcx_t *mlxp)
978 {
979 	uint_t i;
980 	mlxcx_port_t *p;
981 	mlxcx_flow_table_t *ft;
982 
983 	for (i = 0; i < mlxp->mlx_nports; ++i) {
984 		p = &mlxp->mlx_ports[i];
985 		if (!(p->mlp_init & MLXCX_PORT_INIT))
986 			continue;
987 		mutex_enter(&p->mlp_mtx);
988 		if ((ft = p->mlp_rx_flow) != NULL) {
989 			mutex_enter(&ft->mlft_mtx);
990 			/*
991 			 * teardown_flow_table() will destroy the mutex, so
992 			 * we don't release it here.
993 			 */
994 			mlxcx_teardown_flow_table(mlxp, ft);
995 		}
996 		mutex_exit(&p->mlp_mtx);
997 		mutex_destroy(&p->mlp_mtx);
998 		mutex_destroy(&p->mlx_port_event.mla_mtx);
999 		p->mlx_port_event.mla_mlx = NULL;
1000 		p->mlx_port_event.mla_port = NULL;
1001 		p->mlp_init &= ~MLXCX_PORT_INIT;
1002 	}
1003 
1004 	kmem_free(mlxp->mlx_ports, mlxp->mlx_ports_size);
1005 	mlxp->mlx_ports = NULL;
1006 }
1007 
1008 static void
1009 mlxcx_teardown_wqs(mlxcx_t *mlxp)
1010 {
1011 	mlxcx_work_queue_t *mlwq;
1012 
1013 	while ((mlwq = list_head(&mlxp->mlx_wqs)) != NULL) {
1014 		mlxcx_wq_teardown(mlxp, mlwq);
1015 	}
1016 	list_destroy(&mlxp->mlx_wqs);
1017 }
1018 
1019 static void
1020 mlxcx_teardown_cqs(mlxcx_t *mlxp)
1021 {
1022 	mlxcx_completion_queue_t *mlcq;
1023 
1024 	while ((mlcq = list_head(&mlxp->mlx_cqs)) != NULL) {
1025 		mlxcx_cq_teardown(mlxp, mlcq);
1026 	}
1027 	list_destroy(&mlxp->mlx_cqs);
1028 }
1029 
1030 static void
1031 mlxcx_teardown_eqs(mlxcx_t *mlxp)
1032 {
1033 	mlxcx_event_queue_t *mleq;
1034 	uint_t i;
1035 
1036 	for (i = 0; i < mlxp->mlx_intr_count; ++i) {
1037 		mleq = &mlxp->mlx_eqs[i];
1038 		mutex_enter(&mleq->mleq_mtx);
1039 		if ((mleq->mleq_state & MLXCX_EQ_CREATED) &&
1040 		    !(mleq->mleq_state & MLXCX_EQ_DESTROYED)) {
1041 			if (!mlxcx_cmd_destroy_eq(mlxp, mleq)) {
1042 				mlxcx_warn(mlxp, "failed to destroy "
1043 				    "event queue idx %u eqn %u",
1044 				    i, mleq->mleq_num);
1045 			}
1046 		}
1047 		if (mleq->mleq_state & MLXCX_EQ_ALLOC) {
1048 			mlxcx_eq_rele_dma(mlxp, mleq);
1049 		}
1050 		mutex_exit(&mleq->mleq_mtx);
1051 	}
1052 }
1053 
1054 static void
1055 mlxcx_teardown_checktimers(mlxcx_t *mlxp)
1056 {
1057 	if (mlxp->mlx_props.mldp_eq_check_interval_sec > 0)
1058 		ddi_periodic_delete(mlxp->mlx_eq_checktimer);
1059 	if (mlxp->mlx_props.mldp_cq_check_interval_sec > 0)
1060 		ddi_periodic_delete(mlxp->mlx_cq_checktimer);
1061 	if (mlxp->mlx_props.mldp_wq_check_interval_sec > 0)
1062 		ddi_periodic_delete(mlxp->mlx_wq_checktimer);
1063 }
1064 
1065 static void
1066 mlxcx_teardown(mlxcx_t *mlxp)
1067 {
1068 	uint_t i;
1069 	dev_info_t *dip = mlxp->mlx_dip;
1070 
1071 	if (mlxp->mlx_attach & MLXCX_ATTACH_INTRS) {
1072 		/*
1073 		 * Disable interrupts and let any active vectors quiesce.
1074 		 */
1075 		mlxcx_intr_disable(mlxp);
1076 	}
1077 
1078 	if (mlxp->mlx_attach & MLXCX_ATTACH_SENSORS) {
1079 		mlxcx_teardown_sensors(mlxp);
1080 		mlxp->mlx_attach &= ~MLXCX_ATTACH_SENSORS;
1081 	}
1082 
1083 	if (mlxp->mlx_attach & MLXCX_ATTACH_CHKTIMERS) {
1084 		mlxcx_teardown_checktimers(mlxp);
1085 		mlxp->mlx_attach &= ~MLXCX_ATTACH_CHKTIMERS;
1086 	}
1087 
1088 	if (mlxp->mlx_attach & MLXCX_ATTACH_GROUPS) {
1089 		mlxcx_teardown_groups(mlxp);
1090 		mlxp->mlx_attach &= ~MLXCX_ATTACH_GROUPS;
1091 	}
1092 
1093 	if (mlxp->mlx_attach & MLXCX_ATTACH_WQS) {
1094 		mlxcx_teardown_wqs(mlxp);
1095 		mlxp->mlx_attach &= ~MLXCX_ATTACH_WQS;
1096 	}
1097 
1098 	if (mlxp->mlx_attach & MLXCX_ATTACH_CQS) {
1099 		mlxcx_teardown_cqs(mlxp);
1100 		mlxp->mlx_attach &= ~MLXCX_ATTACH_CQS;
1101 	}
1102 
1103 	if (mlxp->mlx_attach & MLXCX_ATTACH_BUFS) {
1104 		mlxcx_teardown_bufs(mlxp);
1105 		mlxp->mlx_attach &= ~MLXCX_ATTACH_BUFS;
1106 	}
1107 
1108 	if (mlxp->mlx_attach & MLXCX_ATTACH_PORTS) {
1109 		mlxcx_teardown_ports(mlxp);
1110 		mlxp->mlx_attach &= ~MLXCX_ATTACH_PORTS;
1111 	}
1112 
1113 	if (mlxp->mlx_attach & MLXCX_ATTACH_INTRS) {
1114 		mlxcx_teardown_eqs(mlxp);
1115 		mlxcx_intr_teardown(mlxp);
1116 		mlxp->mlx_attach &= ~MLXCX_ATTACH_INTRS;
1117 	}
1118 
1119 	if (mlxp->mlx_attach & MLXCX_ATTACH_UAR_PD_TD) {
1120 		if (mlxp->mlx_uar.mlu_allocated) {
1121 			if (!mlxcx_cmd_dealloc_uar(mlxp, &mlxp->mlx_uar)) {
1122 				mlxcx_warn(mlxp, "failed to release UAR");
1123 			}
1124 			for (i = 0; i < MLXCX_BF_PER_UAR; ++i)
1125 				mutex_destroy(&mlxp->mlx_uar.mlu_bf[i].mbf_mtx);
1126 		}
1127 		if (mlxp->mlx_pd.mlpd_allocated &&
1128 		    !mlxcx_cmd_dealloc_pd(mlxp, &mlxp->mlx_pd)) {
1129 			mlxcx_warn(mlxp, "failed to release PD");
1130 		}
1131 		if (mlxp->mlx_tdom.mltd_allocated &&
1132 		    !mlxcx_cmd_dealloc_tdom(mlxp, &mlxp->mlx_tdom)) {
1133 			mlxcx_warn(mlxp, "failed to release TDOM");
1134 		}
1135 		mlxp->mlx_attach &= ~MLXCX_ATTACH_UAR_PD_TD;
1136 	}
1137 
1138 	if (mlxp->mlx_attach & MLXCX_ATTACH_INIT_HCA) {
1139 		if (!mlxcx_cmd_teardown_hca(mlxp)) {
1140 			mlxcx_warn(mlxp, "failed to send teardown HCA "
1141 			    "command during device detach");
1142 		}
1143 		mlxp->mlx_attach &= ~MLXCX_ATTACH_INIT_HCA;
1144 	}
1145 
1146 	if (mlxp->mlx_attach & MLXCX_ATTACH_PAGE_LIST) {
1147 		mlxcx_teardown_pages(mlxp);
1148 		mlxp->mlx_attach &= ~MLXCX_ATTACH_PAGE_LIST;
1149 	}
1150 
1151 	if (mlxp->mlx_attach & MLXCX_ATTACH_ASYNC_TQ) {
1152 		for (i = 0; i <= MLXCX_FUNC_ID_MAX; i++) {
1153 			mlxp->mlx_npages_req[i].mla_mlx = NULL;
1154 			mutex_destroy(&mlxp->mlx_npages_req[i].mla_mtx);
1155 		}
1156 		taskq_destroy(mlxp->mlx_async_tq);
1157 		mlxp->mlx_async_tq = NULL;
1158 		mlxp->mlx_attach &= ~MLXCX_ATTACH_ASYNC_TQ;
1159 	}
1160 
1161 	if (mlxp->mlx_attach & MLXCX_ATTACH_ENABLE_HCA) {
1162 		if (!mlxcx_cmd_disable_hca(mlxp)) {
1163 			mlxcx_warn(mlxp, "failed to send DISABLE HCA command "
1164 			    "during device detach");
1165 		}
1166 		mlxp->mlx_attach &= ~MLXCX_ATTACH_ENABLE_HCA;
1167 	}
1168 
1169 	if (mlxp->mlx_attach & MLXCX_ATTACH_CMD) {
1170 		mlxcx_cmd_queue_fini(mlxp);
1171 		mlxp->mlx_attach &= ~MLXCX_ATTACH_CMD;
1172 	}
1173 
1174 	if (mlxp->mlx_attach & MLXCX_ATTACH_CAPS) {
1175 		kmem_free(mlxp->mlx_caps, sizeof (mlxcx_caps_t));
1176 		mlxp->mlx_caps = NULL;
1177 		mlxp->mlx_attach &= ~MLXCX_ATTACH_CAPS;
1178 	}
1179 
1180 	if (mlxp->mlx_attach & MLXCX_ATTACH_REGS) {
1181 		ddi_regs_map_free(&mlxp->mlx_regs_handle);
1182 		mlxp->mlx_regs_handle = NULL;
1183 		mlxp->mlx_attach &= ~MLXCX_ATTACH_REGS;
1184 	}
1185 
1186 	if (mlxp->mlx_attach & MLXCX_ATTACH_PCI_CONFIG) {
1187 		pci_config_teardown(&mlxp->mlx_cfg_handle);
1188 		mlxp->mlx_cfg_handle = NULL;
1189 		mlxp->mlx_attach &= ~MLXCX_ATTACH_PCI_CONFIG;
1190 	}
1191 
1192 	if (mlxp->mlx_attach & MLXCX_ATTACH_FM) {
1193 		mlxcx_fm_fini(mlxp);
1194 		mlxp->mlx_attach &= ~MLXCX_ATTACH_FM;
1195 	}
1196 
1197 	VERIFY3S(mlxp->mlx_attach, ==, 0);
1198 	ddi_soft_state_free(mlxcx_softstate, mlxp->mlx_inst);
1199 	ddi_set_driver_private(dip, NULL);
1200 }
1201 
1202 static boolean_t
1203 mlxcx_regs_map(mlxcx_t *mlxp)
1204 {
1205 	off_t memsize;
1206 	int ret;
1207 	ddi_device_acc_attr_t da;
1208 
1209 	if (ddi_dev_regsize(mlxp->mlx_dip, MLXCX_REG_NUMBER, &memsize) !=
1210 	    DDI_SUCCESS) {
1211 		mlxcx_warn(mlxp, "failed to get register set size");
1212 		return (B_FALSE);
1213 	}
1214 
1215 	/*
1216 	 * All data in the main BAR is kept in big-endian even though it's a PCI
1217 	 * device.
1218 	 */
1219 	bzero(&da, sizeof (ddi_device_acc_attr_t));
1220 	da.devacc_attr_version = DDI_DEVICE_ATTR_V0;
1221 	da.devacc_attr_endian_flags = DDI_STRUCTURE_BE_ACC;
1222 	da.devacc_attr_dataorder = DDI_STRICTORDER_ACC;
1223 	if (DDI_FM_ACC_ERR_CAP(mlxp->mlx_fm_caps)) {
1224 		da.devacc_attr_access = DDI_FLAGERR_ACC;
1225 	} else {
1226 		da.devacc_attr_access = DDI_DEFAULT_ACC;
1227 	}
1228 
1229 	ret = ddi_regs_map_setup(mlxp->mlx_dip, MLXCX_REG_NUMBER,
1230 	    &mlxp->mlx_regs_base, 0, memsize, &da, &mlxp->mlx_regs_handle);
1231 
1232 	if (ret != DDI_SUCCESS) {
1233 		mlxcx_warn(mlxp, "failed to map device registers: %d", ret);
1234 		return (B_FALSE);
1235 	}
1236 
1237 	return (B_TRUE);
1238 }
1239 
1240 static boolean_t
1241 mlxcx_check_issi(mlxcx_t *mlxp)
1242 {
1243 	uint32_t issi;
1244 
1245 	if (!mlxcx_cmd_query_issi(mlxp, &issi)) {
1246 		mlxcx_warn(mlxp, "failed to get ISSI");
1247 		return (B_FALSE);
1248 	}
1249 
1250 	if ((issi & (1 << MLXCX_CURRENT_ISSI)) == 0) {
1251 		mlxcx_warn(mlxp, "hardware does not support software ISSI, "
1252 		    "hw vector 0x%x, sw version %u", issi, MLXCX_CURRENT_ISSI);
1253 		return (B_FALSE);
1254 	}
1255 
1256 	if (!mlxcx_cmd_set_issi(mlxp, MLXCX_CURRENT_ISSI)) {
1257 		mlxcx_warn(mlxp, "failed to set ISSI to %u",
1258 		    MLXCX_CURRENT_ISSI);
1259 		return (B_FALSE);
1260 	}
1261 
1262 	return (B_TRUE);
1263 }
1264 
1265 boolean_t
1266 mlxcx_give_pages(mlxcx_t *mlxp, int32_t npages, int32_t *ngiven)
1267 {
1268 	ddi_device_acc_attr_t acc;
1269 	ddi_dma_attr_t attr;
1270 	int32_t i;
1271 	list_t plist;
1272 	mlxcx_dev_page_t *mdp;
1273 	mlxcx_dev_page_t **pages;
1274 	const ddi_dma_cookie_t *ck;
1275 
1276 	/*
1277 	 * If there are no pages required, then we're done here.
1278 	 */
1279 	if (npages <= 0) {
1280 		*ngiven = 0;
1281 		return (B_TRUE);
1282 	}
1283 
1284 	npages = MIN(npages, MLXCX_MANAGE_PAGES_MAX_PAGES);
1285 
1286 	pages = kmem_alloc(sizeof (*pages) * npages, KM_SLEEP);
1287 
1288 	list_create(&plist, sizeof (mlxcx_dev_page_t),
1289 	    offsetof(mlxcx_dev_page_t, mxdp_list));
1290 
1291 	for (i = 0; i < npages; i++) {
1292 		mdp = kmem_zalloc(sizeof (mlxcx_dev_page_t), KM_SLEEP);
1293 		mlxcx_dma_acc_attr(mlxp, &acc);
1294 		mlxcx_dma_page_attr(mlxp, &attr);
1295 		if (!mlxcx_dma_alloc(mlxp, &mdp->mxdp_dma, &attr, &acc,
1296 		    B_TRUE, MLXCX_HW_PAGE_SIZE, B_TRUE)) {
1297 			mlxcx_warn(mlxp, "failed to allocate 4k page %u/%u", i,
1298 			    npages);
1299 			kmem_free(mdp, sizeof (mlxcx_dev_page_t));
1300 			goto cleanup_npages;
1301 		}
1302 		ck = mlxcx_dma_cookie_one(&mdp->mxdp_dma);
1303 		mdp->mxdp_pa = ck->dmac_laddress;
1304 
1305 		list_insert_tail(&plist, mdp);
1306 	}
1307 
1308 	/*
1309 	 * Now that all of the pages have been allocated, given them to hardware
1310 	 * in chunks.
1311 	 */
1312 	for (i = 0; i < npages; i++) {
1313 		pages[i] = list_remove_head(&plist);
1314 	}
1315 
1316 	if (!mlxcx_cmd_give_pages(mlxp,
1317 	    MLXCX_MANAGE_PAGES_OPMOD_GIVE_PAGES, npages, pages)) {
1318 		mlxcx_warn(mlxp, "!hardware refused our gift of %u "
1319 		    "pages!", npages);
1320 		for (i = 0; i < npages; i++) {
1321 			list_insert_tail(&plist, pages[i]);
1322 		}
1323 		goto cleanup_npages;
1324 	}
1325 
1326 	mutex_enter(&mlxp->mlx_pagemtx);
1327 	for (i = 0; i < npages; i++) {
1328 		avl_add(&mlxp->mlx_pages, pages[i]);
1329 	}
1330 	mlxp->mlx_npages += npages;
1331 	mutex_exit(&mlxp->mlx_pagemtx);
1332 
1333 	list_destroy(&plist);
1334 	kmem_free(pages, sizeof (*pages) * npages);
1335 
1336 	*ngiven = npages;
1337 
1338 	return (B_TRUE);
1339 
1340 cleanup_npages:
1341 	kmem_free(pages, sizeof (*pages) * npages);
1342 	while ((mdp = list_remove_head(&plist)) != NULL) {
1343 		mlxcx_dma_free(&mdp->mxdp_dma);
1344 		kmem_free(mdp, sizeof (mlxcx_dev_page_t));
1345 	}
1346 	list_destroy(&plist);
1347 	return (B_FALSE);
1348 }
1349 
1350 static boolean_t
1351 mlxcx_init_pages(mlxcx_t *mlxp, uint_t type)
1352 {
1353 	int32_t npages, given;
1354 
1355 	if (!mlxcx_cmd_query_pages(mlxp, type, &npages)) {
1356 		mlxcx_warn(mlxp, "failed to determine boot pages");
1357 		return (B_FALSE);
1358 	}
1359 
1360 	while (npages > 0) {
1361 		if (!mlxcx_give_pages(mlxp, npages, &given))
1362 			return (B_FALSE);
1363 
1364 		npages -= given;
1365 	}
1366 
1367 	return (B_TRUE);
1368 }
1369 
1370 static int
1371 mlxcx_bufs_cache_constr(void *arg, void *cookie, int kmflags)
1372 {
1373 	mlxcx_t *mlxp = cookie;
1374 	mlxcx_buffer_t *b = arg;
1375 
1376 	bzero(b, sizeof (mlxcx_buffer_t));
1377 	b->mlb_mlx = mlxp;
1378 	b->mlb_state = MLXCX_BUFFER_INIT;
1379 	list_create(&b->mlb_tx_chain, sizeof (mlxcx_buffer_t),
1380 	    offsetof(mlxcx_buffer_t, mlb_tx_chain_entry));
1381 
1382 	return (0);
1383 }
1384 
1385 static void
1386 mlxcx_bufs_cache_destr(void *arg, void *cookie)
1387 {
1388 	mlxcx_t *mlxp = cookie;
1389 	mlxcx_buffer_t *b = arg;
1390 	VERIFY3P(b->mlb_mlx, ==, mlxp);
1391 	VERIFY(b->mlb_state == MLXCX_BUFFER_INIT);
1392 	list_destroy(&b->mlb_tx_chain);
1393 }
1394 
1395 mlxcx_buf_shard_t *
1396 mlxcx_mlbs_create(mlxcx_t *mlxp)
1397 {
1398 	mlxcx_buf_shard_t *s;
1399 
1400 	s = kmem_zalloc(sizeof (mlxcx_buf_shard_t), KM_SLEEP);
1401 
1402 	mutex_init(&s->mlbs_mtx, NULL, MUTEX_DRIVER,
1403 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
1404 	list_create(&s->mlbs_busy, sizeof (mlxcx_buffer_t),
1405 	    offsetof(mlxcx_buffer_t, mlb_entry));
1406 	list_create(&s->mlbs_free, sizeof (mlxcx_buffer_t),
1407 	    offsetof(mlxcx_buffer_t, mlb_entry));
1408 	list_create(&s->mlbs_loaned, sizeof (mlxcx_buffer_t),
1409 	    offsetof(mlxcx_buffer_t, mlb_entry));
1410 	cv_init(&s->mlbs_free_nonempty, NULL, CV_DRIVER, NULL);
1411 
1412 	list_insert_tail(&mlxp->mlx_buf_shards, s);
1413 
1414 	return (s);
1415 }
1416 
1417 static boolean_t
1418 mlxcx_setup_bufs(mlxcx_t *mlxp)
1419 {
1420 	char namebuf[KSTAT_STRLEN];
1421 
1422 	(void) snprintf(namebuf, KSTAT_STRLEN, "mlxcx%d_bufs_cache",
1423 	    ddi_get_instance(mlxp->mlx_dip));
1424 	mlxp->mlx_bufs_cache = kmem_cache_create(namebuf,
1425 	    sizeof (mlxcx_buffer_t), sizeof (uint64_t),
1426 	    mlxcx_bufs_cache_constr, mlxcx_bufs_cache_destr,
1427 	    NULL, mlxp, NULL, 0);
1428 
1429 	list_create(&mlxp->mlx_buf_shards, sizeof (mlxcx_buf_shard_t),
1430 	    offsetof(mlxcx_buf_shard_t, mlbs_entry));
1431 
1432 	return (B_TRUE);
1433 }
1434 
1435 static void
1436 mlxcx_fm_qstate_ereport(mlxcx_t *mlxp, const char *qtype, uint32_t qnum,
1437     const char *state, uint8_t statenum)
1438 {
1439 	uint64_t ena;
1440 	char buf[FM_MAX_CLASS];
1441 
1442 	if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps))
1443 		return;
1444 
1445 	(void) snprintf(buf, FM_MAX_CLASS, "%s.%s",
1446 	    MLXCX_FM_SERVICE_MLXCX, "qstate.err");
1447 	ena = fm_ena_generate(0, FM_ENA_FMT1);
1448 
1449 	ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP,
1450 	    FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0,
1451 	    "state", DATA_TYPE_STRING, state,
1452 	    "state_num", DATA_TYPE_UINT8, statenum,
1453 	    "qtype", DATA_TYPE_STRING, qtype,
1454 	    "qnum", DATA_TYPE_UINT32, qnum,
1455 	    NULL);
1456 	ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_DEGRADED);
1457 }
1458 
1459 /*
1460  * The following set of routines are for monitoring the health of
1461  * event, completion and work queues. They run infrequently peeking at
1462  * the structs to catch stalls and inconsistent state.
1463  *
1464  * They peek at the structs *without* acquiring locks - we don't want
1465  * to impede flow of data. Driver start up and shutdown semantics
1466  * guarantee the structs are present and won't disappear underneath
1467  * these routines.
1468  *
1469  * As previously noted, the routines peek at active data in the structs and
1470  * they will store some values for comparison on next invocation. To
1471  * maintain integrity of the saved values, these values are only modified
1472  * within these routines.
1473  */
1474 static void
1475 mlxcx_eq_check(void *arg)
1476 {
1477 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1478 	mlxcx_event_queue_t *eq;
1479 	mlxcx_eventq_ctx_t ctx;
1480 	const char *str;
1481 
1482 	uint_t i;
1483 
1484 	for (i = 0; i < mlxp->mlx_intr_count; ++i) {
1485 		eq = &mlxp->mlx_eqs[i];
1486 
1487 		if ((eq->mleq_state & MLXCX_EQ_CREATED) == 0)
1488 			continue;
1489 
1490 		/*
1491 		 * If the event queue was successfully created in the HCA,
1492 		 * then initialization and shutdown sequences guarantee
1493 		 * the queue exists.
1494 		 */
1495 		ASSERT0(eq->mleq_state & MLXCX_EQ_DESTROYED);
1496 
1497 		if (!mlxcx_cmd_query_eq(mlxp, eq, &ctx))
1498 			continue;
1499 
1500 		str = "???";
1501 		switch (ctx.mleqc_status) {
1502 		case MLXCX_EQ_STATUS_OK:
1503 			break;
1504 		case MLXCX_EQ_STATUS_WRITE_FAILURE:
1505 			str = "WRITE_FAILURE";
1506 			break;
1507 		}
1508 
1509 		if (ctx.mleqc_status != MLXCX_EQ_STATUS_OK) {
1510 			mlxcx_fm_qstate_ereport(mlxp, "event",
1511 			    eq->mleq_num, str, ctx.mleqc_status);
1512 			mlxcx_warn(mlxp, "EQ %u is in bad status: %x (%s)",
1513 			    eq->mleq_intr_index, ctx.mleqc_status, str);
1514 		}
1515 
1516 		if (ctx.mleqc_state != MLXCX_EQ_ST_ARMED &&
1517 		    (eq->mleq_state & MLXCX_EQ_ARMED)) {
1518 			if (eq->mleq_cc == eq->mleq_check_disarm_cc &&
1519 			    ++eq->mleq_check_disarm_cnt >= 3) {
1520 				mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_STALL);
1521 				mlxcx_warn(mlxp, "EQ %u isn't armed",
1522 				    eq->mleq_intr_index);
1523 			}
1524 			eq->mleq_check_disarm_cc = eq->mleq_cc;
1525 		} else {
1526 			eq->mleq_check_disarm_cc = 0;
1527 			eq->mleq_check_disarm_cnt = 0;
1528 		}
1529 	}
1530 }
1531 
1532 static void
1533 mlxcx_cq_check(void *arg)
1534 {
1535 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1536 	mlxcx_completion_queue_t *cq;
1537 	mlxcx_completionq_ctx_t ctx;
1538 	const char *str, *type;
1539 	uint_t v;
1540 
1541 	for (cq = list_head(&mlxp->mlx_cqs); cq != NULL;
1542 	    cq = list_next(&mlxp->mlx_cqs, cq)) {
1543 
1544 		if ((cq->mlcq_state & MLXCX_CQ_CREATED) == 0)
1545 			continue;
1546 
1547 		/*
1548 		 * If the completion queue was successfully created in the HCA,
1549 		 * then initialization and shutdown sequences guarantee
1550 		 * the queue exists.
1551 		 */
1552 		ASSERT0(cq->mlcq_state & MLXCX_CQ_DESTROYED);
1553 		ASSERT0(cq->mlcq_state & MLXCX_CQ_TEARDOWN);
1554 
1555 		if (cq->mlcq_fm_repd_qstate)
1556 			continue;
1557 
1558 		if (!mlxcx_cmd_query_cq(mlxp, cq, &ctx))
1559 			continue;
1560 
1561 		if (cq->mlcq_wq != NULL) {
1562 			mlxcx_work_queue_t *wq = cq->mlcq_wq;
1563 			if (wq->mlwq_type == MLXCX_WQ_TYPE_RECVQ)
1564 				type = "rx ";
1565 			else if (wq->mlwq_type == MLXCX_WQ_TYPE_SENDQ)
1566 				type = "tx ";
1567 			else
1568 				type = "";
1569 		} else {
1570 			type = "";
1571 		}
1572 
1573 		str = "???";
1574 		v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATUS);
1575 		switch (v) {
1576 		case MLXCX_CQC_STATUS_OK:
1577 			break;
1578 		case MLXCX_CQC_STATUS_OVERFLOW:
1579 			str = "OVERFLOW";
1580 			break;
1581 		case MLXCX_CQC_STATUS_WRITE_FAIL:
1582 			str = "WRITE_FAIL";
1583 			break;
1584 		case MLXCX_CQC_STATUS_INVALID:
1585 			str = "INVALID";
1586 			break;
1587 		}
1588 
1589 		if (v != MLXCX_CQC_STATUS_OK) {
1590 			mlxcx_fm_qstate_ereport(mlxp, "completion",
1591 			    cq->mlcq_num, str, v);
1592 			mlxcx_warn(mlxp, "%sCQ 0x%x is in bad status: %x (%s)",
1593 			    type, cq->mlcq_num, v, str);
1594 			cq->mlcq_fm_repd_qstate = B_TRUE;
1595 		}
1596 
1597 		v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATE);
1598 		if (v != MLXCX_CQC_STATE_ARMED &&
1599 		    (cq->mlcq_state & MLXCX_CQ_ARMED) &&
1600 		    !(cq->mlcq_state & MLXCX_CQ_POLLING)) {
1601 			if (cq->mlcq_cc == cq->mlcq_check_disarm_cc &&
1602 			    ++cq->mlcq_check_disarm_cnt >= 3) {
1603 				mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_STALL);
1604 				mlxcx_warn(mlxp, "%sCQ 0x%x (%p) isn't armed",
1605 				    type, cq->mlcq_num, cq);
1606 			}
1607 			cq->mlcq_check_disarm_cc = cq->mlcq_cc;
1608 		} else {
1609 			cq->mlcq_check_disarm_cnt = 0;
1610 			cq->mlcq_check_disarm_cc = 0;
1611 		}
1612 	}
1613 }
1614 
1615 void
1616 mlxcx_check_sq(mlxcx_t *mlxp, mlxcx_work_queue_t *sq)
1617 {
1618 	mlxcx_sq_ctx_t ctx;
1619 	mlxcx_sq_state_t state;
1620 
1621 	if (!mlxcx_cmd_query_sq(mlxp, sq, &ctx))
1622 		return;
1623 
1624 	ASSERT3U(from_be24(ctx.mlsqc_cqn), ==, sq->mlwq_cq->mlcq_num);
1625 	state = get_bits32(ctx.mlsqc_flags, MLXCX_SQ_STATE);
1626 	switch (state) {
1627 	case MLXCX_SQ_STATE_RST:
1628 		if (sq->mlwq_state & MLXCX_WQ_STARTED) {
1629 			mlxcx_fm_qstate_ereport(mlxp, "send",
1630 			    sq->mlwq_num, "RST", state);
1631 			sq->mlwq_fm_repd_qstate = B_TRUE;
1632 		}
1633 		break;
1634 	case MLXCX_SQ_STATE_RDY:
1635 		if (!(sq->mlwq_state & MLXCX_WQ_STARTED)) {
1636 			mlxcx_fm_qstate_ereport(mlxp, "send",
1637 			    sq->mlwq_num, "RDY", state);
1638 			sq->mlwq_fm_repd_qstate = B_TRUE;
1639 		}
1640 		break;
1641 	case MLXCX_SQ_STATE_ERR:
1642 		mlxcx_fm_qstate_ereport(mlxp, "send",
1643 		    sq->mlwq_num, "ERR", state);
1644 		sq->mlwq_fm_repd_qstate = B_TRUE;
1645 		break;
1646 	default:
1647 		mlxcx_fm_qstate_ereport(mlxp, "send",
1648 		    sq->mlwq_num, "???", state);
1649 		sq->mlwq_fm_repd_qstate = B_TRUE;
1650 		break;
1651 	}
1652 }
1653 
1654 void
1655 mlxcx_check_rq(mlxcx_t *mlxp, mlxcx_work_queue_t *rq)
1656 {
1657 	mlxcx_rq_ctx_t ctx;
1658 	mlxcx_rq_state_t state;
1659 
1660 
1661 	if (!mlxcx_cmd_query_rq(mlxp, rq, &ctx))
1662 		return;
1663 
1664 	ASSERT3U(from_be24(ctx.mlrqc_cqn), ==, rq->mlwq_cq->mlcq_num);
1665 	state = get_bits32(ctx.mlrqc_flags, MLXCX_RQ_STATE);
1666 	switch (state) {
1667 	case MLXCX_RQ_STATE_RST:
1668 		if (rq->mlwq_state & MLXCX_WQ_STARTED) {
1669 			mlxcx_fm_qstate_ereport(mlxp, "receive",
1670 			    rq->mlwq_num, "RST", state);
1671 			rq->mlwq_fm_repd_qstate = B_TRUE;
1672 		}
1673 		break;
1674 	case MLXCX_RQ_STATE_RDY:
1675 		if (!(rq->mlwq_state & MLXCX_WQ_STARTED)) {
1676 			mlxcx_fm_qstate_ereport(mlxp, "receive",
1677 			    rq->mlwq_num, "RDY", state);
1678 			rq->mlwq_fm_repd_qstate = B_TRUE;
1679 		}
1680 		break;
1681 	case MLXCX_RQ_STATE_ERR:
1682 		mlxcx_fm_qstate_ereport(mlxp, "receive",
1683 		    rq->mlwq_num, "ERR", state);
1684 		rq->mlwq_fm_repd_qstate = B_TRUE;
1685 		break;
1686 	default:
1687 		mlxcx_fm_qstate_ereport(mlxp, "receive",
1688 		    rq->mlwq_num, "???", state);
1689 		rq->mlwq_fm_repd_qstate = B_TRUE;
1690 		break;
1691 	}
1692 }
1693 
1694 static void
1695 mlxcx_wq_check(void *arg)
1696 {
1697 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1698 	mlxcx_work_queue_t *wq;
1699 
1700 	for (wq = list_head(&mlxp->mlx_wqs); wq != NULL;
1701 	    wq = list_next(&mlxp->mlx_wqs, wq)) {
1702 
1703 		if ((wq->mlwq_state & MLXCX_WQ_CREATED) == 0)
1704 			continue;
1705 
1706 		/*
1707 		 * If the work queue was successfully created in the HCA,
1708 		 * then initialization and shutdown sequences guarantee
1709 		 * the queue exists.
1710 		 */
1711 		ASSERT0(wq->mlwq_state & MLXCX_WQ_DESTROYED);
1712 		ASSERT0(wq->mlwq_state & MLXCX_WQ_TEARDOWN);
1713 
1714 		if (wq->mlwq_fm_repd_qstate)
1715 			continue;
1716 
1717 		switch (wq->mlwq_type) {
1718 		case MLXCX_WQ_TYPE_SENDQ:
1719 			mlxcx_check_sq(mlxp, wq);
1720 			break;
1721 		case MLXCX_WQ_TYPE_RECVQ:
1722 			mlxcx_check_rq(mlxp, wq);
1723 			break;
1724 		}
1725 	}
1726 }
1727 
1728 static boolean_t
1729 mlxcx_setup_checktimers(mlxcx_t *mlxp)
1730 {
1731 	if (mlxp->mlx_props.mldp_eq_check_interval_sec > 0) {
1732 		mlxp->mlx_eq_checktimer = ddi_periodic_add(mlxcx_eq_check, mlxp,
1733 		    mlxp->mlx_props.mldp_eq_check_interval_sec * NANOSEC,
1734 		    DDI_IPL_0);
1735 	}
1736 	if (mlxp->mlx_props.mldp_cq_check_interval_sec > 0) {
1737 		mlxp->mlx_cq_checktimer = ddi_periodic_add(mlxcx_cq_check, mlxp,
1738 		    mlxp->mlx_props.mldp_cq_check_interval_sec * NANOSEC,
1739 		    DDI_IPL_0);
1740 	}
1741 	if (mlxp->mlx_props.mldp_wq_check_interval_sec > 0) {
1742 		mlxp->mlx_wq_checktimer = ddi_periodic_add(mlxcx_wq_check, mlxp,
1743 		    mlxp->mlx_props.mldp_wq_check_interval_sec * NANOSEC,
1744 		    DDI_IPL_0);
1745 	}
1746 	return (B_TRUE);
1747 }
1748 
1749 int
1750 mlxcx_dmac_fe_compare(const void *arg0, const void *arg1)
1751 {
1752 	const mlxcx_flow_entry_t *left = arg0;
1753 	const mlxcx_flow_entry_t *right = arg1;
1754 	int bcmpr;
1755 
1756 	bcmpr = memcmp(left->mlfe_dmac, right->mlfe_dmac,
1757 	    sizeof (left->mlfe_dmac));
1758 	if (bcmpr < 0)
1759 		return (-1);
1760 	if (bcmpr > 0)
1761 		return (1);
1762 	if (left->mlfe_vid < right->mlfe_vid)
1763 		return (-1);
1764 	if (left->mlfe_vid > right->mlfe_vid)
1765 		return (1);
1766 	return (0);
1767 }
1768 
1769 int
1770 mlxcx_grmac_compare(const void *arg0, const void *arg1)
1771 {
1772 	const mlxcx_group_mac_t *left = arg0;
1773 	const mlxcx_group_mac_t *right = arg1;
1774 	int bcmpr;
1775 
1776 	bcmpr = memcmp(left->mlgm_mac, right->mlgm_mac,
1777 	    sizeof (left->mlgm_mac));
1778 	if (bcmpr < 0)
1779 		return (-1);
1780 	if (bcmpr > 0)
1781 		return (1);
1782 	return (0);
1783 }
1784 
1785 int
1786 mlxcx_page_compare(const void *arg0, const void *arg1)
1787 {
1788 	const mlxcx_dev_page_t *p0 = arg0;
1789 	const mlxcx_dev_page_t *p1 = arg1;
1790 
1791 	if (p0->mxdp_pa < p1->mxdp_pa)
1792 		return (-1);
1793 	if (p0->mxdp_pa > p1->mxdp_pa)
1794 		return (1);
1795 	return (0);
1796 }
1797 
1798 static boolean_t
1799 mlxcx_setup_ports(mlxcx_t *mlxp)
1800 {
1801 	uint_t i, j;
1802 	mlxcx_port_t *p;
1803 	mlxcx_flow_table_t *ft;
1804 	mlxcx_flow_group_t *fg;
1805 	mlxcx_flow_entry_t *fe;
1806 
1807 	VERIFY3U(mlxp->mlx_nports, >, 0);
1808 	mlxp->mlx_ports_size = mlxp->mlx_nports * sizeof (mlxcx_port_t);
1809 	mlxp->mlx_ports = kmem_zalloc(mlxp->mlx_ports_size, KM_SLEEP);
1810 
1811 	for (i = 0; i < mlxp->mlx_nports; ++i) {
1812 		p = &mlxp->mlx_ports[i];
1813 		p->mlp_num = i;
1814 		p->mlx_port_event.mla_mlx = mlxp;
1815 		p->mlx_port_event.mla_port = p;
1816 		mutex_init(&p->mlx_port_event.mla_mtx, NULL,
1817 		    MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_async_intr_pri));
1818 		p->mlp_init |= MLXCX_PORT_INIT;
1819 		mutex_init(&p->mlp_mtx, NULL, MUTEX_DRIVER,
1820 		    DDI_INTR_PRI(mlxp->mlx_intr_pri));
1821 		mutex_enter(&p->mlp_mtx);
1822 		if (!mlxcx_cmd_query_nic_vport_ctx(mlxp, p)) {
1823 			mutex_exit(&p->mlp_mtx);
1824 			goto err;
1825 		}
1826 		if (!mlxcx_cmd_query_port_mtu(mlxp, p)) {
1827 			mutex_exit(&p->mlp_mtx);
1828 			goto err;
1829 		}
1830 		if (!mlxcx_cmd_query_port_status(mlxp, p)) {
1831 			mutex_exit(&p->mlp_mtx);
1832 			goto err;
1833 		}
1834 		if (!mlxcx_cmd_query_port_speed(mlxp, p)) {
1835 			mutex_exit(&p->mlp_mtx);
1836 			goto err;
1837 		}
1838 		if (!mlxcx_cmd_modify_nic_vport_ctx(mlxp, p,
1839 		    MLXCX_MODIFY_NIC_VPORT_CTX_PROMISC)) {
1840 			mutex_exit(&p->mlp_mtx);
1841 			goto err;
1842 		}
1843 		if (!mlxcx_cmd_query_port_fec(mlxp, p)) {
1844 			mutex_exit(&p->mlp_mtx);
1845 			goto err;
1846 		}
1847 		p->mlp_fec_requested = LINK_FEC_AUTO;
1848 
1849 		mutex_exit(&p->mlp_mtx);
1850 	}
1851 
1852 	for (i = 0; i < mlxp->mlx_nports; ++i) {
1853 		p = &mlxp->mlx_ports[i];
1854 		mutex_enter(&p->mlp_mtx);
1855 		p->mlp_rx_flow = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t),
1856 		    KM_SLEEP));
1857 		mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER,
1858 		    DDI_INTR_PRI(mlxp->mlx_intr_pri));
1859 
1860 		mutex_enter(&ft->mlft_mtx);
1861 
1862 		ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX;
1863 		ft->mlft_port = p;
1864 		ft->mlft_entshift = mlxp->mlx_props.mldp_ftbl_root_size_shift;
1865 		if (ft->mlft_entshift > mlxp->mlx_caps->mlc_max_rx_ft_shift)
1866 			ft->mlft_entshift = mlxp->mlx_caps->mlc_max_rx_ft_shift;
1867 		ft->mlft_nents = (1 << ft->mlft_entshift);
1868 		ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t);
1869 		ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP);
1870 		list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t),
1871 		    offsetof(mlxcx_flow_group_t, mlfg_entry));
1872 
1873 		for (j = 0; j < ft->mlft_nents; ++j) {
1874 			ft->mlft_ent[j].mlfe_table = ft;
1875 			ft->mlft_ent[j].mlfe_index = j;
1876 		}
1877 
1878 		if (!mlxcx_cmd_create_flow_table(mlxp, ft)) {
1879 			mutex_exit(&ft->mlft_mtx);
1880 			mutex_exit(&p->mlp_mtx);
1881 			goto err;
1882 		}
1883 
1884 		if (!mlxcx_cmd_set_flow_table_root(mlxp, ft)) {
1885 			mutex_exit(&ft->mlft_mtx);
1886 			mutex_exit(&p->mlp_mtx);
1887 			goto err;
1888 		}
1889 
1890 		/*
1891 		 * We match broadcast at the top of the root flow table, then
1892 		 * all multicast/unicast MACs, then the promisc entry is down
1893 		 * the very bottom.
1894 		 *
1895 		 * This way when promisc is on, that entry simply catches any
1896 		 * remaining traffic that earlier flows haven't matched.
1897 		 */
1898 		fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1899 		list_insert_tail(&ft->mlft_groups, fg);
1900 		fg->mlfg_table = ft;
1901 		fg->mlfg_size = 1;
1902 		fg->mlfg_mask |= MLXCX_FLOW_MATCH_DMAC;
1903 		if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1904 			mutex_exit(&ft->mlft_mtx);
1905 			mutex_exit(&p->mlp_mtx);
1906 			goto err;
1907 		}
1908 		p->mlp_bcast = fg;
1909 		fe = list_head(&fg->mlfg_entries);
1910 		fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1911 		(void) memset(fe->mlfe_dmac, 0xff, sizeof (fe->mlfe_dmac));
1912 		fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
1913 
1914 		fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1915 		list_insert_tail(&ft->mlft_groups, fg);
1916 		fg->mlfg_table = ft;
1917 		fg->mlfg_size = ft->mlft_nents - 2;
1918 		fg->mlfg_mask |= MLXCX_FLOW_MATCH_DMAC;
1919 		if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1920 			mutex_exit(&ft->mlft_mtx);
1921 			mutex_exit(&p->mlp_mtx);
1922 			goto err;
1923 		}
1924 		p->mlp_umcast = fg;
1925 
1926 		fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1927 		list_insert_tail(&ft->mlft_groups, fg);
1928 		fg->mlfg_table = ft;
1929 		fg->mlfg_size = 1;
1930 		if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1931 			mutex_exit(&ft->mlft_mtx);
1932 			mutex_exit(&p->mlp_mtx);
1933 			goto err;
1934 		}
1935 		p->mlp_promisc = fg;
1936 		fe = list_head(&fg->mlfg_entries);
1937 		fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1938 		fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
1939 
1940 		avl_create(&p->mlp_dmac_fe, mlxcx_dmac_fe_compare,
1941 		    sizeof (mlxcx_flow_entry_t), offsetof(mlxcx_flow_entry_t,
1942 		    mlfe_dmac_entry));
1943 
1944 		mutex_exit(&ft->mlft_mtx);
1945 		mutex_exit(&p->mlp_mtx);
1946 	}
1947 
1948 	return (B_TRUE);
1949 
1950 err:
1951 	mlxcx_teardown_ports(mlxp);
1952 	return (B_FALSE);
1953 }
1954 
1955 void
1956 mlxcx_remove_all_vlan_entries(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
1957 {
1958 	mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft;
1959 	mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg;
1960 	mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg;
1961 	mlxcx_flow_entry_t *fe;
1962 	mlxcx_group_vlan_t *v;
1963 
1964 	ASSERT(mutex_owned(&g->mlg_mtx));
1965 
1966 	mutex_enter(&ft->mlft_mtx);
1967 
1968 	if (!list_is_empty(&g->mlg_rx_vlans)) {
1969 		fe = list_head(&dfg->mlfg_entries);
1970 		(void) mlxcx_cmd_set_flow_table_entry(mlxp, fe);
1971 	}
1972 
1973 	while ((v = list_remove_head(&g->mlg_rx_vlans)) != NULL) {
1974 		fe = v->mlgv_fe;
1975 		ASSERT3P(fe->mlfe_table, ==, ft);
1976 		ASSERT3P(fe->mlfe_group, ==, fg);
1977 		kmem_free(v, sizeof (mlxcx_group_vlan_t));
1978 
1979 		(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
1980 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
1981 	}
1982 
1983 	mutex_exit(&ft->mlft_mtx);
1984 }
1985 
1986 boolean_t
1987 mlxcx_remove_vlan_entry(mlxcx_t *mlxp, mlxcx_ring_group_t *g,
1988     boolean_t tagged, uint16_t vid)
1989 {
1990 	mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft;
1991 	mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg;
1992 	mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg;
1993 	mlxcx_flow_entry_t *fe;
1994 	mlxcx_group_vlan_t *v;
1995 	boolean_t found = B_FALSE;
1996 
1997 	ASSERT(mutex_owned(&g->mlg_mtx));
1998 
1999 	mutex_enter(&ft->mlft_mtx);
2000 
2001 	for (v = list_head(&g->mlg_rx_vlans); v != NULL;
2002 	    v = list_next(&g->mlg_rx_vlans, v)) {
2003 		if (v->mlgv_tagged == tagged && v->mlgv_vid == vid) {
2004 			found = B_TRUE;
2005 			break;
2006 		}
2007 	}
2008 	if (!found) {
2009 		mutex_exit(&ft->mlft_mtx);
2010 		return (B_FALSE);
2011 	}
2012 
2013 	list_remove(&g->mlg_rx_vlans, v);
2014 
2015 	/*
2016 	 * If this is the last VLAN entry, we have to go back to accepting
2017 	 * any VLAN (which means re-enabling the default entry).
2018 	 *
2019 	 * Do this before we remove the flow entry for the last specific
2020 	 * VLAN so that we don't lose any traffic in the transition.
2021 	 */
2022 	if (list_is_empty(&g->mlg_rx_vlans)) {
2023 		fe = list_head(&dfg->mlfg_entries);
2024 		if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
2025 			list_insert_tail(&g->mlg_rx_vlans, v);
2026 			mutex_exit(&ft->mlft_mtx);
2027 			return (B_FALSE);
2028 		}
2029 	}
2030 
2031 	fe = v->mlgv_fe;
2032 	ASSERT(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED);
2033 	ASSERT(fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED);
2034 	ASSERT3P(fe->mlfe_table, ==, ft);
2035 	ASSERT3P(fe->mlfe_group, ==, fg);
2036 
2037 	if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) {
2038 		list_insert_tail(&g->mlg_rx_vlans, v);
2039 		fe = list_head(&dfg->mlfg_entries);
2040 		if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
2041 			(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
2042 		}
2043 		mutex_exit(&ft->mlft_mtx);
2044 		return (B_FALSE);
2045 	}
2046 
2047 	fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2048 
2049 	kmem_free(v, sizeof (mlxcx_group_vlan_t));
2050 
2051 	mutex_exit(&ft->mlft_mtx);
2052 	return (B_TRUE);
2053 }
2054 
2055 boolean_t
2056 mlxcx_add_vlan_entry(mlxcx_t *mlxp, mlxcx_ring_group_t *g, boolean_t tagged,
2057     uint16_t vid)
2058 {
2059 	mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft;
2060 	mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg;
2061 	mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg;
2062 	mlxcx_flow_entry_t *fe;
2063 	mlxcx_group_vlan_t *v;
2064 	boolean_t found = B_FALSE;
2065 	boolean_t first = B_FALSE;
2066 
2067 	ASSERT(mutex_owned(&g->mlg_mtx));
2068 
2069 	mutex_enter(&ft->mlft_mtx);
2070 
2071 	for (v = list_head(&g->mlg_rx_vlans); v != NULL;
2072 	    v = list_next(&g->mlg_rx_vlans, v)) {
2073 		if (v->mlgv_tagged == tagged && v->mlgv_vid == vid) {
2074 			mutex_exit(&ft->mlft_mtx);
2075 			return (B_TRUE);
2076 		}
2077 	}
2078 	if (list_is_empty(&g->mlg_rx_vlans))
2079 		first = B_TRUE;
2080 
2081 	for (fe = list_head(&fg->mlfg_entries); fe != NULL;
2082 	    fe = list_next(&fg->mlfg_entries, fe)) {
2083 		if (!(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED)) {
2084 			found = B_TRUE;
2085 			break;
2086 		}
2087 	}
2088 	if (!found) {
2089 		mutex_exit(&ft->mlft_mtx);
2090 		return (B_FALSE);
2091 	}
2092 
2093 	v = kmem_zalloc(sizeof (mlxcx_group_vlan_t), KM_SLEEP);
2094 	v->mlgv_fe = fe;
2095 	v->mlgv_tagged = tagged;
2096 	v->mlgv_vid = vid;
2097 
2098 	fe->mlfe_state |= MLXCX_FLOW_ENTRY_RESERVED;
2099 	fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
2100 	fe->mlfe_vid = vid;
2101 	if (tagged) {
2102 		fe->mlfe_vlan_type = MLXCX_VLAN_TYPE_CVLAN;
2103 	} else {
2104 		fe->mlfe_vlan_type = MLXCX_VLAN_TYPE_NONE;
2105 	}
2106 
2107 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
2108 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_DIRTY;
2109 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2110 		kmem_free(v, sizeof (mlxcx_group_vlan_t));
2111 		mutex_exit(&ft->mlft_mtx);
2112 		return (B_FALSE);
2113 	}
2114 
2115 	list_insert_tail(&g->mlg_rx_vlans, v);
2116 
2117 	/*
2118 	 * If the vlan list was empty for this group before adding this one,
2119 	 * then we no longer want the "default" entry to allow all VLANs
2120 	 * through.
2121 	 */
2122 	if (first) {
2123 		fe = list_head(&dfg->mlfg_entries);
2124 		(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
2125 	}
2126 
2127 	mutex_exit(&ft->mlft_mtx);
2128 	return (B_TRUE);
2129 }
2130 
2131 void
2132 mlxcx_remove_all_umcast_entries(mlxcx_t *mlxp, mlxcx_port_t *port,
2133     mlxcx_ring_group_t *group)
2134 {
2135 	mlxcx_flow_entry_t *fe;
2136 	mlxcx_flow_table_t *ft = port->mlp_rx_flow;
2137 	mlxcx_group_mac_t *gm, *ngm;
2138 
2139 	ASSERT(mutex_owned(&port->mlp_mtx));
2140 	ASSERT(mutex_owned(&group->mlg_mtx));
2141 
2142 	mutex_enter(&ft->mlft_mtx);
2143 
2144 	gm = avl_first(&group->mlg_rx_macs);
2145 	for (; gm != NULL; gm = ngm) {
2146 		ngm = AVL_NEXT(&group->mlg_rx_macs, gm);
2147 
2148 		ASSERT3P(gm->mlgm_group, ==, group);
2149 		fe = gm->mlgm_fe;
2150 		ASSERT3P(fe->mlfe_table, ==, ft);
2151 
2152 		avl_remove(&group->mlg_rx_macs, gm);
2153 		list_remove(&fe->mlfe_ring_groups, gm);
2154 		kmem_free(gm, sizeof (mlxcx_group_mac_t));
2155 
2156 		fe->mlfe_ndest = 0;
2157 		for (gm = list_head(&fe->mlfe_ring_groups); gm != NULL;
2158 		    gm = list_next(&fe->mlfe_ring_groups, gm)) {
2159 			fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow =
2160 			    gm->mlgm_group->mlg_rx_vlan_ft;
2161 		}
2162 		fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
2163 
2164 		if (fe->mlfe_ndest > 0) {
2165 			(void) mlxcx_cmd_set_flow_table_entry(mlxp, fe);
2166 			continue;
2167 		}
2168 
2169 		/*
2170 		 * There are no more ring groups left for this MAC (it wasn't
2171 		 * attached to any other groups since ndest == 0), so clean up
2172 		 * its flow entry.
2173 		 */
2174 		avl_remove(&port->mlp_dmac_fe, fe);
2175 		(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
2176 		list_destroy(&fe->mlfe_ring_groups);
2177 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2178 	}
2179 
2180 	mutex_exit(&ft->mlft_mtx);
2181 }
2182 
2183 boolean_t
2184 mlxcx_remove_umcast_entry(mlxcx_t *mlxp, mlxcx_port_t *port,
2185     mlxcx_ring_group_t *group, const uint8_t *macaddr)
2186 {
2187 	mlxcx_flow_entry_t *fe;
2188 	mlxcx_flow_table_t *ft = port->mlp_rx_flow;
2189 	mlxcx_group_mac_t *gm, probe;
2190 
2191 	ASSERT(mutex_owned(&port->mlp_mtx));
2192 	ASSERT(mutex_owned(&group->mlg_mtx));
2193 
2194 	bzero(&probe, sizeof (probe));
2195 	bcopy(macaddr, probe.mlgm_mac, sizeof (probe.mlgm_mac));
2196 
2197 	mutex_enter(&ft->mlft_mtx);
2198 
2199 	gm = avl_find(&group->mlg_rx_macs, &probe, NULL);
2200 	if (gm == NULL) {
2201 		mutex_exit(&ft->mlft_mtx);
2202 		return (B_FALSE);
2203 	}
2204 	ASSERT3P(gm->mlgm_group, ==, group);
2205 	ASSERT0(bcmp(macaddr, gm->mlgm_mac, sizeof (gm->mlgm_mac)));
2206 
2207 	fe = gm->mlgm_fe;
2208 	ASSERT3P(fe->mlfe_table, ==, ft);
2209 	ASSERT0(bcmp(macaddr, fe->mlfe_dmac, sizeof (fe->mlfe_dmac)));
2210 
2211 	list_remove(&fe->mlfe_ring_groups, gm);
2212 	avl_remove(&group->mlg_rx_macs, gm);
2213 	kmem_free(gm, sizeof (mlxcx_group_mac_t));
2214 
2215 	fe->mlfe_ndest = 0;
2216 	for (gm = list_head(&fe->mlfe_ring_groups); gm != NULL;
2217 	    gm = list_next(&fe->mlfe_ring_groups, gm)) {
2218 		fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow =
2219 		    gm->mlgm_group->mlg_rx_vlan_ft;
2220 	}
2221 	fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
2222 
2223 	if (fe->mlfe_ndest > 0) {
2224 		if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
2225 			mutex_exit(&ft->mlft_mtx);
2226 			return (B_FALSE);
2227 		}
2228 		mutex_exit(&ft->mlft_mtx);
2229 		return (B_TRUE);
2230 	}
2231 
2232 	/*
2233 	 * There are no more ring groups left for this MAC (it wasn't attached
2234 	 * to any other groups since ndest == 0), so clean up its flow entry.
2235 	 */
2236 	avl_remove(&port->mlp_dmac_fe, fe);
2237 	(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
2238 	list_destroy(&fe->mlfe_ring_groups);
2239 
2240 	fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2241 
2242 	mutex_exit(&ft->mlft_mtx);
2243 
2244 	return (B_TRUE);
2245 }
2246 
2247 boolean_t
2248 mlxcx_add_umcast_entry(mlxcx_t *mlxp, mlxcx_port_t *port,
2249     mlxcx_ring_group_t *group, const uint8_t *macaddr)
2250 {
2251 	mlxcx_flow_group_t *fg;
2252 	mlxcx_flow_entry_t *fe, probe;
2253 	mlxcx_flow_table_t *ft = port->mlp_rx_flow;
2254 	mlxcx_group_mac_t *gm;
2255 	boolean_t found = B_FALSE;
2256 
2257 	ASSERT(mutex_owned(&port->mlp_mtx));
2258 	ASSERT(mutex_owned(&group->mlg_mtx));
2259 
2260 	bzero(&probe, sizeof (probe));
2261 	bcopy(macaddr, probe.mlfe_dmac, sizeof (probe.mlfe_dmac));
2262 
2263 	mutex_enter(&ft->mlft_mtx);
2264 
2265 	fe = avl_find(&port->mlp_dmac_fe, &probe, NULL);
2266 
2267 	if (fe == NULL) {
2268 		fg = port->mlp_umcast;
2269 		for (fe = list_head(&fg->mlfg_entries); fe != NULL;
2270 		    fe = list_next(&fg->mlfg_entries, fe)) {
2271 			if (!(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED)) {
2272 				found = B_TRUE;
2273 				break;
2274 			}
2275 		}
2276 		if (!found) {
2277 			mutex_exit(&ft->mlft_mtx);
2278 			return (B_FALSE);
2279 		}
2280 		list_create(&fe->mlfe_ring_groups, sizeof (mlxcx_group_mac_t),
2281 		    offsetof(mlxcx_group_mac_t, mlgm_fe_entry));
2282 		fe->mlfe_state |= MLXCX_FLOW_ENTRY_RESERVED;
2283 		fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
2284 		bcopy(macaddr, fe->mlfe_dmac, sizeof (fe->mlfe_dmac));
2285 
2286 		avl_add(&port->mlp_dmac_fe, fe);
2287 	}
2288 
2289 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = group->mlg_rx_vlan_ft;
2290 	fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
2291 
2292 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
2293 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_DIRTY;
2294 		if (--fe->mlfe_ndest == 0) {
2295 			fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2296 		}
2297 		mutex_exit(&ft->mlft_mtx);
2298 		return (B_FALSE);
2299 	}
2300 
2301 	gm = kmem_zalloc(sizeof (mlxcx_group_mac_t), KM_SLEEP);
2302 	gm->mlgm_group = group;
2303 	gm->mlgm_fe = fe;
2304 	bcopy(macaddr, gm->mlgm_mac, sizeof (gm->mlgm_mac));
2305 	avl_add(&group->mlg_rx_macs, gm);
2306 	list_insert_tail(&fe->mlfe_ring_groups, gm);
2307 
2308 	mutex_exit(&ft->mlft_mtx);
2309 
2310 	return (B_TRUE);
2311 }
2312 
2313 boolean_t
2314 mlxcx_setup_flow_group(mlxcx_t *mlxp, mlxcx_flow_table_t *ft,
2315     mlxcx_flow_group_t *fg)
2316 {
2317 	mlxcx_flow_entry_t *fe;
2318 	uint_t i, idx;
2319 
2320 	ASSERT(mutex_owned(&ft->mlft_mtx));
2321 	ASSERT(ft->mlft_state & MLXCX_FLOW_TABLE_CREATED);
2322 	ASSERT3P(fg->mlfg_table, ==, ft);
2323 
2324 	if (ft->mlft_next_ent + fg->mlfg_size > ft->mlft_nents)
2325 		return (B_FALSE);
2326 	fg->mlfg_start_idx = ft->mlft_next_ent;
2327 
2328 	if (!mlxcx_cmd_create_flow_group(mlxp, fg)) {
2329 		return (B_FALSE);
2330 	}
2331 
2332 	list_create(&fg->mlfg_entries, sizeof (mlxcx_flow_entry_t),
2333 	    offsetof(mlxcx_flow_entry_t, mlfe_group_entry));
2334 	for (i = 0; i < fg->mlfg_size; ++i) {
2335 		idx = fg->mlfg_start_idx + i;
2336 		fe = &ft->mlft_ent[idx];
2337 		fe->mlfe_group = fg;
2338 		list_insert_tail(&fg->mlfg_entries, fe);
2339 	}
2340 	fg->mlfg_avail = fg->mlfg_size;
2341 	ft->mlft_next_ent += fg->mlfg_size;
2342 
2343 	return (B_TRUE);
2344 }
2345 
2346 static boolean_t
2347 mlxcx_setup_eq(mlxcx_t *mlxp, uint_t vec, uint64_t events)
2348 {
2349 	mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[vec];
2350 
2351 	mutex_enter(&mleq->mleq_mtx);
2352 	if (!mlxcx_eq_alloc_dma(mlxp, mleq)) {
2353 		/* mlxcx_teardown_eqs() will clean this up */
2354 		mutex_exit(&mleq->mleq_mtx);
2355 		return (B_FALSE);
2356 	}
2357 	mleq->mleq_mlx = mlxp;
2358 	mleq->mleq_uar = &mlxp->mlx_uar;
2359 	mleq->mleq_events = events;
2360 	mleq->mleq_intr_index = vec;
2361 
2362 	if (!mlxcx_cmd_create_eq(mlxp, mleq)) {
2363 		/* mlxcx_teardown_eqs() will clean this up */
2364 		mutex_exit(&mleq->mleq_mtx);
2365 		return (B_FALSE);
2366 	}
2367 
2368 	if (ddi_intr_enable(mlxp->mlx_intr_handles[vec]) != DDI_SUCCESS) {
2369 		/*
2370 		 * mlxcx_teardown_eqs() will handle calling cmd_destroy_eq and
2371 		 * eq_rele_dma
2372 		 */
2373 		mutex_exit(&mleq->mleq_mtx);
2374 		return (B_FALSE);
2375 	}
2376 	mleq->mleq_state |= MLXCX_EQ_INTR_ENABLED;
2377 	mleq->mleq_state |= MLXCX_EQ_ATTACHING;
2378 	mlxcx_arm_eq(mlxp, mleq);
2379 	mutex_exit(&mleq->mleq_mtx);
2380 
2381 	return (B_TRUE);
2382 }
2383 
2384 static void
2385 mlxcx_eq_set_attached(mlxcx_t *mlxp)
2386 {
2387 	uint_t vec;
2388 	mlxcx_event_queue_t *mleq;
2389 
2390 	for (vec = 0; vec < mlxp->mlx_intr_count; ++vec) {
2391 		mleq = &mlxp->mlx_eqs[vec];
2392 
2393 		mutex_enter(&mleq->mleq_mtx);
2394 		mleq->mleq_state &= ~MLXCX_EQ_ATTACHING;
2395 		mutex_exit(&mleq->mleq_mtx);
2396 	}
2397 }
2398 
2399 static boolean_t
2400 mlxcx_setup_async_eqs(mlxcx_t *mlxp)
2401 {
2402 	boolean_t ret;
2403 
2404 	ret = mlxcx_setup_eq(mlxp, 0,
2405 	    (1ULL << MLXCX_EVENT_CMD_COMPLETION) |
2406 	    (1ULL << MLXCX_EVENT_PAGE_REQUEST) |
2407 	    (1ULL << MLXCX_EVENT_PORT_STATE) |
2408 	    (1ULL << MLXCX_EVENT_INTERNAL_ERROR) |
2409 	    (1ULL << MLXCX_EVENT_PORT_MODULE) |
2410 	    (1ULL << MLXCX_EVENT_SENDQ_DRAIN) |
2411 	    (1ULL << MLXCX_EVENT_LAST_WQE) |
2412 	    (1ULL << MLXCX_EVENT_CQ_ERROR) |
2413 	    (1ULL << MLXCX_EVENT_WQ_CATASTROPHE) |
2414 	    (1ULL << MLXCX_EVENT_PAGE_FAULT) |
2415 	    (1ULL << MLXCX_EVENT_WQ_INVALID_REQ) |
2416 	    (1ULL << MLXCX_EVENT_WQ_ACCESS_VIOL) |
2417 	    (1ULL << MLXCX_EVENT_NIC_VPORT) |
2418 	    (1ULL << MLXCX_EVENT_DOORBELL_CONGEST));
2419 
2420 	if (ret)
2421 		mlxcx_cmd_eq_enable(mlxp);
2422 
2423 	return (ret);
2424 }
2425 
2426 int
2427 mlxcx_cq_compare(const void *arg0, const void *arg1)
2428 {
2429 	const mlxcx_completion_queue_t *left = arg0;
2430 	const mlxcx_completion_queue_t *right = arg1;
2431 
2432 	if (left->mlcq_num < right->mlcq_num) {
2433 		return (-1);
2434 	}
2435 	if (left->mlcq_num > right->mlcq_num) {
2436 		return (1);
2437 	}
2438 	return (0);
2439 }
2440 
2441 static boolean_t
2442 mlxcx_setup_eqs(mlxcx_t *mlxp)
2443 {
2444 	uint_t i;
2445 	mlxcx_event_queue_t *mleq;
2446 
2447 	ASSERT3S(mlxp->mlx_intr_count, >, 0);
2448 
2449 	for (i = mlxp->mlx_intr_cq0; i < mlxp->mlx_intr_count; ++i) {
2450 		mleq = &mlxp->mlx_eqs[i];
2451 		mutex_enter(&mleq->mleq_mtx);
2452 		if (!mlxcx_eq_alloc_dma(mlxp, mleq)) {
2453 			mutex_exit(&mleq->mleq_mtx);
2454 			return (B_FALSE);
2455 		}
2456 		mleq->mleq_uar = &mlxp->mlx_uar;
2457 		if (!mlxcx_cmd_create_eq(mlxp, mleq)) {
2458 			/* mlxcx_teardown() will handle calling eq_rele_dma */
2459 			mutex_exit(&mleq->mleq_mtx);
2460 			return (B_FALSE);
2461 		}
2462 		if (mlxp->mlx_props.mldp_intrmod_period_usec != 0 &&
2463 		    !mlxcx_cmd_set_int_mod(mlxp, i,
2464 		    mlxp->mlx_props.mldp_intrmod_period_usec)) {
2465 			mutex_exit(&mleq->mleq_mtx);
2466 			return (B_FALSE);
2467 		}
2468 		if (ddi_intr_enable(mlxp->mlx_intr_handles[i]) != DDI_SUCCESS) {
2469 			mutex_exit(&mleq->mleq_mtx);
2470 			return (B_FALSE);
2471 		}
2472 		mleq->mleq_state |= MLXCX_EQ_INTR_ENABLED;
2473 		mlxcx_arm_eq(mlxp, mleq);
2474 		mutex_exit(&mleq->mleq_mtx);
2475 	}
2476 
2477 	mlxp->mlx_next_eq = mlxp->mlx_intr_cq0;
2478 
2479 	return (B_TRUE);
2480 }
2481 
2482 /*
2483  * A more recent ConnectX part will have the Port CApability Mask register.
2484  * Explore it and note things here.
2485  */
2486 static void
2487 mlxcx_explore_pcam(mlxcx_t *mlxp, mlxcx_caps_t *c)
2488 {
2489 	mlxcx_register_data_t data;
2490 	mlxcx_reg_pcam_t *pcam = &data.mlrd_pcam;
2491 
2492 	ASSERT(c->mlc_pcam);
2493 	bzero(&data, sizeof (data));
2494 
2495 	/*
2496 	 * Okay, so we have access the the Ports CApability Mask (PCAM).
2497 	 * There are various things we need to check about it.
2498 	 */
2499 
2500 	VERIFY(mlxcx_cmd_access_register(mlxp, MLXCX_CMD_ACCESS_REGISTER_READ,
2501 	    MLXCX_REG_PCAM, &data));
2502 
2503 	/*
2504 	 * NOTE: These ASSERT()s may change in future mlxcx(4D) parts.
2505 	 * As of now, only 0 is valid, and 1-255 are reserved.  A future part
2506 	 * may return non-zero in these fields.
2507 	 */
2508 	ASSERT0(pcam->mlrd_pcam_feature_group);
2509 	ASSERT0(pcam->mlrd_pcam_access_reg_group);
2510 
2511 	c->mlc_ext_ptys = get_bit64(pcam->mlrd_pcam_feature_cap_mask_low,
2512 	    MLXCX_PCAM_LOW_FFLAGS_PTYS_EXTENDED);
2513 }
2514 
2515 /*
2516  * Snapshot all of the hardware capabilities that we care about and then modify
2517  * the HCA capabilities to get things moving.
2518  */
2519 static boolean_t
2520 mlxcx_init_caps(mlxcx_t *mlxp)
2521 {
2522 	mlxcx_caps_t *c;
2523 
2524 	mlxp->mlx_caps = c = kmem_zalloc(sizeof (mlxcx_caps_t), KM_SLEEP);
2525 
2526 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_GENERAL,
2527 	    MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_hca_cur)) {
2528 		mlxcx_warn(mlxp, "failed to obtain current HCA general caps");
2529 	}
2530 
2531 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_GENERAL,
2532 	    MLXCX_HCA_CAP_MODE_MAX, &c->mlc_hca_max)) {
2533 		mlxcx_warn(mlxp, "failed to obtain maximum HCA general caps");
2534 	}
2535 
2536 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_ETHERNET,
2537 	    MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_ether_cur)) {
2538 		mlxcx_warn(mlxp, "failed to obtain current HCA eth caps");
2539 	}
2540 
2541 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_ETHERNET,
2542 	    MLXCX_HCA_CAP_MODE_MAX, &c->mlc_ether_max)) {
2543 		mlxcx_warn(mlxp, "failed to obtain maximum HCA eth caps");
2544 	}
2545 
2546 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_NIC_FLOW,
2547 	    MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_nic_flow_cur)) {
2548 		mlxcx_warn(mlxp, "failed to obtain current HCA flow caps");
2549 	}
2550 
2551 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_NIC_FLOW,
2552 	    MLXCX_HCA_CAP_MODE_MAX, &c->mlc_nic_flow_max)) {
2553 		mlxcx_warn(mlxp, "failed to obtain maximum HCA flow caps");
2554 	}
2555 
2556 	/*
2557 	 * Check the caps meet our requirements.
2558 	 */
2559 	const mlxcx_hca_cap_general_caps_t *gen = &c->mlc_hca_cur.mhc_general;
2560 
2561 	if (gen->mlcap_general_log_pg_sz != 12) {
2562 		mlxcx_warn(mlxp, "!hardware has page size != 4k "
2563 		    "(log_pg_sz = %u)", (uint_t)gen->mlcap_general_log_pg_sz);
2564 		goto err;
2565 	}
2566 	if (gen->mlcap_general_cqe_version != 1) {
2567 		mlxcx_warn(mlxp, "!hardware does not support CQE v1 "
2568 		    "(cqe_ver = %u)", (uint_t)gen->mlcap_general_cqe_version);
2569 		goto err;
2570 	}
2571 	if (gen->mlcap_general_port_type !=
2572 	    MLXCX_CAP_GENERAL_PORT_TYPE_ETHERNET) {
2573 		mlxcx_warn(mlxp, "!hardware has non-ethernet ports");
2574 		goto err;
2575 	}
2576 	mlxp->mlx_nports = gen->mlcap_general_num_ports;
2577 	mlxp->mlx_max_sdu = (1 << (gen->mlcap_general_log_max_msg & 0x1F));
2578 
2579 	if (get_bit16(gen->mlcap_general_flags_c,
2580 	    MLXCX_CAP_GENERAL_FLAGS_C_PCAM_REG)) {
2581 		c->mlc_pcam = B_TRUE;
2582 		mlxcx_explore_pcam(mlxp, c);
2583 	}
2584 
2585 	c->mlc_max_tir = (1 << gen->mlcap_general_log_max_tir);
2586 
2587 	c->mlc_checksum = get_bit32(c->mlc_ether_cur.mhc_eth.mlcap_eth_flags,
2588 	    MLXCX_ETH_CAP_CSUM_CAP);
2589 	c->mlc_vxlan = get_bit32(c->mlc_ether_cur.mhc_eth.mlcap_eth_flags,
2590 	    MLXCX_ETH_CAP_TUNNEL_STATELESS_VXLAN);
2591 
2592 	c->mlc_max_lso_size = (1 << get_bits32(c->mlc_ether_cur.mhc_eth.
2593 	    mlcap_eth_flags, MLXCX_ETH_CAP_MAX_LSO_CAP));
2594 	if (c->mlc_max_lso_size == 1) {
2595 		c->mlc_max_lso_size = 0;
2596 		c->mlc_lso = B_FALSE;
2597 	} else {
2598 		c->mlc_lso = B_TRUE;
2599 	}
2600 
2601 	c->mlc_max_rqt_size = (1 << get_bits32(c->mlc_ether_cur.mhc_eth.
2602 	    mlcap_eth_flags, MLXCX_ETH_CAP_RSS_IND_TBL_CAP));
2603 
2604 	if (!get_bit32(c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx.
2605 	    mlcap_flow_prop_flags, MLXCX_FLOW_CAP_PROPS_SUPPORT)) {
2606 		mlxcx_warn(mlxp, "!hardware does not support rx flow tables");
2607 		goto err;
2608 	}
2609 	if (!get_bit32(c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx.
2610 	    mlcap_flow_prop_flags, MLXCX_FLOW_CAP_PROPS_MODIFY)) {
2611 		mlxcx_warn(mlxp, "!hardware does not support modifying rx "
2612 		    "flow table entries");
2613 		goto err;
2614 	}
2615 
2616 	c->mlc_max_rx_ft_shift = c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx.
2617 	    mlcap_flow_prop_log_max_ft_size;
2618 	c->mlc_max_rx_flows = (1 << c->mlc_nic_flow_cur.mhc_flow.
2619 	    mlcap_flow_nic_rx.mlcap_flow_prop_log_max_flow);
2620 	c->mlc_max_rx_ft = (1 << c->mlc_nic_flow_cur.mhc_flow.
2621 	    mlcap_flow_nic_rx.mlcap_flow_prop_log_max_ft_num);
2622 	c->mlc_max_rx_fe_dest = (1 << c->mlc_nic_flow_cur.mhc_flow.
2623 	    mlcap_flow_nic_rx.mlcap_flow_prop_log_max_destination);
2624 
2625 	return (B_TRUE);
2626 
2627 err:
2628 	kmem_free(mlxp->mlx_caps, sizeof (mlxcx_caps_t));
2629 	return (B_FALSE);
2630 }
2631 
2632 static int
2633 mlxcx_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2634 {
2635 	mlxcx_t *mlxp;
2636 
2637 	if (cmd != DDI_DETACH)
2638 		return (DDI_FAILURE);
2639 
2640 	mlxp = ddi_get_driver_private(dip);
2641 	if (mlxp == NULL) {
2642 		mlxcx_warn(NULL, "asked to detach, but missing instance "
2643 		    "private data");
2644 		return (DDI_FAILURE);
2645 	}
2646 
2647 	if (mlxp->mlx_attach & MLXCX_ATTACH_MAC_HDL) {
2648 		if (mac_unregister(mlxp->mlx_mac_hdl) != DDI_SUCCESS) {
2649 			return (DDI_FAILURE);
2650 		}
2651 		mlxp->mlx_attach &= ~MLXCX_ATTACH_MAC_HDL;
2652 	}
2653 
2654 	mlxcx_teardown(mlxp);
2655 	return (DDI_SUCCESS);
2656 }
2657 
2658 static size_t
2659 mlxcx_calc_rx_ngroups(mlxcx_t *mlxp)
2660 {
2661 	size_t ngroups = mlxp->mlx_props.mldp_rx_ngroups_large +
2662 	    mlxp->mlx_props.mldp_rx_ngroups_small;
2663 	size_t tirlim, flowlim, gflowlim;
2664 
2665 	tirlim = mlxp->mlx_caps->mlc_max_tir / MLXCX_TIRS_PER_GROUP;
2666 	if (tirlim < ngroups) {
2667 		mlxcx_note(mlxp, "limiting number of rx groups to %u based "
2668 		    "on number of TIRs available", tirlim);
2669 		ngroups = tirlim;
2670 	}
2671 
2672 	flowlim = (1 << mlxp->mlx_caps->mlc_max_rx_ft_shift) - 2;
2673 	if (flowlim < ngroups) {
2674 		mlxcx_note(mlxp, "limiting number of rx groups to %u based "
2675 		    "on max size of RX flow tables", flowlim);
2676 		ngroups = flowlim;
2677 	}
2678 
2679 	/*
2680 	 * Restrict the number of groups not to exceed the max flow
2681 	 * table number from the devices capabilities.
2682 	 * There is one root table entry per port and 2 entries per
2683 	 * group.
2684 	 */
2685 	flowlim = (mlxp->mlx_caps->mlc_max_rx_ft - mlxp->mlx_nports) / 2;
2686 	if (flowlim < ngroups) {
2687 		mlxcx_note(mlxp, "limiting number of rx groups to %u based "
2688 		    "on max number of RX flow tables",
2689 		    flowlim);
2690 		ngroups = flowlim;
2691 	}
2692 
2693 	do {
2694 		gflowlim = mlxp->mlx_caps->mlc_max_rx_flows - 16 * ngroups - 2;
2695 		if (gflowlim < ngroups) {
2696 			mlxcx_note(mlxp, "limiting number of rx groups to %u "
2697 			    "based on max total RX flows", gflowlim);
2698 			--ngroups;
2699 		}
2700 	} while (gflowlim < ngroups);
2701 
2702 	return (ngroups);
2703 }
2704 
2705 static int
2706 mlxcx_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2707 {
2708 	mlxcx_t *mlxp;
2709 	char tq_name[TASKQ_NAMELEN];
2710 	uint_t i;
2711 	int inst, ret;
2712 
2713 	if (cmd != DDI_ATTACH)
2714 		return (DDI_FAILURE);
2715 
2716 	inst = ddi_get_instance(dip);
2717 	ret = ddi_soft_state_zalloc(mlxcx_softstate, inst);
2718 	if (ret != 0)
2719 		return (ret);
2720 
2721 	mlxp = ddi_get_soft_state(mlxcx_softstate, inst);
2722 	if (mlxp == NULL)
2723 		return (DDI_FAILURE);
2724 	mlxp->mlx_dip = dip;
2725 	mlxp->mlx_inst = inst;
2726 	ddi_set_driver_private(dip, mlxp);
2727 
2728 	mlxcx_load_props(mlxp);
2729 
2730 	mlxcx_fm_init(mlxp);
2731 	mlxp->mlx_attach |= MLXCX_ATTACH_FM;
2732 
2733 	if (pci_config_setup(mlxp->mlx_dip, &mlxp->mlx_cfg_handle) !=
2734 	    DDI_SUCCESS) {
2735 		mlxcx_warn(mlxp, "failed to initial PCI config space");
2736 		goto err;
2737 	}
2738 	mlxp->mlx_attach |= MLXCX_ATTACH_PCI_CONFIG;
2739 
2740 	if (!mlxcx_regs_map(mlxp)) {
2741 		goto err;
2742 	}
2743 	mlxp->mlx_attach |= MLXCX_ATTACH_REGS;
2744 
2745 	if (!mlxcx_cmd_queue_init(mlxp)) {
2746 		goto err;
2747 	}
2748 	mlxp->mlx_attach |= MLXCX_ATTACH_CMD;
2749 
2750 	if (!mlxcx_cmd_enable_hca(mlxp)) {
2751 		goto err;
2752 	}
2753 	mlxp->mlx_attach |= MLXCX_ATTACH_ENABLE_HCA;
2754 
2755 	if (!mlxcx_check_issi(mlxp)) {
2756 		goto err;
2757 	}
2758 
2759 	/*
2760 	 * We have to get our interrupts now so we know what priority to
2761 	 * create pagemtx with.
2762 	 */
2763 	if (!mlxcx_intr_setup(mlxp)) {
2764 		goto err;
2765 	}
2766 	mlxp->mlx_attach |= MLXCX_ATTACH_INTRS;
2767 
2768 	mutex_init(&mlxp->mlx_pagemtx, NULL, MUTEX_DRIVER,
2769 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
2770 	avl_create(&mlxp->mlx_pages, mlxcx_page_compare,
2771 	    sizeof (mlxcx_dev_page_t), offsetof(mlxcx_dev_page_t, mxdp_tree));
2772 	mlxp->mlx_attach |= MLXCX_ATTACH_PAGE_LIST;
2773 
2774 	/*
2775 	 * Taskq for asynchronous events which may interact with the HCA
2776 	 * via the command interface. Single threaded FIFO.
2777 	 */
2778 	(void) snprintf(tq_name, sizeof (tq_name), "%s_async_%d",
2779 	    ddi_driver_name(mlxp->mlx_dip), mlxp->mlx_inst);
2780 	mlxp->mlx_async_tq = taskq_create(tq_name, 1, minclsyspri, 1, INT_MAX,
2781 	    TASKQ_PREPOPULATE);
2782 	/*
2783 	 * Initialize any pre-allocated taskq param structs.
2784 	 */
2785 	for (i = 0; i <= MLXCX_FUNC_ID_MAX; i++) {
2786 		mlxp->mlx_npages_req[i].mla_mlx = mlxp;
2787 		mutex_init(&mlxp->mlx_npages_req[i].mla_mtx, NULL,
2788 		    MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_async_intr_pri));
2789 	}
2790 	mlxp->mlx_attach |= MLXCX_ATTACH_ASYNC_TQ;
2791 
2792 	if (!mlxcx_init_pages(mlxp, MLXCX_QUERY_PAGES_OPMOD_BOOT)) {
2793 		goto err;
2794 	}
2795 
2796 	if (!mlxcx_init_caps(mlxp)) {
2797 		goto err;
2798 	}
2799 	mlxp->mlx_attach |= MLXCX_ATTACH_CAPS;
2800 
2801 	if (!mlxcx_init_pages(mlxp, MLXCX_QUERY_PAGES_OPMOD_INIT)) {
2802 		goto err;
2803 	}
2804 
2805 	if (!mlxcx_cmd_init_hca(mlxp)) {
2806 		goto err;
2807 	}
2808 	mlxp->mlx_attach |= MLXCX_ATTACH_INIT_HCA;
2809 
2810 	if (!mlxcx_cmd_set_driver_version(mlxp, MLXCX_DRIVER_VERSION)) {
2811 		goto err;
2812 	}
2813 
2814 	/*
2815 	 * The User Access Region (UAR) is needed so we can ring EQ and CQ
2816 	 * doorbells.
2817 	 */
2818 	if (!mlxcx_cmd_alloc_uar(mlxp, &mlxp->mlx_uar)) {
2819 		goto err;
2820 	}
2821 	for (i = 0; i < MLXCX_BF_PER_UAR; ++i) {
2822 		mutex_init(&mlxp->mlx_uar.mlu_bf[i].mbf_mtx, NULL,
2823 		    MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_intr_pri));
2824 	}
2825 	mlxp->mlx_attach |= MLXCX_ATTACH_UAR_PD_TD;
2826 
2827 	/*
2828 	 * Set up asynchronous event queue which handles control type events
2829 	 * like PAGE_REQUEST and CMD completion events.
2830 	 *
2831 	 * This will enable and arm the interrupt on EQ 0. Note that only page
2832 	 * reqs and cmd completions will be handled until we call
2833 	 * mlxcx_eq_set_attached further down (this way we don't need an extra
2834 	 * set of locks over the mlxcx_t sub-structs not allocated yet)
2835 	 */
2836 	if (!mlxcx_setup_async_eqs(mlxp)) {
2837 		goto err;
2838 	}
2839 
2840 	/*
2841 	 * Allocate a protection and transport domain. These don't really do
2842 	 * anything for us (they're IB concepts), but we need to give their
2843 	 * ID numbers in other commands.
2844 	 */
2845 	if (!mlxcx_cmd_alloc_pd(mlxp, &mlxp->mlx_pd)) {
2846 		goto err;
2847 	}
2848 	if (!mlxcx_cmd_alloc_tdom(mlxp, &mlxp->mlx_tdom)) {
2849 		goto err;
2850 	}
2851 	/*
2852 	 * Fetch the "reserved" lkey that lets us give linear addresses in
2853 	 * work queue entries, rather than having to mess with the NIC's
2854 	 * internal MMU.
2855 	 */
2856 	if (!mlxcx_cmd_query_special_ctxs(mlxp)) {
2857 		goto err;
2858 	}
2859 
2860 	/*
2861 	 * Query our port information and current state, populate the
2862 	 * mlxcx_port_t structs.
2863 	 *
2864 	 * This also sets up the root flow tables and flow groups.
2865 	 */
2866 	if (!mlxcx_setup_ports(mlxp)) {
2867 		goto err;
2868 	}
2869 	mlxp->mlx_attach |= MLXCX_ATTACH_PORTS;
2870 
2871 	mlxcx_load_model_props(mlxp);
2872 
2873 	/*
2874 	 * Set up, enable and arm the rest of the interrupt EQs which will
2875 	 * service events from CQs.
2876 	 *
2877 	 * The MLXCX_ATTACH_INTRS flag covers checking if these need to be
2878 	 * cleaned up.
2879 	 */
2880 	if (!mlxcx_setup_eqs(mlxp)) {
2881 		goto err;
2882 	}
2883 
2884 	/* Completion queues */
2885 	list_create(&mlxp->mlx_cqs, sizeof (mlxcx_completion_queue_t),
2886 	    offsetof(mlxcx_completion_queue_t, mlcq_entry));
2887 	mlxp->mlx_attach |= MLXCX_ATTACH_CQS;
2888 
2889 	/* Work queues (send queues, receive queues) */
2890 	list_create(&mlxp->mlx_wqs, sizeof (mlxcx_work_queue_t),
2891 	    offsetof(mlxcx_work_queue_t, mlwq_entry));
2892 	mlxp->mlx_attach |= MLXCX_ATTACH_WQS;
2893 
2894 	/*
2895 	 * Construct our arrays of mlxcx_ring_group_ts, which represent the
2896 	 * "groups" we advertise to MAC.
2897 	 */
2898 	mlxp->mlx_rx_ngroups = mlxcx_calc_rx_ngroups(mlxp);
2899 	mlxp->mlx_rx_groups_size = mlxp->mlx_rx_ngroups *
2900 	    sizeof (mlxcx_ring_group_t);
2901 	mlxp->mlx_rx_groups = kmem_zalloc(mlxp->mlx_rx_groups_size, KM_SLEEP);
2902 
2903 	mlxp->mlx_tx_ngroups = mlxp->mlx_props.mldp_tx_ngroups;
2904 	mlxp->mlx_tx_groups_size = mlxp->mlx_tx_ngroups *
2905 	    sizeof (mlxcx_ring_group_t);
2906 	mlxp->mlx_tx_groups = kmem_zalloc(mlxp->mlx_tx_groups_size, KM_SLEEP);
2907 
2908 	mlxp->mlx_attach |= MLXCX_ATTACH_GROUPS;
2909 
2910 	/*
2911 	 * Sets up the free/busy buffers list for keeping track of packet
2912 	 * buffers.
2913 	 */
2914 	if (!mlxcx_setup_bufs(mlxp))
2915 		goto err;
2916 	mlxp->mlx_attach |= MLXCX_ATTACH_BUFS;
2917 
2918 	/*
2919 	 * Before we tell MAC about our rings/groups, we need to do enough
2920 	 * setup on them to be sure about the numbers and configuration that
2921 	 * we have. This will do basically everything short of allocating
2922 	 * packet buffers and starting the rings up.
2923 	 */
2924 	for (i = 0; i < mlxp->mlx_tx_ngroups; ++i) {
2925 		if (!mlxcx_tx_group_setup(mlxp, &mlxp->mlx_tx_groups[i]))
2926 			goto err;
2927 	}
2928 	for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) {
2929 		if (!mlxcx_rx_group_setup(mlxp, &mlxp->mlx_rx_groups[i]))
2930 			goto err;
2931 	}
2932 
2933 	/*
2934 	 * Set up periodic fault check timers which check the queue states,
2935 	 * set up should be after all the queues have been initialized and
2936 	 * consequently the teardown of timers must happen before
2937 	 * queue teardown.
2938 	 */
2939 	if (!mlxcx_setup_checktimers(mlxp)) {
2940 		goto err;
2941 	}
2942 	mlxp->mlx_attach |= MLXCX_ATTACH_CHKTIMERS;
2943 
2944 	/*
2945 	 * Some devices may not have a working temperature sensor; however,
2946 	 * there isn't a great way for us to know. We shouldn't fail attach if
2947 	 * this doesn't work.
2948 	 */
2949 	if (mlxcx_setup_sensors(mlxp)) {
2950 		mlxp->mlx_attach |= MLXCX_ATTACH_SENSORS;
2951 	}
2952 
2953 	/*
2954 	 * Finally, tell MAC that we exist!
2955 	 */
2956 	if (!mlxcx_register_mac(mlxp)) {
2957 		goto err;
2958 	}
2959 	mlxp->mlx_attach |= MLXCX_ATTACH_MAC_HDL;
2960 
2961 	/*
2962 	 * This tells the interrupt handlers they can start processing events
2963 	 * other than cmd completions and page requests.
2964 	 */
2965 	mlxcx_eq_set_attached(mlxp);
2966 
2967 	return (DDI_SUCCESS);
2968 
2969 err:
2970 	mlxcx_teardown(mlxp);
2971 	return (DDI_FAILURE);
2972 }
2973 
2974 static struct cb_ops mlxcx_cb_ops = {
2975 	.cb_open = nulldev,
2976 	.cb_close = nulldev,
2977 	.cb_strategy = nodev,
2978 	.cb_print = nodev,
2979 	.cb_dump = nodev,
2980 	.cb_read = nodev,
2981 	.cb_write = nodev,
2982 	.cb_ioctl = nodev,
2983 	.cb_devmap = nodev,
2984 	.cb_mmap = nodev,
2985 	.cb_segmap = nodev,
2986 	.cb_chpoll = nochpoll,
2987 	.cb_prop_op = ddi_prop_op,
2988 	.cb_flag = D_MP,
2989 	.cb_rev = CB_REV,
2990 	.cb_aread = nodev,
2991 	.cb_awrite = nodev
2992 };
2993 
2994 static struct dev_ops mlxcx_dev_ops = {
2995 	.devo_rev = DEVO_REV,
2996 	.devo_refcnt = 0,
2997 	.devo_getinfo = NULL,
2998 	.devo_identify = nulldev,
2999 	.devo_probe = nulldev,
3000 	.devo_attach = mlxcx_attach,
3001 	.devo_detach = mlxcx_detach,
3002 	.devo_reset = nodev,
3003 	.devo_quiesce = ddi_quiesce_not_supported,
3004 	.devo_cb_ops = &mlxcx_cb_ops
3005 };
3006 
3007 static struct modldrv mlxcx_modldrv = {
3008 	.drv_modops = &mod_driverops,
3009 	.drv_linkinfo = "Mellanox Connect-X 4/5/6",
3010 	.drv_dev_ops = &mlxcx_dev_ops
3011 };
3012 
3013 static struct modlinkage mlxcx_modlinkage = {
3014 	.ml_rev = MODREV_1,
3015 	.ml_linkage = { &mlxcx_modldrv, NULL }
3016 };
3017 
3018 int
3019 _init(void)
3020 {
3021 	int ret;
3022 
3023 	ret = ddi_soft_state_init(&mlxcx_softstate, sizeof (mlxcx_t), 0);
3024 	if (ret != 0) {
3025 		return (ret);
3026 	}
3027 
3028 	mac_init_ops(&mlxcx_dev_ops, MLXCX_MODULE_NAME);
3029 
3030 	if ((ret = mod_install(&mlxcx_modlinkage)) != DDI_SUCCESS) {
3031 		mac_fini_ops(&mlxcx_dev_ops);
3032 		ddi_soft_state_fini(&mlxcx_softstate);
3033 		return (ret);
3034 	}
3035 
3036 	return (DDI_SUCCESS);
3037 }
3038 
3039 int
3040 _info(struct modinfo *modinfop)
3041 {
3042 	return (mod_info(&mlxcx_modlinkage, modinfop));
3043 }
3044 
3045 int
3046 _fini(void)
3047 {
3048 	int ret;
3049 
3050 	if ((ret = mod_remove(&mlxcx_modlinkage)) != DDI_SUCCESS) {
3051 		return (ret);
3052 	}
3053 
3054 	mac_fini_ops(&mlxcx_dev_ops);
3055 
3056 	ddi_soft_state_fini(&mlxcx_softstate);
3057 
3058 	return (DDI_SUCCESS);
3059 }
3060