xref: /illumos-gate/usr/src/uts/common/io/mlxcx/mlxcx.c (revision ed093b41a93e8563e6e1e5dae0768dda2a7bcc27)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2021, The University of Queensland
14  * Copyright (c) 2018, Joyent, Inc.
15  * Copyright 2020 RackTop Systems, Inc.
16  */
17 
18 /*
19  * Mellanox Connect-X 4/5/6 driver.
20  */
21 
22 /*
23  * The PRM for this family of parts is freely available, and can be found at:
24  * https://www.mellanox.com/related-docs/user_manuals/ \
25  *   Ethernet_Adapters_Programming_Manual.pdf
26  */
27 /*
28  * ConnectX glossary
29  * -----------------
30  *
31  * WR		Work Request: something we've asked the hardware to do by
32  *		creating a Work Queue Entry (WQE), e.g. send or recv a packet
33  *
34  * WQE		Work Queue Entry: a descriptor on a work queue descriptor ring
35  *
36  * WQ		Work Queue: a descriptor ring that we can place WQEs on, usually
37  *		either a Send Queue (SQ) or Receive Queue (RQ). Different WQ
38  *		types have different WQE structures, different commands for
39  *		creating and destroying them, etc, but share a common context
40  *		structure, counter setup and state graph.
41  * SQ		Send Queue, a specific type of WQ that sends packets
42  * RQ		Receive Queue, a specific type of WQ that receives packets
43  *
44  * CQ		Completion Queue: completion of WRs from a WQ are reported to
45  *		one of these, as a CQE on its entry ring.
46  * CQE		Completion Queue Entry: an entry in a CQ ring. Contains error
47  *		info, as well as packet size, the ID of the WQ, and the index
48  *		of the WQE which completed. Does not contain any packet data.
49  *
50  * EQ		Event Queue: a ring of event structs from the hardware informing
51  *		us when particular events happen. Many events can point at a
52  *		a particular CQ which we should then go look at.
53  * EQE		Event Queue Entry: an entry on the EQ ring
54  *
55  * UAR		User Access Region, a page of the device's PCI BAR which is
56  *		tied to particular EQ/CQ/WQ sets and contains doorbells to
57  *		ring to arm them for interrupts or wake them up for new work
58  *
59  * RQT		RQ Table, a collection of indexed RQs used to refer to the group
60  *		as a single unit (for e.g. hashing/RSS).
61  *
62  * TIR		Transport Interface Recieve, a bucket of resources for the
63  *		reception of packets. TIRs have to point at either a single RQ
64  *		or a table of RQs (RQT). They then serve as a target for flow
65  *		table entries (FEs). TIRs that point at an RQT also contain the
66  *		settings for hashing for RSS.
67  *
68  * TIS		Transport Interface Send, a bucket of resources associated with
69  *		the transmission of packets. In particular, the temporary
70  *		resources used for LSO internally in the card are accounted to
71  *		a TIS.
72  *
73  * FT		Flow Table, a collection of FEs and FGs that can be referred to
74  *		as a single entity (e.g. used as a target from another flow
75  *		entry or set as the "root" table to handle incoming or outgoing
76  *		packets). Packets arriving at a FT are matched against the
77  *		FEs in the table until either one matches with a terminating
78  *		action or all FEs are exhausted (it's first-match-wins but with
79  *		some actions that are non-terminal, like counting actions).
80  *
81  * FG		Flow Group, a group of FEs which share a common "mask" (i.e.
82  *		they match on the same attributes of packets coming into the
83  *		flow).
84  *
85  * FE		Flow Entry, an individual set of values to match against
86  *		packets entering the flow table, combined with an action to
87  *		take upon a successful match. The action we use most is
88  *		"forward", which sends the packets to a TIR or another flow
89  *		table and then stops further processing within the FE's FT.
90  *
91  * lkey/mkey	A reference to something similar to a page table but in the
92  *		device's internal onboard MMU. Since Connect-X parts double as
93  *		IB cards (lots of RDMA) they have extensive onboard memory mgmt
94  *		features which we try very hard not to use. For our WQEs we use
95  *		the "reserved" lkey, which is a special value which indicates
96  *		that addresses we give are linear addresses and should not be
97  *		translated.
98  *
99  * PD		Protection Domain, an IB concept. We have to allocate one to
100  *		provide as a parameter for new WQs, but we don't do anything
101  *		with it.
102  *
103  * TDOM/TD	Transport Domain, an IB concept. We allocate one in order to
104  *		provide it as a parameter to TIR/TIS creation, but we don't do
105  *		anything with it.
106  */
107 /*
108  *
109  * Data flow overview
110  * ------------------
111  *
112  * This driver is a MAC ring-enabled driver which maps rings to send and recv
113  * queues in hardware on the device.
114  *
115  * Each SQ and RQ is set up to report to its own individual CQ, to ensure
116  * sufficient space, and simplify the logic needed to work out which buffer
117  * was completed.
118  *
119  * The CQs are then round-robin allocated onto EQs, of which we set up one per
120  * interrupt that the system gives us for the device. Normally this means we
121  * have 8 EQs.
122  *
123  * When we have >= 8 EQs available, we try to allocate only RX or only TX
124  * CQs on each one. The EQs are chosen for RX and TX in an alternating fashion.
125  *
126  * EQ #0 is reserved for all event types other than completion events, and has
127  * no CQs associated with it at any time. EQs #1 and upwards are only used for
128  * handling CQ completion events.
129  *
130  * +------+     +------+           +------+        +---------+
131  * | SQ 0 |---->| CQ 0 |-----+     | EQ 0 |------> | MSI-X 0 |     mlxcx_intr_0
132  * +------+     +------+     |     +------+        +---------+
133  *                           |
134  * +------+     +------+     |
135  * | SQ 1 |---->| CQ 1 |---+ |     +------+
136  * +------+     +------+   | +---> |      |
137  *                         |       |      |
138  * +------+     +------+   |       | EQ 1 |        +---------+
139  * | SQ 2 |---->| CQ 2 |---------> |      |------> | MSI-X 1 |     mlxcx_intr_n
140  * +------+     +------+   | +---> |      |        +---------+
141  *                         | |     +------+
142  *                         | |
143  *   ...                   | |
144  *                         | |     +------+
145  * +------+     +------+   +-----> |      |
146  * | RQ 0 |---->| CQ 3 |---------> |      |        +---------+
147  * +------+     +------+     |     | EQ 2 |------> | MSI-X 2 |     mlxcx_intr_n
148  *                           |     |      |        +---------+
149  * +------+     +------+     | +-> |      |
150  * | RQ 1 |---->| CQ 4 |-----+ |   +------+
151  * +------+     +------+       |
152  *                             |     ....
153  * +------+     +------+       |
154  * | RQ 2 |---->| CQ 5 |-------+
155  * +------+     +------+
156  *
157  *   ... (note this diagram does not show RX-only or TX-only EQs)
158  *
159  * For TX, we advertise all of the SQs we create as plain rings to MAC with
160  * no TX groups. This puts MAC in "virtual group" mode where it will allocate
161  * and use the rings as it sees fit.
162  *
163  * For RX, we advertise actual groups in order to make use of hardware
164  * classification.
165  *
166  * The hardware classification we use is based around Flow Tables, and we
167  * currently ignore all of the eswitch features of the card. The NIC VPORT
168  * is always set to promisc mode so that the eswitch sends us all of the
169  * traffic that arrives on the NIC, and we use flow entries to manage
170  * everything.
171  *
172  * We use 2 layers of flow tables for classification: traffic arrives at the
173  * root RX flow table which contains MAC address filters. Those then send
174  * matched traffic to the per-group L1 VLAN filter tables which contain VLAN
175  * presence and VID filters.
176  *
177  * Since these parts only support doing RSS hashing on a single protocol at a
178  * time, we have to use a third layer of flow tables as well to break traffic
179  * down by L4 and L3 protocol (TCPv6, TCPv4, UDPv6, UDPv4, IPv6, IPv4 etc)
180  * so that it can be sent to the appropriate TIR for hashing.
181  *
182  * Incoming packets
183  *        +           +---------+      +---------+
184  *        |        +->| group 0 |      | group 0 |
185  *        |        |  | vlan ft |  +-->| hash ft |
186  *        v        |  |   L1    |  |   |   L2    |
187  *   +----+----+   |  +---------+  |   +---------+    +-----+    +-----+------+
188  *   | eswitch |   |  |         |  |   |  TCPv6  |--->| TIR |--->|     |  RQ0 |
189  *   +----+----+   |  |         |  |   +---------+    +-----+    |     +------+
190  *        |        |  |         |  |   |  UDPv6  |--->| TIR |--->|     |  RQ1 |
191  *        |        |  |         |  |   +---------+    +-----+    |     +------+
192  *        |        |  |         |  |   |  TCPv4  |--->| TIR |--->|     |  RQ2 |
193  *        v        |  |         |  |   +---------+    +-----+    | RQT +------+
194  *   +----+----+   |  +---------+  |   |  UDPv4  |--->| TIR |--->|     |  ... |
195  *   | root rx |   |  | default |--+   +---------+    +-----+    |     |      |
196  *   | flow tb |   |  +---------+  |   |  IPv6   |--->| TIR |--->|     |      |
197  *   |    L0   |   |  | promisc |--+   +---------+    +-----+    |     |      |
198  *   +---------+   |  +---------+  ^   |  IPv4   |--->| TIR |--->|     |      |
199  *   |  bcast  |---|---------------+   +---------+    +-----+    +-----+------+
200  *   +---------+   |               ^   |  other  |-+
201  *   |  MAC 0  |---+               |   +---------+ |  +-----+    +-----+
202  *   +---------+                   |               +->| TIR |--->| RQ0 |
203  *   |  MAC 1  |-+                 |                  +-----+    +-----+
204  *   +---------+ | +---------------+
205  *   |  MAC 2  |-+ |               ^
206  *   +---------+ | |               |
207  *   |  MAC 3  |-+ |  +---------+  |   +---------+
208  *   +---------+ | |  | group 1 |  |   | group 1 |
209  *   |  .....  | +--->| vlan ft |  | +>| hash ft |
210  *   |         |   |  |   L1    |  | | |   L2    |
211  *   +---------+   |  +---------+  | | +---------+    +-----+    +-----+------+
212  *   | promisc |---+  | VLAN 0  |----+ |  TCPv6  |--->| TIR |--->|     |  RQ3 |
213  *   +---------+      +---------+  |   +---------+    +-----+    |     +------+
214  *                    |  .....  |  |   |  UDPv6  |--->| TIR |--->|     |  RQ4 |
215  *                    |         |  |   +---------+    +-----+    |     +------+
216  *                    |         |  |   |  TCPv4  |--->| TIR |--->|     |  RQ5 |
217  *                    |         |  |   +---------+    +-----+    | RQT +------+
218  *                    +---------+  |   |  UDPv4  |--->| TIR |--->|     |  ... |
219  *                    |         |  |   +---------+    +-----+    |     |      |
220  *                    +---------+  |   |  IPv6   |--->| TIR |--->|     |      |
221  *                    | promisc |--+   +---------+    +-----+    |     |      |
222  *                    +---------+      |  IPv4   |--->| TIR |--->|     |      |
223  *                                     +---------+    +-----+    +-----+------+
224  *                                     |  other  |-+
225  *                                     +---------+ |
226  *                      .......                    |  +-----+    +-----+
227  *                                                 +->| TIR |--->| RQ3 |
228  *                                                    +-----+    +-----+
229  *
230  * Note that the "promisc" flow entries are only set/enabled when promisc
231  * mode is enabled for the NIC. All promisc flow entries point directly at
232  * group 0's hashing flowtable (so all promisc-only traffic lands on group 0,
233  * the "default group" in MAC).
234  *
235  * The "default" entry in the L1 VLAN filter flow tables is used when there
236  * are no VLANs set for the group, to accept any traffic regardless of tag. It
237  * is deleted as soon as a VLAN filter is added (and re-instated if the
238  * last VLAN filter is removed).
239  *
240  * The actual descriptor ring structures for RX on Connect-X4 don't contain any
241  * space for packet data (they're a collection of scatter pointers only). TX
242  * descriptors contain some space for "inline headers" (and the card requires
243  * us to put at least the L2 Ethernet headers there for the eswitch to look at)
244  * but all the rest of the data comes from the gather pointers.
245  *
246  * When we get completions back they simply contain the ring index number of
247  * the WR (work request) which completed. So, we manage the buffers for actual
248  * packet data completely independently of the descriptors in this driver. When
249  * a WR is enqueued in a WQE (work queue entry), we stamp the packet data buffer
250  * with the WQE index that we put it at, and therefore don't have to look at
251  * the original descriptor at all when handling completions.
252  *
253  * For RX, we create sufficient packet data buffers to fill 150% of the
254  * available descriptors for each ring. These all are pre-set-up for DMA and
255  * have an mblk_t associated with them (with desballoc()).
256  *
257  * For TX we either borrow the mblk's memory and DMA bind it (if the packet is
258  * large enough), or we copy it into a pre-allocated buffer set up in the same
259  * as as for RX.
260  */
261 
262 /*
263  * Buffer lifecycle: RX
264  * --------------------
265  *
266  * The lifecycle of an mlxcx_buffer_t (packet buffer) used for RX is pretty
267  * straightforward.
268  *
269  * It is created (and has all its memory allocated) at the time of starting up
270  * the RX ring it belongs to. Then it is placed on the "free" list in the
271  * mlxcx_buffer_shard_t associated with its RQ. When mlxcx_rq_refill() wants
272  * more buffers to add to the RQ, it takes one off and marks it as "on WQ"
273  * before making a WQE for it.
274  *
275  * After a completion event occurs, the packet is either discarded (and the
276  * buffer_t returned to the free list), or it is readied for loaning to MAC
277  * and placed on the "loaned" list in the mlxcx_buffer_shard_t.
278  *
279  * Once MAC and the rest of the system have finished with the packet, they call
280  * freemsg() on its mblk, which will call mlxcx_buf_mp_return. At this point
281  * the fate of the buffer_t is determined by the state of the
282  * mlxcx_buffer_shard_t. When the shard is in its normal state the buffer_t
283  * will be returned to the free list, potentially to be recycled and used
284  * again. But if the shard is draining (E.g. after a ring stop) there will be
285  * no recycling and the buffer_t is immediately destroyed.
286  *
287  * At detach/teardown time, buffers are only every destroyed from the free list.
288  *
289  *
290  *                         +
291  *                         |
292  *                         | mlxcx_buf_create
293  *                         |
294  *                         v
295  *                    +----+----+
296  *                    | created |
297  *                    +----+----+                        +------+
298  *                         |                             | dead |
299  *                         |                             +------+
300  *                         | mlxcx_buf_return                ^
301  *                         |                                 |
302  *                         v                                 | mlxcx_buf_destroy
303  * mlxcx_buf_destroy  +----+----+          +-----------+     |
304  *          +---------|  free   |<------no-| draining? |-yes-+
305  *          |         +----+----+          +-----------+
306  *          |              |                     ^
307  *          |              |                     |
308  *          v              | mlxcx_buf_take      | mlxcx_buf_return
309  *      +---+--+           v                     |
310  *      | dead |       +---+---+                 |
311  *      +------+       | on WQ |- - - - - - - - >O
312  *                     +---+---+                 ^
313  *                         |                     |
314  *                         |                     |
315  *                         | mlxcx_buf_loan      | mlxcx_buf_mp_return
316  *                         v                     |
317  *                 +-------+--------+            |
318  *                 | on loan to MAC |----------->O
319  *                 +----------------+  freemsg()
320  *
321  */
322 
323 /*
324  * Buffer lifecycle: TX
325  * --------------------
326  *
327  * mlxcx_buffer_ts used for TX are divided into two kinds: regular buffers, and
328  * "foreign" buffers.
329  *
330  * The former have their memory allocated and DMA bound by this driver, while
331  * the latter (the "foreign" buffers) are on loan from MAC. Their memory is
332  * not owned by us, though we do DMA bind it (and take responsibility for
333  * un-binding it when we're done with them).
334  *
335  * We use separate mlxcx_buf_shard_ts for foreign and local buffers on each
336  * SQ. Thus, there is a separate free list and mutex for each kind.
337  *
338  * Since a TX packet might consist of multiple mblks, we translate each mblk
339  * into exactly one buffer_t. The buffer_ts are chained together in the same
340  * order as the mblks, using the mlb_tx_chain/mlb_tx_chain_entry list_t.
341  *
342  * Each chain of TX buffers may consist of foreign or driver buffers, in any
343  * mixture.
344  *
345  * The head of a TX buffer chain has mlb_tx_head == itself, which distinguishes
346  * it from the rest of the chain buffers.
347  *
348  * TX buffer chains are always returned to the free list by
349  * mlxcx_buf_return_chain(), which takes care of walking the mlb_tx_chain and
350  * freeing all of the members.
351  *
352  * We only call freemsg() once, on the head of the TX buffer chain's original
353  * mblk. This is true whether we copied it or bound it in a foreign buffer.
354  */
355 
356 /*
357  * Startup and command interface
358  * -----------------------------
359  *
360  * The command interface is the primary way in which we give control orders to
361  * the hardware (e.g. actions like "create this queue" or "delete this flow
362  * entry"). The command interface is never used to transmit or receive packets
363  * -- that takes place only on the queues that are set up through it.
364  *
365  * In mlxcx_cmd.c we implement our use of the command interface on top of a
366  * simple taskq. As commands are submitted from the taskq they choose a
367  * "slot", if there are no free slots then execution of the command will
368  * be paused until one is free. The hardware permits up to 32 independent
369  * slots for concurrent command execution.
370  *
371  * Before interrupts are enabled, command completion is polled, once
372  * interrupts are up command completions become asynchronous and are
373  * wired to EQ 0. A caveat to this is commands can not be submitted
374  * directly from EQ 0's completion handler, and any processing resulting from
375  * an asynchronous event which requires further use of the command interface
376  * is posted through a taskq.
377  *
378  * The startup/attach process for this card involves a bunch of different steps
379  * which are summarised pretty well in the PRM. We have to send a number of
380  * commands which do different things to start the card up, give it some pages
381  * of our own memory for it to use, then start creating all the entities that
382  * we need to use like EQs, CQs, WQs, as well as their dependencies like PDs
383  * and TDoms.
384  */
385 
386 /*
387  * UARs
388  * ----
389  *
390  * The pages of the PCI BAR other than the first few are reserved for use as
391  * "UAR" sections in this device. Each UAR section can be used as a set of
392  * doorbells for our queues.
393  *
394  * Currently we just make one single UAR for all of our queues. It doesn't
395  * seem to be a major limitation yet.
396  *
397  * When we're sending packets through an SQ, the PRM is not awful clear about
398  * exactly how we're meant to use the first 16 bytes of the Blueflame buffers
399  * (it's clear on the pattern of alternation you're expected to use between
400  * even and odd for Blueflame sends, but not for regular doorbells).
401  *
402  * Currently we don't do the even-odd alternating pattern for ordinary
403  * doorbells, and we don't use Blueflame at all. This seems to work fine, at
404  * least on Connect-X4 Lx.
405  */
406 
407 /*
408  * Lock ordering
409  * -------------
410  *
411  * Interrupt side:
412  *
413  *  - mleq_mtx
414  *    - mlcq_arm_mtx
415  *      - mlcq_mtx
416  *        - mlcq_bufbmtx
417  *        - mlwq_mtx
418  *          - mlbs_mtx
419  *    - mlp_mtx
420  *
421  * GLD side:
422  *
423  *  - mlp_mtx
424  *    - mlg_mtx
425  *      - mlg_*.mlft_mtx
426  *    - mlp_*.mlft_mtx
427  *    - mlwq_mtx
428  *      - mlbs_mtx
429  *      - mlcq_bufbmtx
430  *  - mleq_mtx
431  *    - mlcq_arm_mtx
432  *      - mlcq_mtx
433  *
434  */
435 
436 #include <sys/modctl.h>
437 #include <sys/conf.h>
438 #include <sys/devops.h>
439 #include <sys/sysmacros.h>
440 #include <sys/time.h>
441 
442 #include <sys/mac_provider.h>
443 
444 #include <mlxcx.h>
445 
446 CTASSERT((1 << MLXCX_RX_HASH_FT_SIZE_SHIFT) >= MLXCX_TIRS_PER_GROUP);
447 
448 #define	MLXCX_MODULE_NAME	"mlxcx"
449 /*
450  * We give this to the firmware, so it has to be in a fixed format that it
451  * understands.
452  */
453 #define	MLXCX_DRIVER_VERSION	"illumos,mlxcx,1.0.0,1,000,000000"
454 
455 /*
456  * Firmware may take a while to reclaim pages. Try a set number of times.
457  */
458 clock_t mlxcx_reclaim_delay = 1000 * 50; /* 50 ms in us */
459 uint_t mlxcx_reclaim_tries = 100; /* Wait at most 5000ms */
460 
461 static void *mlxcx_softstate;
462 
463 /*
464  * Fault detection thresholds.
465  */
466 uint_t mlxcx_doorbell_tries = MLXCX_DOORBELL_TRIES_DFLT;
467 uint_t mlxcx_stuck_intr_count = MLXCX_STUCK_INTR_COUNT_DFLT;
468 
469 static void
470 mlxcx_load_prop_defaults(mlxcx_t *mlxp)
471 {
472 	mlxcx_drv_props_t *p = &mlxp->mlx_props;
473 	mlxcx_port_t *port = &mlxp->mlx_ports[0];
474 
475 	VERIFY((mlxp->mlx_attach & MLXCX_ATTACH_PORTS) != 0);
476 	VERIFY((mlxp->mlx_attach & (MLXCX_ATTACH_CQS | MLXCX_ATTACH_WQS)) == 0);
477 
478 	/*
479 	 * Currently we have different queue size defaults for two
480 	 * categories of queues. One set for devices which support a
481 	 * maximum speed of 10Gb/s, and another for those above that.
482 	 */
483 	if ((port->mlp_max_proto & (MLXCX_PROTO_25G | MLXCX_PROTO_40G |
484 	    MLXCX_PROTO_50G | MLXCX_PROTO_100G)) != 0) {
485 		p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_25G;
486 		p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_25G;
487 		p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_25G;
488 	} else if ((port->mlp_max_proto & (MLXCX_PROTO_100M | MLXCX_PROTO_1G |
489 	    MLXCX_PROTO_10G)) != 0) {
490 		p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT;
491 		p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT;
492 		p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT;
493 	} else {
494 		mlxcx_warn(mlxp, "Encountered a port with a speed we don't "
495 		    "recognize. Proto: 0x%x", port->mlp_max_proto);
496 		p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT;
497 		p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT;
498 		p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT;
499 	}
500 }
501 
502 /*
503  * Properties which may have different defaults based on hardware
504  * characteristics.
505  */
506 static void
507 mlxcx_load_model_props(mlxcx_t *mlxp)
508 {
509 	mlxcx_drv_props_t *p = &mlxp->mlx_props;
510 
511 	mlxcx_load_prop_defaults(mlxp);
512 
513 	p->mldp_cq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
514 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cq_size_shift",
515 	    p->mldp_cq_size_shift_default);
516 	p->mldp_sq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
517 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "sq_size_shift",
518 	    p->mldp_sq_size_shift_default);
519 	p->mldp_rq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
520 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rq_size_shift",
521 	    p->mldp_rq_size_shift_default);
522 }
523 
524 static void
525 mlxcx_load_props(mlxcx_t *mlxp)
526 {
527 	mlxcx_drv_props_t *p = &mlxp->mlx_props;
528 
529 	p->mldp_eq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
530 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "eq_size_shift",
531 	    MLXCX_EQ_SIZE_SHIFT_DFLT);
532 	p->mldp_cqemod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
533 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_period_usec",
534 	    MLXCX_CQEMOD_PERIOD_USEC_DFLT);
535 	p->mldp_cqemod_count = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
536 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_count",
537 	    MLXCX_CQEMOD_COUNT_DFLT);
538 	p->mldp_intrmod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
539 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "intrmod_period_usec",
540 	    MLXCX_INTRMOD_PERIOD_USEC_DFLT);
541 
542 	p->mldp_tx_ngroups = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
543 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_ngroups",
544 	    MLXCX_TX_NGROUPS_DFLT);
545 	p->mldp_tx_nrings_per_group = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
546 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_nrings_per_group",
547 	    MLXCX_TX_NRINGS_PER_GROUP_DFLT);
548 
549 	p->mldp_rx_ngroups_large = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
550 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_ngroups_large",
551 	    MLXCX_RX_NGROUPS_LARGE_DFLT);
552 	p->mldp_rx_ngroups_small = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
553 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_ngroups_small",
554 	    MLXCX_RX_NGROUPS_SMALL_DFLT);
555 	p->mldp_rx_nrings_per_large_group = ddi_getprop(DDI_DEV_T_ANY,
556 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
557 	    "rx_nrings_per_large_group", MLXCX_RX_NRINGS_PER_LARGE_GROUP_DFLT);
558 	p->mldp_rx_nrings_per_small_group = ddi_getprop(DDI_DEV_T_ANY,
559 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
560 	    "rx_nrings_per_small_group", MLXCX_RX_NRINGS_PER_SMALL_GROUP_DFLT);
561 
562 	p->mldp_ftbl_root_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
563 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "ftbl_root_size_shift",
564 	    MLXCX_FTBL_ROOT_SIZE_SHIFT_DFLT);
565 
566 	p->mldp_tx_bind_threshold = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
567 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_bind_threshold",
568 	    MLXCX_TX_BIND_THRESHOLD_DFLT);
569 
570 	p->mldp_ftbl_vlan_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
571 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "ftbl_vlan_size_shift",
572 	    MLXCX_FTBL_VLAN_SIZE_SHIFT_DFLT);
573 
574 	p->mldp_eq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY,
575 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
576 	    "eq_check_interval_sec", MLXCX_EQ_CHECK_INTERVAL_SEC_DFLT);
577 	p->mldp_cq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY,
578 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
579 	    "cq_check_interval_sec", MLXCX_CQ_CHECK_INTERVAL_SEC_DFLT);
580 	p->mldp_wq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY,
581 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
582 	    "wq_check_interval_sec", MLXCX_WQ_CHECK_INTERVAL_SEC_DFLT);
583 
584 	p->mldp_rx_per_cq = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
585 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_limit_per_completion",
586 	    MLXCX_RX_PER_CQ_DEFAULT);
587 
588 	if (p->mldp_rx_per_cq < MLXCX_RX_PER_CQ_MIN ||
589 	    p->mldp_rx_per_cq > MLXCX_RX_PER_CQ_MAX) {
590 		mlxcx_warn(mlxp, "!rx_limit_per_completion = %u is "
591 		    "out of range. Defaulting to: %d. Valid values are from "
592 		    "%d to %d", p->mldp_rx_per_cq, MLXCX_RX_PER_CQ_DEFAULT,
593 		    MLXCX_RX_PER_CQ_MIN, MLXCX_RX_PER_CQ_MAX);
594 		p->mldp_rx_per_cq = MLXCX_RX_PER_CQ_DEFAULT;
595 	}
596 }
597 
598 void
599 mlxcx_note(mlxcx_t *mlxp, const char *fmt, ...)
600 {
601 	va_list ap;
602 
603 	va_start(ap, fmt);
604 	if (mlxp != NULL && mlxp->mlx_dip != NULL) {
605 		vdev_err(mlxp->mlx_dip, CE_NOTE, fmt, ap);
606 	} else {
607 		vcmn_err(CE_NOTE, fmt, ap);
608 	}
609 	va_end(ap);
610 }
611 
612 void
613 mlxcx_warn(mlxcx_t *mlxp, const char *fmt, ...)
614 {
615 	va_list ap;
616 
617 	va_start(ap, fmt);
618 	if (mlxp != NULL && mlxp->mlx_dip != NULL) {
619 		vdev_err(mlxp->mlx_dip, CE_WARN, fmt, ap);
620 	} else {
621 		vcmn_err(CE_WARN, fmt, ap);
622 	}
623 	va_end(ap);
624 }
625 
626 void
627 mlxcx_panic(mlxcx_t *mlxp, const char *fmt, ...)
628 {
629 	va_list ap;
630 
631 	va_start(ap, fmt);
632 	if (mlxp != NULL && mlxp->mlx_dip != NULL) {
633 		vdev_err(mlxp->mlx_dip, CE_PANIC, fmt, ap);
634 	} else {
635 		vcmn_err(CE_PANIC, fmt, ap);
636 	}
637 	va_end(ap);
638 }
639 
640 uint16_t
641 mlxcx_get16(mlxcx_t *mlxp, uintptr_t off)
642 {
643 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
644 	return (ddi_get16(mlxp->mlx_regs_handle, (void *)addr));
645 }
646 
647 uint32_t
648 mlxcx_get32(mlxcx_t *mlxp, uintptr_t off)
649 {
650 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
651 	return (ddi_get32(mlxp->mlx_regs_handle, (void *)addr));
652 }
653 
654 uint64_t
655 mlxcx_get64(mlxcx_t *mlxp, uintptr_t off)
656 {
657 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
658 	return (ddi_get64(mlxp->mlx_regs_handle, (void *)addr));
659 }
660 
661 void
662 mlxcx_put32(mlxcx_t *mlxp, uintptr_t off, uint32_t val)
663 {
664 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
665 	ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val);
666 }
667 
668 void
669 mlxcx_put64(mlxcx_t *mlxp, uintptr_t off, uint64_t val)
670 {
671 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
672 	ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val);
673 }
674 
675 void
676 mlxcx_uar_put32(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint32_t val)
677 {
678 	/*
679 	 * The UAR is always inside the first BAR, which we mapped as
680 	 * mlx_regs
681 	 */
682 	uintptr_t addr = off + (uintptr_t)mlu->mlu_base +
683 	    (uintptr_t)mlxp->mlx_regs_base;
684 	ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val);
685 }
686 
687 void
688 mlxcx_uar_put64(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint64_t val)
689 {
690 	uintptr_t addr = off + (uintptr_t)mlu->mlu_base +
691 	    (uintptr_t)mlxp->mlx_regs_base;
692 	ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val);
693 }
694 
695 static void
696 mlxcx_fm_fini(mlxcx_t *mlxp)
697 {
698 	if (mlxp->mlx_fm_caps == 0)
699 		return;
700 
701 	if (DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps))
702 		ddi_fm_handler_unregister(mlxp->mlx_dip);
703 
704 	if (DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps) ||
705 	    DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps))
706 		pci_ereport_teardown(mlxp->mlx_dip);
707 
708 	ddi_fm_fini(mlxp->mlx_dip);
709 
710 	mlxp->mlx_fm_caps = 0;
711 }
712 
713 void
714 mlxcx_fm_ereport(mlxcx_t *mlxp, const char *detail)
715 {
716 	uint64_t ena;
717 	char buf[FM_MAX_CLASS];
718 
719 	if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps))
720 		return;
721 
722 	(void) snprintf(buf, FM_MAX_CLASS, "%s.%s", DDI_FM_DEVICE, detail);
723 	ena = fm_ena_generate(0, FM_ENA_FMT1);
724 	ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP,
725 	    FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0,
726 	    NULL);
727 }
728 
729 static int
730 mlxcx_fm_errcb(dev_info_t *dip, ddi_fm_error_t *err, const void *arg)
731 {
732 	/*
733 	 * as the driver can always deal with an error in any dma or
734 	 * access handle, we can just return the fme_status value.
735 	 */
736 	pci_ereport_post(dip, err, NULL);
737 	return (err->fme_status);
738 }
739 
740 static void
741 mlxcx_fm_init(mlxcx_t *mlxp)
742 {
743 	ddi_iblock_cookie_t iblk;
744 	int def = DDI_FM_EREPORT_CAPABLE | DDI_FM_ACCCHK_CAPABLE |
745 	    DDI_FM_DMACHK_CAPABLE | DDI_FM_ERRCB_CAPABLE;
746 
747 	mlxp->mlx_fm_caps = ddi_prop_get_int(DDI_DEV_T_ANY, mlxp->mlx_dip,
748 	    DDI_PROP_DONTPASS, "fm_capable", def);
749 
750 	if (mlxp->mlx_fm_caps < 0) {
751 		mlxp->mlx_fm_caps = 0;
752 	}
753 	mlxp->mlx_fm_caps &= def;
754 
755 	if (mlxp->mlx_fm_caps == 0)
756 		return;
757 
758 	ddi_fm_init(mlxp->mlx_dip, &mlxp->mlx_fm_caps, &iblk);
759 	if (DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps) ||
760 	    DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) {
761 		pci_ereport_setup(mlxp->mlx_dip);
762 	}
763 	if (DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) {
764 		ddi_fm_handler_register(mlxp->mlx_dip, mlxcx_fm_errcb,
765 		    (void *)mlxp);
766 	}
767 }
768 
769 static void
770 mlxcx_mlbs_teardown(mlxcx_t *mlxp, mlxcx_buf_shard_t *s)
771 {
772 	mlxcx_buffer_t *buf;
773 
774 	mutex_enter(&s->mlbs_mtx);
775 
776 	while (!list_is_empty(&s->mlbs_busy))
777 		cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
778 
779 	while (!list_is_empty(&s->mlbs_loaned))
780 		cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
781 
782 	while ((buf = list_head(&s->mlbs_free)) != NULL)
783 		mlxcx_buf_destroy(mlxp, buf);
784 
785 	list_destroy(&s->mlbs_free);
786 	list_destroy(&s->mlbs_busy);
787 	list_destroy(&s->mlbs_loaned);
788 	mutex_exit(&s->mlbs_mtx);
789 
790 	cv_destroy(&s->mlbs_free_nonempty);
791 	mutex_destroy(&s->mlbs_mtx);
792 }
793 
794 static void
795 mlxcx_teardown_bufs(mlxcx_t *mlxp)
796 {
797 	mlxcx_buf_shard_t *s;
798 
799 	while ((s = list_remove_head(&mlxp->mlx_buf_shards)) != NULL) {
800 		mlxcx_mlbs_teardown(mlxp, s);
801 		kmem_free(s, sizeof (mlxcx_buf_shard_t));
802 	}
803 	list_destroy(&mlxp->mlx_buf_shards);
804 
805 	kmem_cache_destroy(mlxp->mlx_bufs_cache);
806 }
807 
808 static void
809 mlxcx_teardown_pages(mlxcx_t *mlxp)
810 {
811 	uint_t nzeros = 0;
812 	uint64_t *pas;
813 
814 	pas = kmem_alloc(sizeof (*pas) * MLXCX_MANAGE_PAGES_MAX_PAGES,
815 	    KM_SLEEP);
816 
817 	mutex_enter(&mlxp->mlx_pagemtx);
818 
819 	while (mlxp->mlx_npages > 0) {
820 		int32_t req, ret;
821 
822 		ASSERT0(avl_is_empty(&mlxp->mlx_pages));
823 		req = MIN(mlxp->mlx_npages, MLXCX_MANAGE_PAGES_MAX_PAGES);
824 
825 		if (!mlxcx_cmd_return_pages(mlxp, req, pas, &ret)) {
826 			mlxcx_warn(mlxp, "hardware refused to return pages, "
827 			    "leaking %u remaining pages", mlxp->mlx_npages);
828 			goto out;
829 		}
830 
831 		for (int32_t i = 0; i < ret; i++) {
832 			mlxcx_dev_page_t *mdp, probe;
833 			bzero(&probe, sizeof (probe));
834 			probe.mxdp_pa = pas[i];
835 
836 			mdp = avl_find(&mlxp->mlx_pages, &probe, NULL);
837 
838 			if (mdp != NULL) {
839 				avl_remove(&mlxp->mlx_pages, mdp);
840 				mlxp->mlx_npages--;
841 				mlxcx_dma_free(&mdp->mxdp_dma);
842 				kmem_free(mdp, sizeof (mlxcx_dev_page_t));
843 			} else {
844 				mlxcx_panic(mlxp, "hardware returned a page "
845 				    "with PA 0x%" PRIx64 " but we have no "
846 				    "record of giving out such a page", pas[i]);
847 			}
848 		}
849 
850 		/*
851 		 * If no pages were returned, note that fact.
852 		 */
853 		if (ret == 0) {
854 			nzeros++;
855 			if (nzeros > mlxcx_reclaim_tries) {
856 				mlxcx_warn(mlxp, "hardware refused to return "
857 				    "pages, leaking %u remaining pages",
858 				    mlxp->mlx_npages);
859 				goto out;
860 			}
861 			delay(drv_usectohz(mlxcx_reclaim_delay));
862 		}
863 	}
864 
865 	avl_destroy(&mlxp->mlx_pages);
866 
867 out:
868 	mutex_exit(&mlxp->mlx_pagemtx);
869 	mutex_destroy(&mlxp->mlx_pagemtx);
870 
871 	kmem_free(pas, sizeof (*pas) * MLXCX_MANAGE_PAGES_MAX_PAGES);
872 }
873 
874 static boolean_t
875 mlxcx_eq_alloc_dma(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
876 {
877 	ddi_device_acc_attr_t acc;
878 	ddi_dma_attr_t attr;
879 	boolean_t ret;
880 	size_t sz, i;
881 
882 	VERIFY0(mleq->mleq_state & MLXCX_EQ_ALLOC);
883 
884 	mleq->mleq_entshift = mlxp->mlx_props.mldp_eq_size_shift;
885 	mleq->mleq_nents = (1 << mleq->mleq_entshift);
886 	sz = mleq->mleq_nents * sizeof (mlxcx_eventq_ent_t);
887 	ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0);
888 
889 	mlxcx_dma_acc_attr(mlxp, &acc);
890 	mlxcx_dma_queue_attr(mlxp, &attr);
891 
892 	ret = mlxcx_dma_alloc(mlxp, &mleq->mleq_dma, &attr, &acc,
893 	    B_TRUE, sz, B_TRUE);
894 	if (!ret) {
895 		mlxcx_warn(mlxp, "failed to allocate EQ memory");
896 		return (B_FALSE);
897 	}
898 
899 	mleq->mleq_ent = (mlxcx_eventq_ent_t *)mleq->mleq_dma.mxdb_va;
900 
901 	for (i = 0; i < mleq->mleq_nents; ++i)
902 		mleq->mleq_ent[i].mleqe_owner = MLXCX_EQ_OWNER_INIT;
903 
904 	mleq->mleq_state |= MLXCX_EQ_ALLOC;
905 
906 	return (B_TRUE);
907 }
908 
909 static void
910 mlxcx_eq_rele_dma(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
911 {
912 	VERIFY(mleq->mleq_state & MLXCX_EQ_ALLOC);
913 	if (mleq->mleq_state & MLXCX_EQ_CREATED)
914 		VERIFY(mleq->mleq_state & MLXCX_EQ_DESTROYED);
915 
916 	mlxcx_dma_free(&mleq->mleq_dma);
917 	mleq->mleq_ent = NULL;
918 
919 	mleq->mleq_state &= ~MLXCX_EQ_ALLOC;
920 }
921 
922 void
923 mlxcx_teardown_flow_table(mlxcx_t *mlxp, mlxcx_flow_table_t *ft)
924 {
925 	mlxcx_flow_group_t *fg;
926 	mlxcx_flow_entry_t *fe;
927 	int i;
928 
929 	ASSERT(mutex_owned(&ft->mlft_mtx));
930 
931 	for (i = ft->mlft_nents - 1; i >= 0; --i) {
932 		fe = &ft->mlft_ent[i];
933 		if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
934 			if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) {
935 				mlxcx_panic(mlxp, "failed to delete flow "
936 				    "entry %u on table %u", i,
937 				    ft->mlft_num);
938 			}
939 		}
940 	}
941 
942 	while ((fg = list_remove_head(&ft->mlft_groups)) != NULL) {
943 		if (fg->mlfg_state & MLXCX_FLOW_GROUP_CREATED &&
944 		    !(fg->mlfg_state & MLXCX_FLOW_GROUP_DESTROYED)) {
945 			if (!mlxcx_cmd_destroy_flow_group(mlxp, fg)) {
946 				mlxcx_panic(mlxp, "failed to destroy flow "
947 				    "group %u", fg->mlfg_num);
948 			}
949 		}
950 		kmem_free(fg, sizeof (mlxcx_flow_group_t));
951 	}
952 	list_destroy(&ft->mlft_groups);
953 	if (ft->mlft_state & MLXCX_FLOW_TABLE_CREATED &&
954 	    !(ft->mlft_state & MLXCX_FLOW_TABLE_DESTROYED)) {
955 		if (!mlxcx_cmd_destroy_flow_table(mlxp, ft)) {
956 			mlxcx_panic(mlxp, "failed to destroy flow table %u",
957 			    ft->mlft_num);
958 		}
959 	}
960 	kmem_free(ft->mlft_ent, ft->mlft_entsize);
961 	ft->mlft_ent = NULL;
962 	mutex_exit(&ft->mlft_mtx);
963 	mutex_destroy(&ft->mlft_mtx);
964 	kmem_free(ft, sizeof (mlxcx_flow_table_t));
965 }
966 
967 static void
968 mlxcx_teardown_ports(mlxcx_t *mlxp)
969 {
970 	uint_t i;
971 	mlxcx_port_t *p;
972 	mlxcx_flow_table_t *ft;
973 
974 	for (i = 0; i < mlxp->mlx_nports; ++i) {
975 		p = &mlxp->mlx_ports[i];
976 		if (!(p->mlp_init & MLXCX_PORT_INIT))
977 			continue;
978 		mutex_enter(&p->mlp_mtx);
979 		if ((ft = p->mlp_rx_flow) != NULL) {
980 			mutex_enter(&ft->mlft_mtx);
981 			/*
982 			 * teardown_flow_table() will destroy the mutex, so
983 			 * we don't release it here.
984 			 */
985 			mlxcx_teardown_flow_table(mlxp, ft);
986 		}
987 		mutex_exit(&p->mlp_mtx);
988 		mutex_destroy(&p->mlp_mtx);
989 		mutex_destroy(&p->mlx_port_event.mla_mtx);
990 		p->mlx_port_event.mla_mlx = NULL;
991 		p->mlx_port_event.mla_port = NULL;
992 		p->mlp_init &= ~MLXCX_PORT_INIT;
993 	}
994 
995 	kmem_free(mlxp->mlx_ports, mlxp->mlx_ports_size);
996 	mlxp->mlx_ports = NULL;
997 }
998 
999 static void
1000 mlxcx_teardown_wqs(mlxcx_t *mlxp)
1001 {
1002 	mlxcx_work_queue_t *mlwq;
1003 
1004 	while ((mlwq = list_head(&mlxp->mlx_wqs)) != NULL) {
1005 		mlxcx_wq_teardown(mlxp, mlwq);
1006 	}
1007 	list_destroy(&mlxp->mlx_wqs);
1008 }
1009 
1010 static void
1011 mlxcx_teardown_cqs(mlxcx_t *mlxp)
1012 {
1013 	mlxcx_completion_queue_t *mlcq;
1014 
1015 	while ((mlcq = list_head(&mlxp->mlx_cqs)) != NULL) {
1016 		mlxcx_cq_teardown(mlxp, mlcq);
1017 	}
1018 	list_destroy(&mlxp->mlx_cqs);
1019 }
1020 
1021 static void
1022 mlxcx_teardown_eqs(mlxcx_t *mlxp)
1023 {
1024 	mlxcx_event_queue_t *mleq;
1025 	uint_t i;
1026 
1027 	for (i = 0; i < mlxp->mlx_intr_count; ++i) {
1028 		mleq = &mlxp->mlx_eqs[i];
1029 		mutex_enter(&mleq->mleq_mtx);
1030 		if ((mleq->mleq_state & MLXCX_EQ_CREATED) &&
1031 		    !(mleq->mleq_state & MLXCX_EQ_DESTROYED)) {
1032 			if (!mlxcx_cmd_destroy_eq(mlxp, mleq)) {
1033 				mlxcx_warn(mlxp, "failed to destroy "
1034 				    "event queue idx %u eqn %u",
1035 				    i, mleq->mleq_num);
1036 			}
1037 		}
1038 		if (mleq->mleq_state & MLXCX_EQ_ALLOC) {
1039 			mlxcx_eq_rele_dma(mlxp, mleq);
1040 		}
1041 		mutex_exit(&mleq->mleq_mtx);
1042 	}
1043 }
1044 
1045 static void
1046 mlxcx_teardown_checktimers(mlxcx_t *mlxp)
1047 {
1048 	if (mlxp->mlx_props.mldp_eq_check_interval_sec > 0)
1049 		ddi_periodic_delete(mlxp->mlx_eq_checktimer);
1050 	if (mlxp->mlx_props.mldp_cq_check_interval_sec > 0)
1051 		ddi_periodic_delete(mlxp->mlx_cq_checktimer);
1052 	if (mlxp->mlx_props.mldp_wq_check_interval_sec > 0)
1053 		ddi_periodic_delete(mlxp->mlx_wq_checktimer);
1054 }
1055 
1056 static void
1057 mlxcx_teardown(mlxcx_t *mlxp)
1058 {
1059 	uint_t i;
1060 	dev_info_t *dip = mlxp->mlx_dip;
1061 
1062 	if (mlxp->mlx_attach & MLXCX_ATTACH_INTRS) {
1063 		/*
1064 		 * Disable interrupts and let any active vectors quiesce.
1065 		 */
1066 		mlxcx_intr_disable(mlxp);
1067 	}
1068 
1069 	if (mlxp->mlx_attach & MLXCX_ATTACH_SENSORS) {
1070 		mlxcx_teardown_sensors(mlxp);
1071 		mlxp->mlx_attach &= ~MLXCX_ATTACH_SENSORS;
1072 	}
1073 
1074 	if (mlxp->mlx_attach & MLXCX_ATTACH_CHKTIMERS) {
1075 		mlxcx_teardown_checktimers(mlxp);
1076 		mlxp->mlx_attach &= ~MLXCX_ATTACH_CHKTIMERS;
1077 	}
1078 
1079 	if (mlxp->mlx_attach & MLXCX_ATTACH_GROUPS) {
1080 		mlxcx_teardown_groups(mlxp);
1081 		mlxp->mlx_attach &= ~MLXCX_ATTACH_GROUPS;
1082 	}
1083 
1084 	if (mlxp->mlx_attach & MLXCX_ATTACH_WQS) {
1085 		mlxcx_teardown_wqs(mlxp);
1086 		mlxp->mlx_attach &= ~MLXCX_ATTACH_WQS;
1087 	}
1088 
1089 	if (mlxp->mlx_attach & MLXCX_ATTACH_CQS) {
1090 		mlxcx_teardown_cqs(mlxp);
1091 		mlxp->mlx_attach &= ~MLXCX_ATTACH_CQS;
1092 	}
1093 
1094 	if (mlxp->mlx_attach & MLXCX_ATTACH_BUFS) {
1095 		mlxcx_teardown_bufs(mlxp);
1096 		mlxp->mlx_attach &= ~MLXCX_ATTACH_BUFS;
1097 	}
1098 
1099 	if (mlxp->mlx_attach & MLXCX_ATTACH_PORTS) {
1100 		mlxcx_teardown_ports(mlxp);
1101 		mlxp->mlx_attach &= ~MLXCX_ATTACH_PORTS;
1102 	}
1103 
1104 	if (mlxp->mlx_attach & MLXCX_ATTACH_INTRS) {
1105 		mlxcx_teardown_eqs(mlxp);
1106 		mlxcx_intr_teardown(mlxp);
1107 		mlxp->mlx_attach &= ~MLXCX_ATTACH_INTRS;
1108 	}
1109 
1110 	if (mlxp->mlx_attach & MLXCX_ATTACH_UAR_PD_TD) {
1111 		if (mlxp->mlx_uar.mlu_allocated) {
1112 			if (!mlxcx_cmd_dealloc_uar(mlxp, &mlxp->mlx_uar)) {
1113 				mlxcx_warn(mlxp, "failed to release UAR");
1114 			}
1115 			for (i = 0; i < MLXCX_BF_PER_UAR; ++i)
1116 				mutex_destroy(&mlxp->mlx_uar.mlu_bf[i].mbf_mtx);
1117 		}
1118 		if (mlxp->mlx_pd.mlpd_allocated &&
1119 		    !mlxcx_cmd_dealloc_pd(mlxp, &mlxp->mlx_pd)) {
1120 			mlxcx_warn(mlxp, "failed to release PD");
1121 		}
1122 		if (mlxp->mlx_tdom.mltd_allocated &&
1123 		    !mlxcx_cmd_dealloc_tdom(mlxp, &mlxp->mlx_tdom)) {
1124 			mlxcx_warn(mlxp, "failed to release TDOM");
1125 		}
1126 		mlxp->mlx_attach &= ~MLXCX_ATTACH_UAR_PD_TD;
1127 	}
1128 
1129 	if (mlxp->mlx_attach & MLXCX_ATTACH_INIT_HCA) {
1130 		if (!mlxcx_cmd_teardown_hca(mlxp)) {
1131 			mlxcx_warn(mlxp, "failed to send teardown HCA "
1132 			    "command during device detach");
1133 		}
1134 		mlxp->mlx_attach &= ~MLXCX_ATTACH_INIT_HCA;
1135 	}
1136 
1137 	if (mlxp->mlx_attach & MLXCX_ATTACH_PAGE_LIST) {
1138 		mlxcx_teardown_pages(mlxp);
1139 		mlxp->mlx_attach &= ~MLXCX_ATTACH_PAGE_LIST;
1140 	}
1141 
1142 	if (mlxp->mlx_attach & MLXCX_ATTACH_ASYNC_TQ) {
1143 		for (i = 0; i <= MLXCX_FUNC_ID_MAX; i++) {
1144 			mlxp->mlx_npages_req[i].mla_mlx = NULL;
1145 			mutex_destroy(&mlxp->mlx_npages_req[i].mla_mtx);
1146 		}
1147 		taskq_destroy(mlxp->mlx_async_tq);
1148 		mlxp->mlx_async_tq = NULL;
1149 		mlxp->mlx_attach &= ~MLXCX_ATTACH_ASYNC_TQ;
1150 	}
1151 
1152 	if (mlxp->mlx_attach & MLXCX_ATTACH_ENABLE_HCA) {
1153 		if (!mlxcx_cmd_disable_hca(mlxp)) {
1154 			mlxcx_warn(mlxp, "failed to send DISABLE HCA command "
1155 			    "during device detach");
1156 		}
1157 		mlxp->mlx_attach &= ~MLXCX_ATTACH_ENABLE_HCA;
1158 	}
1159 
1160 	if (mlxp->mlx_attach & MLXCX_ATTACH_CMD) {
1161 		mlxcx_cmd_queue_fini(mlxp);
1162 		mlxp->mlx_attach &= ~MLXCX_ATTACH_CMD;
1163 	}
1164 
1165 	if (mlxp->mlx_attach & MLXCX_ATTACH_CAPS) {
1166 		kmem_free(mlxp->mlx_caps, sizeof (mlxcx_caps_t));
1167 		mlxp->mlx_caps = NULL;
1168 		mlxp->mlx_attach &= ~MLXCX_ATTACH_CAPS;
1169 	}
1170 
1171 	if (mlxp->mlx_attach & MLXCX_ATTACH_REGS) {
1172 		ddi_regs_map_free(&mlxp->mlx_regs_handle);
1173 		mlxp->mlx_regs_handle = NULL;
1174 		mlxp->mlx_attach &= ~MLXCX_ATTACH_REGS;
1175 	}
1176 
1177 	if (mlxp->mlx_attach & MLXCX_ATTACH_PCI_CONFIG) {
1178 		pci_config_teardown(&mlxp->mlx_cfg_handle);
1179 		mlxp->mlx_cfg_handle = NULL;
1180 		mlxp->mlx_attach &= ~MLXCX_ATTACH_PCI_CONFIG;
1181 	}
1182 
1183 	if (mlxp->mlx_attach & MLXCX_ATTACH_FM) {
1184 		mlxcx_fm_fini(mlxp);
1185 		mlxp->mlx_attach &= ~MLXCX_ATTACH_FM;
1186 	}
1187 
1188 	VERIFY3S(mlxp->mlx_attach, ==, 0);
1189 	ddi_soft_state_free(mlxcx_softstate, mlxp->mlx_inst);
1190 	ddi_set_driver_private(dip, NULL);
1191 }
1192 
1193 static boolean_t
1194 mlxcx_regs_map(mlxcx_t *mlxp)
1195 {
1196 	off_t memsize;
1197 	int ret;
1198 	ddi_device_acc_attr_t da;
1199 
1200 	if (ddi_dev_regsize(mlxp->mlx_dip, MLXCX_REG_NUMBER, &memsize) !=
1201 	    DDI_SUCCESS) {
1202 		mlxcx_warn(mlxp, "failed to get register set size");
1203 		return (B_FALSE);
1204 	}
1205 
1206 	/*
1207 	 * All data in the main BAR is kept in big-endian even though it's a PCI
1208 	 * device.
1209 	 */
1210 	bzero(&da, sizeof (ddi_device_acc_attr_t));
1211 	da.devacc_attr_version = DDI_DEVICE_ATTR_V0;
1212 	da.devacc_attr_endian_flags = DDI_STRUCTURE_BE_ACC;
1213 	da.devacc_attr_dataorder = DDI_STRICTORDER_ACC;
1214 	if (DDI_FM_ACC_ERR_CAP(mlxp->mlx_fm_caps)) {
1215 		da.devacc_attr_access = DDI_FLAGERR_ACC;
1216 	} else {
1217 		da.devacc_attr_access = DDI_DEFAULT_ACC;
1218 	}
1219 
1220 	ret = ddi_regs_map_setup(mlxp->mlx_dip, MLXCX_REG_NUMBER,
1221 	    &mlxp->mlx_regs_base, 0, memsize, &da, &mlxp->mlx_regs_handle);
1222 
1223 	if (ret != DDI_SUCCESS) {
1224 		mlxcx_warn(mlxp, "failed to map device registers: %d", ret);
1225 		return (B_FALSE);
1226 	}
1227 
1228 	return (B_TRUE);
1229 }
1230 
1231 static boolean_t
1232 mlxcx_check_issi(mlxcx_t *mlxp)
1233 {
1234 	uint32_t issi;
1235 
1236 	if (!mlxcx_cmd_query_issi(mlxp, &issi)) {
1237 		mlxcx_warn(mlxp, "failed to get ISSI");
1238 		return (B_FALSE);
1239 	}
1240 
1241 	if ((issi & (1 << MLXCX_CURRENT_ISSI)) == 0) {
1242 		mlxcx_warn(mlxp, "hardware does not support software ISSI, "
1243 		    "hw vector 0x%x, sw version %u", issi, MLXCX_CURRENT_ISSI);
1244 		return (B_FALSE);
1245 	}
1246 
1247 	if (!mlxcx_cmd_set_issi(mlxp, MLXCX_CURRENT_ISSI)) {
1248 		mlxcx_warn(mlxp, "failed to set ISSI to %u",
1249 		    MLXCX_CURRENT_ISSI);
1250 		return (B_FALSE);
1251 	}
1252 
1253 	return (B_TRUE);
1254 }
1255 
1256 boolean_t
1257 mlxcx_give_pages(mlxcx_t *mlxp, int32_t npages, int32_t *ngiven)
1258 {
1259 	ddi_device_acc_attr_t acc;
1260 	ddi_dma_attr_t attr;
1261 	int32_t i;
1262 	list_t plist;
1263 	mlxcx_dev_page_t *mdp;
1264 	mlxcx_dev_page_t **pages;
1265 	const ddi_dma_cookie_t *ck;
1266 
1267 	/*
1268 	 * If there are no pages required, then we're done here.
1269 	 */
1270 	if (npages <= 0) {
1271 		*ngiven = 0;
1272 		return (B_TRUE);
1273 	}
1274 
1275 	npages = MIN(npages, MLXCX_MANAGE_PAGES_MAX_PAGES);
1276 
1277 	pages = kmem_alloc(sizeof (*pages) * npages, KM_SLEEP);
1278 
1279 	list_create(&plist, sizeof (mlxcx_dev_page_t),
1280 	    offsetof(mlxcx_dev_page_t, mxdp_list));
1281 
1282 	for (i = 0; i < npages; i++) {
1283 		mdp = kmem_zalloc(sizeof (mlxcx_dev_page_t), KM_SLEEP);
1284 		mlxcx_dma_acc_attr(mlxp, &acc);
1285 		mlxcx_dma_page_attr(mlxp, &attr);
1286 		if (!mlxcx_dma_alloc(mlxp, &mdp->mxdp_dma, &attr, &acc,
1287 		    B_TRUE, MLXCX_HW_PAGE_SIZE, B_TRUE)) {
1288 			mlxcx_warn(mlxp, "failed to allocate 4k page %u/%u", i,
1289 			    npages);
1290 			kmem_free(mdp, sizeof (mlxcx_dev_page_t));
1291 			goto cleanup_npages;
1292 		}
1293 		ck = mlxcx_dma_cookie_one(&mdp->mxdp_dma);
1294 		mdp->mxdp_pa = ck->dmac_laddress;
1295 
1296 		list_insert_tail(&plist, mdp);
1297 	}
1298 
1299 	/*
1300 	 * Now that all of the pages have been allocated, given them to hardware
1301 	 * in chunks.
1302 	 */
1303 	for (i = 0; i < npages; i++) {
1304 		pages[i] = list_remove_head(&plist);
1305 	}
1306 
1307 	if (!mlxcx_cmd_give_pages(mlxp,
1308 	    MLXCX_MANAGE_PAGES_OPMOD_GIVE_PAGES, npages, pages)) {
1309 		mlxcx_warn(mlxp, "!hardware refused our gift of %u "
1310 		    "pages!", npages);
1311 		for (i = 0; i < npages; i++) {
1312 			list_insert_tail(&plist, pages[i]);
1313 		}
1314 		goto cleanup_npages;
1315 	}
1316 
1317 	mutex_enter(&mlxp->mlx_pagemtx);
1318 	for (i = 0; i < npages; i++) {
1319 		avl_add(&mlxp->mlx_pages, pages[i]);
1320 	}
1321 	mlxp->mlx_npages += npages;
1322 	mutex_exit(&mlxp->mlx_pagemtx);
1323 
1324 	list_destroy(&plist);
1325 	kmem_free(pages, sizeof (*pages) * npages);
1326 
1327 	*ngiven = npages;
1328 
1329 	return (B_TRUE);
1330 
1331 cleanup_npages:
1332 	kmem_free(pages, sizeof (*pages) * npages);
1333 	while ((mdp = list_remove_head(&plist)) != NULL) {
1334 		mlxcx_dma_free(&mdp->mxdp_dma);
1335 		kmem_free(mdp, sizeof (mlxcx_dev_page_t));
1336 	}
1337 	list_destroy(&plist);
1338 	return (B_FALSE);
1339 }
1340 
1341 static boolean_t
1342 mlxcx_init_pages(mlxcx_t *mlxp, uint_t type)
1343 {
1344 	int32_t npages, given;
1345 
1346 	if (!mlxcx_cmd_query_pages(mlxp, type, &npages)) {
1347 		mlxcx_warn(mlxp, "failed to determine boot pages");
1348 		return (B_FALSE);
1349 	}
1350 
1351 	while (npages > 0) {
1352 		if (!mlxcx_give_pages(mlxp, npages, &given))
1353 			return (B_FALSE);
1354 
1355 		npages -= given;
1356 	}
1357 
1358 	return (B_TRUE);
1359 }
1360 
1361 static int
1362 mlxcx_bufs_cache_constr(void *arg, void *cookie, int kmflags)
1363 {
1364 	mlxcx_t *mlxp = cookie;
1365 	mlxcx_buffer_t *b = arg;
1366 
1367 	bzero(b, sizeof (mlxcx_buffer_t));
1368 	b->mlb_mlx = mlxp;
1369 	b->mlb_state = MLXCX_BUFFER_INIT;
1370 	list_create(&b->mlb_tx_chain, sizeof (mlxcx_buffer_t),
1371 	    offsetof(mlxcx_buffer_t, mlb_tx_chain_entry));
1372 
1373 	return (0);
1374 }
1375 
1376 static void
1377 mlxcx_bufs_cache_destr(void *arg, void *cookie)
1378 {
1379 	mlxcx_t *mlxp = cookie;
1380 	mlxcx_buffer_t *b = arg;
1381 	VERIFY3P(b->mlb_mlx, ==, mlxp);
1382 	VERIFY(b->mlb_state == MLXCX_BUFFER_INIT);
1383 	list_destroy(&b->mlb_tx_chain);
1384 }
1385 
1386 mlxcx_buf_shard_t *
1387 mlxcx_mlbs_create(mlxcx_t *mlxp)
1388 {
1389 	mlxcx_buf_shard_t *s;
1390 
1391 	s = kmem_zalloc(sizeof (mlxcx_buf_shard_t), KM_SLEEP);
1392 
1393 	mutex_init(&s->mlbs_mtx, NULL, MUTEX_DRIVER,
1394 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
1395 	list_create(&s->mlbs_busy, sizeof (mlxcx_buffer_t),
1396 	    offsetof(mlxcx_buffer_t, mlb_entry));
1397 	list_create(&s->mlbs_free, sizeof (mlxcx_buffer_t),
1398 	    offsetof(mlxcx_buffer_t, mlb_entry));
1399 	list_create(&s->mlbs_loaned, sizeof (mlxcx_buffer_t),
1400 	    offsetof(mlxcx_buffer_t, mlb_entry));
1401 	cv_init(&s->mlbs_free_nonempty, NULL, CV_DRIVER, NULL);
1402 
1403 	list_insert_tail(&mlxp->mlx_buf_shards, s);
1404 
1405 	return (s);
1406 }
1407 
1408 static boolean_t
1409 mlxcx_setup_bufs(mlxcx_t *mlxp)
1410 {
1411 	char namebuf[KSTAT_STRLEN];
1412 
1413 	(void) snprintf(namebuf, KSTAT_STRLEN, "mlxcx%d_bufs_cache",
1414 	    ddi_get_instance(mlxp->mlx_dip));
1415 	mlxp->mlx_bufs_cache = kmem_cache_create(namebuf,
1416 	    sizeof (mlxcx_buffer_t), sizeof (uint64_t),
1417 	    mlxcx_bufs_cache_constr, mlxcx_bufs_cache_destr,
1418 	    NULL, mlxp, NULL, 0);
1419 
1420 	list_create(&mlxp->mlx_buf_shards, sizeof (mlxcx_buf_shard_t),
1421 	    offsetof(mlxcx_buf_shard_t, mlbs_entry));
1422 
1423 	return (B_TRUE);
1424 }
1425 
1426 static void
1427 mlxcx_fm_qstate_ereport(mlxcx_t *mlxp, const char *qtype, uint32_t qnum,
1428     const char *state, uint8_t statenum)
1429 {
1430 	uint64_t ena;
1431 	char buf[FM_MAX_CLASS];
1432 
1433 	if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps))
1434 		return;
1435 
1436 	(void) snprintf(buf, FM_MAX_CLASS, "%s.%s",
1437 	    MLXCX_FM_SERVICE_MLXCX, "qstate.err");
1438 	ena = fm_ena_generate(0, FM_ENA_FMT1);
1439 
1440 	ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP,
1441 	    FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0,
1442 	    "state", DATA_TYPE_STRING, state,
1443 	    "state_num", DATA_TYPE_UINT8, statenum,
1444 	    "qtype", DATA_TYPE_STRING, qtype,
1445 	    "qnum", DATA_TYPE_UINT32, qnum,
1446 	    NULL);
1447 	ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_DEGRADED);
1448 }
1449 
1450 /*
1451  * The following set of routines are for monitoring the health of
1452  * event, completion and work queues. They run infrequently peeking at
1453  * the structs to catch stalls and inconsistent state.
1454  *
1455  * They peek at the structs *without* acquiring locks - we don't want
1456  * to impede flow of data. Driver start up and shutdown semantics
1457  * guarantee the structs are present and won't disappear underneath
1458  * these routines.
1459  *
1460  * As previously noted, the routines peek at active data in the structs and
1461  * they will store some values for comparison on next invocation. To
1462  * maintain integrity of the saved values, these values are only modified
1463  * within these routines.
1464  */
1465 static void
1466 mlxcx_eq_check(void *arg)
1467 {
1468 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1469 	mlxcx_event_queue_t *eq;
1470 	mlxcx_eventq_ctx_t ctx;
1471 	const char *str;
1472 
1473 	uint_t i;
1474 
1475 	for (i = 0; i < mlxp->mlx_intr_count; ++i) {
1476 		eq = &mlxp->mlx_eqs[i];
1477 
1478 		if ((eq->mleq_state & MLXCX_EQ_CREATED) == 0)
1479 			continue;
1480 
1481 		/*
1482 		 * If the event queue was successfully created in the HCA,
1483 		 * then initialization and shutdown sequences guarantee
1484 		 * the queue exists.
1485 		 */
1486 		ASSERT0(eq->mleq_state & MLXCX_EQ_DESTROYED);
1487 
1488 		if (!mlxcx_cmd_query_eq(mlxp, eq, &ctx))
1489 			continue;
1490 
1491 		str = "???";
1492 		switch (ctx.mleqc_status) {
1493 		case MLXCX_EQ_STATUS_OK:
1494 			break;
1495 		case MLXCX_EQ_STATUS_WRITE_FAILURE:
1496 			str = "WRITE_FAILURE";
1497 			break;
1498 		}
1499 
1500 		if (ctx.mleqc_status != MLXCX_EQ_STATUS_OK) {
1501 			mlxcx_fm_qstate_ereport(mlxp, "event",
1502 			    eq->mleq_num, str, ctx.mleqc_status);
1503 			mlxcx_warn(mlxp, "EQ %u is in bad status: %x (%s)",
1504 			    eq->mleq_intr_index, ctx.mleqc_status, str);
1505 		}
1506 
1507 		if (ctx.mleqc_state != MLXCX_EQ_ST_ARMED &&
1508 		    (eq->mleq_state & MLXCX_EQ_ARMED)) {
1509 			if (eq->mleq_cc == eq->mleq_check_disarm_cc &&
1510 			    ++eq->mleq_check_disarm_cnt >= 3) {
1511 				mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_STALL);
1512 				mlxcx_warn(mlxp, "EQ %u isn't armed",
1513 				    eq->mleq_intr_index);
1514 			}
1515 			eq->mleq_check_disarm_cc = eq->mleq_cc;
1516 		} else {
1517 			eq->mleq_check_disarm_cc = 0;
1518 			eq->mleq_check_disarm_cnt = 0;
1519 		}
1520 	}
1521 }
1522 
1523 static void
1524 mlxcx_cq_check(void *arg)
1525 {
1526 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1527 	mlxcx_completion_queue_t *cq;
1528 	mlxcx_completionq_ctx_t ctx;
1529 	const char *str, *type;
1530 	uint_t v;
1531 
1532 	for (cq = list_head(&mlxp->mlx_cqs); cq != NULL;
1533 	    cq = list_next(&mlxp->mlx_cqs, cq)) {
1534 
1535 		if ((cq->mlcq_state & MLXCX_CQ_CREATED) == 0)
1536 			continue;
1537 
1538 		/*
1539 		 * If the completion queue was successfully created in the HCA,
1540 		 * then initialization and shutdown sequences guarantee
1541 		 * the queue exists.
1542 		 */
1543 		ASSERT0(cq->mlcq_state & MLXCX_CQ_DESTROYED);
1544 		ASSERT0(cq->mlcq_state & MLXCX_CQ_TEARDOWN);
1545 
1546 		if (cq->mlcq_fm_repd_qstate)
1547 			continue;
1548 
1549 		if (!mlxcx_cmd_query_cq(mlxp, cq, &ctx))
1550 			continue;
1551 
1552 		if (cq->mlcq_wq != NULL) {
1553 			mlxcx_work_queue_t *wq = cq->mlcq_wq;
1554 			if (wq->mlwq_type == MLXCX_WQ_TYPE_RECVQ)
1555 				type = "rx ";
1556 			else if (wq->mlwq_type == MLXCX_WQ_TYPE_SENDQ)
1557 				type = "tx ";
1558 			else
1559 				type = "";
1560 		} else {
1561 			type = "";
1562 		}
1563 
1564 		str = "???";
1565 		v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATUS);
1566 		switch (v) {
1567 		case MLXCX_CQC_STATUS_OK:
1568 			break;
1569 		case MLXCX_CQC_STATUS_OVERFLOW:
1570 			str = "OVERFLOW";
1571 			break;
1572 		case MLXCX_CQC_STATUS_WRITE_FAIL:
1573 			str = "WRITE_FAIL";
1574 			break;
1575 		case MLXCX_CQC_STATUS_INVALID:
1576 			str = "INVALID";
1577 			break;
1578 		}
1579 
1580 		if (v != MLXCX_CQC_STATUS_OK) {
1581 			mlxcx_fm_qstate_ereport(mlxp, "completion",
1582 			    cq->mlcq_num, str, v);
1583 			mlxcx_warn(mlxp, "%sCQ 0x%x is in bad status: %x (%s)",
1584 			    type, cq->mlcq_num, v, str);
1585 			cq->mlcq_fm_repd_qstate = B_TRUE;
1586 		}
1587 
1588 		v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATE);
1589 		if (v != MLXCX_CQC_STATE_ARMED &&
1590 		    (cq->mlcq_state & MLXCX_CQ_ARMED) &&
1591 		    !(cq->mlcq_state & MLXCX_CQ_POLLING)) {
1592 			if (cq->mlcq_cc == cq->mlcq_check_disarm_cc &&
1593 			    ++cq->mlcq_check_disarm_cnt >= 3) {
1594 				mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_STALL);
1595 				mlxcx_warn(mlxp, "%sCQ 0x%x (%p) isn't armed",
1596 				    type, cq->mlcq_num, cq);
1597 			}
1598 			cq->mlcq_check_disarm_cc = cq->mlcq_cc;
1599 		} else {
1600 			cq->mlcq_check_disarm_cnt = 0;
1601 			cq->mlcq_check_disarm_cc = 0;
1602 		}
1603 	}
1604 }
1605 
1606 void
1607 mlxcx_check_sq(mlxcx_t *mlxp, mlxcx_work_queue_t *sq)
1608 {
1609 	mlxcx_sq_ctx_t ctx;
1610 	mlxcx_sq_state_t state;
1611 
1612 	if (!mlxcx_cmd_query_sq(mlxp, sq, &ctx))
1613 		return;
1614 
1615 	ASSERT3U(from_be24(ctx.mlsqc_cqn), ==, sq->mlwq_cq->mlcq_num);
1616 	state = get_bits32(ctx.mlsqc_flags, MLXCX_SQ_STATE);
1617 	switch (state) {
1618 	case MLXCX_SQ_STATE_RST:
1619 		if (sq->mlwq_state & MLXCX_WQ_STARTED) {
1620 			mlxcx_fm_qstate_ereport(mlxp, "send",
1621 			    sq->mlwq_num, "RST", state);
1622 			sq->mlwq_fm_repd_qstate = B_TRUE;
1623 		}
1624 		break;
1625 	case MLXCX_SQ_STATE_RDY:
1626 		if (!(sq->mlwq_state & MLXCX_WQ_STARTED)) {
1627 			mlxcx_fm_qstate_ereport(mlxp, "send",
1628 			    sq->mlwq_num, "RDY", state);
1629 			sq->mlwq_fm_repd_qstate = B_TRUE;
1630 		}
1631 		break;
1632 	case MLXCX_SQ_STATE_ERR:
1633 		mlxcx_fm_qstate_ereport(mlxp, "send",
1634 		    sq->mlwq_num, "ERR", state);
1635 		sq->mlwq_fm_repd_qstate = B_TRUE;
1636 		break;
1637 	default:
1638 		mlxcx_fm_qstate_ereport(mlxp, "send",
1639 		    sq->mlwq_num, "???", state);
1640 		sq->mlwq_fm_repd_qstate = B_TRUE;
1641 		break;
1642 	}
1643 }
1644 
1645 void
1646 mlxcx_check_rq(mlxcx_t *mlxp, mlxcx_work_queue_t *rq)
1647 {
1648 	mlxcx_rq_ctx_t ctx;
1649 	mlxcx_rq_state_t state;
1650 
1651 
1652 	if (!mlxcx_cmd_query_rq(mlxp, rq, &ctx))
1653 		return;
1654 
1655 	ASSERT3U(from_be24(ctx.mlrqc_cqn), ==, rq->mlwq_cq->mlcq_num);
1656 	state = get_bits32(ctx.mlrqc_flags, MLXCX_RQ_STATE);
1657 	switch (state) {
1658 	case MLXCX_RQ_STATE_RST:
1659 		if (rq->mlwq_state & MLXCX_WQ_STARTED) {
1660 			mlxcx_fm_qstate_ereport(mlxp, "receive",
1661 			    rq->mlwq_num, "RST", state);
1662 			rq->mlwq_fm_repd_qstate = B_TRUE;
1663 		}
1664 		break;
1665 	case MLXCX_RQ_STATE_RDY:
1666 		if (!(rq->mlwq_state & MLXCX_WQ_STARTED)) {
1667 			mlxcx_fm_qstate_ereport(mlxp, "receive",
1668 			    rq->mlwq_num, "RDY", state);
1669 			rq->mlwq_fm_repd_qstate = B_TRUE;
1670 		}
1671 		break;
1672 	case MLXCX_RQ_STATE_ERR:
1673 		mlxcx_fm_qstate_ereport(mlxp, "receive",
1674 		    rq->mlwq_num, "ERR", state);
1675 		rq->mlwq_fm_repd_qstate = B_TRUE;
1676 		break;
1677 	default:
1678 		mlxcx_fm_qstate_ereport(mlxp, "receive",
1679 		    rq->mlwq_num, "???", state);
1680 		rq->mlwq_fm_repd_qstate = B_TRUE;
1681 		break;
1682 	}
1683 }
1684 
1685 static void
1686 mlxcx_wq_check(void *arg)
1687 {
1688 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1689 	mlxcx_work_queue_t *wq;
1690 
1691 	for (wq = list_head(&mlxp->mlx_wqs); wq != NULL;
1692 	    wq = list_next(&mlxp->mlx_wqs, wq)) {
1693 
1694 		if ((wq->mlwq_state & MLXCX_WQ_CREATED) == 0)
1695 			continue;
1696 
1697 		/*
1698 		 * If the work queue was successfully created in the HCA,
1699 		 * then initialization and shutdown sequences guarantee
1700 		 * the queue exists.
1701 		 */
1702 		ASSERT0(wq->mlwq_state & MLXCX_WQ_DESTROYED);
1703 		ASSERT0(wq->mlwq_state & MLXCX_WQ_TEARDOWN);
1704 
1705 		if (wq->mlwq_fm_repd_qstate)
1706 			continue;
1707 
1708 		switch (wq->mlwq_type) {
1709 		case MLXCX_WQ_TYPE_SENDQ:
1710 			mlxcx_check_sq(mlxp, wq);
1711 			break;
1712 		case MLXCX_WQ_TYPE_RECVQ:
1713 			mlxcx_check_rq(mlxp, wq);
1714 			break;
1715 		}
1716 	}
1717 }
1718 
1719 static boolean_t
1720 mlxcx_setup_checktimers(mlxcx_t *mlxp)
1721 {
1722 	if (mlxp->mlx_props.mldp_eq_check_interval_sec > 0) {
1723 		mlxp->mlx_eq_checktimer = ddi_periodic_add(mlxcx_eq_check, mlxp,
1724 		    mlxp->mlx_props.mldp_eq_check_interval_sec * NANOSEC,
1725 		    DDI_IPL_0);
1726 	}
1727 	if (mlxp->mlx_props.mldp_cq_check_interval_sec > 0) {
1728 		mlxp->mlx_cq_checktimer = ddi_periodic_add(mlxcx_cq_check, mlxp,
1729 		    mlxp->mlx_props.mldp_cq_check_interval_sec * NANOSEC,
1730 		    DDI_IPL_0);
1731 	}
1732 	if (mlxp->mlx_props.mldp_wq_check_interval_sec > 0) {
1733 		mlxp->mlx_wq_checktimer = ddi_periodic_add(mlxcx_wq_check, mlxp,
1734 		    mlxp->mlx_props.mldp_wq_check_interval_sec * NANOSEC,
1735 		    DDI_IPL_0);
1736 	}
1737 	return (B_TRUE);
1738 }
1739 
1740 int
1741 mlxcx_dmac_fe_compare(const void *arg0, const void *arg1)
1742 {
1743 	const mlxcx_flow_entry_t *left = arg0;
1744 	const mlxcx_flow_entry_t *right = arg1;
1745 	int bcmpr;
1746 
1747 	bcmpr = memcmp(left->mlfe_dmac, right->mlfe_dmac,
1748 	    sizeof (left->mlfe_dmac));
1749 	if (bcmpr < 0)
1750 		return (-1);
1751 	if (bcmpr > 0)
1752 		return (1);
1753 	if (left->mlfe_vid < right->mlfe_vid)
1754 		return (-1);
1755 	if (left->mlfe_vid > right->mlfe_vid)
1756 		return (1);
1757 	return (0);
1758 }
1759 
1760 int
1761 mlxcx_grmac_compare(const void *arg0, const void *arg1)
1762 {
1763 	const mlxcx_group_mac_t *left = arg0;
1764 	const mlxcx_group_mac_t *right = arg1;
1765 	int bcmpr;
1766 
1767 	bcmpr = memcmp(left->mlgm_mac, right->mlgm_mac,
1768 	    sizeof (left->mlgm_mac));
1769 	if (bcmpr < 0)
1770 		return (-1);
1771 	if (bcmpr > 0)
1772 		return (1);
1773 	return (0);
1774 }
1775 
1776 int
1777 mlxcx_page_compare(const void *arg0, const void *arg1)
1778 {
1779 	const mlxcx_dev_page_t *p0 = arg0;
1780 	const mlxcx_dev_page_t *p1 = arg1;
1781 
1782 	if (p0->mxdp_pa < p1->mxdp_pa)
1783 		return (-1);
1784 	if (p0->mxdp_pa > p1->mxdp_pa)
1785 		return (1);
1786 	return (0);
1787 }
1788 
1789 static boolean_t
1790 mlxcx_setup_ports(mlxcx_t *mlxp)
1791 {
1792 	uint_t i, j;
1793 	mlxcx_port_t *p;
1794 	mlxcx_flow_table_t *ft;
1795 	mlxcx_flow_group_t *fg;
1796 	mlxcx_flow_entry_t *fe;
1797 
1798 	VERIFY3U(mlxp->mlx_nports, >, 0);
1799 	mlxp->mlx_ports_size = mlxp->mlx_nports * sizeof (mlxcx_port_t);
1800 	mlxp->mlx_ports = kmem_zalloc(mlxp->mlx_ports_size, KM_SLEEP);
1801 
1802 	for (i = 0; i < mlxp->mlx_nports; ++i) {
1803 		p = &mlxp->mlx_ports[i];
1804 		p->mlp_num = i;
1805 		p->mlx_port_event.mla_mlx = mlxp;
1806 		p->mlx_port_event.mla_port = p;
1807 		mutex_init(&p->mlx_port_event.mla_mtx, NULL,
1808 		    MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_async_intr_pri));
1809 		p->mlp_init |= MLXCX_PORT_INIT;
1810 		mutex_init(&p->mlp_mtx, NULL, MUTEX_DRIVER,
1811 		    DDI_INTR_PRI(mlxp->mlx_intr_pri));
1812 		mutex_enter(&p->mlp_mtx);
1813 		if (!mlxcx_cmd_query_nic_vport_ctx(mlxp, p)) {
1814 			mutex_exit(&p->mlp_mtx);
1815 			goto err;
1816 		}
1817 		if (!mlxcx_cmd_query_port_mtu(mlxp, p)) {
1818 			mutex_exit(&p->mlp_mtx);
1819 			goto err;
1820 		}
1821 		if (!mlxcx_cmd_query_port_status(mlxp, p)) {
1822 			mutex_exit(&p->mlp_mtx);
1823 			goto err;
1824 		}
1825 		if (!mlxcx_cmd_query_port_speed(mlxp, p)) {
1826 			mutex_exit(&p->mlp_mtx);
1827 			goto err;
1828 		}
1829 		if (!mlxcx_cmd_modify_nic_vport_ctx(mlxp, p,
1830 		    MLXCX_MODIFY_NIC_VPORT_CTX_PROMISC)) {
1831 			mutex_exit(&p->mlp_mtx);
1832 			goto err;
1833 		}
1834 		if (!mlxcx_cmd_query_port_fec(mlxp, p)) {
1835 			mutex_exit(&p->mlp_mtx);
1836 			goto err;
1837 		}
1838 		p->mlp_fec_requested = LINK_FEC_AUTO;
1839 
1840 		mutex_exit(&p->mlp_mtx);
1841 	}
1842 
1843 	for (i = 0; i < mlxp->mlx_nports; ++i) {
1844 		p = &mlxp->mlx_ports[i];
1845 		mutex_enter(&p->mlp_mtx);
1846 		p->mlp_rx_flow = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t),
1847 		    KM_SLEEP));
1848 		mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER,
1849 		    DDI_INTR_PRI(mlxp->mlx_intr_pri));
1850 
1851 		mutex_enter(&ft->mlft_mtx);
1852 
1853 		ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX;
1854 		ft->mlft_port = p;
1855 		ft->mlft_entshift = mlxp->mlx_props.mldp_ftbl_root_size_shift;
1856 		if (ft->mlft_entshift > mlxp->mlx_caps->mlc_max_rx_ft_shift)
1857 			ft->mlft_entshift = mlxp->mlx_caps->mlc_max_rx_ft_shift;
1858 		ft->mlft_nents = (1 << ft->mlft_entshift);
1859 		ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t);
1860 		ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP);
1861 		list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t),
1862 		    offsetof(mlxcx_flow_group_t, mlfg_entry));
1863 
1864 		for (j = 0; j < ft->mlft_nents; ++j) {
1865 			ft->mlft_ent[j].mlfe_table = ft;
1866 			ft->mlft_ent[j].mlfe_index = j;
1867 		}
1868 
1869 		if (!mlxcx_cmd_create_flow_table(mlxp, ft)) {
1870 			mutex_exit(&ft->mlft_mtx);
1871 			mutex_exit(&p->mlp_mtx);
1872 			goto err;
1873 		}
1874 
1875 		if (!mlxcx_cmd_set_flow_table_root(mlxp, ft)) {
1876 			mutex_exit(&ft->mlft_mtx);
1877 			mutex_exit(&p->mlp_mtx);
1878 			goto err;
1879 		}
1880 
1881 		/*
1882 		 * We match broadcast at the top of the root flow table, then
1883 		 * all multicast/unicast MACs, then the promisc entry is down
1884 		 * the very bottom.
1885 		 *
1886 		 * This way when promisc is on, that entry simply catches any
1887 		 * remaining traffic that earlier flows haven't matched.
1888 		 */
1889 		fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1890 		list_insert_tail(&ft->mlft_groups, fg);
1891 		fg->mlfg_table = ft;
1892 		fg->mlfg_size = 1;
1893 		fg->mlfg_mask |= MLXCX_FLOW_MATCH_DMAC;
1894 		if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1895 			mutex_exit(&ft->mlft_mtx);
1896 			mutex_exit(&p->mlp_mtx);
1897 			goto err;
1898 		}
1899 		p->mlp_bcast = fg;
1900 		fe = list_head(&fg->mlfg_entries);
1901 		fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1902 		(void) memset(fe->mlfe_dmac, 0xff, sizeof (fe->mlfe_dmac));
1903 		fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
1904 
1905 		fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1906 		list_insert_tail(&ft->mlft_groups, fg);
1907 		fg->mlfg_table = ft;
1908 		fg->mlfg_size = ft->mlft_nents - 2;
1909 		fg->mlfg_mask |= MLXCX_FLOW_MATCH_DMAC;
1910 		if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1911 			mutex_exit(&ft->mlft_mtx);
1912 			mutex_exit(&p->mlp_mtx);
1913 			goto err;
1914 		}
1915 		p->mlp_umcast = fg;
1916 
1917 		fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1918 		list_insert_tail(&ft->mlft_groups, fg);
1919 		fg->mlfg_table = ft;
1920 		fg->mlfg_size = 1;
1921 		if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1922 			mutex_exit(&ft->mlft_mtx);
1923 			mutex_exit(&p->mlp_mtx);
1924 			goto err;
1925 		}
1926 		p->mlp_promisc = fg;
1927 		fe = list_head(&fg->mlfg_entries);
1928 		fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1929 		fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
1930 
1931 		avl_create(&p->mlp_dmac_fe, mlxcx_dmac_fe_compare,
1932 		    sizeof (mlxcx_flow_entry_t), offsetof(mlxcx_flow_entry_t,
1933 		    mlfe_dmac_entry));
1934 
1935 		mutex_exit(&ft->mlft_mtx);
1936 		mutex_exit(&p->mlp_mtx);
1937 	}
1938 
1939 	return (B_TRUE);
1940 
1941 err:
1942 	mlxcx_teardown_ports(mlxp);
1943 	return (B_FALSE);
1944 }
1945 
1946 void
1947 mlxcx_remove_all_vlan_entries(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
1948 {
1949 	mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft;
1950 	mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg;
1951 	mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg;
1952 	mlxcx_flow_entry_t *fe;
1953 	mlxcx_group_vlan_t *v;
1954 
1955 	ASSERT(mutex_owned(&g->mlg_mtx));
1956 
1957 	mutex_enter(&ft->mlft_mtx);
1958 
1959 	if (!list_is_empty(&g->mlg_rx_vlans)) {
1960 		fe = list_head(&dfg->mlfg_entries);
1961 		(void) mlxcx_cmd_set_flow_table_entry(mlxp, fe);
1962 	}
1963 
1964 	while ((v = list_remove_head(&g->mlg_rx_vlans)) != NULL) {
1965 		fe = v->mlgv_fe;
1966 		ASSERT3P(fe->mlfe_table, ==, ft);
1967 		ASSERT3P(fe->mlfe_group, ==, fg);
1968 		kmem_free(v, sizeof (mlxcx_group_vlan_t));
1969 
1970 		(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
1971 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
1972 	}
1973 
1974 	mutex_exit(&ft->mlft_mtx);
1975 }
1976 
1977 boolean_t
1978 mlxcx_remove_vlan_entry(mlxcx_t *mlxp, mlxcx_ring_group_t *g,
1979     boolean_t tagged, uint16_t vid)
1980 {
1981 	mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft;
1982 	mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg;
1983 	mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg;
1984 	mlxcx_flow_entry_t *fe;
1985 	mlxcx_group_vlan_t *v;
1986 	boolean_t found = B_FALSE;
1987 
1988 	ASSERT(mutex_owned(&g->mlg_mtx));
1989 
1990 	mutex_enter(&ft->mlft_mtx);
1991 
1992 	for (v = list_head(&g->mlg_rx_vlans); v != NULL;
1993 	    v = list_next(&g->mlg_rx_vlans, v)) {
1994 		if (v->mlgv_tagged == tagged && v->mlgv_vid == vid) {
1995 			found = B_TRUE;
1996 			break;
1997 		}
1998 	}
1999 	if (!found) {
2000 		mutex_exit(&ft->mlft_mtx);
2001 		return (B_FALSE);
2002 	}
2003 
2004 	list_remove(&g->mlg_rx_vlans, v);
2005 
2006 	/*
2007 	 * If this is the last VLAN entry, we have to go back to accepting
2008 	 * any VLAN (which means re-enabling the default entry).
2009 	 *
2010 	 * Do this before we remove the flow entry for the last specific
2011 	 * VLAN so that we don't lose any traffic in the transition.
2012 	 */
2013 	if (list_is_empty(&g->mlg_rx_vlans)) {
2014 		fe = list_head(&dfg->mlfg_entries);
2015 		if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
2016 			list_insert_tail(&g->mlg_rx_vlans, v);
2017 			mutex_exit(&ft->mlft_mtx);
2018 			return (B_FALSE);
2019 		}
2020 	}
2021 
2022 	fe = v->mlgv_fe;
2023 	ASSERT(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED);
2024 	ASSERT(fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED);
2025 	ASSERT3P(fe->mlfe_table, ==, ft);
2026 	ASSERT3P(fe->mlfe_group, ==, fg);
2027 
2028 	if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) {
2029 		list_insert_tail(&g->mlg_rx_vlans, v);
2030 		fe = list_head(&dfg->mlfg_entries);
2031 		if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
2032 			(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
2033 		}
2034 		mutex_exit(&ft->mlft_mtx);
2035 		return (B_FALSE);
2036 	}
2037 
2038 	fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2039 
2040 	kmem_free(v, sizeof (mlxcx_group_vlan_t));
2041 
2042 	mutex_exit(&ft->mlft_mtx);
2043 	return (B_TRUE);
2044 }
2045 
2046 boolean_t
2047 mlxcx_add_vlan_entry(mlxcx_t *mlxp, mlxcx_ring_group_t *g, boolean_t tagged,
2048     uint16_t vid)
2049 {
2050 	mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft;
2051 	mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg;
2052 	mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg;
2053 	mlxcx_flow_entry_t *fe;
2054 	mlxcx_group_vlan_t *v;
2055 	boolean_t found = B_FALSE;
2056 	boolean_t first = B_FALSE;
2057 
2058 	ASSERT(mutex_owned(&g->mlg_mtx));
2059 
2060 	mutex_enter(&ft->mlft_mtx);
2061 
2062 	for (v = list_head(&g->mlg_rx_vlans); v != NULL;
2063 	    v = list_next(&g->mlg_rx_vlans, v)) {
2064 		if (v->mlgv_tagged == tagged && v->mlgv_vid == vid) {
2065 			mutex_exit(&ft->mlft_mtx);
2066 			return (B_TRUE);
2067 		}
2068 	}
2069 	if (list_is_empty(&g->mlg_rx_vlans))
2070 		first = B_TRUE;
2071 
2072 	for (fe = list_head(&fg->mlfg_entries); fe != NULL;
2073 	    fe = list_next(&fg->mlfg_entries, fe)) {
2074 		if (!(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED)) {
2075 			found = B_TRUE;
2076 			break;
2077 		}
2078 	}
2079 	if (!found) {
2080 		mutex_exit(&ft->mlft_mtx);
2081 		return (B_FALSE);
2082 	}
2083 
2084 	v = kmem_zalloc(sizeof (mlxcx_group_vlan_t), KM_SLEEP);
2085 	v->mlgv_fe = fe;
2086 	v->mlgv_tagged = tagged;
2087 	v->mlgv_vid = vid;
2088 
2089 	fe->mlfe_state |= MLXCX_FLOW_ENTRY_RESERVED;
2090 	fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
2091 	fe->mlfe_vid = vid;
2092 	if (tagged) {
2093 		fe->mlfe_vlan_type = MLXCX_VLAN_TYPE_CVLAN;
2094 	} else {
2095 		fe->mlfe_vlan_type = MLXCX_VLAN_TYPE_NONE;
2096 	}
2097 
2098 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
2099 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_DIRTY;
2100 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2101 		kmem_free(v, sizeof (mlxcx_group_vlan_t));
2102 		mutex_exit(&ft->mlft_mtx);
2103 		return (B_FALSE);
2104 	}
2105 
2106 	list_insert_tail(&g->mlg_rx_vlans, v);
2107 
2108 	/*
2109 	 * If the vlan list was empty for this group before adding this one,
2110 	 * then we no longer want the "default" entry to allow all VLANs
2111 	 * through.
2112 	 */
2113 	if (first) {
2114 		fe = list_head(&dfg->mlfg_entries);
2115 		(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
2116 	}
2117 
2118 	mutex_exit(&ft->mlft_mtx);
2119 	return (B_TRUE);
2120 }
2121 
2122 void
2123 mlxcx_remove_all_umcast_entries(mlxcx_t *mlxp, mlxcx_port_t *port,
2124     mlxcx_ring_group_t *group)
2125 {
2126 	mlxcx_flow_entry_t *fe;
2127 	mlxcx_flow_table_t *ft = port->mlp_rx_flow;
2128 	mlxcx_group_mac_t *gm, *ngm;
2129 
2130 	ASSERT(mutex_owned(&port->mlp_mtx));
2131 	ASSERT(mutex_owned(&group->mlg_mtx));
2132 
2133 	mutex_enter(&ft->mlft_mtx);
2134 
2135 	gm = avl_first(&group->mlg_rx_macs);
2136 	for (; gm != NULL; gm = ngm) {
2137 		ngm = AVL_NEXT(&group->mlg_rx_macs, gm);
2138 
2139 		ASSERT3P(gm->mlgm_group, ==, group);
2140 		fe = gm->mlgm_fe;
2141 		ASSERT3P(fe->mlfe_table, ==, ft);
2142 
2143 		avl_remove(&group->mlg_rx_macs, gm);
2144 		list_remove(&fe->mlfe_ring_groups, gm);
2145 		kmem_free(gm, sizeof (mlxcx_group_mac_t));
2146 
2147 		fe->mlfe_ndest = 0;
2148 		for (gm = list_head(&fe->mlfe_ring_groups); gm != NULL;
2149 		    gm = list_next(&fe->mlfe_ring_groups, gm)) {
2150 			fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow =
2151 			    gm->mlgm_group->mlg_rx_vlan_ft;
2152 		}
2153 		fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
2154 
2155 		if (fe->mlfe_ndest > 0) {
2156 			(void) mlxcx_cmd_set_flow_table_entry(mlxp, fe);
2157 			continue;
2158 		}
2159 
2160 		/*
2161 		 * There are no more ring groups left for this MAC (it wasn't
2162 		 * attached to any other groups since ndest == 0), so clean up
2163 		 * its flow entry.
2164 		 */
2165 		avl_remove(&port->mlp_dmac_fe, fe);
2166 		(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
2167 		list_destroy(&fe->mlfe_ring_groups);
2168 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2169 	}
2170 
2171 	mutex_exit(&ft->mlft_mtx);
2172 }
2173 
2174 boolean_t
2175 mlxcx_remove_umcast_entry(mlxcx_t *mlxp, mlxcx_port_t *port,
2176     mlxcx_ring_group_t *group, const uint8_t *macaddr)
2177 {
2178 	mlxcx_flow_entry_t *fe;
2179 	mlxcx_flow_table_t *ft = port->mlp_rx_flow;
2180 	mlxcx_group_mac_t *gm, probe;
2181 
2182 	ASSERT(mutex_owned(&port->mlp_mtx));
2183 	ASSERT(mutex_owned(&group->mlg_mtx));
2184 
2185 	bzero(&probe, sizeof (probe));
2186 	bcopy(macaddr, probe.mlgm_mac, sizeof (probe.mlgm_mac));
2187 
2188 	mutex_enter(&ft->mlft_mtx);
2189 
2190 	gm = avl_find(&group->mlg_rx_macs, &probe, NULL);
2191 	if (gm == NULL) {
2192 		mutex_exit(&ft->mlft_mtx);
2193 		return (B_FALSE);
2194 	}
2195 	ASSERT3P(gm->mlgm_group, ==, group);
2196 	ASSERT0(bcmp(macaddr, gm->mlgm_mac, sizeof (gm->mlgm_mac)));
2197 
2198 	fe = gm->mlgm_fe;
2199 	ASSERT3P(fe->mlfe_table, ==, ft);
2200 	ASSERT0(bcmp(macaddr, fe->mlfe_dmac, sizeof (fe->mlfe_dmac)));
2201 
2202 	list_remove(&fe->mlfe_ring_groups, gm);
2203 	avl_remove(&group->mlg_rx_macs, gm);
2204 	kmem_free(gm, sizeof (mlxcx_group_mac_t));
2205 
2206 	fe->mlfe_ndest = 0;
2207 	for (gm = list_head(&fe->mlfe_ring_groups); gm != NULL;
2208 	    gm = list_next(&fe->mlfe_ring_groups, gm)) {
2209 		fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow =
2210 		    gm->mlgm_group->mlg_rx_vlan_ft;
2211 	}
2212 	fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
2213 
2214 	if (fe->mlfe_ndest > 0) {
2215 		if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
2216 			mutex_exit(&ft->mlft_mtx);
2217 			return (B_FALSE);
2218 		}
2219 		mutex_exit(&ft->mlft_mtx);
2220 		return (B_TRUE);
2221 	}
2222 
2223 	/*
2224 	 * There are no more ring groups left for this MAC (it wasn't attached
2225 	 * to any other groups since ndest == 0), so clean up its flow entry.
2226 	 */
2227 	avl_remove(&port->mlp_dmac_fe, fe);
2228 	(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
2229 	list_destroy(&fe->mlfe_ring_groups);
2230 
2231 	fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2232 
2233 	mutex_exit(&ft->mlft_mtx);
2234 
2235 	return (B_TRUE);
2236 }
2237 
2238 boolean_t
2239 mlxcx_add_umcast_entry(mlxcx_t *mlxp, mlxcx_port_t *port,
2240     mlxcx_ring_group_t *group, const uint8_t *macaddr)
2241 {
2242 	mlxcx_flow_group_t *fg;
2243 	mlxcx_flow_entry_t *fe, probe;
2244 	mlxcx_flow_table_t *ft = port->mlp_rx_flow;
2245 	mlxcx_group_mac_t *gm;
2246 	boolean_t found = B_FALSE;
2247 
2248 	ASSERT(mutex_owned(&port->mlp_mtx));
2249 	ASSERT(mutex_owned(&group->mlg_mtx));
2250 
2251 	bzero(&probe, sizeof (probe));
2252 	bcopy(macaddr, probe.mlfe_dmac, sizeof (probe.mlfe_dmac));
2253 
2254 	mutex_enter(&ft->mlft_mtx);
2255 
2256 	fe = avl_find(&port->mlp_dmac_fe, &probe, NULL);
2257 
2258 	if (fe == NULL) {
2259 		fg = port->mlp_umcast;
2260 		for (fe = list_head(&fg->mlfg_entries); fe != NULL;
2261 		    fe = list_next(&fg->mlfg_entries, fe)) {
2262 			if (!(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED)) {
2263 				found = B_TRUE;
2264 				break;
2265 			}
2266 		}
2267 		if (!found) {
2268 			mutex_exit(&ft->mlft_mtx);
2269 			return (B_FALSE);
2270 		}
2271 		list_create(&fe->mlfe_ring_groups, sizeof (mlxcx_group_mac_t),
2272 		    offsetof(mlxcx_group_mac_t, mlgm_fe_entry));
2273 		fe->mlfe_state |= MLXCX_FLOW_ENTRY_RESERVED;
2274 		fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
2275 		bcopy(macaddr, fe->mlfe_dmac, sizeof (fe->mlfe_dmac));
2276 
2277 		avl_add(&port->mlp_dmac_fe, fe);
2278 	}
2279 
2280 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = group->mlg_rx_vlan_ft;
2281 	fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
2282 
2283 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
2284 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_DIRTY;
2285 		if (--fe->mlfe_ndest == 0) {
2286 			fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2287 		}
2288 		mutex_exit(&ft->mlft_mtx);
2289 		return (B_FALSE);
2290 	}
2291 
2292 	gm = kmem_zalloc(sizeof (mlxcx_group_mac_t), KM_SLEEP);
2293 	gm->mlgm_group = group;
2294 	gm->mlgm_fe = fe;
2295 	bcopy(macaddr, gm->mlgm_mac, sizeof (gm->mlgm_mac));
2296 	avl_add(&group->mlg_rx_macs, gm);
2297 	list_insert_tail(&fe->mlfe_ring_groups, gm);
2298 
2299 	mutex_exit(&ft->mlft_mtx);
2300 
2301 	return (B_TRUE);
2302 }
2303 
2304 boolean_t
2305 mlxcx_setup_flow_group(mlxcx_t *mlxp, mlxcx_flow_table_t *ft,
2306     mlxcx_flow_group_t *fg)
2307 {
2308 	mlxcx_flow_entry_t *fe;
2309 	uint_t i, idx;
2310 
2311 	ASSERT(mutex_owned(&ft->mlft_mtx));
2312 	ASSERT(ft->mlft_state & MLXCX_FLOW_TABLE_CREATED);
2313 	ASSERT3P(fg->mlfg_table, ==, ft);
2314 
2315 	if (ft->mlft_next_ent + fg->mlfg_size > ft->mlft_nents)
2316 		return (B_FALSE);
2317 	fg->mlfg_start_idx = ft->mlft_next_ent;
2318 
2319 	if (!mlxcx_cmd_create_flow_group(mlxp, fg)) {
2320 		return (B_FALSE);
2321 	}
2322 
2323 	list_create(&fg->mlfg_entries, sizeof (mlxcx_flow_entry_t),
2324 	    offsetof(mlxcx_flow_entry_t, mlfe_group_entry));
2325 	for (i = 0; i < fg->mlfg_size; ++i) {
2326 		idx = fg->mlfg_start_idx + i;
2327 		fe = &ft->mlft_ent[idx];
2328 		fe->mlfe_group = fg;
2329 		list_insert_tail(&fg->mlfg_entries, fe);
2330 	}
2331 	fg->mlfg_avail = fg->mlfg_size;
2332 	ft->mlft_next_ent += fg->mlfg_size;
2333 
2334 	return (B_TRUE);
2335 }
2336 
2337 static boolean_t
2338 mlxcx_setup_eq(mlxcx_t *mlxp, uint_t vec, uint64_t events)
2339 {
2340 	mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[vec];
2341 
2342 	mutex_enter(&mleq->mleq_mtx);
2343 	if (!mlxcx_eq_alloc_dma(mlxp, mleq)) {
2344 		/* mlxcx_teardown_eqs() will clean this up */
2345 		mutex_exit(&mleq->mleq_mtx);
2346 		return (B_FALSE);
2347 	}
2348 	mleq->mleq_mlx = mlxp;
2349 	mleq->mleq_uar = &mlxp->mlx_uar;
2350 	mleq->mleq_events = events;
2351 	mleq->mleq_intr_index = vec;
2352 
2353 	if (!mlxcx_cmd_create_eq(mlxp, mleq)) {
2354 		/* mlxcx_teardown_eqs() will clean this up */
2355 		mutex_exit(&mleq->mleq_mtx);
2356 		return (B_FALSE);
2357 	}
2358 
2359 	if (ddi_intr_enable(mlxp->mlx_intr_handles[vec]) != DDI_SUCCESS) {
2360 		/*
2361 		 * mlxcx_teardown_eqs() will handle calling cmd_destroy_eq and
2362 		 * eq_rele_dma
2363 		 */
2364 		mutex_exit(&mleq->mleq_mtx);
2365 		return (B_FALSE);
2366 	}
2367 	mleq->mleq_state |= MLXCX_EQ_INTR_ENABLED;
2368 	mleq->mleq_state |= MLXCX_EQ_ATTACHING;
2369 	mlxcx_arm_eq(mlxp, mleq);
2370 	mutex_exit(&mleq->mleq_mtx);
2371 
2372 	return (B_TRUE);
2373 }
2374 
2375 static void
2376 mlxcx_eq_set_attached(mlxcx_t *mlxp)
2377 {
2378 	uint_t vec;
2379 	mlxcx_event_queue_t *mleq;
2380 
2381 	for (vec = 0; vec < mlxp->mlx_intr_count; ++vec) {
2382 		mleq = &mlxp->mlx_eqs[vec];
2383 
2384 		mutex_enter(&mleq->mleq_mtx);
2385 		mleq->mleq_state &= ~MLXCX_EQ_ATTACHING;
2386 		mutex_exit(&mleq->mleq_mtx);
2387 	}
2388 }
2389 
2390 static boolean_t
2391 mlxcx_setup_async_eqs(mlxcx_t *mlxp)
2392 {
2393 	boolean_t ret;
2394 
2395 	ret = mlxcx_setup_eq(mlxp, 0,
2396 	    (1ULL << MLXCX_EVENT_CMD_COMPLETION) |
2397 	    (1ULL << MLXCX_EVENT_PAGE_REQUEST) |
2398 	    (1ULL << MLXCX_EVENT_PORT_STATE) |
2399 	    (1ULL << MLXCX_EVENT_INTERNAL_ERROR) |
2400 	    (1ULL << MLXCX_EVENT_PORT_MODULE) |
2401 	    (1ULL << MLXCX_EVENT_SENDQ_DRAIN) |
2402 	    (1ULL << MLXCX_EVENT_LAST_WQE) |
2403 	    (1ULL << MLXCX_EVENT_CQ_ERROR) |
2404 	    (1ULL << MLXCX_EVENT_WQ_CATASTROPHE) |
2405 	    (1ULL << MLXCX_EVENT_PAGE_FAULT) |
2406 	    (1ULL << MLXCX_EVENT_WQ_INVALID_REQ) |
2407 	    (1ULL << MLXCX_EVENT_WQ_ACCESS_VIOL) |
2408 	    (1ULL << MLXCX_EVENT_NIC_VPORT) |
2409 	    (1ULL << MLXCX_EVENT_DOORBELL_CONGEST));
2410 
2411 	if (ret)
2412 		mlxcx_cmd_eq_enable(mlxp);
2413 
2414 	return (ret);
2415 }
2416 
2417 int
2418 mlxcx_cq_compare(const void *arg0, const void *arg1)
2419 {
2420 	const mlxcx_completion_queue_t *left = arg0;
2421 	const mlxcx_completion_queue_t *right = arg1;
2422 
2423 	if (left->mlcq_num < right->mlcq_num) {
2424 		return (-1);
2425 	}
2426 	if (left->mlcq_num > right->mlcq_num) {
2427 		return (1);
2428 	}
2429 	return (0);
2430 }
2431 
2432 static boolean_t
2433 mlxcx_setup_eqs(mlxcx_t *mlxp)
2434 {
2435 	uint_t i;
2436 	mlxcx_event_queue_t *mleq;
2437 
2438 	ASSERT3S(mlxp->mlx_intr_count, >, 0);
2439 
2440 	for (i = mlxp->mlx_intr_cq0; i < mlxp->mlx_intr_count; ++i) {
2441 		mleq = &mlxp->mlx_eqs[i];
2442 		mutex_enter(&mleq->mleq_mtx);
2443 		if (!mlxcx_eq_alloc_dma(mlxp, mleq)) {
2444 			mutex_exit(&mleq->mleq_mtx);
2445 			return (B_FALSE);
2446 		}
2447 		mleq->mleq_uar = &mlxp->mlx_uar;
2448 		if (!mlxcx_cmd_create_eq(mlxp, mleq)) {
2449 			/* mlxcx_teardown() will handle calling eq_rele_dma */
2450 			mutex_exit(&mleq->mleq_mtx);
2451 			return (B_FALSE);
2452 		}
2453 		if (mlxp->mlx_props.mldp_intrmod_period_usec != 0 &&
2454 		    !mlxcx_cmd_set_int_mod(mlxp, i,
2455 		    mlxp->mlx_props.mldp_intrmod_period_usec)) {
2456 			mutex_exit(&mleq->mleq_mtx);
2457 			return (B_FALSE);
2458 		}
2459 		if (ddi_intr_enable(mlxp->mlx_intr_handles[i]) != DDI_SUCCESS) {
2460 			mutex_exit(&mleq->mleq_mtx);
2461 			return (B_FALSE);
2462 		}
2463 		mleq->mleq_state |= MLXCX_EQ_INTR_ENABLED;
2464 		mlxcx_arm_eq(mlxp, mleq);
2465 		mutex_exit(&mleq->mleq_mtx);
2466 	}
2467 
2468 	mlxp->mlx_next_eq = mlxp->mlx_intr_cq0;
2469 
2470 	return (B_TRUE);
2471 }
2472 
2473 /*
2474  * Snapshot all of the hardware capabilities that we care about and then modify
2475  * the HCA capabilities to get things moving.
2476  */
2477 static boolean_t
2478 mlxcx_init_caps(mlxcx_t *mlxp)
2479 {
2480 	mlxcx_caps_t *c;
2481 
2482 	mlxp->mlx_caps = c = kmem_zalloc(sizeof (mlxcx_caps_t), KM_SLEEP);
2483 
2484 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_GENERAL,
2485 	    MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_hca_cur)) {
2486 		mlxcx_warn(mlxp, "failed to obtain current HCA general caps");
2487 	}
2488 
2489 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_GENERAL,
2490 	    MLXCX_HCA_CAP_MODE_MAX, &c->mlc_hca_max)) {
2491 		mlxcx_warn(mlxp, "failed to obtain maximum HCA general caps");
2492 	}
2493 
2494 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_ETHERNET,
2495 	    MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_ether_cur)) {
2496 		mlxcx_warn(mlxp, "failed to obtain current HCA eth caps");
2497 	}
2498 
2499 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_ETHERNET,
2500 	    MLXCX_HCA_CAP_MODE_MAX, &c->mlc_ether_max)) {
2501 		mlxcx_warn(mlxp, "failed to obtain maximum HCA eth caps");
2502 	}
2503 
2504 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_NIC_FLOW,
2505 	    MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_nic_flow_cur)) {
2506 		mlxcx_warn(mlxp, "failed to obtain current HCA flow caps");
2507 	}
2508 
2509 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_NIC_FLOW,
2510 	    MLXCX_HCA_CAP_MODE_MAX, &c->mlc_nic_flow_max)) {
2511 		mlxcx_warn(mlxp, "failed to obtain maximum HCA flow caps");
2512 	}
2513 
2514 	/*
2515 	 * Check the caps meet our requirements.
2516 	 */
2517 	const mlxcx_hca_cap_general_caps_t *gen = &c->mlc_hca_cur.mhc_general;
2518 
2519 	if (gen->mlcap_general_log_pg_sz != 12) {
2520 		mlxcx_warn(mlxp, "!hardware has page size != 4k "
2521 		    "(log_pg_sz = %u)", (uint_t)gen->mlcap_general_log_pg_sz);
2522 		goto err;
2523 	}
2524 	if (gen->mlcap_general_cqe_version != 1) {
2525 		mlxcx_warn(mlxp, "!hardware does not support CQE v1 "
2526 		    "(cqe_ver = %u)", (uint_t)gen->mlcap_general_cqe_version);
2527 		goto err;
2528 	}
2529 	if (gen->mlcap_general_port_type !=
2530 	    MLXCX_CAP_GENERAL_PORT_TYPE_ETHERNET) {
2531 		mlxcx_warn(mlxp, "!hardware has non-ethernet ports");
2532 		goto err;
2533 	}
2534 	mlxp->mlx_nports = gen->mlcap_general_num_ports;
2535 	mlxp->mlx_max_sdu = (1 << (gen->mlcap_general_log_max_msg & 0x1F));
2536 
2537 	c->mlc_max_tir = (1 << gen->mlcap_general_log_max_tir);
2538 
2539 	c->mlc_checksum = get_bit32(c->mlc_ether_cur.mhc_eth.mlcap_eth_flags,
2540 	    MLXCX_ETH_CAP_CSUM_CAP);
2541 	c->mlc_vxlan = get_bit32(c->mlc_ether_cur.mhc_eth.mlcap_eth_flags,
2542 	    MLXCX_ETH_CAP_TUNNEL_STATELESS_VXLAN);
2543 
2544 	c->mlc_max_lso_size = (1 << get_bits32(c->mlc_ether_cur.mhc_eth.
2545 	    mlcap_eth_flags, MLXCX_ETH_CAP_MAX_LSO_CAP));
2546 	if (c->mlc_max_lso_size == 1) {
2547 		c->mlc_max_lso_size = 0;
2548 		c->mlc_lso = B_FALSE;
2549 	} else {
2550 		c->mlc_lso = B_TRUE;
2551 	}
2552 
2553 	c->mlc_max_rqt_size = (1 << get_bits32(c->mlc_ether_cur.mhc_eth.
2554 	    mlcap_eth_flags, MLXCX_ETH_CAP_RSS_IND_TBL_CAP));
2555 
2556 	if (!get_bit32(c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx.
2557 	    mlcap_flow_prop_flags, MLXCX_FLOW_CAP_PROPS_SUPPORT)) {
2558 		mlxcx_warn(mlxp, "!hardware does not support rx flow tables");
2559 		goto err;
2560 	}
2561 	if (!get_bit32(c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx.
2562 	    mlcap_flow_prop_flags, MLXCX_FLOW_CAP_PROPS_MODIFY)) {
2563 		mlxcx_warn(mlxp, "!hardware does not support modifying rx "
2564 		    "flow table entries");
2565 		goto err;
2566 	}
2567 
2568 	c->mlc_max_rx_ft_shift = c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx.
2569 	    mlcap_flow_prop_log_max_ft_size;
2570 	c->mlc_max_rx_flows = (1 << c->mlc_nic_flow_cur.mhc_flow.
2571 	    mlcap_flow_nic_rx.mlcap_flow_prop_log_max_flow);
2572 	c->mlc_max_rx_ft = (1 << c->mlc_nic_flow_cur.mhc_flow.
2573 	    mlcap_flow_nic_rx.mlcap_flow_prop_log_max_ft_num);
2574 	c->mlc_max_rx_fe_dest = (1 << c->mlc_nic_flow_cur.mhc_flow.
2575 	    mlcap_flow_nic_rx.mlcap_flow_prop_log_max_destination);
2576 
2577 	return (B_TRUE);
2578 
2579 err:
2580 	kmem_free(mlxp->mlx_caps, sizeof (mlxcx_caps_t));
2581 	return (B_FALSE);
2582 }
2583 
2584 static int
2585 mlxcx_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2586 {
2587 	mlxcx_t *mlxp;
2588 
2589 	if (cmd != DDI_DETACH)
2590 		return (DDI_FAILURE);
2591 
2592 	mlxp = ddi_get_driver_private(dip);
2593 	if (mlxp == NULL) {
2594 		mlxcx_warn(NULL, "asked to detach, but missing instance "
2595 		    "private data");
2596 		return (DDI_FAILURE);
2597 	}
2598 
2599 	if (mlxp->mlx_attach & MLXCX_ATTACH_MAC_HDL) {
2600 		if (mac_unregister(mlxp->mlx_mac_hdl) != DDI_SUCCESS) {
2601 			return (DDI_FAILURE);
2602 		}
2603 		mlxp->mlx_attach &= ~MLXCX_ATTACH_MAC_HDL;
2604 	}
2605 
2606 	mlxcx_teardown(mlxp);
2607 	return (DDI_SUCCESS);
2608 }
2609 
2610 static size_t
2611 mlxcx_calc_rx_ngroups(mlxcx_t *mlxp)
2612 {
2613 	size_t ngroups = mlxp->mlx_props.mldp_rx_ngroups_large +
2614 	    mlxp->mlx_props.mldp_rx_ngroups_small;
2615 	size_t tirlim, flowlim, gflowlim;
2616 
2617 	tirlim = mlxp->mlx_caps->mlc_max_tir / MLXCX_TIRS_PER_GROUP;
2618 	if (tirlim < ngroups) {
2619 		mlxcx_note(mlxp, "limiting number of rx groups to %u based "
2620 		    "on number of TIRs available", tirlim);
2621 		ngroups = tirlim;
2622 	}
2623 
2624 	flowlim = (1 << mlxp->mlx_caps->mlc_max_rx_ft_shift) - 2;
2625 	if (flowlim < ngroups) {
2626 		mlxcx_note(mlxp, "limiting number of rx groups to %u based "
2627 		    "on max size of RX flow tables", flowlim);
2628 		ngroups = flowlim;
2629 	}
2630 
2631 	/*
2632 	 * Restrict the number of groups not to exceed the max flow
2633 	 * table number from the devices capabilities.
2634 	 * There is one root table entry per port and 2 entries per
2635 	 * group.
2636 	 */
2637 	flowlim = (mlxp->mlx_caps->mlc_max_rx_ft - mlxp->mlx_nports) / 2;
2638 	if (flowlim < ngroups) {
2639 		mlxcx_note(mlxp, "limiting number of rx groups to %u based "
2640 		    "on max number of RX flow tables",
2641 		    flowlim);
2642 		ngroups = flowlim;
2643 	}
2644 
2645 	do {
2646 		gflowlim = mlxp->mlx_caps->mlc_max_rx_flows - 16 * ngroups - 2;
2647 		if (gflowlim < ngroups) {
2648 			mlxcx_note(mlxp, "limiting number of rx groups to %u "
2649 			    "based on max total RX flows", gflowlim);
2650 			--ngroups;
2651 		}
2652 	} while (gflowlim < ngroups);
2653 
2654 	return (ngroups);
2655 }
2656 
2657 static int
2658 mlxcx_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2659 {
2660 	mlxcx_t *mlxp;
2661 	char tq_name[TASKQ_NAMELEN];
2662 	uint_t i;
2663 	int inst, ret;
2664 
2665 	if (cmd != DDI_ATTACH)
2666 		return (DDI_FAILURE);
2667 
2668 	inst = ddi_get_instance(dip);
2669 	ret = ddi_soft_state_zalloc(mlxcx_softstate, inst);
2670 	if (ret != 0)
2671 		return (ret);
2672 
2673 	mlxp = ddi_get_soft_state(mlxcx_softstate, inst);
2674 	if (mlxp == NULL)
2675 		return (DDI_FAILURE);
2676 	mlxp->mlx_dip = dip;
2677 	mlxp->mlx_inst = inst;
2678 	ddi_set_driver_private(dip, mlxp);
2679 
2680 	mlxcx_load_props(mlxp);
2681 
2682 	mlxcx_fm_init(mlxp);
2683 	mlxp->mlx_attach |= MLXCX_ATTACH_FM;
2684 
2685 	if (pci_config_setup(mlxp->mlx_dip, &mlxp->mlx_cfg_handle) !=
2686 	    DDI_SUCCESS) {
2687 		mlxcx_warn(mlxp, "failed to initial PCI config space");
2688 		goto err;
2689 	}
2690 	mlxp->mlx_attach |= MLXCX_ATTACH_PCI_CONFIG;
2691 
2692 	if (!mlxcx_regs_map(mlxp)) {
2693 		goto err;
2694 	}
2695 	mlxp->mlx_attach |= MLXCX_ATTACH_REGS;
2696 
2697 	if (!mlxcx_cmd_queue_init(mlxp)) {
2698 		goto err;
2699 	}
2700 	mlxp->mlx_attach |= MLXCX_ATTACH_CMD;
2701 
2702 	if (!mlxcx_cmd_enable_hca(mlxp)) {
2703 		goto err;
2704 	}
2705 	mlxp->mlx_attach |= MLXCX_ATTACH_ENABLE_HCA;
2706 
2707 	if (!mlxcx_check_issi(mlxp)) {
2708 		goto err;
2709 	}
2710 
2711 	/*
2712 	 * We have to get our interrupts now so we know what priority to
2713 	 * create pagemtx with.
2714 	 */
2715 	if (!mlxcx_intr_setup(mlxp)) {
2716 		goto err;
2717 	}
2718 	mlxp->mlx_attach |= MLXCX_ATTACH_INTRS;
2719 
2720 	mutex_init(&mlxp->mlx_pagemtx, NULL, MUTEX_DRIVER,
2721 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
2722 	avl_create(&mlxp->mlx_pages, mlxcx_page_compare,
2723 	    sizeof (mlxcx_dev_page_t), offsetof(mlxcx_dev_page_t, mxdp_tree));
2724 	mlxp->mlx_attach |= MLXCX_ATTACH_PAGE_LIST;
2725 
2726 	/*
2727 	 * Taskq for asynchronous events which may interact with the HCA
2728 	 * via the command interface. Single threaded FIFO.
2729 	 */
2730 	(void) snprintf(tq_name, sizeof (tq_name), "%s_async_%d",
2731 	    ddi_driver_name(mlxp->mlx_dip), mlxp->mlx_inst);
2732 	mlxp->mlx_async_tq = taskq_create(tq_name, 1, minclsyspri, 1, INT_MAX,
2733 	    TASKQ_PREPOPULATE);
2734 	/*
2735 	 * Initialize any pre-allocated taskq param structs.
2736 	 */
2737 	for (i = 0; i <= MLXCX_FUNC_ID_MAX; i++) {
2738 		mlxp->mlx_npages_req[i].mla_mlx = mlxp;
2739 		mutex_init(&mlxp->mlx_npages_req[i].mla_mtx, NULL,
2740 		    MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_async_intr_pri));
2741 	}
2742 	mlxp->mlx_attach |= MLXCX_ATTACH_ASYNC_TQ;
2743 
2744 	if (!mlxcx_init_pages(mlxp, MLXCX_QUERY_PAGES_OPMOD_BOOT)) {
2745 		goto err;
2746 	}
2747 
2748 	if (!mlxcx_init_caps(mlxp)) {
2749 		goto err;
2750 	}
2751 	mlxp->mlx_attach |= MLXCX_ATTACH_CAPS;
2752 
2753 	if (!mlxcx_init_pages(mlxp, MLXCX_QUERY_PAGES_OPMOD_INIT)) {
2754 		goto err;
2755 	}
2756 
2757 	if (!mlxcx_cmd_init_hca(mlxp)) {
2758 		goto err;
2759 	}
2760 	mlxp->mlx_attach |= MLXCX_ATTACH_INIT_HCA;
2761 
2762 	if (!mlxcx_cmd_set_driver_version(mlxp, MLXCX_DRIVER_VERSION)) {
2763 		goto err;
2764 	}
2765 
2766 	/*
2767 	 * The User Access Region (UAR) is needed so we can ring EQ and CQ
2768 	 * doorbells.
2769 	 */
2770 	if (!mlxcx_cmd_alloc_uar(mlxp, &mlxp->mlx_uar)) {
2771 		goto err;
2772 	}
2773 	for (i = 0; i < MLXCX_BF_PER_UAR; ++i) {
2774 		mutex_init(&mlxp->mlx_uar.mlu_bf[i].mbf_mtx, NULL,
2775 		    MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_intr_pri));
2776 	}
2777 	mlxp->mlx_attach |= MLXCX_ATTACH_UAR_PD_TD;
2778 
2779 	/*
2780 	 * Set up asynchronous event queue which handles control type events
2781 	 * like PAGE_REQUEST and CMD completion events.
2782 	 *
2783 	 * This will enable and arm the interrupt on EQ 0. Note that only page
2784 	 * reqs and cmd completions will be handled until we call
2785 	 * mlxcx_eq_set_attached further down (this way we don't need an extra
2786 	 * set of locks over the mlxcx_t sub-structs not allocated yet)
2787 	 */
2788 	if (!mlxcx_setup_async_eqs(mlxp)) {
2789 		goto err;
2790 	}
2791 
2792 	/*
2793 	 * Allocate a protection and transport domain. These don't really do
2794 	 * anything for us (they're IB concepts), but we need to give their
2795 	 * ID numbers in other commands.
2796 	 */
2797 	if (!mlxcx_cmd_alloc_pd(mlxp, &mlxp->mlx_pd)) {
2798 		goto err;
2799 	}
2800 	if (!mlxcx_cmd_alloc_tdom(mlxp, &mlxp->mlx_tdom)) {
2801 		goto err;
2802 	}
2803 	/*
2804 	 * Fetch the "reserved" lkey that lets us give linear addresses in
2805 	 * work queue entries, rather than having to mess with the NIC's
2806 	 * internal MMU.
2807 	 */
2808 	if (!mlxcx_cmd_query_special_ctxs(mlxp)) {
2809 		goto err;
2810 	}
2811 
2812 	/*
2813 	 * Query our port information and current state, populate the
2814 	 * mlxcx_port_t structs.
2815 	 *
2816 	 * This also sets up the root flow tables and flow groups.
2817 	 */
2818 	if (!mlxcx_setup_ports(mlxp)) {
2819 		goto err;
2820 	}
2821 	mlxp->mlx_attach |= MLXCX_ATTACH_PORTS;
2822 
2823 	mlxcx_load_model_props(mlxp);
2824 
2825 	/*
2826 	 * Set up, enable and arm the rest of the interrupt EQs which will
2827 	 * service events from CQs.
2828 	 *
2829 	 * The MLXCX_ATTACH_INTRS flag covers checking if these need to be
2830 	 * cleaned up.
2831 	 */
2832 	if (!mlxcx_setup_eqs(mlxp)) {
2833 		goto err;
2834 	}
2835 
2836 	/* Completion queues */
2837 	list_create(&mlxp->mlx_cqs, sizeof (mlxcx_completion_queue_t),
2838 	    offsetof(mlxcx_completion_queue_t, mlcq_entry));
2839 	mlxp->mlx_attach |= MLXCX_ATTACH_CQS;
2840 
2841 	/* Work queues (send queues, receive queues) */
2842 	list_create(&mlxp->mlx_wqs, sizeof (mlxcx_work_queue_t),
2843 	    offsetof(mlxcx_work_queue_t, mlwq_entry));
2844 	mlxp->mlx_attach |= MLXCX_ATTACH_WQS;
2845 
2846 	/*
2847 	 * Construct our arrays of mlxcx_ring_group_ts, which represent the
2848 	 * "groups" we advertise to MAC.
2849 	 */
2850 	mlxp->mlx_rx_ngroups = mlxcx_calc_rx_ngroups(mlxp);
2851 	mlxp->mlx_rx_groups_size = mlxp->mlx_rx_ngroups *
2852 	    sizeof (mlxcx_ring_group_t);
2853 	mlxp->mlx_rx_groups = kmem_zalloc(mlxp->mlx_rx_groups_size, KM_SLEEP);
2854 
2855 	mlxp->mlx_tx_ngroups = mlxp->mlx_props.mldp_tx_ngroups;
2856 	mlxp->mlx_tx_groups_size = mlxp->mlx_tx_ngroups *
2857 	    sizeof (mlxcx_ring_group_t);
2858 	mlxp->mlx_tx_groups = kmem_zalloc(mlxp->mlx_tx_groups_size, KM_SLEEP);
2859 
2860 	mlxp->mlx_attach |= MLXCX_ATTACH_GROUPS;
2861 
2862 	/*
2863 	 * Sets up the free/busy buffers list for keeping track of packet
2864 	 * buffers.
2865 	 */
2866 	if (!mlxcx_setup_bufs(mlxp))
2867 		goto err;
2868 	mlxp->mlx_attach |= MLXCX_ATTACH_BUFS;
2869 
2870 	/*
2871 	 * Before we tell MAC about our rings/groups, we need to do enough
2872 	 * setup on them to be sure about the numbers and configuration that
2873 	 * we have. This will do basically everything short of allocating
2874 	 * packet buffers and starting the rings up.
2875 	 */
2876 	for (i = 0; i < mlxp->mlx_tx_ngroups; ++i) {
2877 		if (!mlxcx_tx_group_setup(mlxp, &mlxp->mlx_tx_groups[i]))
2878 			goto err;
2879 	}
2880 	for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) {
2881 		if (!mlxcx_rx_group_setup(mlxp, &mlxp->mlx_rx_groups[i]))
2882 			goto err;
2883 	}
2884 
2885 	/*
2886 	 * Set up periodic fault check timers which check the queue states,
2887 	 * set up should be after all the queues have been initialized and
2888 	 * consequently the teardown of timers must happen before
2889 	 * queue teardown.
2890 	 */
2891 	if (!mlxcx_setup_checktimers(mlxp)) {
2892 		goto err;
2893 	}
2894 	mlxp->mlx_attach |= MLXCX_ATTACH_CHKTIMERS;
2895 
2896 	/*
2897 	 * Some devices may not have a working temperature sensor; however,
2898 	 * there isn't a great way for us to know. We shouldn't fail attach if
2899 	 * this doesn't work.
2900 	 */
2901 	if (mlxcx_setup_sensors(mlxp)) {
2902 		mlxp->mlx_attach |= MLXCX_ATTACH_SENSORS;
2903 	}
2904 
2905 	/*
2906 	 * Finally, tell MAC that we exist!
2907 	 */
2908 	if (!mlxcx_register_mac(mlxp)) {
2909 		goto err;
2910 	}
2911 	mlxp->mlx_attach |= MLXCX_ATTACH_MAC_HDL;
2912 
2913 	/*
2914 	 * This tells the interrupt handlers they can start processing events
2915 	 * other than cmd completions and page requests.
2916 	 */
2917 	mlxcx_eq_set_attached(mlxp);
2918 
2919 	return (DDI_SUCCESS);
2920 
2921 err:
2922 	mlxcx_teardown(mlxp);
2923 	return (DDI_FAILURE);
2924 }
2925 
2926 static struct cb_ops mlxcx_cb_ops = {
2927 	.cb_open = nulldev,
2928 	.cb_close = nulldev,
2929 	.cb_strategy = nodev,
2930 	.cb_print = nodev,
2931 	.cb_dump = nodev,
2932 	.cb_read = nodev,
2933 	.cb_write = nodev,
2934 	.cb_ioctl = nodev,
2935 	.cb_devmap = nodev,
2936 	.cb_mmap = nodev,
2937 	.cb_segmap = nodev,
2938 	.cb_chpoll = nochpoll,
2939 	.cb_prop_op = ddi_prop_op,
2940 	.cb_flag = D_MP,
2941 	.cb_rev = CB_REV,
2942 	.cb_aread = nodev,
2943 	.cb_awrite = nodev
2944 };
2945 
2946 static struct dev_ops mlxcx_dev_ops = {
2947 	.devo_rev = DEVO_REV,
2948 	.devo_refcnt = 0,
2949 	.devo_getinfo = NULL,
2950 	.devo_identify = nulldev,
2951 	.devo_probe = nulldev,
2952 	.devo_attach = mlxcx_attach,
2953 	.devo_detach = mlxcx_detach,
2954 	.devo_reset = nodev,
2955 	.devo_quiesce = ddi_quiesce_not_supported,
2956 	.devo_cb_ops = &mlxcx_cb_ops
2957 };
2958 
2959 static struct modldrv mlxcx_modldrv = {
2960 	.drv_modops = &mod_driverops,
2961 	.drv_linkinfo = "Mellanox Connect-X 4/5/6",
2962 	.drv_dev_ops = &mlxcx_dev_ops
2963 };
2964 
2965 static struct modlinkage mlxcx_modlinkage = {
2966 	.ml_rev = MODREV_1,
2967 	.ml_linkage = { &mlxcx_modldrv, NULL }
2968 };
2969 
2970 int
2971 _init(void)
2972 {
2973 	int ret;
2974 
2975 	ret = ddi_soft_state_init(&mlxcx_softstate, sizeof (mlxcx_t), 0);
2976 	if (ret != 0) {
2977 		return (ret);
2978 	}
2979 
2980 	mac_init_ops(&mlxcx_dev_ops, MLXCX_MODULE_NAME);
2981 
2982 	if ((ret = mod_install(&mlxcx_modlinkage)) != DDI_SUCCESS) {
2983 		mac_fini_ops(&mlxcx_dev_ops);
2984 		ddi_soft_state_fini(&mlxcx_softstate);
2985 		return (ret);
2986 	}
2987 
2988 	return (DDI_SUCCESS);
2989 }
2990 
2991 int
2992 _info(struct modinfo *modinfop)
2993 {
2994 	return (mod_info(&mlxcx_modlinkage, modinfop));
2995 }
2996 
2997 int
2998 _fini(void)
2999 {
3000 	int ret;
3001 
3002 	if ((ret = mod_remove(&mlxcx_modlinkage)) != DDI_SUCCESS) {
3003 		return (ret);
3004 	}
3005 
3006 	mac_fini_ops(&mlxcx_dev_ops);
3007 
3008 	ddi_soft_state_fini(&mlxcx_softstate);
3009 
3010 	return (DDI_SUCCESS);
3011 }
3012