1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2023, The University of Queensland
14 * Copyright (c) 2018, Joyent, Inc.
15 * Copyright 2023 RackTop Systems, Inc.
16 * Copyright 2023 MNX Cloud, Inc.
17 */
18
19 /*
20 * Mellanox Connect-X 4/5/6 driver.
21 */
22
23 /*
24 * The PRM for this family of parts was freely available at:
25 *
26 * https://www.mellanox.com/related-docs/user_manuals/ \
27 * Ethernet_Adapters_Programming_Manual.pdf
28 *
29 * but has since disappeared.
30 */
31 /*
32 * ConnectX glossary
33 * -----------------
34 *
35 * WR Work Request: something we've asked the hardware to do by
36 * creating a Work Queue Entry (WQE), e.g. send or recv a packet
37 *
38 * WQE Work Queue Entry: a descriptor on a work queue descriptor ring
39 *
40 * WQ Work Queue: a descriptor ring that we can place WQEs on, usually
41 * either a Send Queue (SQ) or Receive Queue (RQ). Different WQ
42 * types have different WQE structures, different commands for
43 * creating and destroying them, etc, but share a common context
44 * structure, counter setup and state graph.
45 * SQ Send Queue, a specific type of WQ that sends packets
46 * RQ Receive Queue, a specific type of WQ that receives packets
47 *
48 * CQ Completion Queue: completion of WRs from a WQ are reported to
49 * one of these, as a CQE on its entry ring.
50 * CQE Completion Queue Entry: an entry in a CQ ring. Contains error
51 * info, as well as packet size, the ID of the WQ, and the index
52 * of the WQE which completed. Does not contain any packet data.
53 *
54 * EQ Event Queue: a ring of event structs from the hardware informing
55 * us when particular events happen. Many events can point at a
56 * a particular CQ which we should then go look at.
57 * EQE Event Queue Entry: an entry on the EQ ring
58 *
59 * UAR User Access Region, a page of the device's PCI BAR which is
60 * tied to particular EQ/CQ/WQ sets and contains doorbells to
61 * ring to arm them for interrupts or wake them up for new work
62 *
63 * RQT RQ Table, a collection of indexed RQs used to refer to the group
64 * as a single unit (for e.g. hashing/RSS).
65 *
66 * TIR Transport Interface Recieve, a bucket of resources for the
67 * reception of packets. TIRs have to point at either a single RQ
68 * or a table of RQs (RQT). They then serve as a target for flow
69 * table entries (FEs). TIRs that point at an RQT also contain the
70 * settings for hashing for RSS.
71 *
72 * TIS Transport Interface Send, a bucket of resources associated with
73 * the transmission of packets. In particular, the temporary
74 * resources used for LSO internally in the card are accounted to
75 * a TIS.
76 *
77 * FT Flow Table, a collection of FEs and FGs that can be referred to
78 * as a single entity (e.g. used as a target from another flow
79 * entry or set as the "root" table to handle incoming or outgoing
80 * packets). Packets arriving at a FT are matched against the
81 * FEs in the table until either one matches with a terminating
82 * action or all FEs are exhausted (it's first-match-wins but with
83 * some actions that are non-terminal, like counting actions).
84 *
85 * FG Flow Group, a group of FEs which share a common "mask" (i.e.
86 * they match on the same attributes of packets coming into the
87 * flow).
88 *
89 * FE Flow Entry, an individual set of values to match against
90 * packets entering the flow table, combined with an action to
91 * take upon a successful match. The action we use most is
92 * "forward", which sends the packets to a TIR or another flow
93 * table and then stops further processing within the FE's FT.
94 *
95 * lkey/mkey A reference to something similar to a page table but in the
96 * device's internal onboard MMU. Since Connect-X parts double as
97 * IB cards (lots of RDMA) they have extensive onboard memory mgmt
98 * features which we try very hard not to use. For our WQEs we use
99 * the "reserved" lkey, which is a special value which indicates
100 * that addresses we give are linear addresses and should not be
101 * translated.
102 *
103 * PD Protection Domain, an IB concept. We have to allocate one to
104 * provide as a parameter for new WQs, but we don't do anything
105 * with it.
106 *
107 * TDOM/TD Transport Domain, an IB concept. We allocate one in order to
108 * provide it as a parameter to TIR/TIS creation, but we don't do
109 * anything with it.
110 */
111 /*
112 *
113 * Data flow overview
114 * ------------------
115 *
116 * This driver is a MAC ring-enabled driver which maps rings to send and recv
117 * queues in hardware on the device.
118 *
119 * Each SQ and RQ is set up to report to its own individual CQ, to ensure
120 * sufficient space, and simplify the logic needed to work out which buffer
121 * was completed.
122 *
123 * The CQs are then round-robin allocated onto EQs, of which we set up one per
124 * interrupt that the system gives us for the device. Normally this means we
125 * have 8 EQs.
126 *
127 * When we have >= 8 EQs available, we try to allocate only RX or only TX
128 * CQs on each one. The EQs are chosen for RX and TX in an alternating fashion.
129 *
130 * EQ #0 is reserved for all event types other than completion events, and has
131 * no CQs associated with it at any time. EQs #1 and upwards are only used for
132 * handling CQ completion events.
133 *
134 * +------+ +------+ +------+ +---------+
135 * | SQ 0 |---->| CQ 0 |-----+ | EQ 0 |------> | MSI-X 0 | mlxcx_intr_0
136 * +------+ +------+ | +------+ +---------+
137 * |
138 * +------+ +------+ |
139 * | SQ 1 |---->| CQ 1 |---+ | +------+
140 * +------+ +------+ | +---> | |
141 * | | |
142 * +------+ +------+ | | EQ 1 | +---------+
143 * | SQ 2 |---->| CQ 2 |---------> | |------> | MSI-X 1 | mlxcx_intr_n
144 * +------+ +------+ | +---> | | +---------+
145 * | | +------+
146 * | |
147 * ... | |
148 * | | +------+
149 * +------+ +------+ +-----> | |
150 * | RQ 0 |---->| CQ 3 |---------> | | +---------+
151 * +------+ +------+ | | EQ 2 |------> | MSI-X 2 | mlxcx_intr_n
152 * | | | +---------+
153 * +------+ +------+ | +-> | |
154 * | RQ 1 |---->| CQ 4 |-----+ | +------+
155 * +------+ +------+ |
156 * | ....
157 * +------+ +------+ |
158 * | RQ 2 |---->| CQ 5 |-------+
159 * +------+ +------+
160 *
161 * ... (note this diagram does not show RX-only or TX-only EQs)
162 *
163 * For TX, we advertise all of the SQs we create as plain rings to MAC with
164 * no TX groups. This puts MAC in "virtual group" mode where it will allocate
165 * and use the rings as it sees fit.
166 *
167 * For RX, we advertise actual groups in order to make use of hardware
168 * classification.
169 *
170 * The hardware classification we use is based around Flow Tables, and we
171 * currently ignore all of the eswitch features of the card. The NIC VPORT
172 * is always set to promisc mode so that the eswitch sends us all of the
173 * traffic that arrives on the NIC, and we use flow entries to manage
174 * everything.
175 *
176 * We use 2 layers of flow tables for classification: traffic arrives at the
177 * root RX flow table which contains MAC address filters. Those then send
178 * matched traffic to the per-group L1 VLAN filter tables which contain VLAN
179 * presence and VID filters.
180 *
181 * Since these parts only support doing RSS hashing on a single protocol at a
182 * time, we have to use a third layer of flow tables as well to break traffic
183 * down by L4 and L3 protocol (TCPv6, TCPv4, UDPv6, UDPv4, IPv6, IPv4 etc)
184 * so that it can be sent to the appropriate TIR for hashing.
185 *
186 * Incoming packets
187 * + +---------+ +---------+
188 * | +->| group 0 | | group 0 |
189 * | | | vlan ft | +-->| hash ft |
190 * v | | L1 | | | L2 |
191 * +----+----+ | +---------+ | +---------+ +-----+ +-----+------+
192 * | eswitch | | | | | | TCPv6 |--->| TIR |--->| | RQ0 |
193 * +----+----+ | | | | +---------+ +-----+ | +------+
194 * | | | | | | UDPv6 |--->| TIR |--->| | RQ1 |
195 * | | | | | +---------+ +-----+ | +------+
196 * | | | | | | TCPv4 |--->| TIR |--->| | RQ2 |
197 * v | | | | +---------+ +-----+ | RQT +------+
198 * +----+----+ | +---------+ | | UDPv4 |--->| TIR |--->| | ... |
199 * | root rx | | | default |--+ +---------+ +-----+ | | |
200 * | flow tb | | +---------+ | | IPv6 |--->| TIR |--->| | |
201 * | L0 | | | promisc |--+ +---------+ +-----+ | | |
202 * +---------+ | +---------+ ^ | IPv4 |--->| TIR |--->| | |
203 * | bcast |---|---------------+ +---------+ +-----+ +-----+------+
204 * +---------+ | ^ | other |-+
205 * | MAC 0 |---+ | +---------+ | +-----+ +-----+
206 * +---------+ | +->| TIR |--->| RQ0 |
207 * | MAC 1 |-+ | +-----+ +-----+
208 * +---------+ | +---------------+
209 * | MAC 2 |-+ | ^
210 * +---------+ | | |
211 * | MAC 3 |-+ | +---------+ | +---------+
212 * +---------+ | | | group 1 | | | group 1 |
213 * | ..... | +--->| vlan ft | | +>| hash ft |
214 * | | | | L1 | | | | L2 |
215 * +---------+ | +---------+ | | +---------+ +-----+ +-----+------+
216 * | promisc |---+ | VLAN 0 |----+ | TCPv6 |--->| TIR |--->| | RQ3 |
217 * +---------+ +---------+ | +---------+ +-----+ | +------+
218 * | ..... | | | UDPv6 |--->| TIR |--->| | RQ4 |
219 * | | | +---------+ +-----+ | +------+
220 * | | | | TCPv4 |--->| TIR |--->| | RQ5 |
221 * | | | +---------+ +-----+ | RQT +------+
222 * +---------+ | | UDPv4 |--->| TIR |--->| | ... |
223 * | | | +---------+ +-----+ | | |
224 * +---------+ | | IPv6 |--->| TIR |--->| | |
225 * | promisc |--+ +---------+ +-----+ | | |
226 * +---------+ | IPv4 |--->| TIR |--->| | |
227 * +---------+ +-----+ +-----+------+
228 * | other |-+
229 * +---------+ |
230 * ....... | +-----+ +-----+
231 * +->| TIR |--->| RQ3 |
232 * +-----+ +-----+
233 *
234 * Note that the "promisc" flow entries are only set/enabled when promisc
235 * mode is enabled for the NIC. All promisc flow entries point directly at
236 * group 0's hashing flowtable (so all promisc-only traffic lands on group 0,
237 * the "default group" in MAC).
238 *
239 * The "default" entry in the L1 VLAN filter flow tables is used when there
240 * are no VLANs set for the group, to accept any traffic regardless of tag. It
241 * is deleted as soon as a VLAN filter is added (and re-instated if the
242 * last VLAN filter is removed).
243 *
244 * The actual descriptor ring structures for RX on Connect-X4 don't contain any
245 * space for packet data (they're a collection of scatter pointers only). TX
246 * descriptors contain some space for "inline headers" (and the card requires
247 * us to put at least the L2 Ethernet headers there for the eswitch to look at)
248 * but all the rest of the data comes from the gather pointers.
249 *
250 * When we get completions back they simply contain the ring index number of
251 * the WR (work request) which completed. So, we manage the buffers for actual
252 * packet data completely independently of the descriptors in this driver. When
253 * a WR is enqueued in a WQE (work queue entry), we stamp the packet data buffer
254 * with the WQE index that we put it at, and therefore don't have to look at
255 * the original descriptor at all when handling completions.
256 *
257 * For RX, we create sufficient packet data buffers to fill 150% of the
258 * available descriptors for each ring. These all are pre-set-up for DMA and
259 * have an mblk_t associated with them (with desballoc()).
260 *
261 * For TX we either borrow the mblk's memory and DMA bind it (if the packet is
262 * large enough), or we copy it into a pre-allocated buffer set up in the same
263 * as as for RX.
264 */
265
266 /*
267 * Buffer lifecycle: RX
268 * --------------------
269 *
270 * The lifecycle of an mlxcx_buffer_t (packet buffer) used for RX is pretty
271 * straightforward.
272 *
273 * It is created (and has all its memory allocated) at the time of starting up
274 * the RX ring it belongs to. Then it is placed on the "free" list in the
275 * mlxcx_buffer_shard_t associated with its RQ. When mlxcx_rq_refill() wants
276 * more buffers to add to the RQ, it takes one off and marks it as "on WQ"
277 * before making a WQE for it.
278 *
279 * After a completion event occurs, the packet is either discarded (and the
280 * buffer_t returned to the free list), or it is readied for loaning to MAC
281 * and placed on the "loaned" list in the mlxcx_buffer_shard_t.
282 *
283 * Once MAC and the rest of the system have finished with the packet, they call
284 * freemsg() on its mblk, which will call mlxcx_buf_mp_return. At this point
285 * the fate of the buffer_t is determined by the state of the
286 * mlxcx_buffer_shard_t. When the shard is in its normal state the buffer_t
287 * will be returned to the free list, potentially to be recycled and used
288 * again. But if the shard is draining (E.g. after a ring stop) there will be
289 * no recycling and the buffer_t is immediately destroyed.
290 *
291 * At detach/teardown time, buffers are only every destroyed from the free list.
292 *
293 *
294 * +
295 * |
296 * | mlxcx_buf_create
297 * |
298 * v
299 * +----+----+
300 * | created |
301 * +----+----+ +------+
302 * | | dead |
303 * | +------+
304 * | mlxcx_buf_return ^
305 * | |
306 * v | mlxcx_buf_destroy
307 * mlxcx_buf_destroy +----+----+ +-----------+ |
308 * +---------| free |<------no-| draining? |-yes-+
309 * | +----+----+ +-----------+
310 * | | ^
311 * | | |
312 * v | mlxcx_buf_take | mlxcx_buf_return
313 * +---+--+ v |
314 * | dead | +---+---+ |
315 * +------+ | on WQ |- - - - - - - - >O
316 * +---+---+ ^
317 * | |
318 * | |
319 * | mlxcx_buf_loan | mlxcx_buf_mp_return
320 * v |
321 * +-------+--------+ |
322 * | on loan to MAC |----------->O
323 * +----------------+ freemsg()
324 *
325 */
326
327 /*
328 * Buffer lifecycle: TX
329 * --------------------
330 *
331 * mlxcx_buffer_ts used for TX are divided into two kinds: regular buffers, and
332 * "foreign" buffers.
333 *
334 * The former have their memory allocated and DMA bound by this driver, while
335 * the latter (the "foreign" buffers) are on loan from MAC. Their memory is
336 * not owned by us, though we do DMA bind it (and take responsibility for
337 * un-binding it when we're done with them).
338 *
339 * We use separate mlxcx_buf_shard_ts for foreign and local buffers on each
340 * SQ. Thus, there is a separate free list and mutex for each kind.
341 *
342 * Since a TX packet might consist of multiple mblks, we translate each mblk
343 * into exactly one buffer_t. The buffer_ts are chained together in the same
344 * order as the mblks, using the mlb_tx_chain/mlb_tx_chain_entry list_t.
345 *
346 * Each chain of TX buffers may consist of foreign or driver buffers, in any
347 * mixture.
348 *
349 * The head of a TX buffer chain has mlb_tx_head == itself, which distinguishes
350 * it from the rest of the chain buffers.
351 *
352 * TX buffer chains are always returned to the free list by
353 * mlxcx_buf_return_chain(), which takes care of walking the mlb_tx_chain and
354 * freeing all of the members.
355 *
356 * We only call freemsg() once, on the head of the TX buffer chain's original
357 * mblk. This is true whether we copied it or bound it in a foreign buffer.
358 */
359
360 /*
361 * Startup and command interface
362 * -----------------------------
363 *
364 * The command interface is the primary way in which we give control orders to
365 * the hardware (e.g. actions like "create this queue" or "delete this flow
366 * entry"). The command interface is never used to transmit or receive packets
367 * -- that takes place only on the queues that are set up through it.
368 *
369 * In mlxcx_cmd.c we implement our use of the command interface on top of a
370 * simple taskq. As commands are submitted from the taskq they choose a
371 * "slot", if there are no free slots then execution of the command will
372 * be paused until one is free. The hardware permits up to 32 independent
373 * slots for concurrent command execution.
374 *
375 * Before interrupts are enabled, command completion is polled, once
376 * interrupts are up command completions become asynchronous and are
377 * wired to EQ 0. A caveat to this is commands can not be submitted
378 * directly from EQ 0's completion handler, and any processing resulting from
379 * an asynchronous event which requires further use of the command interface
380 * is posted through a taskq.
381 *
382 * The startup/attach process for this card involves a bunch of different steps
383 * which are summarised pretty well in the PRM. We have to send a number of
384 * commands which do different things to start the card up, give it some pages
385 * of our own memory for it to use, then start creating all the entities that
386 * we need to use like EQs, CQs, WQs, as well as their dependencies like PDs
387 * and TDoms.
388 */
389
390 /*
391 * UARs
392 * ----
393 *
394 * The pages of the PCI BAR other than the first few are reserved for use as
395 * "UAR" sections in this device. Each UAR section can be used as a set of
396 * doorbells for our queues.
397 *
398 * Currently we just make one single UAR for all of our queues. It doesn't
399 * seem to be a major limitation yet.
400 *
401 * When we're sending packets through an SQ, the PRM is not awful clear about
402 * exactly how we're meant to use the first 16 bytes of the Blueflame buffers
403 * (it's clear on the pattern of alternation you're expected to use between
404 * even and odd for Blueflame sends, but not for regular doorbells).
405 *
406 * Currently we don't do the even-odd alternating pattern for ordinary
407 * doorbells, and we don't use Blueflame at all. This seems to work fine, at
408 * least on Connect-X4 Lx.
409 */
410
411 /*
412 * Lock ordering
413 * -------------
414 *
415 * Interrupt side:
416 *
417 * - mleq_mtx
418 * - mlcq_arm_mtx
419 * - mlcq_mtx
420 * - mlcq_bufbmtx
421 * - mlwq_mtx
422 * - mlbs_mtx
423 * - mlp_mtx
424 *
425 * GLD side:
426 *
427 * - mlp_mtx
428 * - mlg_mtx
429 * - mlg_*.mlft_mtx
430 * - mlp_*.mlft_mtx
431 * - mlwq_mtx
432 * - mlbs_mtx
433 * - mlcq_bufbmtx
434 * - mleq_mtx
435 * - mlcq_arm_mtx
436 * - mlcq_mtx
437 *
438 */
439
440 #include <sys/modctl.h>
441 #include <sys/conf.h>
442 #include <sys/devops.h>
443 #include <sys/sysmacros.h>
444 #include <sys/time.h>
445 #include <sys/pci.h>
446 #include <sys/mac_provider.h>
447
448 #include <mlxcx.h>
449
450 CTASSERT((1 << MLXCX_RX_HASH_FT_SIZE_SHIFT) >= MLXCX_TIRS_PER_GROUP);
451
452 #define MLXCX_MODULE_NAME "mlxcx"
453 /*
454 * We give this to the firmware, so it has to be in a fixed format that it
455 * understands.
456 */
457 #define MLXCX_DRIVER_VERSION "illumos,mlxcx,1.0.0,1,000,000000"
458
459 /*
460 * Firmware may take a while to reclaim pages. Try a set number of times.
461 */
462 clock_t mlxcx_reclaim_delay = 1000 * 50; /* 50 ms in us */
463 uint_t mlxcx_reclaim_tries = 100; /* Wait at most 5000ms */
464
465 static void *mlxcx_softstate;
466
467 /*
468 * Fault detection thresholds.
469 */
470 uint_t mlxcx_doorbell_tries = MLXCX_DOORBELL_TRIES_DFLT;
471 uint_t mlxcx_stuck_intr_count = MLXCX_STUCK_INTR_COUNT_DFLT;
472
473 static void
mlxcx_load_prop_defaults(mlxcx_t * mlxp)474 mlxcx_load_prop_defaults(mlxcx_t *mlxp)
475 {
476 mlxcx_drv_props_t *p = &mlxp->mlx_props;
477 mlxcx_port_t *port = &mlxp->mlx_ports[0];
478
479 VERIFY((mlxp->mlx_attach & MLXCX_ATTACH_PORTS) != 0);
480 VERIFY((mlxp->mlx_attach & (MLXCX_ATTACH_CQS | MLXCX_ATTACH_WQS)) == 0);
481
482 /*
483 * Currently we have different queue size defaults for two
484 * categories of queues. One set for devices which support a
485 * maximum speed of 10Gb/s, and another for those above that.
486 */
487 if ((port->mlp_max_proto & (MLXCX_PROTO_25G | MLXCX_PROTO_40G |
488 MLXCX_PROTO_50G | MLXCX_PROTO_100G)) != 0 ||
489 (port->mlp_ext_max_proto & (MLXCX_EXTPROTO_25G |
490 MLXCX_EXTPROTO_40G | MLXCX_EXTPROTO_50G | MLXCX_EXTPROTO_100G |
491 MLXCX_EXTPROTO_200G | MLXCX_EXTPROTO_400G)) != 0) {
492 p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_25G;
493 p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_25G;
494 p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_25G;
495 } else if ((port->mlp_max_proto & (MLXCX_PROTO_100M | MLXCX_PROTO_1G |
496 MLXCX_PROTO_10G)) != 0 ||
497 (port->mlp_ext_max_proto & (MLXCX_EXTPROTO_100M |
498 MLXCX_EXTPROTO_5G | MLXCX_EXTPROTO_1G | MLXCX_EXTPROTO_10G)) != 0) {
499 p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT;
500 p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT;
501 p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT;
502 } else {
503 mlxcx_warn(mlxp, "Encountered a port with a speed we don't "
504 "recognize. Proto: 0x%x", port->mlp_max_proto);
505 p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT;
506 p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT;
507 p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT;
508 }
509 }
510
511 /*
512 * Properties which may have different defaults based on hardware
513 * characteristics.
514 */
515 static void
mlxcx_load_model_props(mlxcx_t * mlxp)516 mlxcx_load_model_props(mlxcx_t *mlxp)
517 {
518 mlxcx_drv_props_t *p = &mlxp->mlx_props;
519
520 mlxcx_load_prop_defaults(mlxp);
521
522 p->mldp_cq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
523 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cq_size_shift",
524 p->mldp_cq_size_shift_default);
525 p->mldp_sq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
526 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "sq_size_shift",
527 p->mldp_sq_size_shift_default);
528 p->mldp_rq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
529 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rq_size_shift",
530 p->mldp_rq_size_shift_default);
531 }
532
533 static void
mlxcx_load_props(mlxcx_t * mlxp)534 mlxcx_load_props(mlxcx_t *mlxp)
535 {
536 mlxcx_drv_props_t *p = &mlxp->mlx_props;
537
538 p->mldp_eq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
539 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "eq_size_shift",
540 MLXCX_EQ_SIZE_SHIFT_DFLT);
541 p->mldp_cqemod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
542 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_period_usec",
543 MLXCX_CQEMOD_PERIOD_USEC_DFLT);
544 p->mldp_cqemod_count = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
545 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_count",
546 MLXCX_CQEMOD_COUNT_DFLT);
547 p->mldp_intrmod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
548 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "intrmod_period_usec",
549 MLXCX_INTRMOD_PERIOD_USEC_DFLT);
550
551 p->mldp_tx_ngroups = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
552 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_ngroups",
553 MLXCX_TX_NGROUPS_DFLT);
554 p->mldp_tx_nrings_per_group = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
555 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_nrings_per_group",
556 MLXCX_TX_NRINGS_PER_GROUP_DFLT);
557
558 p->mldp_rx_ngroups_large = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
559 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_ngroups_large",
560 MLXCX_RX_NGROUPS_LARGE_DFLT);
561 p->mldp_rx_ngroups_small = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
562 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_ngroups_small",
563 MLXCX_RX_NGROUPS_SMALL_DFLT);
564 p->mldp_rx_nrings_per_large_group = ddi_getprop(DDI_DEV_T_ANY,
565 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
566 "rx_nrings_per_large_group", MLXCX_RX_NRINGS_PER_LARGE_GROUP_DFLT);
567 p->mldp_rx_nrings_per_small_group = ddi_getprop(DDI_DEV_T_ANY,
568 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
569 "rx_nrings_per_small_group", MLXCX_RX_NRINGS_PER_SMALL_GROUP_DFLT);
570
571 p->mldp_ftbl_root_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
572 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "ftbl_root_size_shift",
573 MLXCX_FTBL_ROOT_SIZE_SHIFT_DFLT);
574
575 p->mldp_tx_bind_threshold = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
576 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_bind_threshold",
577 MLXCX_TX_BIND_THRESHOLD_DFLT);
578
579 p->mldp_ftbl_vlan_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
580 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "ftbl_vlan_size_shift",
581 MLXCX_FTBL_VLAN_SIZE_SHIFT_DFLT);
582
583 p->mldp_eq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY,
584 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
585 "eq_check_interval_sec", MLXCX_EQ_CHECK_INTERVAL_SEC_DFLT);
586 p->mldp_cq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY,
587 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
588 "cq_check_interval_sec", MLXCX_CQ_CHECK_INTERVAL_SEC_DFLT);
589 p->mldp_wq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY,
590 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
591 "wq_check_interval_sec", MLXCX_WQ_CHECK_INTERVAL_SEC_DFLT);
592
593 p->mldp_rx_per_cq = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
594 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_limit_per_completion",
595 MLXCX_RX_PER_CQ_DEFAULT);
596
597 if (p->mldp_rx_per_cq < MLXCX_RX_PER_CQ_MIN ||
598 p->mldp_rx_per_cq > MLXCX_RX_PER_CQ_MAX) {
599 mlxcx_warn(mlxp, "!rx_limit_per_completion = %u is "
600 "out of range. Defaulting to: %d. Valid values are from "
601 "%d to %d", p->mldp_rx_per_cq, MLXCX_RX_PER_CQ_DEFAULT,
602 MLXCX_RX_PER_CQ_MIN, MLXCX_RX_PER_CQ_MAX);
603 p->mldp_rx_per_cq = MLXCX_RX_PER_CQ_DEFAULT;
604 }
605
606 p->mldp_rx_p50_loan_min_size = ddi_getprop(DDI_DEV_T_ANY,
607 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
608 "rx_p50_loan_min_size", MLXCX_P50_LOAN_MIN_SIZE_DFLT);
609 }
610
611 void
mlxcx_note(mlxcx_t * mlxp,const char * fmt,...)612 mlxcx_note(mlxcx_t *mlxp, const char *fmt, ...)
613 {
614 va_list ap;
615
616 va_start(ap, fmt);
617 if (mlxp != NULL && mlxp->mlx_dip != NULL) {
618 vdev_err(mlxp->mlx_dip, CE_NOTE, fmt, ap);
619 } else {
620 vcmn_err(CE_NOTE, fmt, ap);
621 }
622 va_end(ap);
623 }
624
625 void
mlxcx_warn(mlxcx_t * mlxp,const char * fmt,...)626 mlxcx_warn(mlxcx_t *mlxp, const char *fmt, ...)
627 {
628 va_list ap;
629
630 va_start(ap, fmt);
631 if (mlxp != NULL && mlxp->mlx_dip != NULL) {
632 vdev_err(mlxp->mlx_dip, CE_WARN, fmt, ap);
633 } else {
634 vcmn_err(CE_WARN, fmt, ap);
635 }
636 va_end(ap);
637 }
638
639 void
mlxcx_panic(mlxcx_t * mlxp,const char * fmt,...)640 mlxcx_panic(mlxcx_t *mlxp, const char *fmt, ...)
641 {
642 va_list ap;
643
644 va_start(ap, fmt);
645 if (mlxp != NULL && mlxp->mlx_dip != NULL) {
646 vdev_err(mlxp->mlx_dip, CE_PANIC, fmt, ap);
647 } else {
648 vcmn_err(CE_PANIC, fmt, ap);
649 }
650 va_end(ap);
651 }
652
653 uint16_t
mlxcx_get16(mlxcx_t * mlxp,uintptr_t off)654 mlxcx_get16(mlxcx_t *mlxp, uintptr_t off)
655 {
656 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
657 return (ddi_get16(mlxp->mlx_regs_handle, (void *)addr));
658 }
659
660 uint32_t
mlxcx_get32(mlxcx_t * mlxp,uintptr_t off)661 mlxcx_get32(mlxcx_t *mlxp, uintptr_t off)
662 {
663 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
664 return (ddi_get32(mlxp->mlx_regs_handle, (void *)addr));
665 }
666
667 uint64_t
mlxcx_get64(mlxcx_t * mlxp,uintptr_t off)668 mlxcx_get64(mlxcx_t *mlxp, uintptr_t off)
669 {
670 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
671 return (ddi_get64(mlxp->mlx_regs_handle, (void *)addr));
672 }
673
674 void
mlxcx_put32(mlxcx_t * mlxp,uintptr_t off,uint32_t val)675 mlxcx_put32(mlxcx_t *mlxp, uintptr_t off, uint32_t val)
676 {
677 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
678 ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val);
679 }
680
681 void
mlxcx_put64(mlxcx_t * mlxp,uintptr_t off,uint64_t val)682 mlxcx_put64(mlxcx_t *mlxp, uintptr_t off, uint64_t val)
683 {
684 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
685 ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val);
686 }
687
688 void
mlxcx_uar_put32(mlxcx_t * mlxp,mlxcx_uar_t * mlu,uintptr_t off,uint32_t val)689 mlxcx_uar_put32(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint32_t val)
690 {
691 /*
692 * The UAR is always inside the first BAR, which we mapped as
693 * mlx_regs
694 */
695 uintptr_t addr = off + (uintptr_t)mlu->mlu_base +
696 (uintptr_t)mlxp->mlx_regs_base;
697 ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val);
698 }
699
700 void
mlxcx_uar_put64(mlxcx_t * mlxp,mlxcx_uar_t * mlu,uintptr_t off,uint64_t val)701 mlxcx_uar_put64(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint64_t val)
702 {
703 uintptr_t addr = off + (uintptr_t)mlu->mlu_base +
704 (uintptr_t)mlxp->mlx_regs_base;
705 ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val);
706 }
707
708 static void
mlxcx_fm_fini(mlxcx_t * mlxp)709 mlxcx_fm_fini(mlxcx_t *mlxp)
710 {
711 if (mlxp->mlx_fm_caps == 0)
712 return;
713
714 if (DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps))
715 ddi_fm_handler_unregister(mlxp->mlx_dip);
716
717 if (DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps) ||
718 DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps))
719 pci_ereport_teardown(mlxp->mlx_dip);
720
721 ddi_fm_fini(mlxp->mlx_dip);
722
723 mlxp->mlx_fm_caps = 0;
724 }
725
726 void
mlxcx_fm_ereport(mlxcx_t * mlxp,const char * detail)727 mlxcx_fm_ereport(mlxcx_t *mlxp, const char *detail)
728 {
729 uint64_t ena;
730 char buf[FM_MAX_CLASS];
731
732 if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps))
733 return;
734
735 (void) snprintf(buf, FM_MAX_CLASS, "%s.%s", DDI_FM_DEVICE, detail);
736 ena = fm_ena_generate(0, FM_ENA_FMT1);
737 ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP,
738 FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0,
739 NULL);
740 }
741
742 static int
mlxcx_fm_errcb(dev_info_t * dip,ddi_fm_error_t * err,const void * arg)743 mlxcx_fm_errcb(dev_info_t *dip, ddi_fm_error_t *err, const void *arg)
744 {
745 /*
746 * as the driver can always deal with an error in any dma or
747 * access handle, we can just return the fme_status value.
748 */
749 pci_ereport_post(dip, err, NULL);
750 return (err->fme_status);
751 }
752
753 static void
mlxcx_fm_init(mlxcx_t * mlxp)754 mlxcx_fm_init(mlxcx_t *mlxp)
755 {
756 ddi_iblock_cookie_t iblk;
757 int def = DDI_FM_EREPORT_CAPABLE | DDI_FM_ACCCHK_CAPABLE |
758 DDI_FM_DMACHK_CAPABLE | DDI_FM_ERRCB_CAPABLE;
759
760 mlxp->mlx_fm_caps = ddi_prop_get_int(DDI_DEV_T_ANY, mlxp->mlx_dip,
761 DDI_PROP_DONTPASS, "fm_capable", def);
762
763 if (mlxp->mlx_fm_caps < 0) {
764 mlxp->mlx_fm_caps = 0;
765 }
766 mlxp->mlx_fm_caps &= def;
767
768 if (mlxp->mlx_fm_caps == 0)
769 return;
770
771 ddi_fm_init(mlxp->mlx_dip, &mlxp->mlx_fm_caps, &iblk);
772 if (DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps) ||
773 DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) {
774 pci_ereport_setup(mlxp->mlx_dip);
775 }
776 if (DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) {
777 ddi_fm_handler_register(mlxp->mlx_dip, mlxcx_fm_errcb,
778 (void *)mlxp);
779 }
780 }
781
782 static void
mlxcx_mlbs_teardown(mlxcx_t * mlxp,mlxcx_buf_shard_t * s)783 mlxcx_mlbs_teardown(mlxcx_t *mlxp, mlxcx_buf_shard_t *s)
784 {
785 mlxcx_buffer_t *buf;
786
787 mutex_enter(&s->mlbs_mtx);
788
789 while (!list_is_empty(&s->mlbs_busy))
790 cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
791
792 while (!list_is_empty(&s->mlbs_loaned))
793 cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
794
795 while ((buf = list_head(&s->mlbs_free)) != NULL)
796 mlxcx_buf_destroy(mlxp, buf);
797
798 list_destroy(&s->mlbs_free);
799 list_destroy(&s->mlbs_busy);
800 list_destroy(&s->mlbs_loaned);
801 mutex_exit(&s->mlbs_mtx);
802
803 cv_destroy(&s->mlbs_free_nonempty);
804 mutex_destroy(&s->mlbs_mtx);
805 }
806
807 static void
mlxcx_teardown_bufs(mlxcx_t * mlxp)808 mlxcx_teardown_bufs(mlxcx_t *mlxp)
809 {
810 mlxcx_buf_shard_t *s;
811
812 while ((s = list_remove_head(&mlxp->mlx_buf_shards)) != NULL) {
813 mlxcx_mlbs_teardown(mlxp, s);
814 kmem_free(s, sizeof (mlxcx_buf_shard_t));
815 }
816 list_destroy(&mlxp->mlx_buf_shards);
817
818 kmem_cache_destroy(mlxp->mlx_bufs_cache);
819 }
820
821 static void
mlxcx_teardown_pages(mlxcx_t * mlxp)822 mlxcx_teardown_pages(mlxcx_t *mlxp)
823 {
824 uint_t nzeros = 0;
825 uint64_t *pas;
826
827 pas = kmem_alloc(sizeof (*pas) * MLXCX_MANAGE_PAGES_MAX_PAGES,
828 KM_SLEEP);
829
830 mutex_enter(&mlxp->mlx_pagemtx);
831
832 while (mlxp->mlx_npages > 0) {
833 int32_t req, ret;
834
835 ASSERT0(avl_is_empty(&mlxp->mlx_pages));
836 req = MIN(mlxp->mlx_npages, MLXCX_MANAGE_PAGES_MAX_PAGES);
837
838 if (!mlxcx_cmd_return_pages(mlxp, req, pas, &ret)) {
839 mlxcx_warn(mlxp, "hardware refused to return pages, "
840 "leaking %u remaining pages", mlxp->mlx_npages);
841 goto out;
842 }
843
844 for (int32_t i = 0; i < ret; i++) {
845 mlxcx_dev_page_t *mdp, probe;
846 bzero(&probe, sizeof (probe));
847 probe.mxdp_pa = pas[i];
848
849 mdp = avl_find(&mlxp->mlx_pages, &probe, NULL);
850
851 if (mdp != NULL) {
852 avl_remove(&mlxp->mlx_pages, mdp);
853 mlxp->mlx_npages--;
854 mlxcx_dma_free(&mdp->mxdp_dma);
855 kmem_free(mdp, sizeof (mlxcx_dev_page_t));
856 } else {
857 mlxcx_panic(mlxp, "hardware returned a page "
858 "with PA 0x%" PRIx64 " but we have no "
859 "record of giving out such a page", pas[i]);
860 }
861 }
862
863 /*
864 * If no pages were returned, note that fact.
865 */
866 if (ret == 0) {
867 nzeros++;
868 if (nzeros > mlxcx_reclaim_tries) {
869 mlxcx_warn(mlxp, "hardware refused to return "
870 "pages, leaking %u remaining pages",
871 mlxp->mlx_npages);
872 goto out;
873 }
874 delay(drv_usectohz(mlxcx_reclaim_delay));
875 }
876 }
877
878 avl_destroy(&mlxp->mlx_pages);
879
880 out:
881 mutex_exit(&mlxp->mlx_pagemtx);
882 mutex_destroy(&mlxp->mlx_pagemtx);
883
884 kmem_free(pas, sizeof (*pas) * MLXCX_MANAGE_PAGES_MAX_PAGES);
885 }
886
887 static boolean_t
mlxcx_eq_alloc_dma(mlxcx_t * mlxp,mlxcx_event_queue_t * mleq)888 mlxcx_eq_alloc_dma(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
889 {
890 ddi_device_acc_attr_t acc;
891 ddi_dma_attr_t attr;
892 boolean_t ret;
893 size_t sz, i;
894
895 VERIFY0(mleq->mleq_state & MLXCX_EQ_ALLOC);
896
897 mleq->mleq_entshift = mlxp->mlx_props.mldp_eq_size_shift;
898 mleq->mleq_nents = (1 << mleq->mleq_entshift);
899 sz = mleq->mleq_nents * sizeof (mlxcx_eventq_ent_t);
900 ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0);
901
902 mlxcx_dma_acc_attr(mlxp, &acc);
903 mlxcx_dma_queue_attr(mlxp, &attr);
904
905 ret = mlxcx_dma_alloc(mlxp, &mleq->mleq_dma, &attr, &acc,
906 B_TRUE, sz, B_TRUE);
907 if (!ret) {
908 mlxcx_warn(mlxp, "failed to allocate EQ memory");
909 return (B_FALSE);
910 }
911
912 mleq->mleq_ent = (mlxcx_eventq_ent_t *)mleq->mleq_dma.mxdb_va;
913
914 for (i = 0; i < mleq->mleq_nents; ++i)
915 mleq->mleq_ent[i].mleqe_owner = MLXCX_EQ_OWNER_INIT;
916
917 mleq->mleq_state |= MLXCX_EQ_ALLOC;
918
919 return (B_TRUE);
920 }
921
922 static void
mlxcx_eq_rele_dma(mlxcx_t * mlxp,mlxcx_event_queue_t * mleq)923 mlxcx_eq_rele_dma(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
924 {
925 VERIFY(mleq->mleq_state & MLXCX_EQ_ALLOC);
926 if (mleq->mleq_state & MLXCX_EQ_CREATED)
927 VERIFY(mleq->mleq_state & MLXCX_EQ_DESTROYED);
928
929 mlxcx_dma_free(&mleq->mleq_dma);
930 mleq->mleq_ent = NULL;
931
932 mleq->mleq_state &= ~MLXCX_EQ_ALLOC;
933 }
934
935 void
mlxcx_teardown_flow_table(mlxcx_t * mlxp,mlxcx_flow_table_t * ft)936 mlxcx_teardown_flow_table(mlxcx_t *mlxp, mlxcx_flow_table_t *ft)
937 {
938 mlxcx_flow_group_t *fg;
939 mlxcx_flow_entry_t *fe;
940 int i;
941
942 ASSERT(mutex_owned(&ft->mlft_mtx));
943
944 for (i = ft->mlft_nents - 1; i >= 0; --i) {
945 fe = &ft->mlft_ent[i];
946 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
947 if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) {
948 mlxcx_panic(mlxp, "failed to delete flow "
949 "entry %u on table %u", i,
950 ft->mlft_num);
951 }
952 }
953 }
954
955 while ((fg = list_remove_head(&ft->mlft_groups)) != NULL) {
956 if (fg->mlfg_state & MLXCX_FLOW_GROUP_CREATED &&
957 !(fg->mlfg_state & MLXCX_FLOW_GROUP_DESTROYED)) {
958 if (!mlxcx_cmd_destroy_flow_group(mlxp, fg)) {
959 mlxcx_panic(mlxp, "failed to destroy flow "
960 "group %u", fg->mlfg_num);
961 }
962 }
963 kmem_free(fg, sizeof (mlxcx_flow_group_t));
964 }
965 list_destroy(&ft->mlft_groups);
966 if (ft->mlft_state & MLXCX_FLOW_TABLE_CREATED &&
967 !(ft->mlft_state & MLXCX_FLOW_TABLE_DESTROYED)) {
968 if (!mlxcx_cmd_destroy_flow_table(mlxp, ft)) {
969 mlxcx_panic(mlxp, "failed to destroy flow table %u",
970 ft->mlft_num);
971 }
972 }
973 kmem_free(ft->mlft_ent, ft->mlft_entsize);
974 ft->mlft_ent = NULL;
975 mutex_exit(&ft->mlft_mtx);
976 mutex_destroy(&ft->mlft_mtx);
977 kmem_free(ft, sizeof (mlxcx_flow_table_t));
978 }
979
980 static void
mlxcx_teardown_ports(mlxcx_t * mlxp)981 mlxcx_teardown_ports(mlxcx_t *mlxp)
982 {
983 uint_t i;
984 mlxcx_port_t *p;
985 mlxcx_flow_table_t *ft;
986
987 for (i = 0; i < mlxp->mlx_nports; ++i) {
988 p = &mlxp->mlx_ports[i];
989 if (!(p->mlp_init & MLXCX_PORT_INIT))
990 continue;
991 mutex_enter(&p->mlp_mtx);
992 if ((ft = p->mlp_rx_flow) != NULL) {
993 mutex_enter(&ft->mlft_mtx);
994 /*
995 * teardown_flow_table() will destroy the mutex, so
996 * we don't release it here.
997 */
998 mlxcx_teardown_flow_table(mlxp, ft);
999 }
1000 mutex_exit(&p->mlp_mtx);
1001 mutex_destroy(&p->mlp_mtx);
1002 mutex_destroy(&p->mlx_port_event.mla_mtx);
1003 p->mlx_port_event.mla_mlx = NULL;
1004 p->mlx_port_event.mla_port = NULL;
1005 p->mlp_init &= ~MLXCX_PORT_INIT;
1006 }
1007
1008 kmem_free(mlxp->mlx_ports, mlxp->mlx_ports_size);
1009 mlxp->mlx_ports = NULL;
1010 }
1011
1012 static void
mlxcx_teardown_wqs(mlxcx_t * mlxp)1013 mlxcx_teardown_wqs(mlxcx_t *mlxp)
1014 {
1015 mlxcx_work_queue_t *mlwq;
1016
1017 while ((mlwq = list_head(&mlxp->mlx_wqs)) != NULL) {
1018 mlxcx_wq_teardown(mlxp, mlwq);
1019 }
1020 list_destroy(&mlxp->mlx_wqs);
1021 }
1022
1023 static void
mlxcx_teardown_cqs(mlxcx_t * mlxp)1024 mlxcx_teardown_cqs(mlxcx_t *mlxp)
1025 {
1026 mlxcx_completion_queue_t *mlcq;
1027
1028 while ((mlcq = list_head(&mlxp->mlx_cqs)) != NULL) {
1029 mlxcx_cq_teardown(mlxp, mlcq);
1030 }
1031 list_destroy(&mlxp->mlx_cqs);
1032 }
1033
1034 static void
mlxcx_teardown_eqs(mlxcx_t * mlxp)1035 mlxcx_teardown_eqs(mlxcx_t *mlxp)
1036 {
1037 mlxcx_event_queue_t *mleq;
1038 uint_t i;
1039
1040 for (i = 0; i < mlxp->mlx_intr_count; ++i) {
1041 mleq = &mlxp->mlx_eqs[i];
1042 mutex_enter(&mleq->mleq_mtx);
1043 if ((mleq->mleq_state & MLXCX_EQ_CREATED) &&
1044 !(mleq->mleq_state & MLXCX_EQ_DESTROYED)) {
1045 if (!mlxcx_cmd_destroy_eq(mlxp, mleq)) {
1046 mlxcx_warn(mlxp, "failed to destroy "
1047 "event queue idx %u eqn %u",
1048 i, mleq->mleq_num);
1049 }
1050 }
1051 if (mleq->mleq_state & MLXCX_EQ_ALLOC) {
1052 mlxcx_eq_rele_dma(mlxp, mleq);
1053 }
1054 mutex_exit(&mleq->mleq_mtx);
1055 }
1056 }
1057
1058 static void
mlxcx_teardown_checktimers(mlxcx_t * mlxp)1059 mlxcx_teardown_checktimers(mlxcx_t *mlxp)
1060 {
1061 if (mlxp->mlx_props.mldp_eq_check_interval_sec > 0)
1062 ddi_periodic_delete(mlxp->mlx_eq_checktimer);
1063 if (mlxp->mlx_props.mldp_cq_check_interval_sec > 0)
1064 ddi_periodic_delete(mlxp->mlx_cq_checktimer);
1065 if (mlxp->mlx_props.mldp_wq_check_interval_sec > 0)
1066 ddi_periodic_delete(mlxp->mlx_wq_checktimer);
1067 }
1068
1069 static void
mlxcx_teardown(mlxcx_t * mlxp)1070 mlxcx_teardown(mlxcx_t *mlxp)
1071 {
1072 uint_t i;
1073 dev_info_t *dip = mlxp->mlx_dip;
1074
1075 if (mlxp->mlx_attach & MLXCX_ATTACH_INTRS) {
1076 /*
1077 * Disable interrupts and let any active vectors quiesce.
1078 */
1079 mlxcx_intr_disable(mlxp);
1080 }
1081
1082 if (mlxp->mlx_attach & MLXCX_ATTACH_SENSORS) {
1083 mlxcx_teardown_sensors(mlxp);
1084 mlxp->mlx_attach &= ~MLXCX_ATTACH_SENSORS;
1085 }
1086
1087 if (mlxp->mlx_attach & MLXCX_ATTACH_CHKTIMERS) {
1088 mlxcx_teardown_checktimers(mlxp);
1089 mlxp->mlx_attach &= ~MLXCX_ATTACH_CHKTIMERS;
1090 }
1091
1092 if (mlxp->mlx_attach & MLXCX_ATTACH_GROUPS) {
1093 mlxcx_teardown_groups(mlxp);
1094 mlxp->mlx_attach &= ~MLXCX_ATTACH_GROUPS;
1095 }
1096
1097 if (mlxp->mlx_attach & MLXCX_ATTACH_WQS) {
1098 mlxcx_teardown_wqs(mlxp);
1099 mlxp->mlx_attach &= ~MLXCX_ATTACH_WQS;
1100 }
1101
1102 if (mlxp->mlx_attach & MLXCX_ATTACH_CQS) {
1103 mlxcx_teardown_cqs(mlxp);
1104 mlxp->mlx_attach &= ~MLXCX_ATTACH_CQS;
1105 }
1106
1107 if (mlxp->mlx_attach & MLXCX_ATTACH_BUFS) {
1108 mlxcx_teardown_bufs(mlxp);
1109 mlxp->mlx_attach &= ~MLXCX_ATTACH_BUFS;
1110 }
1111
1112 if (mlxp->mlx_attach & MLXCX_ATTACH_PORTS) {
1113 mlxcx_teardown_ports(mlxp);
1114 mlxp->mlx_attach &= ~MLXCX_ATTACH_PORTS;
1115 }
1116
1117 if (mlxp->mlx_attach & MLXCX_ATTACH_INTRS) {
1118 mlxcx_teardown_eqs(mlxp);
1119 mlxcx_intr_teardown(mlxp);
1120 mlxp->mlx_attach &= ~MLXCX_ATTACH_INTRS;
1121 }
1122
1123 if (mlxp->mlx_attach & MLXCX_ATTACH_UAR_PD_TD) {
1124 if (mlxp->mlx_uar.mlu_allocated) {
1125 if (!mlxcx_cmd_dealloc_uar(mlxp, &mlxp->mlx_uar)) {
1126 mlxcx_warn(mlxp, "failed to release UAR");
1127 }
1128 for (i = 0; i < MLXCX_BF_PER_UAR; ++i)
1129 mutex_destroy(&mlxp->mlx_uar.mlu_bf[i].mbf_mtx);
1130 }
1131 if (mlxp->mlx_pd.mlpd_allocated &&
1132 !mlxcx_cmd_dealloc_pd(mlxp, &mlxp->mlx_pd)) {
1133 mlxcx_warn(mlxp, "failed to release PD");
1134 }
1135 if (mlxp->mlx_tdom.mltd_allocated &&
1136 !mlxcx_cmd_dealloc_tdom(mlxp, &mlxp->mlx_tdom)) {
1137 mlxcx_warn(mlxp, "failed to release TDOM");
1138 }
1139 mlxp->mlx_attach &= ~MLXCX_ATTACH_UAR_PD_TD;
1140 }
1141
1142 if (mlxp->mlx_attach & MLXCX_ATTACH_INIT_HCA) {
1143 if (!mlxcx_cmd_teardown_hca(mlxp)) {
1144 mlxcx_warn(mlxp, "failed to send teardown HCA "
1145 "command during device detach");
1146 }
1147 mlxp->mlx_attach &= ~MLXCX_ATTACH_INIT_HCA;
1148 }
1149
1150 if (mlxp->mlx_attach & MLXCX_ATTACH_PAGE_LIST) {
1151 mlxcx_teardown_pages(mlxp);
1152 mlxp->mlx_attach &= ~MLXCX_ATTACH_PAGE_LIST;
1153 }
1154
1155 if (mlxp->mlx_attach & MLXCX_ATTACH_ASYNC_TQ) {
1156 for (i = 0; i <= MLXCX_FUNC_ID_MAX; i++) {
1157 mlxp->mlx_npages_req[i].mla_mlx = NULL;
1158 mutex_destroy(&mlxp->mlx_npages_req[i].mla_mtx);
1159 }
1160 taskq_destroy(mlxp->mlx_async_tq);
1161 mlxp->mlx_async_tq = NULL;
1162 mlxp->mlx_attach &= ~MLXCX_ATTACH_ASYNC_TQ;
1163 }
1164
1165 if (mlxp->mlx_attach & MLXCX_ATTACH_ENABLE_HCA) {
1166 if (!mlxcx_cmd_disable_hca(mlxp)) {
1167 mlxcx_warn(mlxp, "failed to send DISABLE HCA command "
1168 "during device detach");
1169 }
1170 mlxp->mlx_attach &= ~MLXCX_ATTACH_ENABLE_HCA;
1171 }
1172
1173 if (mlxp->mlx_attach & MLXCX_ATTACH_CMD) {
1174 mlxcx_cmd_queue_fini(mlxp);
1175 mlxp->mlx_attach &= ~MLXCX_ATTACH_CMD;
1176 }
1177
1178 if (mlxp->mlx_attach & MLXCX_ATTACH_CAPS) {
1179 kmem_free(mlxp->mlx_caps, sizeof (mlxcx_caps_t));
1180 mlxp->mlx_caps = NULL;
1181 mlxp->mlx_attach &= ~MLXCX_ATTACH_CAPS;
1182 }
1183
1184 if (mlxp->mlx_attach & MLXCX_ATTACH_REGS) {
1185 ddi_regs_map_free(&mlxp->mlx_regs_handle);
1186 mlxp->mlx_regs_handle = NULL;
1187 mlxp->mlx_attach &= ~MLXCX_ATTACH_REGS;
1188 }
1189
1190 if (mlxp->mlx_attach & MLXCX_ATTACH_PCI_CONFIG) {
1191 pci_config_teardown(&mlxp->mlx_cfg_handle);
1192 mlxp->mlx_cfg_handle = NULL;
1193 mlxp->mlx_attach &= ~MLXCX_ATTACH_PCI_CONFIG;
1194 }
1195
1196 if (mlxp->mlx_attach & MLXCX_ATTACH_FM) {
1197 mlxcx_fm_fini(mlxp);
1198 mlxp->mlx_attach &= ~MLXCX_ATTACH_FM;
1199 }
1200
1201 VERIFY3S(mlxp->mlx_attach, ==, 0);
1202 ddi_soft_state_free(mlxcx_softstate, mlxp->mlx_inst);
1203 ddi_set_driver_private(dip, NULL);
1204 }
1205
1206 static void
mlxcx_get_model(mlxcx_t * mlxp)1207 mlxcx_get_model(mlxcx_t *mlxp)
1208 {
1209 uint16_t venid;
1210 uint16_t devid;
1211
1212 venid = pci_config_get16(mlxp->mlx_cfg_handle, PCI_CONF_VENID);
1213 if (venid != MLXCX_VENDOR_ID) {
1214 /* Currently, all supported cards have a Mellanox vendor id. */
1215 mlxp->mlx_type = MLXCX_DEV_UNKNOWN;
1216 return;
1217 }
1218
1219 devid = pci_config_get16(mlxp->mlx_cfg_handle, PCI_CONF_DEVID);
1220 switch (devid) {
1221 case MLXCX_CX4_DEVID:
1222 case MLXCX_CX4_VF_DEVID:
1223 case MLXCX_CX4_LX_VF_DEVID:
1224 mlxp->mlx_type = MLXCX_DEV_CX4;
1225 break;
1226 case MLXCX_CX5_DEVID:
1227 case MLXCX_CX5_VF_DEVID:
1228 case MLXCX_CX5_EX_DEVID:
1229 case MLXCX_CX5_EX_VF_DEVID:
1230 case MLXCX_CX5_GEN_VF_DEVID:
1231 mlxp->mlx_type = MLXCX_DEV_CX5;
1232 break;
1233 case MLXCX_CX6_DEVID:
1234 case MLXCX_CX6_VF_DEVID:
1235 case MLXCX_CX6_DF_DEVID:
1236 case MLXCX_CX6_LX_DEVID:
1237 mlxp->mlx_type = MLXCX_DEV_CX6;
1238 break;
1239 default:
1240 mlxp->mlx_type = MLXCX_DEV_UNKNOWN;
1241 }
1242 }
1243
1244 static boolean_t
mlxcx_regs_map(mlxcx_t * mlxp)1245 mlxcx_regs_map(mlxcx_t *mlxp)
1246 {
1247 off_t memsize;
1248 int ret;
1249 ddi_device_acc_attr_t da;
1250
1251 if (ddi_dev_regsize(mlxp->mlx_dip, MLXCX_REG_NUMBER, &memsize) !=
1252 DDI_SUCCESS) {
1253 mlxcx_warn(mlxp, "failed to get register set size");
1254 return (B_FALSE);
1255 }
1256
1257 /*
1258 * All data in the main BAR is kept in big-endian even though it's a PCI
1259 * device.
1260 */
1261 bzero(&da, sizeof (ddi_device_acc_attr_t));
1262 da.devacc_attr_version = DDI_DEVICE_ATTR_V0;
1263 da.devacc_attr_endian_flags = DDI_STRUCTURE_BE_ACC;
1264 da.devacc_attr_dataorder = DDI_STRICTORDER_ACC;
1265 if (DDI_FM_ACC_ERR_CAP(mlxp->mlx_fm_caps)) {
1266 da.devacc_attr_access = DDI_FLAGERR_ACC;
1267 } else {
1268 da.devacc_attr_access = DDI_DEFAULT_ACC;
1269 }
1270
1271 ret = ddi_regs_map_setup(mlxp->mlx_dip, MLXCX_REG_NUMBER,
1272 &mlxp->mlx_regs_base, 0, memsize, &da, &mlxp->mlx_regs_handle);
1273
1274 if (ret != DDI_SUCCESS) {
1275 mlxcx_warn(mlxp, "failed to map device registers: %d", ret);
1276 return (B_FALSE);
1277 }
1278
1279 return (B_TRUE);
1280 }
1281
1282 static boolean_t
mlxcx_check_issi(mlxcx_t * mlxp)1283 mlxcx_check_issi(mlxcx_t *mlxp)
1284 {
1285 uint32_t issi;
1286
1287 if (!mlxcx_cmd_query_issi(mlxp, &issi)) {
1288 mlxcx_warn(mlxp, "failed to get ISSI");
1289 return (B_FALSE);
1290 }
1291
1292 if ((issi & (1 << MLXCX_CURRENT_ISSI)) == 0) {
1293 mlxcx_warn(mlxp, "hardware does not support software ISSI, "
1294 "hw vector 0x%x, sw version %u", issi, MLXCX_CURRENT_ISSI);
1295 return (B_FALSE);
1296 }
1297
1298 if (!mlxcx_cmd_set_issi(mlxp, MLXCX_CURRENT_ISSI)) {
1299 mlxcx_warn(mlxp, "failed to set ISSI to %u",
1300 MLXCX_CURRENT_ISSI);
1301 return (B_FALSE);
1302 }
1303
1304 return (B_TRUE);
1305 }
1306
1307 boolean_t
mlxcx_give_pages(mlxcx_t * mlxp,int32_t npages,int32_t * ngiven)1308 mlxcx_give_pages(mlxcx_t *mlxp, int32_t npages, int32_t *ngiven)
1309 {
1310 ddi_device_acc_attr_t acc;
1311 ddi_dma_attr_t attr;
1312 int32_t i;
1313 list_t plist;
1314 mlxcx_dev_page_t *mdp;
1315 mlxcx_dev_page_t **pages;
1316 const ddi_dma_cookie_t *ck;
1317
1318 /*
1319 * If there are no pages required, then we're done here.
1320 */
1321 if (npages <= 0) {
1322 *ngiven = 0;
1323 return (B_TRUE);
1324 }
1325
1326 npages = MIN(npages, MLXCX_MANAGE_PAGES_MAX_PAGES);
1327
1328 pages = kmem_alloc(sizeof (*pages) * npages, KM_SLEEP);
1329
1330 list_create(&plist, sizeof (mlxcx_dev_page_t),
1331 offsetof(mlxcx_dev_page_t, mxdp_list));
1332
1333 for (i = 0; i < npages; i++) {
1334 mdp = kmem_zalloc(sizeof (mlxcx_dev_page_t), KM_SLEEP);
1335 mlxcx_dma_acc_attr(mlxp, &acc);
1336 mlxcx_dma_page_attr(mlxp, &attr);
1337 if (!mlxcx_dma_alloc(mlxp, &mdp->mxdp_dma, &attr, &acc,
1338 B_TRUE, MLXCX_HW_PAGE_SIZE, B_TRUE)) {
1339 mlxcx_warn(mlxp, "failed to allocate 4k page %u/%u", i,
1340 npages);
1341 kmem_free(mdp, sizeof (mlxcx_dev_page_t));
1342 goto cleanup_npages;
1343 }
1344 ck = mlxcx_dma_cookie_one(&mdp->mxdp_dma);
1345 mdp->mxdp_pa = ck->dmac_laddress;
1346
1347 list_insert_tail(&plist, mdp);
1348 }
1349
1350 /*
1351 * Now that all of the pages have been allocated, given them to hardware
1352 * in chunks.
1353 */
1354 for (i = 0; i < npages; i++) {
1355 pages[i] = list_remove_head(&plist);
1356 }
1357
1358 if (!mlxcx_cmd_give_pages(mlxp,
1359 MLXCX_MANAGE_PAGES_OPMOD_GIVE_PAGES, npages, pages)) {
1360 mlxcx_warn(mlxp, "!hardware refused our gift of %u "
1361 "pages!", npages);
1362 for (i = 0; i < npages; i++) {
1363 list_insert_tail(&plist, pages[i]);
1364 }
1365 goto cleanup_npages;
1366 }
1367
1368 mutex_enter(&mlxp->mlx_pagemtx);
1369 for (i = 0; i < npages; i++) {
1370 avl_add(&mlxp->mlx_pages, pages[i]);
1371 }
1372 mlxp->mlx_npages += npages;
1373 mutex_exit(&mlxp->mlx_pagemtx);
1374
1375 list_destroy(&plist);
1376 kmem_free(pages, sizeof (*pages) * npages);
1377
1378 *ngiven = npages;
1379
1380 return (B_TRUE);
1381
1382 cleanup_npages:
1383 kmem_free(pages, sizeof (*pages) * npages);
1384 while ((mdp = list_remove_head(&plist)) != NULL) {
1385 mlxcx_dma_free(&mdp->mxdp_dma);
1386 kmem_free(mdp, sizeof (mlxcx_dev_page_t));
1387 }
1388 list_destroy(&plist);
1389 return (B_FALSE);
1390 }
1391
1392 static boolean_t
mlxcx_init_pages(mlxcx_t * mlxp,uint_t type)1393 mlxcx_init_pages(mlxcx_t *mlxp, uint_t type)
1394 {
1395 int32_t npages, given;
1396
1397 if (!mlxcx_cmd_query_pages(mlxp, type, &npages)) {
1398 mlxcx_warn(mlxp, "failed to determine boot pages");
1399 return (B_FALSE);
1400 }
1401
1402 while (npages > 0) {
1403 if (!mlxcx_give_pages(mlxp, npages, &given))
1404 return (B_FALSE);
1405
1406 npages -= given;
1407 }
1408
1409 return (B_TRUE);
1410 }
1411
1412 static int
mlxcx_bufs_cache_constr(void * arg,void * cookie,int kmflags)1413 mlxcx_bufs_cache_constr(void *arg, void *cookie, int kmflags)
1414 {
1415 mlxcx_t *mlxp = cookie;
1416 mlxcx_buffer_t *b = arg;
1417
1418 bzero(b, sizeof (mlxcx_buffer_t));
1419 b->mlb_mlx = mlxp;
1420 b->mlb_state = MLXCX_BUFFER_INIT;
1421 list_create(&b->mlb_tx_chain, sizeof (mlxcx_buffer_t),
1422 offsetof(mlxcx_buffer_t, mlb_tx_chain_entry));
1423
1424 return (0);
1425 }
1426
1427 static void
mlxcx_bufs_cache_destr(void * arg,void * cookie)1428 mlxcx_bufs_cache_destr(void *arg, void *cookie)
1429 {
1430 mlxcx_t *mlxp = cookie;
1431 mlxcx_buffer_t *b = arg;
1432 VERIFY3P(b->mlb_mlx, ==, mlxp);
1433 VERIFY(b->mlb_state == MLXCX_BUFFER_INIT);
1434 list_destroy(&b->mlb_tx_chain);
1435 }
1436
1437 mlxcx_buf_shard_t *
mlxcx_mlbs_create(mlxcx_t * mlxp)1438 mlxcx_mlbs_create(mlxcx_t *mlxp)
1439 {
1440 mlxcx_buf_shard_t *s;
1441
1442 s = kmem_zalloc(sizeof (mlxcx_buf_shard_t), KM_SLEEP);
1443
1444 mutex_init(&s->mlbs_mtx, NULL, MUTEX_DRIVER,
1445 DDI_INTR_PRI(mlxp->mlx_intr_pri));
1446 list_create(&s->mlbs_busy, sizeof (mlxcx_buffer_t),
1447 offsetof(mlxcx_buffer_t, mlb_entry));
1448 list_create(&s->mlbs_free, sizeof (mlxcx_buffer_t),
1449 offsetof(mlxcx_buffer_t, mlb_entry));
1450 list_create(&s->mlbs_loaned, sizeof (mlxcx_buffer_t),
1451 offsetof(mlxcx_buffer_t, mlb_entry));
1452 cv_init(&s->mlbs_free_nonempty, NULL, CV_DRIVER, NULL);
1453
1454 list_insert_tail(&mlxp->mlx_buf_shards, s);
1455
1456 return (s);
1457 }
1458
1459 static boolean_t
mlxcx_setup_bufs(mlxcx_t * mlxp)1460 mlxcx_setup_bufs(mlxcx_t *mlxp)
1461 {
1462 char namebuf[KSTAT_STRLEN];
1463
1464 (void) snprintf(namebuf, KSTAT_STRLEN, "mlxcx%d_bufs_cache",
1465 ddi_get_instance(mlxp->mlx_dip));
1466 mlxp->mlx_bufs_cache = kmem_cache_create(namebuf,
1467 sizeof (mlxcx_buffer_t), sizeof (uint64_t),
1468 mlxcx_bufs_cache_constr, mlxcx_bufs_cache_destr,
1469 NULL, mlxp, NULL, 0);
1470
1471 list_create(&mlxp->mlx_buf_shards, sizeof (mlxcx_buf_shard_t),
1472 offsetof(mlxcx_buf_shard_t, mlbs_entry));
1473
1474 return (B_TRUE);
1475 }
1476
1477 static void
mlxcx_fm_qstate_ereport(mlxcx_t * mlxp,const char * qtype,uint32_t qnum,const char * state,uint8_t statenum)1478 mlxcx_fm_qstate_ereport(mlxcx_t *mlxp, const char *qtype, uint32_t qnum,
1479 const char *state, uint8_t statenum)
1480 {
1481 uint64_t ena;
1482 char buf[FM_MAX_CLASS];
1483
1484 if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps))
1485 return;
1486
1487 (void) snprintf(buf, FM_MAX_CLASS, "%s.%s",
1488 MLXCX_FM_SERVICE_MLXCX, "qstate.err");
1489 ena = fm_ena_generate(0, FM_ENA_FMT1);
1490
1491 ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP,
1492 FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0,
1493 "state", DATA_TYPE_STRING, state,
1494 "state_num", DATA_TYPE_UINT8, statenum,
1495 "qtype", DATA_TYPE_STRING, qtype,
1496 "qnum", DATA_TYPE_UINT32, qnum,
1497 NULL);
1498 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_DEGRADED);
1499 }
1500
1501 /*
1502 * The following set of routines are for monitoring the health of
1503 * event, completion and work queues. They run infrequently peeking at
1504 * the structs to catch stalls and inconsistent state.
1505 *
1506 * They peek at the structs *without* acquiring locks - we don't want
1507 * to impede flow of data. Driver start up and shutdown semantics
1508 * guarantee the structs are present and won't disappear underneath
1509 * these routines.
1510 *
1511 * As previously noted, the routines peek at active data in the structs and
1512 * they will store some values for comparison on next invocation. To
1513 * maintain integrity of the saved values, these values are only modified
1514 * within these routines.
1515 */
1516 static void
mlxcx_eq_check(void * arg)1517 mlxcx_eq_check(void *arg)
1518 {
1519 mlxcx_t *mlxp = (mlxcx_t *)arg;
1520 mlxcx_event_queue_t *eq;
1521 mlxcx_eventq_ctx_t ctx;
1522 const char *str;
1523
1524 uint_t i;
1525
1526 for (i = 0; i < mlxp->mlx_intr_count; ++i) {
1527 eq = &mlxp->mlx_eqs[i];
1528
1529 if ((eq->mleq_state & MLXCX_EQ_CREATED) == 0)
1530 continue;
1531
1532 /*
1533 * If the event queue was successfully created in the HCA,
1534 * then initialization and shutdown sequences guarantee
1535 * the queue exists.
1536 */
1537 ASSERT0(eq->mleq_state & MLXCX_EQ_DESTROYED);
1538
1539 if (!mlxcx_cmd_query_eq(mlxp, eq, &ctx))
1540 continue;
1541
1542 str = "???";
1543 switch (ctx.mleqc_status) {
1544 case MLXCX_EQ_STATUS_OK:
1545 break;
1546 case MLXCX_EQ_STATUS_WRITE_FAILURE:
1547 str = "WRITE_FAILURE";
1548 break;
1549 }
1550
1551 if (ctx.mleqc_status != MLXCX_EQ_STATUS_OK) {
1552 mlxcx_fm_qstate_ereport(mlxp, "event",
1553 eq->mleq_num, str, ctx.mleqc_status);
1554 mlxcx_warn(mlxp, "EQ %u is in bad status: %x (%s)",
1555 eq->mleq_intr_index, ctx.mleqc_status, str);
1556 }
1557
1558 if (ctx.mleqc_state != MLXCX_EQ_ST_ARMED &&
1559 (eq->mleq_state & MLXCX_EQ_ARMED)) {
1560 if (eq->mleq_cc == eq->mleq_check_disarm_cc &&
1561 ++eq->mleq_check_disarm_cnt >= 3) {
1562 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_STALL);
1563 mlxcx_warn(mlxp, "EQ %u isn't armed",
1564 eq->mleq_intr_index);
1565 }
1566 eq->mleq_check_disarm_cc = eq->mleq_cc;
1567 } else {
1568 eq->mleq_check_disarm_cc = 0;
1569 eq->mleq_check_disarm_cnt = 0;
1570 }
1571 }
1572 }
1573
1574 static void
mlxcx_cq_check(void * arg)1575 mlxcx_cq_check(void *arg)
1576 {
1577 mlxcx_t *mlxp = (mlxcx_t *)arg;
1578 mlxcx_completion_queue_t *cq;
1579 mlxcx_completionq_ctx_t ctx;
1580 const char *str, *type;
1581 uint_t v;
1582
1583 for (cq = list_head(&mlxp->mlx_cqs); cq != NULL;
1584 cq = list_next(&mlxp->mlx_cqs, cq)) {
1585
1586 if ((cq->mlcq_state & MLXCX_CQ_CREATED) == 0)
1587 continue;
1588
1589 /*
1590 * If the completion queue was successfully created in the HCA,
1591 * then initialization and shutdown sequences guarantee
1592 * the queue exists.
1593 */
1594 ASSERT0(cq->mlcq_state & MLXCX_CQ_DESTROYED);
1595 ASSERT0(cq->mlcq_state & MLXCX_CQ_TEARDOWN);
1596
1597 if (cq->mlcq_fm_repd_qstate)
1598 continue;
1599
1600 if (!mlxcx_cmd_query_cq(mlxp, cq, &ctx))
1601 continue;
1602
1603 if (cq->mlcq_wq != NULL) {
1604 mlxcx_work_queue_t *wq = cq->mlcq_wq;
1605 if (wq->mlwq_type == MLXCX_WQ_TYPE_RECVQ)
1606 type = "rx ";
1607 else if (wq->mlwq_type == MLXCX_WQ_TYPE_SENDQ)
1608 type = "tx ";
1609 else
1610 type = "";
1611 } else {
1612 type = "";
1613 }
1614
1615 str = "???";
1616 v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATUS);
1617 switch (v) {
1618 case MLXCX_CQC_STATUS_OK:
1619 break;
1620 case MLXCX_CQC_STATUS_OVERFLOW:
1621 str = "OVERFLOW";
1622 break;
1623 case MLXCX_CQC_STATUS_WRITE_FAIL:
1624 str = "WRITE_FAIL";
1625 break;
1626 case MLXCX_CQC_STATUS_INVALID:
1627 str = "INVALID";
1628 break;
1629 }
1630
1631 if (v != MLXCX_CQC_STATUS_OK) {
1632 mlxcx_fm_qstate_ereport(mlxp, "completion",
1633 cq->mlcq_num, str, v);
1634 mlxcx_warn(mlxp, "%sCQ 0x%x is in bad status: %x (%s)",
1635 type, cq->mlcq_num, v, str);
1636 cq->mlcq_fm_repd_qstate = B_TRUE;
1637 }
1638
1639 v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATE);
1640 if (v != MLXCX_CQC_STATE_ARMED &&
1641 (cq->mlcq_state & MLXCX_CQ_ARMED) &&
1642 !(cq->mlcq_state & MLXCX_CQ_POLLING)) {
1643 if (cq->mlcq_cc == cq->mlcq_check_disarm_cc &&
1644 ++cq->mlcq_check_disarm_cnt >= 3) {
1645 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_STALL);
1646 mlxcx_warn(mlxp, "%sCQ 0x%x (%p) isn't armed",
1647 type, cq->mlcq_num, cq);
1648 }
1649 cq->mlcq_check_disarm_cc = cq->mlcq_cc;
1650 } else {
1651 cq->mlcq_check_disarm_cnt = 0;
1652 cq->mlcq_check_disarm_cc = 0;
1653 }
1654 }
1655 }
1656
1657 void
mlxcx_check_sq(mlxcx_t * mlxp,mlxcx_work_queue_t * sq)1658 mlxcx_check_sq(mlxcx_t *mlxp, mlxcx_work_queue_t *sq)
1659 {
1660 mlxcx_sq_ctx_t ctx;
1661 mlxcx_sq_state_t state;
1662
1663 if (!mlxcx_cmd_query_sq(mlxp, sq, &ctx))
1664 return;
1665
1666 ASSERT3U(from_be24(ctx.mlsqc_cqn), ==, sq->mlwq_cq->mlcq_num);
1667 state = get_bits32(ctx.mlsqc_flags, MLXCX_SQ_STATE);
1668 switch (state) {
1669 case MLXCX_SQ_STATE_RST:
1670 if (sq->mlwq_state & MLXCX_WQ_STARTED) {
1671 mlxcx_fm_qstate_ereport(mlxp, "send",
1672 sq->mlwq_num, "RST", state);
1673 sq->mlwq_fm_repd_qstate = B_TRUE;
1674 }
1675 break;
1676 case MLXCX_SQ_STATE_RDY:
1677 if (!(sq->mlwq_state & MLXCX_WQ_STARTED)) {
1678 mlxcx_fm_qstate_ereport(mlxp, "send",
1679 sq->mlwq_num, "RDY", state);
1680 sq->mlwq_fm_repd_qstate = B_TRUE;
1681 }
1682 break;
1683 case MLXCX_SQ_STATE_ERR:
1684 mlxcx_fm_qstate_ereport(mlxp, "send",
1685 sq->mlwq_num, "ERR", state);
1686 sq->mlwq_fm_repd_qstate = B_TRUE;
1687 break;
1688 default:
1689 mlxcx_fm_qstate_ereport(mlxp, "send",
1690 sq->mlwq_num, "???", state);
1691 sq->mlwq_fm_repd_qstate = B_TRUE;
1692 break;
1693 }
1694 }
1695
1696 void
mlxcx_check_rq(mlxcx_t * mlxp,mlxcx_work_queue_t * rq)1697 mlxcx_check_rq(mlxcx_t *mlxp, mlxcx_work_queue_t *rq)
1698 {
1699 mlxcx_rq_ctx_t ctx;
1700 mlxcx_rq_state_t state;
1701
1702
1703 if (!mlxcx_cmd_query_rq(mlxp, rq, &ctx))
1704 return;
1705
1706 ASSERT3U(from_be24(ctx.mlrqc_cqn), ==, rq->mlwq_cq->mlcq_num);
1707 state = get_bits32(ctx.mlrqc_flags, MLXCX_RQ_STATE);
1708 switch (state) {
1709 case MLXCX_RQ_STATE_RST:
1710 if (rq->mlwq_state & MLXCX_WQ_STARTED) {
1711 mlxcx_fm_qstate_ereport(mlxp, "receive",
1712 rq->mlwq_num, "RST", state);
1713 rq->mlwq_fm_repd_qstate = B_TRUE;
1714 }
1715 break;
1716 case MLXCX_RQ_STATE_RDY:
1717 if (!(rq->mlwq_state & MLXCX_WQ_STARTED)) {
1718 mlxcx_fm_qstate_ereport(mlxp, "receive",
1719 rq->mlwq_num, "RDY", state);
1720 rq->mlwq_fm_repd_qstate = B_TRUE;
1721 }
1722 break;
1723 case MLXCX_RQ_STATE_ERR:
1724 mlxcx_fm_qstate_ereport(mlxp, "receive",
1725 rq->mlwq_num, "ERR", state);
1726 rq->mlwq_fm_repd_qstate = B_TRUE;
1727 break;
1728 default:
1729 mlxcx_fm_qstate_ereport(mlxp, "receive",
1730 rq->mlwq_num, "???", state);
1731 rq->mlwq_fm_repd_qstate = B_TRUE;
1732 break;
1733 }
1734 }
1735
1736 static void
mlxcx_wq_check(void * arg)1737 mlxcx_wq_check(void *arg)
1738 {
1739 mlxcx_t *mlxp = (mlxcx_t *)arg;
1740 mlxcx_work_queue_t *wq;
1741
1742 for (wq = list_head(&mlxp->mlx_wqs); wq != NULL;
1743 wq = list_next(&mlxp->mlx_wqs, wq)) {
1744
1745 if ((wq->mlwq_state & MLXCX_WQ_CREATED) == 0)
1746 continue;
1747
1748 /*
1749 * If the work queue was successfully created in the HCA,
1750 * then initialization and shutdown sequences guarantee
1751 * the queue exists.
1752 */
1753 ASSERT0(wq->mlwq_state & MLXCX_WQ_DESTROYED);
1754 ASSERT0(wq->mlwq_state & MLXCX_WQ_TEARDOWN);
1755
1756 if (wq->mlwq_fm_repd_qstate)
1757 continue;
1758
1759 switch (wq->mlwq_type) {
1760 case MLXCX_WQ_TYPE_SENDQ:
1761 mlxcx_check_sq(mlxp, wq);
1762 break;
1763 case MLXCX_WQ_TYPE_RECVQ:
1764 mlxcx_check_rq(mlxp, wq);
1765 break;
1766 }
1767 }
1768 }
1769
1770 static boolean_t
mlxcx_setup_checktimers(mlxcx_t * mlxp)1771 mlxcx_setup_checktimers(mlxcx_t *mlxp)
1772 {
1773 if (mlxp->mlx_props.mldp_eq_check_interval_sec > 0) {
1774 mlxp->mlx_eq_checktimer = ddi_periodic_add(mlxcx_eq_check, mlxp,
1775 mlxp->mlx_props.mldp_eq_check_interval_sec * NANOSEC,
1776 DDI_IPL_0);
1777 }
1778 if (mlxp->mlx_props.mldp_cq_check_interval_sec > 0) {
1779 mlxp->mlx_cq_checktimer = ddi_periodic_add(mlxcx_cq_check, mlxp,
1780 mlxp->mlx_props.mldp_cq_check_interval_sec * NANOSEC,
1781 DDI_IPL_0);
1782 }
1783 if (mlxp->mlx_props.mldp_wq_check_interval_sec > 0) {
1784 mlxp->mlx_wq_checktimer = ddi_periodic_add(mlxcx_wq_check, mlxp,
1785 mlxp->mlx_props.mldp_wq_check_interval_sec * NANOSEC,
1786 DDI_IPL_0);
1787 }
1788 return (B_TRUE);
1789 }
1790
1791 int
mlxcx_dmac_fe_compare(const void * arg0,const void * arg1)1792 mlxcx_dmac_fe_compare(const void *arg0, const void *arg1)
1793 {
1794 const mlxcx_flow_entry_t *left = arg0;
1795 const mlxcx_flow_entry_t *right = arg1;
1796 int bcmpr;
1797
1798 bcmpr = memcmp(left->mlfe_dmac, right->mlfe_dmac,
1799 sizeof (left->mlfe_dmac));
1800 if (bcmpr < 0)
1801 return (-1);
1802 if (bcmpr > 0)
1803 return (1);
1804 if (left->mlfe_vid < right->mlfe_vid)
1805 return (-1);
1806 if (left->mlfe_vid > right->mlfe_vid)
1807 return (1);
1808 return (0);
1809 }
1810
1811 int
mlxcx_grmac_compare(const void * arg0,const void * arg1)1812 mlxcx_grmac_compare(const void *arg0, const void *arg1)
1813 {
1814 const mlxcx_group_mac_t *left = arg0;
1815 const mlxcx_group_mac_t *right = arg1;
1816 int bcmpr;
1817
1818 bcmpr = memcmp(left->mlgm_mac, right->mlgm_mac,
1819 sizeof (left->mlgm_mac));
1820 if (bcmpr < 0)
1821 return (-1);
1822 if (bcmpr > 0)
1823 return (1);
1824 return (0);
1825 }
1826
1827 int
mlxcx_page_compare(const void * arg0,const void * arg1)1828 mlxcx_page_compare(const void *arg0, const void *arg1)
1829 {
1830 const mlxcx_dev_page_t *p0 = arg0;
1831 const mlxcx_dev_page_t *p1 = arg1;
1832
1833 if (p0->mxdp_pa < p1->mxdp_pa)
1834 return (-1);
1835 if (p0->mxdp_pa > p1->mxdp_pa)
1836 return (1);
1837 return (0);
1838 }
1839
1840 static boolean_t
mlxcx_setup_ports(mlxcx_t * mlxp)1841 mlxcx_setup_ports(mlxcx_t *mlxp)
1842 {
1843 uint_t i, j;
1844 mlxcx_port_t *p;
1845 mlxcx_flow_table_t *ft;
1846 mlxcx_flow_group_t *fg;
1847 mlxcx_flow_entry_t *fe;
1848
1849 VERIFY3U(mlxp->mlx_nports, >, 0);
1850 mlxp->mlx_ports_size = mlxp->mlx_nports * sizeof (mlxcx_port_t);
1851 mlxp->mlx_ports = kmem_zalloc(mlxp->mlx_ports_size, KM_SLEEP);
1852
1853 for (i = 0; i < mlxp->mlx_nports; ++i) {
1854 p = &mlxp->mlx_ports[i];
1855 p->mlp_num = i;
1856 p->mlx_port_event.mla_mlx = mlxp;
1857 p->mlx_port_event.mla_port = p;
1858 mutex_init(&p->mlx_port_event.mla_mtx, NULL,
1859 MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_async_intr_pri));
1860 p->mlp_init |= MLXCX_PORT_INIT;
1861 mutex_init(&p->mlp_mtx, NULL, MUTEX_DRIVER,
1862 DDI_INTR_PRI(mlxp->mlx_intr_pri));
1863 mutex_enter(&p->mlp_mtx);
1864 if (!mlxcx_cmd_query_nic_vport_ctx(mlxp, p)) {
1865 mutex_exit(&p->mlp_mtx);
1866 goto err;
1867 }
1868 if (!mlxcx_cmd_query_port_mtu(mlxp, p)) {
1869 mutex_exit(&p->mlp_mtx);
1870 goto err;
1871 }
1872 if (!mlxcx_cmd_query_port_status(mlxp, p)) {
1873 mutex_exit(&p->mlp_mtx);
1874 goto err;
1875 }
1876 if (!mlxcx_cmd_query_port_speed(mlxp, p)) {
1877 mutex_exit(&p->mlp_mtx);
1878 goto err;
1879 }
1880 if (!mlxcx_cmd_modify_nic_vport_ctx(mlxp, p,
1881 MLXCX_MODIFY_NIC_VPORT_CTX_PROMISC)) {
1882 mutex_exit(&p->mlp_mtx);
1883 goto err;
1884 }
1885 if (!mlxcx_cmd_query_port_fec(mlxp, p)) {
1886 mutex_exit(&p->mlp_mtx);
1887 goto err;
1888 }
1889 p->mlp_fec_requested = LINK_FEC_AUTO;
1890
1891 mutex_exit(&p->mlp_mtx);
1892 }
1893
1894 for (i = 0; i < mlxp->mlx_nports; ++i) {
1895 p = &mlxp->mlx_ports[i];
1896 mutex_enter(&p->mlp_mtx);
1897 p->mlp_rx_flow = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t),
1898 KM_SLEEP));
1899 mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER,
1900 DDI_INTR_PRI(mlxp->mlx_intr_pri));
1901
1902 mutex_enter(&ft->mlft_mtx);
1903
1904 ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX;
1905 ft->mlft_port = p;
1906 ft->mlft_entshift = mlxp->mlx_props.mldp_ftbl_root_size_shift;
1907 if (ft->mlft_entshift > mlxp->mlx_caps->mlc_max_rx_ft_shift)
1908 ft->mlft_entshift = mlxp->mlx_caps->mlc_max_rx_ft_shift;
1909 ft->mlft_nents = (1 << ft->mlft_entshift);
1910 ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t);
1911 ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP);
1912 list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t),
1913 offsetof(mlxcx_flow_group_t, mlfg_entry));
1914
1915 for (j = 0; j < ft->mlft_nents; ++j) {
1916 ft->mlft_ent[j].mlfe_table = ft;
1917 ft->mlft_ent[j].mlfe_index = j;
1918 }
1919
1920 if (!mlxcx_cmd_create_flow_table(mlxp, ft)) {
1921 mutex_exit(&ft->mlft_mtx);
1922 mutex_exit(&p->mlp_mtx);
1923 goto err;
1924 }
1925
1926 if (!mlxcx_cmd_set_flow_table_root(mlxp, ft)) {
1927 mutex_exit(&ft->mlft_mtx);
1928 mutex_exit(&p->mlp_mtx);
1929 goto err;
1930 }
1931
1932 /*
1933 * We match broadcast at the top of the root flow table, then
1934 * all multicast/unicast MACs, then the promisc entry is down
1935 * the very bottom.
1936 *
1937 * This way when promisc is on, that entry simply catches any
1938 * remaining traffic that earlier flows haven't matched.
1939 */
1940 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1941 list_insert_tail(&ft->mlft_groups, fg);
1942 fg->mlfg_table = ft;
1943 fg->mlfg_size = 1;
1944 fg->mlfg_mask |= MLXCX_FLOW_MATCH_DMAC;
1945 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1946 mutex_exit(&ft->mlft_mtx);
1947 mutex_exit(&p->mlp_mtx);
1948 goto err;
1949 }
1950 p->mlp_bcast = fg;
1951 fe = list_head(&fg->mlfg_entries);
1952 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1953 (void) memset(fe->mlfe_dmac, 0xff, sizeof (fe->mlfe_dmac));
1954 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
1955
1956 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1957 list_insert_tail(&ft->mlft_groups, fg);
1958 fg->mlfg_table = ft;
1959 fg->mlfg_size = ft->mlft_nents - 2;
1960 fg->mlfg_mask |= MLXCX_FLOW_MATCH_DMAC;
1961 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1962 mutex_exit(&ft->mlft_mtx);
1963 mutex_exit(&p->mlp_mtx);
1964 goto err;
1965 }
1966 p->mlp_umcast = fg;
1967
1968 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1969 list_insert_tail(&ft->mlft_groups, fg);
1970 fg->mlfg_table = ft;
1971 fg->mlfg_size = 1;
1972 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1973 mutex_exit(&ft->mlft_mtx);
1974 mutex_exit(&p->mlp_mtx);
1975 goto err;
1976 }
1977 p->mlp_promisc = fg;
1978 fe = list_head(&fg->mlfg_entries);
1979 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1980 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
1981
1982 avl_create(&p->mlp_dmac_fe, mlxcx_dmac_fe_compare,
1983 sizeof (mlxcx_flow_entry_t), offsetof(mlxcx_flow_entry_t,
1984 mlfe_dmac_entry));
1985
1986 mutex_exit(&ft->mlft_mtx);
1987 mutex_exit(&p->mlp_mtx);
1988 }
1989
1990 return (B_TRUE);
1991
1992 err:
1993 mlxcx_teardown_ports(mlxp);
1994 return (B_FALSE);
1995 }
1996
1997 void
mlxcx_remove_all_vlan_entries(mlxcx_t * mlxp,mlxcx_ring_group_t * g)1998 mlxcx_remove_all_vlan_entries(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
1999 {
2000 mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft;
2001 mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg;
2002 mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg;
2003 mlxcx_flow_entry_t *fe;
2004 mlxcx_group_vlan_t *v;
2005
2006 ASSERT(mutex_owned(&g->mlg_mtx));
2007
2008 mutex_enter(&ft->mlft_mtx);
2009
2010 if (!list_is_empty(&g->mlg_rx_vlans)) {
2011 fe = list_head(&dfg->mlfg_entries);
2012 (void) mlxcx_cmd_set_flow_table_entry(mlxp, fe);
2013 }
2014
2015 while ((v = list_remove_head(&g->mlg_rx_vlans)) != NULL) {
2016 fe = v->mlgv_fe;
2017 ASSERT3P(fe->mlfe_table, ==, ft);
2018 ASSERT3P(fe->mlfe_group, ==, fg);
2019 kmem_free(v, sizeof (mlxcx_group_vlan_t));
2020
2021 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
2022 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2023 }
2024
2025 mutex_exit(&ft->mlft_mtx);
2026 }
2027
2028 boolean_t
mlxcx_remove_vlan_entry(mlxcx_t * mlxp,mlxcx_ring_group_t * g,boolean_t tagged,uint16_t vid)2029 mlxcx_remove_vlan_entry(mlxcx_t *mlxp, mlxcx_ring_group_t *g,
2030 boolean_t tagged, uint16_t vid)
2031 {
2032 mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft;
2033 mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg;
2034 mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg;
2035 mlxcx_flow_entry_t *fe;
2036 mlxcx_group_vlan_t *v;
2037 boolean_t found = B_FALSE;
2038
2039 ASSERT(mutex_owned(&g->mlg_mtx));
2040
2041 mutex_enter(&ft->mlft_mtx);
2042
2043 for (v = list_head(&g->mlg_rx_vlans); v != NULL;
2044 v = list_next(&g->mlg_rx_vlans, v)) {
2045 if (v->mlgv_tagged == tagged && v->mlgv_vid == vid) {
2046 found = B_TRUE;
2047 break;
2048 }
2049 }
2050 if (!found) {
2051 mutex_exit(&ft->mlft_mtx);
2052 return (B_FALSE);
2053 }
2054
2055 list_remove(&g->mlg_rx_vlans, v);
2056
2057 /*
2058 * If this is the last VLAN entry, we have to go back to accepting
2059 * any VLAN (which means re-enabling the default entry).
2060 *
2061 * Do this before we remove the flow entry for the last specific
2062 * VLAN so that we don't lose any traffic in the transition.
2063 */
2064 if (list_is_empty(&g->mlg_rx_vlans)) {
2065 fe = list_head(&dfg->mlfg_entries);
2066 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
2067 list_insert_tail(&g->mlg_rx_vlans, v);
2068 mutex_exit(&ft->mlft_mtx);
2069 return (B_FALSE);
2070 }
2071 }
2072
2073 fe = v->mlgv_fe;
2074 ASSERT(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED);
2075 ASSERT(fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED);
2076 ASSERT3P(fe->mlfe_table, ==, ft);
2077 ASSERT3P(fe->mlfe_group, ==, fg);
2078
2079 if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) {
2080 list_insert_tail(&g->mlg_rx_vlans, v);
2081 fe = list_head(&dfg->mlfg_entries);
2082 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
2083 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
2084 }
2085 mutex_exit(&ft->mlft_mtx);
2086 return (B_FALSE);
2087 }
2088
2089 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2090
2091 kmem_free(v, sizeof (mlxcx_group_vlan_t));
2092
2093 mutex_exit(&ft->mlft_mtx);
2094 return (B_TRUE);
2095 }
2096
2097 boolean_t
mlxcx_add_vlan_entry(mlxcx_t * mlxp,mlxcx_ring_group_t * g,boolean_t tagged,uint16_t vid)2098 mlxcx_add_vlan_entry(mlxcx_t *mlxp, mlxcx_ring_group_t *g, boolean_t tagged,
2099 uint16_t vid)
2100 {
2101 mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft;
2102 mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg;
2103 mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg;
2104 mlxcx_flow_entry_t *fe;
2105 mlxcx_group_vlan_t *v;
2106 boolean_t found = B_FALSE;
2107 boolean_t first = B_FALSE;
2108
2109 ASSERT(mutex_owned(&g->mlg_mtx));
2110
2111 mutex_enter(&ft->mlft_mtx);
2112
2113 for (v = list_head(&g->mlg_rx_vlans); v != NULL;
2114 v = list_next(&g->mlg_rx_vlans, v)) {
2115 if (v->mlgv_tagged == tagged && v->mlgv_vid == vid) {
2116 mutex_exit(&ft->mlft_mtx);
2117 return (B_TRUE);
2118 }
2119 }
2120 if (list_is_empty(&g->mlg_rx_vlans))
2121 first = B_TRUE;
2122
2123 for (fe = list_head(&fg->mlfg_entries); fe != NULL;
2124 fe = list_next(&fg->mlfg_entries, fe)) {
2125 if (!(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED)) {
2126 found = B_TRUE;
2127 break;
2128 }
2129 }
2130 if (!found) {
2131 mutex_exit(&ft->mlft_mtx);
2132 return (B_FALSE);
2133 }
2134
2135 v = kmem_zalloc(sizeof (mlxcx_group_vlan_t), KM_SLEEP);
2136 v->mlgv_fe = fe;
2137 v->mlgv_tagged = tagged;
2138 v->mlgv_vid = vid;
2139
2140 fe->mlfe_state |= MLXCX_FLOW_ENTRY_RESERVED;
2141 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
2142 fe->mlfe_vid = vid;
2143 if (tagged) {
2144 fe->mlfe_vlan_type = MLXCX_VLAN_TYPE_CVLAN;
2145 } else {
2146 fe->mlfe_vlan_type = MLXCX_VLAN_TYPE_NONE;
2147 }
2148
2149 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
2150 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_DIRTY;
2151 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2152 kmem_free(v, sizeof (mlxcx_group_vlan_t));
2153 mutex_exit(&ft->mlft_mtx);
2154 return (B_FALSE);
2155 }
2156
2157 list_insert_tail(&g->mlg_rx_vlans, v);
2158
2159 /*
2160 * If the vlan list was empty for this group before adding this one,
2161 * then we no longer want the "default" entry to allow all VLANs
2162 * through.
2163 */
2164 if (first) {
2165 fe = list_head(&dfg->mlfg_entries);
2166 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
2167 }
2168
2169 mutex_exit(&ft->mlft_mtx);
2170 return (B_TRUE);
2171 }
2172
2173 void
mlxcx_remove_all_umcast_entries(mlxcx_t * mlxp,mlxcx_port_t * port,mlxcx_ring_group_t * group)2174 mlxcx_remove_all_umcast_entries(mlxcx_t *mlxp, mlxcx_port_t *port,
2175 mlxcx_ring_group_t *group)
2176 {
2177 mlxcx_flow_entry_t *fe;
2178 mlxcx_flow_table_t *ft = port->mlp_rx_flow;
2179 mlxcx_group_mac_t *gm, *ngm;
2180
2181 ASSERT(mutex_owned(&port->mlp_mtx));
2182 ASSERT(mutex_owned(&group->mlg_mtx));
2183
2184 mutex_enter(&ft->mlft_mtx);
2185
2186 gm = avl_first(&group->mlg_rx_macs);
2187 for (; gm != NULL; gm = ngm) {
2188 ngm = AVL_NEXT(&group->mlg_rx_macs, gm);
2189
2190 ASSERT3P(gm->mlgm_group, ==, group);
2191 fe = gm->mlgm_fe;
2192 ASSERT3P(fe->mlfe_table, ==, ft);
2193
2194 avl_remove(&group->mlg_rx_macs, gm);
2195 list_remove(&fe->mlfe_ring_groups, gm);
2196 kmem_free(gm, sizeof (mlxcx_group_mac_t));
2197
2198 fe->mlfe_ndest = 0;
2199 for (gm = list_head(&fe->mlfe_ring_groups); gm != NULL;
2200 gm = list_next(&fe->mlfe_ring_groups, gm)) {
2201 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow =
2202 gm->mlgm_group->mlg_rx_vlan_ft;
2203 }
2204 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
2205
2206 if (fe->mlfe_ndest > 0) {
2207 (void) mlxcx_cmd_set_flow_table_entry(mlxp, fe);
2208 continue;
2209 }
2210
2211 /*
2212 * There are no more ring groups left for this MAC (it wasn't
2213 * attached to any other groups since ndest == 0), so clean up
2214 * its flow entry.
2215 */
2216 avl_remove(&port->mlp_dmac_fe, fe);
2217 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
2218 list_destroy(&fe->mlfe_ring_groups);
2219 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2220 }
2221
2222 mutex_exit(&ft->mlft_mtx);
2223 }
2224
2225 boolean_t
mlxcx_remove_umcast_entry(mlxcx_t * mlxp,mlxcx_port_t * port,mlxcx_ring_group_t * group,const uint8_t * macaddr)2226 mlxcx_remove_umcast_entry(mlxcx_t *mlxp, mlxcx_port_t *port,
2227 mlxcx_ring_group_t *group, const uint8_t *macaddr)
2228 {
2229 mlxcx_flow_entry_t *fe;
2230 mlxcx_flow_table_t *ft = port->mlp_rx_flow;
2231 mlxcx_group_mac_t *gm, probe;
2232
2233 ASSERT(mutex_owned(&port->mlp_mtx));
2234 ASSERT(mutex_owned(&group->mlg_mtx));
2235
2236 bzero(&probe, sizeof (probe));
2237 bcopy(macaddr, probe.mlgm_mac, sizeof (probe.mlgm_mac));
2238
2239 mutex_enter(&ft->mlft_mtx);
2240
2241 gm = avl_find(&group->mlg_rx_macs, &probe, NULL);
2242 if (gm == NULL) {
2243 mutex_exit(&ft->mlft_mtx);
2244 return (B_FALSE);
2245 }
2246 ASSERT3P(gm->mlgm_group, ==, group);
2247 ASSERT0(bcmp(macaddr, gm->mlgm_mac, sizeof (gm->mlgm_mac)));
2248
2249 fe = gm->mlgm_fe;
2250 ASSERT3P(fe->mlfe_table, ==, ft);
2251 ASSERT0(bcmp(macaddr, fe->mlfe_dmac, sizeof (fe->mlfe_dmac)));
2252
2253 list_remove(&fe->mlfe_ring_groups, gm);
2254 avl_remove(&group->mlg_rx_macs, gm);
2255 kmem_free(gm, sizeof (mlxcx_group_mac_t));
2256
2257 fe->mlfe_ndest = 0;
2258 for (gm = list_head(&fe->mlfe_ring_groups); gm != NULL;
2259 gm = list_next(&fe->mlfe_ring_groups, gm)) {
2260 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow =
2261 gm->mlgm_group->mlg_rx_vlan_ft;
2262 }
2263 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
2264
2265 if (fe->mlfe_ndest > 0) {
2266 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
2267 mutex_exit(&ft->mlft_mtx);
2268 return (B_FALSE);
2269 }
2270 mutex_exit(&ft->mlft_mtx);
2271 return (B_TRUE);
2272 }
2273
2274 /*
2275 * There are no more ring groups left for this MAC (it wasn't attached
2276 * to any other groups since ndest == 0), so clean up its flow entry.
2277 */
2278 avl_remove(&port->mlp_dmac_fe, fe);
2279 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
2280 list_destroy(&fe->mlfe_ring_groups);
2281
2282 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2283
2284 mutex_exit(&ft->mlft_mtx);
2285
2286 return (B_TRUE);
2287 }
2288
2289 boolean_t
mlxcx_add_umcast_entry(mlxcx_t * mlxp,mlxcx_port_t * port,mlxcx_ring_group_t * group,const uint8_t * macaddr)2290 mlxcx_add_umcast_entry(mlxcx_t *mlxp, mlxcx_port_t *port,
2291 mlxcx_ring_group_t *group, const uint8_t *macaddr)
2292 {
2293 mlxcx_flow_group_t *fg;
2294 mlxcx_flow_entry_t *fe, probe;
2295 mlxcx_flow_table_t *ft = port->mlp_rx_flow;
2296 mlxcx_group_mac_t *gm;
2297 boolean_t found = B_FALSE;
2298
2299 ASSERT(mutex_owned(&port->mlp_mtx));
2300 ASSERT(mutex_owned(&group->mlg_mtx));
2301
2302 bzero(&probe, sizeof (probe));
2303 bcopy(macaddr, probe.mlfe_dmac, sizeof (probe.mlfe_dmac));
2304
2305 mutex_enter(&ft->mlft_mtx);
2306
2307 fe = avl_find(&port->mlp_dmac_fe, &probe, NULL);
2308
2309 if (fe == NULL) {
2310 fg = port->mlp_umcast;
2311 for (fe = list_head(&fg->mlfg_entries); fe != NULL;
2312 fe = list_next(&fg->mlfg_entries, fe)) {
2313 if (!(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED)) {
2314 found = B_TRUE;
2315 break;
2316 }
2317 }
2318 if (!found) {
2319 mutex_exit(&ft->mlft_mtx);
2320 return (B_FALSE);
2321 }
2322 list_create(&fe->mlfe_ring_groups, sizeof (mlxcx_group_mac_t),
2323 offsetof(mlxcx_group_mac_t, mlgm_fe_entry));
2324 fe->mlfe_state |= MLXCX_FLOW_ENTRY_RESERVED;
2325 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
2326 bcopy(macaddr, fe->mlfe_dmac, sizeof (fe->mlfe_dmac));
2327
2328 avl_add(&port->mlp_dmac_fe, fe);
2329 }
2330
2331 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = group->mlg_rx_vlan_ft;
2332 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
2333
2334 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
2335 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_DIRTY;
2336 if (--fe->mlfe_ndest == 0) {
2337 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2338 }
2339 mutex_exit(&ft->mlft_mtx);
2340 return (B_FALSE);
2341 }
2342
2343 gm = kmem_zalloc(sizeof (mlxcx_group_mac_t), KM_SLEEP);
2344 gm->mlgm_group = group;
2345 gm->mlgm_fe = fe;
2346 bcopy(macaddr, gm->mlgm_mac, sizeof (gm->mlgm_mac));
2347 avl_add(&group->mlg_rx_macs, gm);
2348 list_insert_tail(&fe->mlfe_ring_groups, gm);
2349
2350 mutex_exit(&ft->mlft_mtx);
2351
2352 return (B_TRUE);
2353 }
2354
2355 boolean_t
mlxcx_setup_flow_group(mlxcx_t * mlxp,mlxcx_flow_table_t * ft,mlxcx_flow_group_t * fg)2356 mlxcx_setup_flow_group(mlxcx_t *mlxp, mlxcx_flow_table_t *ft,
2357 mlxcx_flow_group_t *fg)
2358 {
2359 mlxcx_flow_entry_t *fe;
2360 uint_t i, idx;
2361
2362 ASSERT(mutex_owned(&ft->mlft_mtx));
2363 ASSERT(ft->mlft_state & MLXCX_FLOW_TABLE_CREATED);
2364 ASSERT3P(fg->mlfg_table, ==, ft);
2365
2366 if (ft->mlft_next_ent + fg->mlfg_size > ft->mlft_nents)
2367 return (B_FALSE);
2368 fg->mlfg_start_idx = ft->mlft_next_ent;
2369
2370 if (!mlxcx_cmd_create_flow_group(mlxp, fg)) {
2371 return (B_FALSE);
2372 }
2373
2374 list_create(&fg->mlfg_entries, sizeof (mlxcx_flow_entry_t),
2375 offsetof(mlxcx_flow_entry_t, mlfe_group_entry));
2376 for (i = 0; i < fg->mlfg_size; ++i) {
2377 idx = fg->mlfg_start_idx + i;
2378 fe = &ft->mlft_ent[idx];
2379 fe->mlfe_group = fg;
2380 list_insert_tail(&fg->mlfg_entries, fe);
2381 }
2382 fg->mlfg_avail = fg->mlfg_size;
2383 ft->mlft_next_ent += fg->mlfg_size;
2384
2385 return (B_TRUE);
2386 }
2387
2388 static boolean_t
mlxcx_setup_eq(mlxcx_t * mlxp,uint_t vec,uint64_t events)2389 mlxcx_setup_eq(mlxcx_t *mlxp, uint_t vec, uint64_t events)
2390 {
2391 mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[vec];
2392
2393 mutex_enter(&mleq->mleq_mtx);
2394 if (!mlxcx_eq_alloc_dma(mlxp, mleq)) {
2395 /* mlxcx_teardown_eqs() will clean this up */
2396 mutex_exit(&mleq->mleq_mtx);
2397 return (B_FALSE);
2398 }
2399 mleq->mleq_mlx = mlxp;
2400 mleq->mleq_uar = &mlxp->mlx_uar;
2401 mleq->mleq_events = events;
2402 mleq->mleq_intr_index = vec;
2403
2404 if (!mlxcx_cmd_create_eq(mlxp, mleq)) {
2405 /* mlxcx_teardown_eqs() will clean this up */
2406 mutex_exit(&mleq->mleq_mtx);
2407 return (B_FALSE);
2408 }
2409
2410 if (ddi_intr_enable(mlxp->mlx_intr_handles[vec]) != DDI_SUCCESS) {
2411 /*
2412 * mlxcx_teardown_eqs() will handle calling cmd_destroy_eq and
2413 * eq_rele_dma
2414 */
2415 mutex_exit(&mleq->mleq_mtx);
2416 return (B_FALSE);
2417 }
2418 mleq->mleq_state |= MLXCX_EQ_INTR_ENABLED;
2419 mleq->mleq_state |= MLXCX_EQ_ATTACHING;
2420 mlxcx_arm_eq(mlxp, mleq);
2421 mutex_exit(&mleq->mleq_mtx);
2422
2423 return (B_TRUE);
2424 }
2425
2426 static void
mlxcx_eq_set_attached(mlxcx_t * mlxp)2427 mlxcx_eq_set_attached(mlxcx_t *mlxp)
2428 {
2429 uint_t vec;
2430 mlxcx_event_queue_t *mleq;
2431
2432 for (vec = 0; vec < mlxp->mlx_intr_count; ++vec) {
2433 mleq = &mlxp->mlx_eqs[vec];
2434
2435 mutex_enter(&mleq->mleq_mtx);
2436 mleq->mleq_state &= ~MLXCX_EQ_ATTACHING;
2437 mutex_exit(&mleq->mleq_mtx);
2438 }
2439 }
2440
2441 static boolean_t
mlxcx_setup_async_eqs(mlxcx_t * mlxp)2442 mlxcx_setup_async_eqs(mlxcx_t *mlxp)
2443 {
2444 boolean_t ret;
2445
2446 ret = mlxcx_setup_eq(mlxp, 0,
2447 (1ULL << MLXCX_EVENT_CMD_COMPLETION) |
2448 (1ULL << MLXCX_EVENT_PAGE_REQUEST) |
2449 (1ULL << MLXCX_EVENT_PORT_STATE) |
2450 (1ULL << MLXCX_EVENT_INTERNAL_ERROR) |
2451 (1ULL << MLXCX_EVENT_PORT_MODULE) |
2452 (1ULL << MLXCX_EVENT_SENDQ_DRAIN) |
2453 (1ULL << MLXCX_EVENT_LAST_WQE) |
2454 (1ULL << MLXCX_EVENT_CQ_ERROR) |
2455 (1ULL << MLXCX_EVENT_WQ_CATASTROPHE) |
2456 (1ULL << MLXCX_EVENT_PAGE_FAULT) |
2457 (1ULL << MLXCX_EVENT_WQ_INVALID_REQ) |
2458 (1ULL << MLXCX_EVENT_WQ_ACCESS_VIOL) |
2459 (1ULL << MLXCX_EVENT_NIC_VPORT) |
2460 (1ULL << MLXCX_EVENT_DOORBELL_CONGEST));
2461
2462 if (ret)
2463 mlxcx_cmd_eq_enable(mlxp);
2464
2465 return (ret);
2466 }
2467
2468 int
mlxcx_cq_compare(const void * arg0,const void * arg1)2469 mlxcx_cq_compare(const void *arg0, const void *arg1)
2470 {
2471 const mlxcx_completion_queue_t *left = arg0;
2472 const mlxcx_completion_queue_t *right = arg1;
2473
2474 if (left->mlcq_num < right->mlcq_num) {
2475 return (-1);
2476 }
2477 if (left->mlcq_num > right->mlcq_num) {
2478 return (1);
2479 }
2480 return (0);
2481 }
2482
2483 static boolean_t
mlxcx_setup_eqs(mlxcx_t * mlxp)2484 mlxcx_setup_eqs(mlxcx_t *mlxp)
2485 {
2486 uint_t i;
2487 mlxcx_event_queue_t *mleq;
2488
2489 ASSERT3S(mlxp->mlx_intr_count, >, 0);
2490
2491 for (i = mlxp->mlx_intr_cq0; i < mlxp->mlx_intr_count; ++i) {
2492 mleq = &mlxp->mlx_eqs[i];
2493 mutex_enter(&mleq->mleq_mtx);
2494 if (!mlxcx_eq_alloc_dma(mlxp, mleq)) {
2495 mutex_exit(&mleq->mleq_mtx);
2496 return (B_FALSE);
2497 }
2498 mleq->mleq_uar = &mlxp->mlx_uar;
2499 if (!mlxcx_cmd_create_eq(mlxp, mleq)) {
2500 /* mlxcx_teardown() will handle calling eq_rele_dma */
2501 mutex_exit(&mleq->mleq_mtx);
2502 return (B_FALSE);
2503 }
2504 if (mlxp->mlx_props.mldp_intrmod_period_usec != 0 &&
2505 !mlxcx_cmd_set_int_mod(mlxp, i,
2506 mlxp->mlx_props.mldp_intrmod_period_usec)) {
2507 mutex_exit(&mleq->mleq_mtx);
2508 return (B_FALSE);
2509 }
2510 if (ddi_intr_enable(mlxp->mlx_intr_handles[i]) != DDI_SUCCESS) {
2511 mutex_exit(&mleq->mleq_mtx);
2512 return (B_FALSE);
2513 }
2514 mleq->mleq_state |= MLXCX_EQ_INTR_ENABLED;
2515 mlxcx_arm_eq(mlxp, mleq);
2516 mutex_exit(&mleq->mleq_mtx);
2517 }
2518
2519 mlxp->mlx_next_eq = mlxp->mlx_intr_cq0;
2520
2521 return (B_TRUE);
2522 }
2523
2524 /*
2525 * A more recent ConnectX part will have the Port CApability Mask register.
2526 * Explore it and note things here.
2527 */
2528 static void
mlxcx_explore_pcam(mlxcx_t * mlxp,mlxcx_caps_t * c)2529 mlxcx_explore_pcam(mlxcx_t *mlxp, mlxcx_caps_t *c)
2530 {
2531 mlxcx_register_data_t data;
2532 mlxcx_reg_pcam_t *pcam = &data.mlrd_pcam;
2533
2534 ASSERT(c->mlc_pcam);
2535 bzero(&data, sizeof (data));
2536
2537 /*
2538 * Okay, so we have access the the Ports CApability Mask (PCAM).
2539 * There are various things we need to check about it.
2540 */
2541
2542 VERIFY(mlxcx_cmd_access_register(mlxp, MLXCX_CMD_ACCESS_REGISTER_READ,
2543 MLXCX_REG_PCAM, &data));
2544
2545 /*
2546 * NOTE: These ASSERT()s may change in future mlxcx(4D) parts.
2547 * As of now, only 0 is valid, and 1-255 are reserved. A future part
2548 * may return non-zero in these fields.
2549 */
2550 ASSERT0(pcam->mlrd_pcam_feature_group);
2551 ASSERT0(pcam->mlrd_pcam_access_reg_group);
2552
2553 c->mlc_ext_ptys = get_bit64(pcam->mlrd_pcam_feature_cap_mask_low,
2554 MLXCX_PCAM_LOW_FFLAGS_PTYS_EXTENDED);
2555 }
2556
2557 /*
2558 * Snapshot all of the hardware capabilities that we care about and then modify
2559 * the HCA capabilities to get things moving.
2560 */
2561 static boolean_t
mlxcx_init_caps(mlxcx_t * mlxp)2562 mlxcx_init_caps(mlxcx_t *mlxp)
2563 {
2564 mlxcx_caps_t *c;
2565
2566 mlxp->mlx_caps = c = kmem_zalloc(sizeof (mlxcx_caps_t), KM_SLEEP);
2567
2568 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_GENERAL,
2569 MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_hca_cur)) {
2570 mlxcx_warn(mlxp, "failed to obtain current HCA general caps");
2571 }
2572
2573 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_GENERAL,
2574 MLXCX_HCA_CAP_MODE_MAX, &c->mlc_hca_max)) {
2575 mlxcx_warn(mlxp, "failed to obtain maximum HCA general caps");
2576 }
2577
2578 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_ETHERNET,
2579 MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_ether_cur)) {
2580 mlxcx_warn(mlxp, "failed to obtain current HCA eth caps");
2581 }
2582
2583 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_ETHERNET,
2584 MLXCX_HCA_CAP_MODE_MAX, &c->mlc_ether_max)) {
2585 mlxcx_warn(mlxp, "failed to obtain maximum HCA eth caps");
2586 }
2587
2588 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_NIC_FLOW,
2589 MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_nic_flow_cur)) {
2590 mlxcx_warn(mlxp, "failed to obtain current HCA flow caps");
2591 }
2592
2593 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_NIC_FLOW,
2594 MLXCX_HCA_CAP_MODE_MAX, &c->mlc_nic_flow_max)) {
2595 mlxcx_warn(mlxp, "failed to obtain maximum HCA flow caps");
2596 }
2597
2598 /*
2599 * Check the caps meet our requirements.
2600 */
2601 const mlxcx_hca_cap_general_caps_t *gen = &c->mlc_hca_cur.mhc_general;
2602
2603 if (gen->mlcap_general_log_pg_sz != 12) {
2604 mlxcx_warn(mlxp, "!hardware has page size != 4k "
2605 "(log_pg_sz = %u)", (uint_t)gen->mlcap_general_log_pg_sz);
2606 goto err;
2607 }
2608 if (gen->mlcap_general_cqe_version != 1) {
2609 mlxcx_warn(mlxp, "!hardware does not support CQE v1 "
2610 "(cqe_ver = %u)", (uint_t)gen->mlcap_general_cqe_version);
2611 goto err;
2612 }
2613 if (gen->mlcap_general_port_type !=
2614 MLXCX_CAP_GENERAL_PORT_TYPE_ETHERNET) {
2615 mlxcx_warn(mlxp, "!hardware has non-ethernet ports");
2616 goto err;
2617 }
2618 mlxp->mlx_nports = gen->mlcap_general_num_ports;
2619 mlxp->mlx_max_sdu = (1 << (gen->mlcap_general_log_max_msg & 0x1F));
2620
2621 if (mlxp->mlx_type >= MLXCX_DEV_CX5 &&
2622 get_bit16(gen->mlcap_general_flags_c,
2623 MLXCX_CAP_GENERAL_FLAGS_C_PCAM_REG)) {
2624 c->mlc_pcam = B_TRUE;
2625 }
2626
2627 c->mlc_max_tir = (1 << gen->mlcap_general_log_max_tir);
2628
2629 c->mlc_checksum = get_bit32(c->mlc_ether_cur.mhc_eth.mlcap_eth_flags,
2630 MLXCX_ETH_CAP_CSUM_CAP);
2631 c->mlc_vxlan = get_bit32(c->mlc_ether_cur.mhc_eth.mlcap_eth_flags,
2632 MLXCX_ETH_CAP_TUNNEL_STATELESS_VXLAN);
2633
2634 c->mlc_max_lso_size = (1 << get_bits32(c->mlc_ether_cur.mhc_eth.
2635 mlcap_eth_flags, MLXCX_ETH_CAP_MAX_LSO_CAP));
2636 if (c->mlc_max_lso_size == 1) {
2637 c->mlc_max_lso_size = 0;
2638 c->mlc_lso = B_FALSE;
2639 } else {
2640 c->mlc_lso = B_TRUE;
2641 }
2642
2643 c->mlc_max_rqt_size = (1 << get_bits32(c->mlc_ether_cur.mhc_eth.
2644 mlcap_eth_flags, MLXCX_ETH_CAP_RSS_IND_TBL_CAP));
2645
2646 if (!get_bit32(c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx.
2647 mlcap_flow_prop_flags, MLXCX_FLOW_CAP_PROPS_SUPPORT)) {
2648 mlxcx_warn(mlxp, "!hardware does not support rx flow tables");
2649 goto err;
2650 }
2651 if (!get_bit32(c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx.
2652 mlcap_flow_prop_flags, MLXCX_FLOW_CAP_PROPS_MODIFY)) {
2653 mlxcx_warn(mlxp, "!hardware does not support modifying rx "
2654 "flow table entries");
2655 goto err;
2656 }
2657
2658 c->mlc_max_rx_ft_shift = c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx.
2659 mlcap_flow_prop_log_max_ft_size;
2660 c->mlc_max_rx_flows = (1 << c->mlc_nic_flow_cur.mhc_flow.
2661 mlcap_flow_nic_rx.mlcap_flow_prop_log_max_flow);
2662 c->mlc_max_rx_ft = (1 << c->mlc_nic_flow_cur.mhc_flow.
2663 mlcap_flow_nic_rx.mlcap_flow_prop_log_max_ft_num);
2664 c->mlc_max_rx_fe_dest = (1 << c->mlc_nic_flow_cur.mhc_flow.
2665 mlcap_flow_nic_rx.mlcap_flow_prop_log_max_destination);
2666
2667 return (B_TRUE);
2668
2669 err:
2670 kmem_free(mlxp->mlx_caps, sizeof (mlxcx_caps_t));
2671 return (B_FALSE);
2672 }
2673
2674 static int
mlxcx_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)2675 mlxcx_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2676 {
2677 mlxcx_t *mlxp;
2678
2679 if (cmd != DDI_DETACH)
2680 return (DDI_FAILURE);
2681
2682 mlxp = ddi_get_driver_private(dip);
2683 if (mlxp == NULL) {
2684 mlxcx_warn(NULL, "asked to detach, but missing instance "
2685 "private data");
2686 return (DDI_FAILURE);
2687 }
2688
2689 if (mlxp->mlx_attach & MLXCX_ATTACH_MAC_HDL) {
2690 if (mac_unregister(mlxp->mlx_mac_hdl) != DDI_SUCCESS) {
2691 return (DDI_FAILURE);
2692 }
2693 mlxp->mlx_attach &= ~MLXCX_ATTACH_MAC_HDL;
2694 }
2695
2696 mlxcx_teardown(mlxp);
2697 return (DDI_SUCCESS);
2698 }
2699
2700 static size_t
mlxcx_calc_rx_ngroups(mlxcx_t * mlxp)2701 mlxcx_calc_rx_ngroups(mlxcx_t *mlxp)
2702 {
2703 size_t ngroups = mlxp->mlx_props.mldp_rx_ngroups_large +
2704 mlxp->mlx_props.mldp_rx_ngroups_small;
2705 size_t tirlim, flowlim, gflowlim;
2706
2707 tirlim = mlxp->mlx_caps->mlc_max_tir / MLXCX_TIRS_PER_GROUP;
2708 if (tirlim < ngroups) {
2709 mlxcx_note(mlxp, "limiting number of rx groups to %u based "
2710 "on number of TIRs available", tirlim);
2711 ngroups = tirlim;
2712 }
2713
2714 flowlim = (1 << mlxp->mlx_caps->mlc_max_rx_ft_shift) - 2;
2715 if (flowlim < ngroups) {
2716 mlxcx_note(mlxp, "limiting number of rx groups to %u based "
2717 "on max size of RX flow tables", flowlim);
2718 ngroups = flowlim;
2719 }
2720
2721 /*
2722 * Restrict the number of groups not to exceed the max flow
2723 * table number from the devices capabilities.
2724 * There is one root table entry per port and 2 entries per
2725 * group.
2726 */
2727 flowlim = (mlxp->mlx_caps->mlc_max_rx_ft - mlxp->mlx_nports) / 2;
2728 if (flowlim < ngroups) {
2729 mlxcx_note(mlxp, "limiting number of rx groups to %u based "
2730 "on max number of RX flow tables",
2731 flowlim);
2732 ngroups = flowlim;
2733 }
2734
2735 do {
2736 gflowlim = mlxp->mlx_caps->mlc_max_rx_flows - 16 * ngroups - 2;
2737 if (gflowlim < ngroups) {
2738 mlxcx_note(mlxp, "limiting number of rx groups to %u "
2739 "based on max total RX flows", gflowlim);
2740 --ngroups;
2741 }
2742 } while (gflowlim < ngroups);
2743
2744 return (ngroups);
2745 }
2746
2747 static int
mlxcx_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)2748 mlxcx_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2749 {
2750 mlxcx_t *mlxp;
2751 char tq_name[TASKQ_NAMELEN];
2752 uint_t i;
2753 int inst, ret;
2754
2755 if (cmd != DDI_ATTACH)
2756 return (DDI_FAILURE);
2757
2758 inst = ddi_get_instance(dip);
2759 ret = ddi_soft_state_zalloc(mlxcx_softstate, inst);
2760 if (ret != 0)
2761 return (ret);
2762
2763 mlxp = ddi_get_soft_state(mlxcx_softstate, inst);
2764 if (mlxp == NULL)
2765 return (DDI_FAILURE);
2766 mlxp->mlx_dip = dip;
2767 mlxp->mlx_inst = inst;
2768 ddi_set_driver_private(dip, mlxp);
2769
2770 mlxcx_load_props(mlxp);
2771
2772 mlxcx_fm_init(mlxp);
2773 mlxp->mlx_attach |= MLXCX_ATTACH_FM;
2774
2775 if (pci_config_setup(mlxp->mlx_dip, &mlxp->mlx_cfg_handle) !=
2776 DDI_SUCCESS) {
2777 mlxcx_warn(mlxp, "failed to initial PCI config space");
2778 goto err;
2779 }
2780 mlxcx_get_model(mlxp);
2781 mlxp->mlx_attach |= MLXCX_ATTACH_PCI_CONFIG;
2782
2783 if (!mlxcx_regs_map(mlxp)) {
2784 goto err;
2785 }
2786 mlxp->mlx_attach |= MLXCX_ATTACH_REGS;
2787
2788 if (!mlxcx_cmd_queue_init(mlxp)) {
2789 goto err;
2790 }
2791 mlxp->mlx_attach |= MLXCX_ATTACH_CMD;
2792
2793 if (!mlxcx_cmd_enable_hca(mlxp)) {
2794 goto err;
2795 }
2796 mlxp->mlx_attach |= MLXCX_ATTACH_ENABLE_HCA;
2797
2798 if (!mlxcx_check_issi(mlxp)) {
2799 goto err;
2800 }
2801
2802 /*
2803 * We have to get our interrupts now so we know what priority to
2804 * create pagemtx with.
2805 */
2806 if (!mlxcx_intr_setup(mlxp)) {
2807 goto err;
2808 }
2809 mlxp->mlx_attach |= MLXCX_ATTACH_INTRS;
2810
2811 mutex_init(&mlxp->mlx_pagemtx, NULL, MUTEX_DRIVER,
2812 DDI_INTR_PRI(mlxp->mlx_intr_pri));
2813 avl_create(&mlxp->mlx_pages, mlxcx_page_compare,
2814 sizeof (mlxcx_dev_page_t), offsetof(mlxcx_dev_page_t, mxdp_tree));
2815 mlxp->mlx_attach |= MLXCX_ATTACH_PAGE_LIST;
2816
2817 /*
2818 * Taskq for asynchronous events which may interact with the HCA
2819 * via the command interface. Single threaded FIFO.
2820 */
2821 (void) snprintf(tq_name, sizeof (tq_name), "%s_async_%d",
2822 ddi_driver_name(mlxp->mlx_dip), mlxp->mlx_inst);
2823 mlxp->mlx_async_tq = taskq_create(tq_name, 1, minclsyspri, 1, INT_MAX,
2824 TASKQ_PREPOPULATE);
2825 /*
2826 * Initialize any pre-allocated taskq param structs.
2827 */
2828 for (i = 0; i <= MLXCX_FUNC_ID_MAX; i++) {
2829 mlxp->mlx_npages_req[i].mla_mlx = mlxp;
2830 mutex_init(&mlxp->mlx_npages_req[i].mla_mtx, NULL,
2831 MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_async_intr_pri));
2832 }
2833 mlxp->mlx_attach |= MLXCX_ATTACH_ASYNC_TQ;
2834
2835 if (!mlxcx_init_pages(mlxp, MLXCX_QUERY_PAGES_OPMOD_BOOT)) {
2836 goto err;
2837 }
2838
2839 if (!mlxcx_init_caps(mlxp)) {
2840 goto err;
2841 }
2842 mlxp->mlx_attach |= MLXCX_ATTACH_CAPS;
2843
2844 if (!mlxcx_init_pages(mlxp, MLXCX_QUERY_PAGES_OPMOD_INIT)) {
2845 goto err;
2846 }
2847
2848 if (!mlxcx_cmd_init_hca(mlxp)) {
2849 goto err;
2850 }
2851 mlxp->mlx_attach |= MLXCX_ATTACH_INIT_HCA;
2852
2853 if (!mlxcx_cmd_set_driver_version(mlxp, MLXCX_DRIVER_VERSION)) {
2854 goto err;
2855 }
2856
2857 if (mlxp->mlx_caps->mlc_pcam) {
2858 mlxcx_explore_pcam(mlxp, mlxp->mlx_caps);
2859 }
2860
2861 /*
2862 * The User Access Region (UAR) is needed so we can ring EQ and CQ
2863 * doorbells.
2864 */
2865 if (!mlxcx_cmd_alloc_uar(mlxp, &mlxp->mlx_uar)) {
2866 goto err;
2867 }
2868 for (i = 0; i < MLXCX_BF_PER_UAR; ++i) {
2869 mutex_init(&mlxp->mlx_uar.mlu_bf[i].mbf_mtx, NULL,
2870 MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_intr_pri));
2871 }
2872 mlxp->mlx_attach |= MLXCX_ATTACH_UAR_PD_TD;
2873
2874 /*
2875 * Set up asynchronous event queue which handles control type events
2876 * like PAGE_REQUEST and CMD completion events.
2877 *
2878 * This will enable and arm the interrupt on EQ 0. Note that only page
2879 * reqs and cmd completions will be handled until we call
2880 * mlxcx_eq_set_attached further down (this way we don't need an extra
2881 * set of locks over the mlxcx_t sub-structs not allocated yet)
2882 */
2883 if (!mlxcx_setup_async_eqs(mlxp)) {
2884 goto err;
2885 }
2886
2887 /*
2888 * Allocate a protection and transport domain. These don't really do
2889 * anything for us (they're IB concepts), but we need to give their
2890 * ID numbers in other commands.
2891 */
2892 if (!mlxcx_cmd_alloc_pd(mlxp, &mlxp->mlx_pd)) {
2893 goto err;
2894 }
2895 if (!mlxcx_cmd_alloc_tdom(mlxp, &mlxp->mlx_tdom)) {
2896 goto err;
2897 }
2898 /*
2899 * Fetch the "reserved" lkey that lets us give linear addresses in
2900 * work queue entries, rather than having to mess with the NIC's
2901 * internal MMU.
2902 */
2903 if (!mlxcx_cmd_query_special_ctxs(mlxp)) {
2904 goto err;
2905 }
2906
2907 /*
2908 * Query our port information and current state, populate the
2909 * mlxcx_port_t structs.
2910 *
2911 * This also sets up the root flow tables and flow groups.
2912 */
2913 if (!mlxcx_setup_ports(mlxp)) {
2914 goto err;
2915 }
2916 mlxp->mlx_attach |= MLXCX_ATTACH_PORTS;
2917
2918 mlxcx_load_model_props(mlxp);
2919
2920 /*
2921 * Set up, enable and arm the rest of the interrupt EQs which will
2922 * service events from CQs.
2923 *
2924 * The MLXCX_ATTACH_INTRS flag covers checking if these need to be
2925 * cleaned up.
2926 */
2927 if (!mlxcx_setup_eqs(mlxp)) {
2928 goto err;
2929 }
2930
2931 /* Completion queues */
2932 list_create(&mlxp->mlx_cqs, sizeof (mlxcx_completion_queue_t),
2933 offsetof(mlxcx_completion_queue_t, mlcq_entry));
2934 mlxp->mlx_attach |= MLXCX_ATTACH_CQS;
2935
2936 /* Work queues (send queues, receive queues) */
2937 list_create(&mlxp->mlx_wqs, sizeof (mlxcx_work_queue_t),
2938 offsetof(mlxcx_work_queue_t, mlwq_entry));
2939 mlxp->mlx_attach |= MLXCX_ATTACH_WQS;
2940
2941 /*
2942 * Construct our arrays of mlxcx_ring_group_ts, which represent the
2943 * "groups" we advertise to MAC.
2944 */
2945 mlxp->mlx_rx_ngroups = mlxcx_calc_rx_ngroups(mlxp);
2946 mlxp->mlx_rx_groups_size = mlxp->mlx_rx_ngroups *
2947 sizeof (mlxcx_ring_group_t);
2948 mlxp->mlx_rx_groups = kmem_zalloc(mlxp->mlx_rx_groups_size, KM_SLEEP);
2949
2950 mlxp->mlx_tx_ngroups = mlxp->mlx_props.mldp_tx_ngroups;
2951 mlxp->mlx_tx_groups_size = mlxp->mlx_tx_ngroups *
2952 sizeof (mlxcx_ring_group_t);
2953 mlxp->mlx_tx_groups = kmem_zalloc(mlxp->mlx_tx_groups_size, KM_SLEEP);
2954
2955 mlxp->mlx_attach |= MLXCX_ATTACH_GROUPS;
2956
2957 /*
2958 * Sets up the free/busy buffers list for keeping track of packet
2959 * buffers.
2960 */
2961 if (!mlxcx_setup_bufs(mlxp))
2962 goto err;
2963 mlxp->mlx_attach |= MLXCX_ATTACH_BUFS;
2964
2965 /*
2966 * Before we tell MAC about our rings/groups, we need to do enough
2967 * setup on them to be sure about the numbers and configuration that
2968 * we have. This will do basically everything short of allocating
2969 * packet buffers and starting the rings up.
2970 */
2971 for (i = 0; i < mlxp->mlx_tx_ngroups; ++i) {
2972 if (!mlxcx_tx_group_setup(mlxp, &mlxp->mlx_tx_groups[i]))
2973 goto err;
2974 }
2975 for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) {
2976 if (!mlxcx_rx_group_setup(mlxp, &mlxp->mlx_rx_groups[i]))
2977 goto err;
2978 }
2979
2980 /*
2981 * Set up periodic fault check timers which check the queue states,
2982 * set up should be after all the queues have been initialized and
2983 * consequently the teardown of timers must happen before
2984 * queue teardown.
2985 */
2986 if (!mlxcx_setup_checktimers(mlxp)) {
2987 goto err;
2988 }
2989 mlxp->mlx_attach |= MLXCX_ATTACH_CHKTIMERS;
2990
2991 /*
2992 * Some devices may not have a working temperature sensor; however,
2993 * there isn't a great way for us to know. We shouldn't fail attach if
2994 * this doesn't work.
2995 */
2996 if (mlxcx_setup_sensors(mlxp)) {
2997 mlxp->mlx_attach |= MLXCX_ATTACH_SENSORS;
2998 }
2999
3000 /*
3001 * Finally, tell MAC that we exist!
3002 */
3003 if (!mlxcx_register_mac(mlxp)) {
3004 goto err;
3005 }
3006 mlxp->mlx_attach |= MLXCX_ATTACH_MAC_HDL;
3007
3008 /*
3009 * This tells the interrupt handlers they can start processing events
3010 * other than cmd completions and page requests.
3011 */
3012 mlxcx_eq_set_attached(mlxp);
3013
3014 return (DDI_SUCCESS);
3015
3016 err:
3017 mlxcx_teardown(mlxp);
3018 return (DDI_FAILURE);
3019 }
3020
3021 static struct cb_ops mlxcx_cb_ops = {
3022 .cb_open = nulldev,
3023 .cb_close = nulldev,
3024 .cb_strategy = nodev,
3025 .cb_print = nodev,
3026 .cb_dump = nodev,
3027 .cb_read = nodev,
3028 .cb_write = nodev,
3029 .cb_ioctl = nodev,
3030 .cb_devmap = nodev,
3031 .cb_mmap = nodev,
3032 .cb_segmap = nodev,
3033 .cb_chpoll = nochpoll,
3034 .cb_prop_op = ddi_prop_op,
3035 .cb_flag = D_MP,
3036 .cb_rev = CB_REV,
3037 .cb_aread = nodev,
3038 .cb_awrite = nodev
3039 };
3040
3041 static struct dev_ops mlxcx_dev_ops = {
3042 .devo_rev = DEVO_REV,
3043 .devo_refcnt = 0,
3044 .devo_getinfo = NULL,
3045 .devo_identify = nulldev,
3046 .devo_probe = nulldev,
3047 .devo_attach = mlxcx_attach,
3048 .devo_detach = mlxcx_detach,
3049 .devo_reset = nodev,
3050 .devo_quiesce = ddi_quiesce_not_supported,
3051 .devo_cb_ops = &mlxcx_cb_ops
3052 };
3053
3054 static struct modldrv mlxcx_modldrv = {
3055 .drv_modops = &mod_driverops,
3056 .drv_linkinfo = "Mellanox Connect-X 4/5/6",
3057 .drv_dev_ops = &mlxcx_dev_ops
3058 };
3059
3060 static struct modlinkage mlxcx_modlinkage = {
3061 .ml_rev = MODREV_1,
3062 .ml_linkage = { &mlxcx_modldrv, NULL }
3063 };
3064
3065 int
_init(void)3066 _init(void)
3067 {
3068 int ret;
3069
3070 ret = ddi_soft_state_init(&mlxcx_softstate, sizeof (mlxcx_t), 0);
3071 if (ret != 0) {
3072 return (ret);
3073 }
3074
3075 mac_init_ops(&mlxcx_dev_ops, MLXCX_MODULE_NAME);
3076
3077 if ((ret = mod_install(&mlxcx_modlinkage)) != DDI_SUCCESS) {
3078 mac_fini_ops(&mlxcx_dev_ops);
3079 ddi_soft_state_fini(&mlxcx_softstate);
3080 return (ret);
3081 }
3082
3083 return (DDI_SUCCESS);
3084 }
3085
3086 int
_info(struct modinfo * modinfop)3087 _info(struct modinfo *modinfop)
3088 {
3089 return (mod_info(&mlxcx_modlinkage, modinfop));
3090 }
3091
3092 int
_fini(void)3093 _fini(void)
3094 {
3095 int ret;
3096
3097 if ((ret = mod_remove(&mlxcx_modlinkage)) != DDI_SUCCESS) {
3098 return (ret);
3099 }
3100
3101 mac_fini_ops(&mlxcx_dev_ops);
3102
3103 ddi_soft_state_fini(&mlxcx_softstate);
3104
3105 return (DDI_SUCCESS);
3106 }
3107