1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2020, The University of Queensland 14 * Copyright (c) 2018, Joyent, Inc. 15 */ 16 17 /* 18 * Mellanox Connect-X 4/5/6 driver. 19 */ 20 21 /* 22 * The PRM for this family of parts is freely available, and can be found at: 23 * https://www.mellanox.com/related-docs/user_manuals/ \ 24 * Ethernet_Adapters_Programming_Manual.pdf 25 */ 26 /* 27 * ConnectX glossary 28 * ----------------- 29 * 30 * WR Work Request: something we've asked the hardware to do by 31 * creating a Work Queue Entry (WQE), e.g. send or recv a packet 32 * 33 * WQE Work Queue Entry: a descriptor on a work queue descriptor ring 34 * 35 * WQ Work Queue: a descriptor ring that we can place WQEs on, usually 36 * either a Send Queue (SQ) or Receive Queue (RQ). Different WQ 37 * types have different WQE structures, different commands for 38 * creating and destroying them, etc, but share a common context 39 * structure, counter setup and state graph. 40 * SQ Send Queue, a specific type of WQ that sends packets 41 * RQ Receive Queue, a specific type of WQ that receives packets 42 * 43 * CQ Completion Queue: completion of WRs from a WQ are reported to 44 * one of these, as a CQE on its entry ring. 45 * CQE Completion Queue Entry: an entry in a CQ ring. Contains error 46 * info, as well as packet size, the ID of the WQ, and the index 47 * of the WQE which completed. Does not contain any packet data. 48 * 49 * EQ Event Queue: a ring of event structs from the hardware informing 50 * us when particular events happen. Many events can point at a 51 * a particular CQ which we should then go look at. 52 * EQE Event Queue Entry: an entry on the EQ ring 53 * 54 * UAR User Access Region, a page of the device's PCI BAR which is 55 * tied to particular EQ/CQ/WQ sets and contains doorbells to 56 * ring to arm them for interrupts or wake them up for new work 57 * 58 * RQT RQ Table, a collection of indexed RQs used to refer to the group 59 * as a single unit (for e.g. hashing/RSS). 60 * 61 * TIR Transport Interface Recieve, a bucket of resources for the 62 * reception of packets. TIRs have to point at either a single RQ 63 * or a table of RQs (RQT). They then serve as a target for flow 64 * table entries (FEs). TIRs that point at an RQT also contain the 65 * settings for hashing for RSS. 66 * 67 * TIS Transport Interface Send, a bucket of resources associated with 68 * the transmission of packets. In particular, the temporary 69 * resources used for LSO internally in the card are accounted to 70 * a TIS. 71 * 72 * FT Flow Table, a collection of FEs and FGs that can be referred to 73 * as a single entity (e.g. used as a target from another flow 74 * entry or set as the "root" table to handle incoming or outgoing 75 * packets). Packets arriving at a FT are matched against the 76 * FEs in the table until either one matches with a terminating 77 * action or all FEs are exhausted (it's first-match-wins but with 78 * some actions that are non-terminal, like counting actions). 79 * 80 * FG Flow Group, a group of FEs which share a common "mask" (i.e. 81 * they match on the same attributes of packets coming into the 82 * flow). 83 * 84 * FE Flow Entry, an individual set of values to match against 85 * packets entering the flow table, combined with an action to 86 * take upon a successful match. The action we use most is 87 * "forward", which sends the packets to a TIR or another flow 88 * table and then stops further processing within the FE's FT. 89 * 90 * lkey/mkey A reference to something similar to a page table but in the 91 * device's internal onboard MMU. Since Connect-X parts double as 92 * IB cards (lots of RDMA) they have extensive onboard memory mgmt 93 * features which we try very hard not to use. For our WQEs we use 94 * the "reserved" lkey, which is a special value which indicates 95 * that addresses we give are linear addresses and should not be 96 * translated. 97 * 98 * PD Protection Domain, an IB concept. We have to allocate one to 99 * provide as a parameter for new WQs, but we don't do anything 100 * with it. 101 * 102 * TDOM/TD Transport Domain, an IB concept. We allocate one in order to 103 * provide it as a parameter to TIR/TIS creation, but we don't do 104 * anything with it. 105 */ 106 /* 107 * 108 * Data flow overview 109 * ------------------ 110 * 111 * This driver is a MAC ring-enabled driver which maps rings to send and recv 112 * queues in hardware on the device. 113 * 114 * Each SQ and RQ is set up to report to its own individual CQ, to ensure 115 * sufficient space, and simplify the logic needed to work out which buffer 116 * was completed. 117 * 118 * The CQs are then round-robin allocated onto EQs, of which we set up one per 119 * interrupt that the system gives us for the device. Normally this means we 120 * have 8 EQs. 121 * 122 * When we have >= 8 EQs available, we try to allocate only RX or only TX 123 * CQs on each one. The EQs are chosen for RX and TX in an alternating fashion. 124 * 125 * EQ #0 is reserved for all event types other than completion events, and has 126 * no CQs associated with it at any time. EQs #1 and upwards are only used for 127 * handling CQ completion events. 128 * 129 * +------+ +------+ +------+ +---------+ 130 * | SQ 0 |---->| CQ 0 |-----+ | EQ 0 |------> | MSI-X 0 | mlxcx_intr_0 131 * +------+ +------+ | +------+ +---------+ 132 * | 133 * +------+ +------+ | 134 * | SQ 1 |---->| CQ 1 |---+ | +------+ 135 * +------+ +------+ | +---> | | 136 * | | | 137 * +------+ +------+ | | EQ 1 | +---------+ 138 * | SQ 2 |---->| CQ 2 |---------> | |------> | MSI-X 1 | mlxcx_intr_n 139 * +------+ +------+ | +---> | | +---------+ 140 * | | +------+ 141 * | | 142 * ... | | 143 * | | +------+ 144 * +------+ +------+ +-----> | | 145 * | RQ 0 |---->| CQ 3 |---------> | | +---------+ 146 * +------+ +------+ | | EQ 2 |------> | MSI-X 2 | mlxcx_intr_n 147 * | | | +---------+ 148 * +------+ +------+ | +-> | | 149 * | RQ 1 |---->| CQ 4 |-----+ | +------+ 150 * +------+ +------+ | 151 * | .... 152 * +------+ +------+ | 153 * | RQ 2 |---->| CQ 5 |-------+ 154 * +------+ +------+ 155 * 156 * ... (note this diagram does not show RX-only or TX-only EQs) 157 * 158 * For TX, we advertise all of the SQs we create as plain rings to MAC with 159 * no TX groups. This puts MAC in "virtual group" mode where it will allocate 160 * and use the rings as it sees fit. 161 * 162 * For RX, we advertise actual groups in order to make use of hardware 163 * classification. 164 * 165 * The hardware classification we use is based around Flow Tables, and we 166 * currently ignore all of the eswitch features of the card. The NIC VPORT 167 * is always set to promisc mode so that the eswitch sends us all of the 168 * traffic that arrives on the NIC, and we use flow entries to manage 169 * everything. 170 * 171 * We use 2 layers of flow tables for classification: traffic arrives at the 172 * root RX flow table which contains MAC address filters. Those then send 173 * matched traffic to the per-group L1 VLAN filter tables which contain VLAN 174 * presence and VID filters. 175 * 176 * Since these parts only support doing RSS hashing on a single protocol at a 177 * time, we have to use a third layer of flow tables as well to break traffic 178 * down by L4 and L3 protocol (TCPv6, TCPv4, UDPv6, UDPv4, IPv6, IPv4 etc) 179 * so that it can be sent to the appropriate TIR for hashing. 180 * 181 * Incoming packets 182 * + +---------+ +---------+ 183 * | +->| group 0 | | group 0 | 184 * | | | vlan ft | +-->| hash ft | 185 * v | | L1 | | | L2 | 186 * +----+----+ | +---------+ | +---------+ +-----+ +-----+------+ 187 * | eswitch | | | | | | TCPv6 |--->| TIR |--->| | RQ0 | 188 * +----+----+ | | | | +---------+ +-----+ | +------+ 189 * | | | | | | UDPv6 |--->| TIR |--->| | RQ1 | 190 * | | | | | +---------+ +-----+ | +------+ 191 * | | | | | | TCPv4 |--->| TIR |--->| | RQ2 | 192 * v | | | | +---------+ +-----+ | RQT +------+ 193 * +----+----+ | +---------+ | | UDPv4 |--->| TIR |--->| | ... | 194 * | root rx | | | default |--+ +---------+ +-----+ | | | 195 * | flow tb | | +---------+ | | IPv6 |--->| TIR |--->| | | 196 * | L0 | | | promisc |--+ +---------+ +-----+ | | | 197 * +---------+ | +---------+ ^ | IPv4 |--->| TIR |--->| | | 198 * | bcast |---|---------------+ +---------+ +-----+ +-----+------+ 199 * +---------+ | ^ | other |-+ 200 * | MAC 0 |---+ | +---------+ | +-----+ +-----+ 201 * +---------+ | +->| TIR |--->| RQ0 | 202 * | MAC 1 |-+ | +-----+ +-----+ 203 * +---------+ | +---------------+ 204 * | MAC 2 |-+ | ^ 205 * +---------+ | | | 206 * | MAC 3 |-+ | +---------+ | +---------+ 207 * +---------+ | | | group 1 | | | group 1 | 208 * | ..... | +--->| vlan ft | | +>| hash ft | 209 * | | | | L1 | | | | L2 | 210 * +---------+ | +---------+ | | +---------+ +-----+ +-----+------+ 211 * | promisc |---+ | VLAN 0 |----+ | TCPv6 |--->| TIR |--->| | RQ3 | 212 * +---------+ +---------+ | +---------+ +-----+ | +------+ 213 * | ..... | | | UDPv6 |--->| TIR |--->| | RQ4 | 214 * | | | +---------+ +-----+ | +------+ 215 * | | | | TCPv4 |--->| TIR |--->| | RQ5 | 216 * | | | +---------+ +-----+ | RQT +------+ 217 * +---------+ | | UDPv4 |--->| TIR |--->| | ... | 218 * | | | +---------+ +-----+ | | | 219 * +---------+ | | IPv6 |--->| TIR |--->| | | 220 * | promisc |--+ +---------+ +-----+ | | | 221 * +---------+ | IPv4 |--->| TIR |--->| | | 222 * +---------+ +-----+ +-----+------+ 223 * | other |-+ 224 * +---------+ | 225 * ....... | +-----+ +-----+ 226 * +->| TIR |--->| RQ3 | 227 * +-----+ +-----+ 228 * 229 * Note that the "promisc" flow entries are only set/enabled when promisc 230 * mode is enabled for the NIC. All promisc flow entries point directly at 231 * group 0's hashing flowtable (so all promisc-only traffic lands on group 0, 232 * the "default group" in MAC). 233 * 234 * The "default" entry in the L1 VLAN filter flow tables is used when there 235 * are no VLANs set for the group, to accept any traffic regardless of tag. It 236 * is deleted as soon as a VLAN filter is added (and re-instated if the 237 * last VLAN filter is removed). 238 * 239 * The actual descriptor ring structures for RX on Connect-X4 don't contain any 240 * space for packet data (they're a collection of scatter pointers only). TX 241 * descriptors contain some space for "inline headers" (and the card requires 242 * us to put at least the L2 Ethernet headers there for the eswitch to look at) 243 * but all the rest of the data comes from the gather pointers. 244 * 245 * When we get completions back they simply contain the ring index number of 246 * the WR (work request) which completed. So, we manage the buffers for actual 247 * packet data completely independently of the descriptors in this driver. When 248 * a WR is enqueued in a WQE (work queue entry), we stamp the packet data buffer 249 * with the WQE index that we put it at, and therefore don't have to look at 250 * the original descriptor at all when handling completions. 251 * 252 * For RX, we create sufficient packet data buffers to fill 150% of the 253 * available descriptors for each ring. These all are pre-set-up for DMA and 254 * have an mblk_t associated with them (with desballoc()). 255 * 256 * For TX we either borrow the mblk's memory and DMA bind it (if the packet is 257 * large enough), or we copy it into a pre-allocated buffer set up in the same 258 * as as for RX. 259 */ 260 261 /* 262 * Buffer lifecycle: RX 263 * -------------------- 264 * 265 * The lifecycle of an mlxcx_buffer_t (packet buffer) used for RX is pretty 266 * straightforward. 267 * 268 * It is created (and has all its memory allocated) at the time of starting up 269 * the RX ring it belongs to. Then it is placed on the "free" list in the 270 * mlxcx_buffer_shard_t associated with its RQ. When mlxcx_rq_refill() wants 271 * more buffers to add to the RQ, it takes one off and marks it as "on WQ" 272 * before making a WQE for it. 273 * 274 * After a completion event occurs, the packet is either discarded (and the 275 * buffer_t returned to the free list), or it is readied for loaning to MAC. 276 * 277 * Once MAC and the rest of the system have finished with the packet, they call 278 * freemsg() on its mblk, which will call mlxcx_buf_mp_return and return the 279 * buffer_t to the free list. 280 * 281 * At detach/teardown time, buffers are only every destroyed from the free list. 282 * 283 * 284 * + 285 * | 286 * | mlxcx_buf_create 287 * | 288 * v 289 * +----+----+ 290 * | created | 291 * +----+----+ 292 * | 293 * | 294 * | mlxcx_buf_return 295 * | 296 * v 297 * mlxcx_buf_destroy +----+----+ 298 * +---------| free |<---------------+ 299 * | +----+----+ | 300 * | | | 301 * | | | mlxcx_buf_return 302 * v | mlxcx_buf_take | 303 * +---+--+ v | 304 * | dead | +---+---+ | 305 * +------+ | on WQ |- - - - - - - - >O 306 * +---+---+ ^ 307 * | | 308 * | | 309 * | mlxcx_buf_loan | mlxcx_buf_mp_return 310 * v | 311 * +-------+--------+ | 312 * | on loan to MAC |----------->O 313 * +----------------+ freemsg() 314 * 315 */ 316 317 /* 318 * Buffer lifecycle: TX 319 * -------------------- 320 * 321 * mlxcx_buffer_ts used for TX are divided into two kinds: regular buffers, and 322 * "foreign" buffers. 323 * 324 * The former have their memory allocated and DMA bound by this driver, while 325 * the latter (the "foreign" buffers) are on loan from MAC. Their memory is 326 * not owned by us, though we do DMA bind it (and take responsibility for 327 * un-binding it when we're done with them). 328 * 329 * We use separate mlxcx_buf_shard_ts for foreign and local buffers on each 330 * SQ. Thus, there is a separate free list and mutex for each kind. 331 * 332 * Since a TX packet might consist of multiple mblks, we translate each mblk 333 * into exactly one buffer_t. The buffer_ts are chained together in the same 334 * order as the mblks, using the mlb_tx_chain/mlb_tx_chain_entry list_t. 335 * 336 * Each chain of TX buffers may consist of foreign or driver buffers, in any 337 * mixture. 338 * 339 * The head of a TX buffer chain has mlb_tx_head == itself, which distinguishes 340 * it from the rest of the chain buffers. 341 * 342 * TX buffer chains are always returned to the free list by 343 * mlxcx_buf_return_chain(), which takes care of walking the mlb_tx_chain and 344 * freeing all of the members. 345 * 346 * We only call freemsg() once, on the head of the TX buffer chain's original 347 * mblk. This is true whether we copied it or bound it in a foreign buffer. 348 */ 349 350 /* 351 * Startup and command interface 352 * ----------------------------- 353 * 354 * The command interface is the primary way in which we give control orders to 355 * the hardware (e.g. actions like "create this queue" or "delete this flow 356 * entry"). The command interface is never used to transmit or receive packets 357 * -- that takes place only on the queues that are set up through it. 358 * 359 * In mlxcx_cmd.c we implement our use of the command interface on top of a 360 * simple taskq. Since it's not performance critical, we busy-wait on command 361 * completions and only process a single command at a time. 362 * 363 * If this becomes a problem later we can wire command completions up to EQ 0 364 * once we have interrupts running. 365 * 366 * The startup/attach process for this card involves a bunch of different steps 367 * which are summarised pretty well in the PRM. We have to send a number of 368 * commands which do different things to start the card up, give it some pages 369 * of our own memory for it to use, then start creating all the entities that 370 * we need to use like EQs, CQs, WQs, as well as their dependencies like PDs 371 * and TDoms. 372 */ 373 374 /* 375 * UARs 376 * ---- 377 * 378 * The pages of the PCI BAR other than the first few are reserved for use as 379 * "UAR" sections in this device. Each UAR section can be used as a set of 380 * doorbells for our queues. 381 * 382 * Currently we just make one single UAR for all of our queues. It doesn't 383 * seem to be a major limitation yet. 384 * 385 * When we're sending packets through an SQ, the PRM is not awful clear about 386 * exactly how we're meant to use the first 16 bytes of the Blueflame buffers 387 * (it's clear on the pattern of alternation you're expected to use between 388 * even and odd for Blueflame sends, but not for regular doorbells). 389 * 390 * Currently we don't do the even-odd alternating pattern for ordinary 391 * doorbells, and we don't use Blueflame at all. This seems to work fine, at 392 * least on Connect-X4 Lx. 393 */ 394 395 /* 396 * Lock ordering 397 * ------------- 398 * 399 * Interrupt side: 400 * 401 * - mleq_mtx 402 * - mlcq_mtx 403 * - mlcq_bufbmtx 404 * - mlwq_mtx 405 * - mlbs_mtx 406 * - mlp_mtx 407 * 408 * GLD side: 409 * 410 * - mlp_mtx 411 * - mlg_mtx 412 * - mlg_*.mlft_mtx 413 * - mlp_*.mlft_mtx 414 * - mlwq_mtx 415 * - mlbs_mtx 416 * - mlcq_bufbmtx 417 * - mleq_mtx 418 * - mlcq_mtx 419 * 420 */ 421 422 #include <sys/modctl.h> 423 #include <sys/conf.h> 424 #include <sys/devops.h> 425 #include <sys/sysmacros.h> 426 #include <sys/time.h> 427 428 #include <sys/mac_provider.h> 429 430 #include <mlxcx.h> 431 432 CTASSERT((1 << MLXCX_RX_HASH_FT_SIZE_SHIFT) >= MLXCX_TIRS_PER_GROUP); 433 434 #define MLXCX_MODULE_NAME "mlxcx" 435 /* 436 * We give this to the firmware, so it has to be in a fixed format that it 437 * understands. 438 */ 439 #define MLXCX_DRIVER_VERSION "illumos,mlxcx,1.0.0,1,000,000000" 440 441 /* 442 * Firmware may take a while to reclaim pages. Try a set number of times. 443 */ 444 clock_t mlxcx_reclaim_delay = 1000 * 50; /* 50 ms in us */ 445 uint_t mlxcx_reclaim_tries = 100; /* Wait at most 5000ms */ 446 447 static void *mlxcx_softstate; 448 449 /* 450 * Fault detection thresholds. 451 */ 452 uint_t mlxcx_doorbell_tries = MLXCX_DOORBELL_TRIES_DFLT; 453 uint_t mlxcx_stuck_intr_count = MLXCX_STUCK_INTR_COUNT_DFLT; 454 455 static void 456 mlxcx_load_props(mlxcx_t *mlxp) 457 { 458 mlxcx_drv_props_t *p = &mlxp->mlx_props; 459 460 p->mldp_eq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 461 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "eq_size_shift", 462 MLXCX_EQ_SIZE_SHIFT_DFLT); 463 p->mldp_cq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 464 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cq_size_shift", 465 MLXCX_CQ_SIZE_SHIFT_DFLT); 466 p->mldp_sq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 467 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "sq_size_shift", 468 MLXCX_SQ_SIZE_SHIFT_DFLT); 469 p->mldp_rq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 470 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rq_size_shift", 471 MLXCX_RQ_SIZE_SHIFT_DFLT); 472 473 p->mldp_cqemod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 474 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_period_usec", 475 MLXCX_CQEMOD_PERIOD_USEC_DFLT); 476 p->mldp_cqemod_count = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 477 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_count", 478 MLXCX_CQEMOD_COUNT_DFLT); 479 p->mldp_intrmod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 480 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "intrmod_period_usec", 481 MLXCX_INTRMOD_PERIOD_USEC_DFLT); 482 483 p->mldp_tx_ngroups = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 484 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_ngroups", 485 MLXCX_TX_NGROUPS_DFLT); 486 p->mldp_tx_nrings_per_group = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 487 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_nrings_per_group", 488 MLXCX_TX_NRINGS_PER_GROUP_DFLT); 489 490 p->mldp_rx_ngroups_large = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 491 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_ngroups_large", 492 MLXCX_RX_NGROUPS_LARGE_DFLT); 493 p->mldp_rx_ngroups_small = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 494 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_ngroups_small", 495 MLXCX_RX_NGROUPS_SMALL_DFLT); 496 p->mldp_rx_nrings_per_large_group = ddi_getprop(DDI_DEV_T_ANY, 497 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 498 "rx_nrings_per_large_group", MLXCX_RX_NRINGS_PER_LARGE_GROUP_DFLT); 499 p->mldp_rx_nrings_per_small_group = ddi_getprop(DDI_DEV_T_ANY, 500 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 501 "rx_nrings_per_small_group", MLXCX_RX_NRINGS_PER_SMALL_GROUP_DFLT); 502 503 p->mldp_ftbl_root_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 504 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "ftbl_root_size_shift", 505 MLXCX_FTBL_ROOT_SIZE_SHIFT_DFLT); 506 507 p->mldp_tx_bind_threshold = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 508 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_bind_threshold", 509 MLXCX_TX_BIND_THRESHOLD_DFLT); 510 511 p->mldp_ftbl_vlan_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 512 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "ftbl_vlan_size_shift", 513 MLXCX_FTBL_VLAN_SIZE_SHIFT_DFLT); 514 515 p->mldp_eq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY, 516 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 517 "eq_check_interval_sec", MLXCX_EQ_CHECK_INTERVAL_SEC_DFLT); 518 p->mldp_cq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY, 519 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 520 "cq_check_interval_sec", MLXCX_CQ_CHECK_INTERVAL_SEC_DFLT); 521 p->mldp_wq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY, 522 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 523 "wq_check_interval_sec", MLXCX_WQ_CHECK_INTERVAL_SEC_DFLT); 524 } 525 526 void 527 mlxcx_note(mlxcx_t *mlxp, const char *fmt, ...) 528 { 529 va_list ap; 530 531 va_start(ap, fmt); 532 if (mlxp != NULL && mlxp->mlx_dip != NULL) { 533 vdev_err(mlxp->mlx_dip, CE_NOTE, fmt, ap); 534 } else { 535 vcmn_err(CE_NOTE, fmt, ap); 536 } 537 va_end(ap); 538 } 539 540 void 541 mlxcx_warn(mlxcx_t *mlxp, const char *fmt, ...) 542 { 543 va_list ap; 544 545 va_start(ap, fmt); 546 if (mlxp != NULL && mlxp->mlx_dip != NULL) { 547 vdev_err(mlxp->mlx_dip, CE_WARN, fmt, ap); 548 } else { 549 vcmn_err(CE_WARN, fmt, ap); 550 } 551 va_end(ap); 552 } 553 554 void 555 mlxcx_panic(mlxcx_t *mlxp, const char *fmt, ...) 556 { 557 va_list ap; 558 559 va_start(ap, fmt); 560 if (mlxp != NULL && mlxp->mlx_dip != NULL) { 561 vdev_err(mlxp->mlx_dip, CE_PANIC, fmt, ap); 562 } else { 563 vcmn_err(CE_PANIC, fmt, ap); 564 } 565 va_end(ap); 566 } 567 568 uint16_t 569 mlxcx_get16(mlxcx_t *mlxp, uintptr_t off) 570 { 571 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 572 return (ddi_get16(mlxp->mlx_regs_handle, (void *)addr)); 573 } 574 575 uint32_t 576 mlxcx_get32(mlxcx_t *mlxp, uintptr_t off) 577 { 578 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 579 return (ddi_get32(mlxp->mlx_regs_handle, (void *)addr)); 580 } 581 582 uint64_t 583 mlxcx_get64(mlxcx_t *mlxp, uintptr_t off) 584 { 585 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 586 return (ddi_get64(mlxp->mlx_regs_handle, (void *)addr)); 587 } 588 589 void 590 mlxcx_put32(mlxcx_t *mlxp, uintptr_t off, uint32_t val) 591 { 592 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 593 ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val); 594 } 595 596 void 597 mlxcx_put64(mlxcx_t *mlxp, uintptr_t off, uint64_t val) 598 { 599 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 600 ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val); 601 } 602 603 void 604 mlxcx_uar_put32(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint32_t val) 605 { 606 /* 607 * The UAR is always inside the first BAR, which we mapped as 608 * mlx_regs 609 */ 610 uintptr_t addr = off + (uintptr_t)mlu->mlu_base + 611 (uintptr_t)mlxp->mlx_regs_base; 612 ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val); 613 } 614 615 void 616 mlxcx_uar_put64(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint64_t val) 617 { 618 uintptr_t addr = off + (uintptr_t)mlu->mlu_base + 619 (uintptr_t)mlxp->mlx_regs_base; 620 ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val); 621 } 622 623 static void 624 mlxcx_fm_fini(mlxcx_t *mlxp) 625 { 626 if (mlxp->mlx_fm_caps == 0) 627 return; 628 629 if (DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) 630 ddi_fm_handler_unregister(mlxp->mlx_dip); 631 632 if (DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps) || 633 DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) 634 pci_ereport_teardown(mlxp->mlx_dip); 635 636 ddi_fm_fini(mlxp->mlx_dip); 637 638 mlxp->mlx_fm_caps = 0; 639 } 640 641 void 642 mlxcx_fm_ereport(mlxcx_t *mlxp, const char *detail) 643 { 644 uint64_t ena; 645 char buf[FM_MAX_CLASS]; 646 647 if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps)) 648 return; 649 650 (void) snprintf(buf, FM_MAX_CLASS, "%s.%s", DDI_FM_DEVICE, detail); 651 ena = fm_ena_generate(0, FM_ENA_FMT1); 652 ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP, 653 FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0, 654 NULL); 655 } 656 657 static int 658 mlxcx_fm_errcb(dev_info_t *dip, ddi_fm_error_t *err, const void *arg) 659 { 660 /* 661 * as the driver can always deal with an error in any dma or 662 * access handle, we can just return the fme_status value. 663 */ 664 pci_ereport_post(dip, err, NULL); 665 return (err->fme_status); 666 } 667 668 static void 669 mlxcx_fm_init(mlxcx_t *mlxp) 670 { 671 ddi_iblock_cookie_t iblk; 672 int def = DDI_FM_EREPORT_CAPABLE | DDI_FM_ACCCHK_CAPABLE | 673 DDI_FM_DMACHK_CAPABLE | DDI_FM_ERRCB_CAPABLE; 674 675 mlxp->mlx_fm_caps = ddi_prop_get_int(DDI_DEV_T_ANY, mlxp->mlx_dip, 676 DDI_PROP_DONTPASS, "fm_capable", def); 677 678 if (mlxp->mlx_fm_caps < 0) { 679 mlxp->mlx_fm_caps = 0; 680 } 681 mlxp->mlx_fm_caps &= def; 682 683 if (mlxp->mlx_fm_caps == 0) 684 return; 685 686 ddi_fm_init(mlxp->mlx_dip, &mlxp->mlx_fm_caps, &iblk); 687 if (DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps) || 688 DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) { 689 pci_ereport_setup(mlxp->mlx_dip); 690 } 691 if (DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) { 692 ddi_fm_handler_register(mlxp->mlx_dip, mlxcx_fm_errcb, 693 (void *)mlxp); 694 } 695 } 696 697 static void 698 mlxcx_mlbs_teardown(mlxcx_t *mlxp, mlxcx_buf_shard_t *s) 699 { 700 mlxcx_buffer_t *buf; 701 702 mutex_enter(&s->mlbs_mtx); 703 while (!list_is_empty(&s->mlbs_busy)) 704 cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx); 705 while ((buf = list_head(&s->mlbs_free)) != NULL) { 706 mlxcx_buf_destroy(mlxp, buf); 707 } 708 list_destroy(&s->mlbs_free); 709 list_destroy(&s->mlbs_busy); 710 mutex_exit(&s->mlbs_mtx); 711 712 cv_destroy(&s->mlbs_free_nonempty); 713 mutex_destroy(&s->mlbs_mtx); 714 } 715 716 static void 717 mlxcx_teardown_bufs(mlxcx_t *mlxp) 718 { 719 mlxcx_buf_shard_t *s; 720 721 while ((s = list_remove_head(&mlxp->mlx_buf_shards)) != NULL) { 722 mlxcx_mlbs_teardown(mlxp, s); 723 kmem_free(s, sizeof (mlxcx_buf_shard_t)); 724 } 725 list_destroy(&mlxp->mlx_buf_shards); 726 727 kmem_cache_destroy(mlxp->mlx_bufs_cache); 728 } 729 730 static void 731 mlxcx_teardown_pages(mlxcx_t *mlxp) 732 { 733 uint_t nzeros = 0; 734 735 mutex_enter(&mlxp->mlx_pagemtx); 736 737 while (mlxp->mlx_npages > 0) { 738 int32_t req, ret; 739 uint64_t pas[MLXCX_MANAGE_PAGES_MAX_PAGES]; 740 741 ASSERT0(avl_is_empty(&mlxp->mlx_pages)); 742 req = MIN(mlxp->mlx_npages, MLXCX_MANAGE_PAGES_MAX_PAGES); 743 744 if (!mlxcx_cmd_return_pages(mlxp, req, pas, &ret)) { 745 mlxcx_warn(mlxp, "hardware refused to return pages, " 746 "leaking %u remaining pages", mlxp->mlx_npages); 747 goto out; 748 } 749 750 for (int32_t i = 0; i < ret; i++) { 751 mlxcx_dev_page_t *mdp, probe; 752 bzero(&probe, sizeof (probe)); 753 probe.mxdp_pa = pas[i]; 754 755 mdp = avl_find(&mlxp->mlx_pages, &probe, NULL); 756 757 if (mdp != NULL) { 758 avl_remove(&mlxp->mlx_pages, mdp); 759 mlxp->mlx_npages--; 760 mlxcx_dma_free(&mdp->mxdp_dma); 761 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 762 } else { 763 mlxcx_panic(mlxp, "hardware returned a page " 764 "with PA 0x%" PRIx64 " but we have no " 765 "record of giving out such a page", pas[i]); 766 } 767 } 768 769 /* 770 * If no pages were returned, note that fact. 771 */ 772 if (ret == 0) { 773 nzeros++; 774 if (nzeros > mlxcx_reclaim_tries) { 775 mlxcx_warn(mlxp, "hardware refused to return " 776 "pages, leaking %u remaining pages", 777 mlxp->mlx_npages); 778 goto out; 779 } 780 delay(drv_usectohz(mlxcx_reclaim_delay)); 781 } 782 } 783 784 avl_destroy(&mlxp->mlx_pages); 785 786 out: 787 mutex_exit(&mlxp->mlx_pagemtx); 788 mutex_destroy(&mlxp->mlx_pagemtx); 789 } 790 791 static boolean_t 792 mlxcx_eq_alloc_dma(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) 793 { 794 ddi_device_acc_attr_t acc; 795 ddi_dma_attr_t attr; 796 boolean_t ret; 797 size_t sz, i; 798 799 VERIFY0(mleq->mleq_state & MLXCX_EQ_ALLOC); 800 801 mleq->mleq_entshift = mlxp->mlx_props.mldp_eq_size_shift; 802 mleq->mleq_nents = (1 << mleq->mleq_entshift); 803 sz = mleq->mleq_nents * sizeof (mlxcx_eventq_ent_t); 804 ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0); 805 806 mlxcx_dma_acc_attr(mlxp, &acc); 807 mlxcx_dma_queue_attr(mlxp, &attr); 808 809 ret = mlxcx_dma_alloc(mlxp, &mleq->mleq_dma, &attr, &acc, 810 B_TRUE, sz, B_TRUE); 811 if (!ret) { 812 mlxcx_warn(mlxp, "failed to allocate EQ memory"); 813 return (B_FALSE); 814 } 815 816 mleq->mleq_ent = (mlxcx_eventq_ent_t *)mleq->mleq_dma.mxdb_va; 817 818 for (i = 0; i < mleq->mleq_nents; ++i) 819 mleq->mleq_ent[i].mleqe_owner = MLXCX_EQ_OWNER_INIT; 820 821 mleq->mleq_state |= MLXCX_EQ_ALLOC; 822 823 return (B_TRUE); 824 } 825 826 static void 827 mlxcx_eq_rele_dma(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) 828 { 829 VERIFY(mleq->mleq_state & MLXCX_EQ_ALLOC); 830 if (mleq->mleq_state & MLXCX_EQ_CREATED) 831 VERIFY(mleq->mleq_state & MLXCX_EQ_DESTROYED); 832 833 mlxcx_dma_free(&mleq->mleq_dma); 834 mleq->mleq_ent = NULL; 835 836 mleq->mleq_state &= ~MLXCX_EQ_ALLOC; 837 } 838 839 void 840 mlxcx_teardown_flow_table(mlxcx_t *mlxp, mlxcx_flow_table_t *ft) 841 { 842 mlxcx_flow_group_t *fg; 843 mlxcx_flow_entry_t *fe; 844 int i; 845 846 ASSERT(mutex_owned(&ft->mlft_mtx)); 847 848 for (i = ft->mlft_nents - 1; i >= 0; --i) { 849 fe = &ft->mlft_ent[i]; 850 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) { 851 if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) { 852 mlxcx_panic(mlxp, "failed to delete flow " 853 "entry %u on table %u", i, 854 ft->mlft_num); 855 } 856 } 857 } 858 859 while ((fg = list_remove_head(&ft->mlft_groups)) != NULL) { 860 if (fg->mlfg_state & MLXCX_FLOW_GROUP_CREATED && 861 !(fg->mlfg_state & MLXCX_FLOW_GROUP_DESTROYED)) { 862 if (!mlxcx_cmd_destroy_flow_group(mlxp, fg)) { 863 mlxcx_panic(mlxp, "failed to destroy flow " 864 "group %u", fg->mlfg_num); 865 } 866 } 867 kmem_free(fg, sizeof (mlxcx_flow_group_t)); 868 } 869 list_destroy(&ft->mlft_groups); 870 if (ft->mlft_state & MLXCX_FLOW_TABLE_CREATED && 871 !(ft->mlft_state & MLXCX_FLOW_TABLE_DESTROYED)) { 872 if (!mlxcx_cmd_destroy_flow_table(mlxp, ft)) { 873 mlxcx_panic(mlxp, "failed to destroy flow table %u", 874 ft->mlft_num); 875 } 876 } 877 kmem_free(ft->mlft_ent, ft->mlft_entsize); 878 ft->mlft_ent = NULL; 879 mutex_exit(&ft->mlft_mtx); 880 mutex_destroy(&ft->mlft_mtx); 881 kmem_free(ft, sizeof (mlxcx_flow_table_t)); 882 } 883 884 static void 885 mlxcx_teardown_ports(mlxcx_t *mlxp) 886 { 887 uint_t i; 888 mlxcx_port_t *p; 889 mlxcx_flow_table_t *ft; 890 891 for (i = 0; i < mlxp->mlx_nports; ++i) { 892 p = &mlxp->mlx_ports[i]; 893 if (!(p->mlp_init & MLXCX_PORT_INIT)) 894 continue; 895 mutex_enter(&p->mlp_mtx); 896 if ((ft = p->mlp_rx_flow) != NULL) { 897 mutex_enter(&ft->mlft_mtx); 898 /* 899 * teardown_flow_table() will destroy the mutex, so 900 * we don't release it here. 901 */ 902 mlxcx_teardown_flow_table(mlxp, ft); 903 } 904 mutex_exit(&p->mlp_mtx); 905 mutex_destroy(&p->mlp_mtx); 906 p->mlp_init &= ~MLXCX_PORT_INIT; 907 } 908 909 kmem_free(mlxp->mlx_ports, mlxp->mlx_ports_size); 910 mlxp->mlx_ports = NULL; 911 } 912 913 static void 914 mlxcx_teardown_wqs(mlxcx_t *mlxp) 915 { 916 mlxcx_work_queue_t *mlwq; 917 918 while ((mlwq = list_head(&mlxp->mlx_wqs)) != NULL) { 919 mlxcx_wq_teardown(mlxp, mlwq); 920 } 921 list_destroy(&mlxp->mlx_wqs); 922 } 923 924 static void 925 mlxcx_teardown_cqs(mlxcx_t *mlxp) 926 { 927 mlxcx_completion_queue_t *mlcq; 928 929 while ((mlcq = list_head(&mlxp->mlx_cqs)) != NULL) { 930 mlxcx_cq_teardown(mlxp, mlcq); 931 } 932 list_destroy(&mlxp->mlx_cqs); 933 } 934 935 static void 936 mlxcx_teardown_eqs(mlxcx_t *mlxp) 937 { 938 mlxcx_event_queue_t *mleq; 939 uint_t i; 940 941 for (i = 0; i < mlxp->mlx_intr_count; ++i) { 942 mleq = &mlxp->mlx_eqs[i]; 943 mutex_enter(&mleq->mleq_mtx); 944 if ((mleq->mleq_state & MLXCX_EQ_CREATED) && 945 !(mleq->mleq_state & MLXCX_EQ_DESTROYED)) { 946 if (!mlxcx_cmd_destroy_eq(mlxp, mleq)) { 947 mlxcx_warn(mlxp, "failed to destroy " 948 "event queue idx %u eqn %u", 949 i, mleq->mleq_num); 950 } 951 } 952 if (mleq->mleq_state & MLXCX_EQ_ALLOC) { 953 mlxcx_eq_rele_dma(mlxp, mleq); 954 } 955 mutex_exit(&mleq->mleq_mtx); 956 } 957 } 958 959 static void 960 mlxcx_teardown_checktimers(mlxcx_t *mlxp) 961 { 962 if (mlxp->mlx_props.mldp_eq_check_interval_sec > 0) 963 ddi_periodic_delete(mlxp->mlx_eq_checktimer); 964 if (mlxp->mlx_props.mldp_cq_check_interval_sec > 0) 965 ddi_periodic_delete(mlxp->mlx_cq_checktimer); 966 if (mlxp->mlx_props.mldp_wq_check_interval_sec > 0) 967 ddi_periodic_delete(mlxp->mlx_wq_checktimer); 968 } 969 970 static void 971 mlxcx_teardown(mlxcx_t *mlxp) 972 { 973 uint_t i; 974 dev_info_t *dip = mlxp->mlx_dip; 975 976 if (mlxp->mlx_attach & MLXCX_ATTACH_GROUPS) { 977 mlxcx_teardown_groups(mlxp); 978 mlxp->mlx_attach &= ~MLXCX_ATTACH_GROUPS; 979 } 980 981 if (mlxp->mlx_attach & MLXCX_ATTACH_CHKTIMERS) { 982 mlxcx_teardown_checktimers(mlxp); 983 mlxp->mlx_attach &= ~MLXCX_ATTACH_CHKTIMERS; 984 } 985 986 if (mlxp->mlx_attach & MLXCX_ATTACH_WQS) { 987 mlxcx_teardown_wqs(mlxp); 988 mlxp->mlx_attach &= ~MLXCX_ATTACH_WQS; 989 } 990 991 if (mlxp->mlx_attach & MLXCX_ATTACH_CQS) { 992 mlxcx_teardown_cqs(mlxp); 993 mlxp->mlx_attach &= ~MLXCX_ATTACH_CQS; 994 } 995 996 if (mlxp->mlx_attach & MLXCX_ATTACH_BUFS) { 997 mlxcx_teardown_bufs(mlxp); 998 mlxp->mlx_attach &= ~MLXCX_ATTACH_BUFS; 999 } 1000 1001 if (mlxp->mlx_attach & MLXCX_ATTACH_PORTS) { 1002 mlxcx_teardown_ports(mlxp); 1003 mlxp->mlx_attach &= ~MLXCX_ATTACH_PORTS; 1004 } 1005 1006 if (mlxp->mlx_attach & MLXCX_ATTACH_INTRS) { 1007 mlxcx_teardown_eqs(mlxp); 1008 mlxcx_intr_teardown(mlxp); 1009 mlxp->mlx_attach &= ~MLXCX_ATTACH_INTRS; 1010 } 1011 1012 if (mlxp->mlx_attach & MLXCX_ATTACH_UAR_PD_TD) { 1013 if (mlxp->mlx_uar.mlu_allocated) { 1014 if (!mlxcx_cmd_dealloc_uar(mlxp, &mlxp->mlx_uar)) { 1015 mlxcx_warn(mlxp, "failed to release UAR"); 1016 } 1017 for (i = 0; i < MLXCX_BF_PER_UAR; ++i) 1018 mutex_destroy(&mlxp->mlx_uar.mlu_bf[i].mbf_mtx); 1019 } 1020 if (mlxp->mlx_pd.mlpd_allocated && 1021 !mlxcx_cmd_dealloc_pd(mlxp, &mlxp->mlx_pd)) { 1022 mlxcx_warn(mlxp, "failed to release PD"); 1023 } 1024 if (mlxp->mlx_tdom.mltd_allocated && 1025 !mlxcx_cmd_dealloc_tdom(mlxp, &mlxp->mlx_tdom)) { 1026 mlxcx_warn(mlxp, "failed to release TDOM"); 1027 } 1028 mlxp->mlx_attach &= ~MLXCX_ATTACH_UAR_PD_TD; 1029 } 1030 1031 if (mlxp->mlx_attach & MLXCX_ATTACH_INIT_HCA) { 1032 if (!mlxcx_cmd_teardown_hca(mlxp)) { 1033 mlxcx_warn(mlxp, "failed to send teardown HCA " 1034 "command during device detach"); 1035 } 1036 mlxp->mlx_attach &= ~MLXCX_ATTACH_INIT_HCA; 1037 } 1038 1039 if (mlxp->mlx_attach & MLXCX_ATTACH_PAGE_LIST) { 1040 mlxcx_teardown_pages(mlxp); 1041 mlxp->mlx_attach &= ~MLXCX_ATTACH_PAGE_LIST; 1042 } 1043 1044 if (mlxp->mlx_attach & MLXCX_ATTACH_ENABLE_HCA) { 1045 if (!mlxcx_cmd_disable_hca(mlxp)) { 1046 mlxcx_warn(mlxp, "failed to send DISABLE HCA command " 1047 "during device detach"); 1048 } 1049 mlxp->mlx_attach &= ~MLXCX_ATTACH_ENABLE_HCA; 1050 } 1051 1052 if (mlxp->mlx_attach & MLXCX_ATTACH_CMD) { 1053 mlxcx_cmd_queue_fini(mlxp); 1054 mlxp->mlx_attach &= ~MLXCX_ATTACH_CMD; 1055 } 1056 1057 if (mlxp->mlx_attach & MLXCX_ATTACH_CAPS) { 1058 kmem_free(mlxp->mlx_caps, sizeof (mlxcx_caps_t)); 1059 mlxp->mlx_caps = NULL; 1060 mlxp->mlx_attach &= ~MLXCX_ATTACH_CAPS; 1061 } 1062 1063 if (mlxp->mlx_attach & MLXCX_ATTACH_REGS) { 1064 ddi_regs_map_free(&mlxp->mlx_regs_handle); 1065 mlxp->mlx_regs_handle = NULL; 1066 mlxp->mlx_attach &= ~MLXCX_ATTACH_REGS; 1067 } 1068 1069 if (mlxp->mlx_attach & MLXCX_ATTACH_PCI_CONFIG) { 1070 pci_config_teardown(&mlxp->mlx_cfg_handle); 1071 mlxp->mlx_cfg_handle = NULL; 1072 mlxp->mlx_attach &= ~MLXCX_ATTACH_PCI_CONFIG; 1073 } 1074 1075 if (mlxp->mlx_attach & MLXCX_ATTACH_FM) { 1076 mlxcx_fm_fini(mlxp); 1077 mlxp->mlx_attach &= ~MLXCX_ATTACH_FM; 1078 } 1079 1080 VERIFY3S(mlxp->mlx_attach, ==, 0); 1081 ddi_soft_state_free(mlxcx_softstate, mlxp->mlx_inst); 1082 ddi_set_driver_private(dip, NULL); 1083 } 1084 1085 static boolean_t 1086 mlxcx_regs_map(mlxcx_t *mlxp) 1087 { 1088 off_t memsize; 1089 int ret; 1090 ddi_device_acc_attr_t da; 1091 1092 if (ddi_dev_regsize(mlxp->mlx_dip, MLXCX_REG_NUMBER, &memsize) != 1093 DDI_SUCCESS) { 1094 mlxcx_warn(mlxp, "failed to get register set size"); 1095 return (B_FALSE); 1096 } 1097 1098 /* 1099 * All data in the main BAR is kept in big-endian even though it's a PCI 1100 * device. 1101 */ 1102 bzero(&da, sizeof (ddi_device_acc_attr_t)); 1103 da.devacc_attr_version = DDI_DEVICE_ATTR_V0; 1104 da.devacc_attr_endian_flags = DDI_STRUCTURE_BE_ACC; 1105 da.devacc_attr_dataorder = DDI_STRICTORDER_ACC; 1106 if (DDI_FM_ACC_ERR_CAP(mlxp->mlx_fm_caps)) { 1107 da.devacc_attr_access = DDI_FLAGERR_ACC; 1108 } else { 1109 da.devacc_attr_access = DDI_DEFAULT_ACC; 1110 } 1111 1112 ret = ddi_regs_map_setup(mlxp->mlx_dip, MLXCX_REG_NUMBER, 1113 &mlxp->mlx_regs_base, 0, memsize, &da, &mlxp->mlx_regs_handle); 1114 1115 if (ret != DDI_SUCCESS) { 1116 mlxcx_warn(mlxp, "failed to map device registers: %d", ret); 1117 return (B_FALSE); 1118 } 1119 1120 return (B_TRUE); 1121 } 1122 1123 static boolean_t 1124 mlxcx_check_issi(mlxcx_t *mlxp) 1125 { 1126 uint32_t issi; 1127 1128 if (!mlxcx_cmd_query_issi(mlxp, &issi)) { 1129 mlxcx_warn(mlxp, "failed to get ISSI"); 1130 return (B_FALSE); 1131 } 1132 1133 if ((issi & (1 << MLXCX_CURRENT_ISSI)) == 0) { 1134 mlxcx_warn(mlxp, "hardware does not support software ISSI, " 1135 "hw vector 0x%x, sw version %u", issi, MLXCX_CURRENT_ISSI); 1136 return (B_FALSE); 1137 } 1138 1139 if (!mlxcx_cmd_set_issi(mlxp, MLXCX_CURRENT_ISSI)) { 1140 mlxcx_warn(mlxp, "failed to set ISSI to %u", 1141 MLXCX_CURRENT_ISSI); 1142 return (B_FALSE); 1143 } 1144 1145 return (B_TRUE); 1146 } 1147 1148 boolean_t 1149 mlxcx_give_pages(mlxcx_t *mlxp, int32_t npages) 1150 { 1151 ddi_device_acc_attr_t acc; 1152 ddi_dma_attr_t attr; 1153 int32_t i; 1154 list_t plist; 1155 mlxcx_dev_page_t *mdp; 1156 const ddi_dma_cookie_t *ck; 1157 1158 /* 1159 * If there are no pages required, then we're done here. 1160 */ 1161 if (npages <= 0) { 1162 return (B_TRUE); 1163 } 1164 1165 list_create(&plist, sizeof (mlxcx_dev_page_t), 1166 offsetof(mlxcx_dev_page_t, mxdp_list)); 1167 1168 for (i = 0; i < npages; i++) { 1169 mdp = kmem_zalloc(sizeof (mlxcx_dev_page_t), KM_SLEEP); 1170 mlxcx_dma_acc_attr(mlxp, &acc); 1171 mlxcx_dma_page_attr(mlxp, &attr); 1172 if (!mlxcx_dma_alloc(mlxp, &mdp->mxdp_dma, &attr, &acc, 1173 B_TRUE, MLXCX_HW_PAGE_SIZE, B_TRUE)) { 1174 mlxcx_warn(mlxp, "failed to allocate 4k page %u/%u", i, 1175 npages); 1176 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 1177 goto cleanup_npages; 1178 } 1179 ck = mlxcx_dma_cookie_one(&mdp->mxdp_dma); 1180 mdp->mxdp_pa = ck->dmac_laddress; 1181 1182 list_insert_tail(&plist, mdp); 1183 } 1184 1185 /* 1186 * Now that all of the pages have been allocated, given them to hardware 1187 * in chunks. 1188 */ 1189 while (npages > 0) { 1190 mlxcx_dev_page_t *pages[MLXCX_MANAGE_PAGES_MAX_PAGES]; 1191 int32_t togive = MIN(MLXCX_MANAGE_PAGES_MAX_PAGES, npages); 1192 1193 for (i = 0; i < togive; i++) { 1194 pages[i] = list_remove_head(&plist); 1195 } 1196 1197 if (!mlxcx_cmd_give_pages(mlxp, 1198 MLXCX_MANAGE_PAGES_OPMOD_GIVE_PAGES, togive, pages)) { 1199 mlxcx_warn(mlxp, "!hardware refused our gift of %u " 1200 "pages!", togive); 1201 for (i = 0; i < togive; i++) { 1202 list_insert_tail(&plist, pages[i]); 1203 } 1204 goto cleanup_npages; 1205 } 1206 1207 mutex_enter(&mlxp->mlx_pagemtx); 1208 for (i = 0; i < togive; i++) { 1209 avl_add(&mlxp->mlx_pages, pages[i]); 1210 } 1211 mlxp->mlx_npages += togive; 1212 mutex_exit(&mlxp->mlx_pagemtx); 1213 npages -= togive; 1214 } 1215 1216 list_destroy(&plist); 1217 1218 return (B_TRUE); 1219 1220 cleanup_npages: 1221 while ((mdp = list_remove_head(&plist)) != NULL) { 1222 mlxcx_dma_free(&mdp->mxdp_dma); 1223 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 1224 } 1225 list_destroy(&plist); 1226 return (B_FALSE); 1227 } 1228 1229 static boolean_t 1230 mlxcx_init_pages(mlxcx_t *mlxp, uint_t type) 1231 { 1232 int32_t npages; 1233 1234 if (!mlxcx_cmd_query_pages(mlxp, type, &npages)) { 1235 mlxcx_warn(mlxp, "failed to determine boot pages"); 1236 return (B_FALSE); 1237 } 1238 1239 return (mlxcx_give_pages(mlxp, npages)); 1240 } 1241 1242 static int 1243 mlxcx_bufs_cache_constr(void *arg, void *cookie, int kmflags) 1244 { 1245 mlxcx_t *mlxp = cookie; 1246 mlxcx_buffer_t *b = arg; 1247 1248 bzero(b, sizeof (mlxcx_buffer_t)); 1249 b->mlb_mlx = mlxp; 1250 b->mlb_state = MLXCX_BUFFER_INIT; 1251 list_create(&b->mlb_tx_chain, sizeof (mlxcx_buffer_t), 1252 offsetof(mlxcx_buffer_t, mlb_tx_chain_entry)); 1253 1254 return (0); 1255 } 1256 1257 static void 1258 mlxcx_bufs_cache_destr(void *arg, void *cookie) 1259 { 1260 mlxcx_t *mlxp = cookie; 1261 mlxcx_buffer_t *b = arg; 1262 VERIFY3P(b->mlb_mlx, ==, mlxp); 1263 VERIFY(b->mlb_state == MLXCX_BUFFER_INIT); 1264 list_destroy(&b->mlb_tx_chain); 1265 } 1266 1267 mlxcx_buf_shard_t * 1268 mlxcx_mlbs_create(mlxcx_t *mlxp) 1269 { 1270 mlxcx_buf_shard_t *s; 1271 1272 s = kmem_zalloc(sizeof (mlxcx_buf_shard_t), KM_SLEEP); 1273 1274 mutex_init(&s->mlbs_mtx, NULL, MUTEX_DRIVER, 1275 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 1276 list_create(&s->mlbs_busy, sizeof (mlxcx_buffer_t), 1277 offsetof(mlxcx_buffer_t, mlb_entry)); 1278 list_create(&s->mlbs_free, sizeof (mlxcx_buffer_t), 1279 offsetof(mlxcx_buffer_t, mlb_entry)); 1280 cv_init(&s->mlbs_free_nonempty, NULL, CV_DRIVER, NULL); 1281 1282 list_insert_tail(&mlxp->mlx_buf_shards, s); 1283 1284 return (s); 1285 } 1286 1287 static boolean_t 1288 mlxcx_setup_bufs(mlxcx_t *mlxp) 1289 { 1290 char namebuf[KSTAT_STRLEN]; 1291 1292 (void) snprintf(namebuf, KSTAT_STRLEN, "mlxcx%d_bufs_cache", 1293 ddi_get_instance(mlxp->mlx_dip)); 1294 mlxp->mlx_bufs_cache = kmem_cache_create(namebuf, 1295 sizeof (mlxcx_buffer_t), sizeof (uint64_t), 1296 mlxcx_bufs_cache_constr, mlxcx_bufs_cache_destr, 1297 NULL, mlxp, NULL, 0); 1298 1299 list_create(&mlxp->mlx_buf_shards, sizeof (mlxcx_buf_shard_t), 1300 offsetof(mlxcx_buf_shard_t, mlbs_entry)); 1301 1302 return (B_TRUE); 1303 } 1304 1305 static void 1306 mlxcx_fm_qstate_ereport(mlxcx_t *mlxp, const char *qtype, uint32_t qnum, 1307 const char *state, uint8_t statenum) 1308 { 1309 uint64_t ena; 1310 char buf[FM_MAX_CLASS]; 1311 1312 if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps)) 1313 return; 1314 1315 (void) snprintf(buf, FM_MAX_CLASS, "%s.%s", 1316 MLXCX_FM_SERVICE_MLXCX, "qstate.err"); 1317 ena = fm_ena_generate(0, FM_ENA_FMT1); 1318 1319 ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP, 1320 FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0, 1321 "state", DATA_TYPE_STRING, state, 1322 "state_num", DATA_TYPE_UINT8, statenum, 1323 "qtype", DATA_TYPE_STRING, qtype, 1324 "qnum", DATA_TYPE_UINT32, qnum, 1325 NULL); 1326 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_DEGRADED); 1327 } 1328 1329 static void 1330 mlxcx_eq_check(void *arg) 1331 { 1332 mlxcx_t *mlxp = (mlxcx_t *)arg; 1333 mlxcx_event_queue_t *eq; 1334 mlxcx_eventq_ctx_t ctx; 1335 const char *str; 1336 1337 uint_t i; 1338 1339 for (i = 0; i < mlxp->mlx_intr_count; ++i) { 1340 eq = &mlxp->mlx_eqs[i]; 1341 if (!(eq->mleq_state & MLXCX_EQ_CREATED) || 1342 (eq->mleq_state & MLXCX_EQ_DESTROYED)) 1343 continue; 1344 mutex_enter(&eq->mleq_mtx); 1345 if (!mlxcx_cmd_query_eq(mlxp, eq, &ctx)) { 1346 mutex_exit(&eq->mleq_mtx); 1347 continue; 1348 } 1349 1350 str = "???"; 1351 switch (ctx.mleqc_status) { 1352 case MLXCX_EQ_STATUS_OK: 1353 break; 1354 case MLXCX_EQ_STATUS_WRITE_FAILURE: 1355 str = "WRITE_FAILURE"; 1356 break; 1357 } 1358 if (ctx.mleqc_status != MLXCX_EQ_STATUS_OK) { 1359 mlxcx_fm_qstate_ereport(mlxp, "event", 1360 eq->mleq_num, str, ctx.mleqc_status); 1361 mlxcx_warn(mlxp, "EQ %u is in bad status: %x (%s)", 1362 eq->mleq_intr_index, ctx.mleqc_status, str); 1363 } 1364 1365 if (ctx.mleqc_state != MLXCX_EQ_ST_ARMED && 1366 (eq->mleq_state & MLXCX_EQ_ARMED)) { 1367 if (eq->mleq_cc == eq->mleq_check_disarm_cc && 1368 ++eq->mleq_check_disarm_cnt >= 3) { 1369 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_STALL); 1370 mlxcx_warn(mlxp, "EQ %u isn't armed", 1371 eq->mleq_intr_index); 1372 } 1373 eq->mleq_check_disarm_cc = eq->mleq_cc; 1374 } else { 1375 eq->mleq_check_disarm_cc = 0; 1376 eq->mleq_check_disarm_cnt = 0; 1377 } 1378 1379 mutex_exit(&eq->mleq_mtx); 1380 } 1381 } 1382 1383 static void 1384 mlxcx_cq_check(void *arg) 1385 { 1386 mlxcx_t *mlxp = (mlxcx_t *)arg; 1387 mlxcx_completion_queue_t *cq; 1388 mlxcx_completionq_ctx_t ctx; 1389 const char *str, *type; 1390 uint_t v; 1391 1392 for (cq = list_head(&mlxp->mlx_cqs); cq != NULL; 1393 cq = list_next(&mlxp->mlx_cqs, cq)) { 1394 mutex_enter(&cq->mlcq_mtx); 1395 if (!(cq->mlcq_state & MLXCX_CQ_CREATED) || 1396 (cq->mlcq_state & MLXCX_CQ_DESTROYED) || 1397 (cq->mlcq_state & MLXCX_CQ_TEARDOWN)) { 1398 mutex_exit(&cq->mlcq_mtx); 1399 continue; 1400 } 1401 if (cq->mlcq_fm_repd_qstate) { 1402 mutex_exit(&cq->mlcq_mtx); 1403 continue; 1404 } 1405 if (!mlxcx_cmd_query_cq(mlxp, cq, &ctx)) { 1406 mutex_exit(&cq->mlcq_mtx); 1407 continue; 1408 } 1409 if (cq->mlcq_wq != NULL) { 1410 mlxcx_work_queue_t *wq = cq->mlcq_wq; 1411 if (wq->mlwq_type == MLXCX_WQ_TYPE_RECVQ) 1412 type = "rx "; 1413 else if (wq->mlwq_type == MLXCX_WQ_TYPE_SENDQ) 1414 type = "tx "; 1415 else 1416 type = ""; 1417 } else { 1418 type = ""; 1419 } 1420 1421 str = "???"; 1422 v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATUS); 1423 switch (v) { 1424 case MLXCX_CQC_STATUS_OK: 1425 break; 1426 case MLXCX_CQC_STATUS_OVERFLOW: 1427 str = "OVERFLOW"; 1428 break; 1429 case MLXCX_CQC_STATUS_WRITE_FAIL: 1430 str = "WRITE_FAIL"; 1431 break; 1432 case MLXCX_CQC_STATUS_INVALID: 1433 str = "INVALID"; 1434 break; 1435 } 1436 if (v != MLXCX_CQC_STATUS_OK) { 1437 mlxcx_fm_qstate_ereport(mlxp, "completion", 1438 cq->mlcq_num, str, v); 1439 mlxcx_warn(mlxp, "%sCQ 0x%x is in bad status: %x (%s)", 1440 type, cq->mlcq_num, v, str); 1441 cq->mlcq_fm_repd_qstate = B_TRUE; 1442 } 1443 1444 v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATE); 1445 if (v != MLXCX_CQC_STATE_ARMED && 1446 (cq->mlcq_state & MLXCX_CQ_ARMED) && 1447 !(cq->mlcq_state & MLXCX_CQ_POLLING)) { 1448 if (cq->mlcq_cc == cq->mlcq_check_disarm_cc && 1449 ++cq->mlcq_check_disarm_cnt >= 3) { 1450 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_STALL); 1451 mlxcx_warn(mlxp, "%sCQ 0x%x (%p) isn't armed", 1452 type, cq->mlcq_num, cq); 1453 } 1454 cq->mlcq_check_disarm_cc = cq->mlcq_cc; 1455 } else { 1456 cq->mlcq_check_disarm_cnt = 0; 1457 cq->mlcq_check_disarm_cc = 0; 1458 } 1459 mutex_exit(&cq->mlcq_mtx); 1460 } 1461 } 1462 1463 void 1464 mlxcx_check_sq(mlxcx_t *mlxp, mlxcx_work_queue_t *sq) 1465 { 1466 mlxcx_sq_ctx_t ctx; 1467 mlxcx_sq_state_t state; 1468 1469 ASSERT(mutex_owned(&sq->mlwq_mtx)); 1470 1471 if (!mlxcx_cmd_query_sq(mlxp, sq, &ctx)) 1472 return; 1473 1474 ASSERT3U(from_be24(ctx.mlsqc_cqn), ==, sq->mlwq_cq->mlcq_num); 1475 state = get_bits32(ctx.mlsqc_flags, MLXCX_SQ_STATE); 1476 switch (state) { 1477 case MLXCX_SQ_STATE_RST: 1478 if (sq->mlwq_state & MLXCX_WQ_STARTED) { 1479 mlxcx_fm_qstate_ereport(mlxp, "send", 1480 sq->mlwq_num, "RST", state); 1481 sq->mlwq_fm_repd_qstate = B_TRUE; 1482 } 1483 break; 1484 case MLXCX_SQ_STATE_RDY: 1485 if (!(sq->mlwq_state & MLXCX_WQ_STARTED)) { 1486 mlxcx_fm_qstate_ereport(mlxp, "send", 1487 sq->mlwq_num, "RDY", state); 1488 sq->mlwq_fm_repd_qstate = B_TRUE; 1489 } 1490 break; 1491 case MLXCX_SQ_STATE_ERR: 1492 mlxcx_fm_qstate_ereport(mlxp, "send", 1493 sq->mlwq_num, "ERR", state); 1494 sq->mlwq_fm_repd_qstate = B_TRUE; 1495 break; 1496 default: 1497 mlxcx_fm_qstate_ereport(mlxp, "send", 1498 sq->mlwq_num, "???", state); 1499 sq->mlwq_fm_repd_qstate = B_TRUE; 1500 break; 1501 } 1502 } 1503 1504 void 1505 mlxcx_check_rq(mlxcx_t *mlxp, mlxcx_work_queue_t *rq) 1506 { 1507 mlxcx_rq_ctx_t ctx; 1508 mlxcx_rq_state_t state; 1509 1510 ASSERT(mutex_owned(&rq->mlwq_mtx)); 1511 1512 if (!mlxcx_cmd_query_rq(mlxp, rq, &ctx)) 1513 return; 1514 1515 ASSERT3U(from_be24(ctx.mlrqc_cqn), ==, rq->mlwq_cq->mlcq_num); 1516 state = get_bits32(ctx.mlrqc_flags, MLXCX_RQ_STATE); 1517 switch (state) { 1518 case MLXCX_RQ_STATE_RST: 1519 if (rq->mlwq_state & MLXCX_WQ_STARTED) { 1520 mlxcx_fm_qstate_ereport(mlxp, "receive", 1521 rq->mlwq_num, "RST", state); 1522 rq->mlwq_fm_repd_qstate = B_TRUE; 1523 } 1524 break; 1525 case MLXCX_RQ_STATE_RDY: 1526 if (!(rq->mlwq_state & MLXCX_WQ_STARTED)) { 1527 mlxcx_fm_qstate_ereport(mlxp, "receive", 1528 rq->mlwq_num, "RDY", state); 1529 rq->mlwq_fm_repd_qstate = B_TRUE; 1530 } 1531 break; 1532 case MLXCX_RQ_STATE_ERR: 1533 mlxcx_fm_qstate_ereport(mlxp, "receive", 1534 rq->mlwq_num, "ERR", state); 1535 rq->mlwq_fm_repd_qstate = B_TRUE; 1536 break; 1537 default: 1538 mlxcx_fm_qstate_ereport(mlxp, "receive", 1539 rq->mlwq_num, "???", state); 1540 rq->mlwq_fm_repd_qstate = B_TRUE; 1541 break; 1542 } 1543 } 1544 1545 static void 1546 mlxcx_wq_check(void *arg) 1547 { 1548 mlxcx_t *mlxp = (mlxcx_t *)arg; 1549 mlxcx_work_queue_t *wq; 1550 1551 for (wq = list_head(&mlxp->mlx_wqs); wq != NULL; 1552 wq = list_next(&mlxp->mlx_wqs, wq)) { 1553 mutex_enter(&wq->mlwq_mtx); 1554 if (!(wq->mlwq_state & MLXCX_WQ_CREATED) || 1555 (wq->mlwq_state & MLXCX_WQ_DESTROYED) || 1556 (wq->mlwq_state & MLXCX_WQ_TEARDOWN)) { 1557 mutex_exit(&wq->mlwq_mtx); 1558 continue; 1559 } 1560 if (wq->mlwq_fm_repd_qstate) { 1561 mutex_exit(&wq->mlwq_mtx); 1562 continue; 1563 } 1564 switch (wq->mlwq_type) { 1565 case MLXCX_WQ_TYPE_SENDQ: 1566 mlxcx_check_sq(mlxp, wq); 1567 break; 1568 case MLXCX_WQ_TYPE_RECVQ: 1569 mlxcx_check_rq(mlxp, wq); 1570 break; 1571 } 1572 mutex_exit(&wq->mlwq_mtx); 1573 } 1574 } 1575 1576 static boolean_t 1577 mlxcx_setup_checktimers(mlxcx_t *mlxp) 1578 { 1579 if (mlxp->mlx_props.mldp_eq_check_interval_sec > 0) { 1580 mlxp->mlx_eq_checktimer = ddi_periodic_add(mlxcx_eq_check, mlxp, 1581 mlxp->mlx_props.mldp_eq_check_interval_sec * NANOSEC, 1582 DDI_IPL_0); 1583 } 1584 if (mlxp->mlx_props.mldp_cq_check_interval_sec > 0) { 1585 mlxp->mlx_cq_checktimer = ddi_periodic_add(mlxcx_cq_check, mlxp, 1586 mlxp->mlx_props.mldp_cq_check_interval_sec * NANOSEC, 1587 DDI_IPL_0); 1588 } 1589 if (mlxp->mlx_props.mldp_wq_check_interval_sec > 0) { 1590 mlxp->mlx_wq_checktimer = ddi_periodic_add(mlxcx_wq_check, mlxp, 1591 mlxp->mlx_props.mldp_wq_check_interval_sec * NANOSEC, 1592 DDI_IPL_0); 1593 } 1594 return (B_TRUE); 1595 } 1596 1597 int 1598 mlxcx_dmac_fe_compare(const void *arg0, const void *arg1) 1599 { 1600 const mlxcx_flow_entry_t *left = arg0; 1601 const mlxcx_flow_entry_t *right = arg1; 1602 int bcmpr; 1603 1604 bcmpr = memcmp(left->mlfe_dmac, right->mlfe_dmac, 1605 sizeof (left->mlfe_dmac)); 1606 if (bcmpr < 0) 1607 return (-1); 1608 if (bcmpr > 0) 1609 return (1); 1610 if (left->mlfe_vid < right->mlfe_vid) 1611 return (-1); 1612 if (left->mlfe_vid > right->mlfe_vid) 1613 return (1); 1614 return (0); 1615 } 1616 1617 int 1618 mlxcx_grmac_compare(const void *arg0, const void *arg1) 1619 { 1620 const mlxcx_group_mac_t *left = arg0; 1621 const mlxcx_group_mac_t *right = arg1; 1622 int bcmpr; 1623 1624 bcmpr = memcmp(left->mlgm_mac, right->mlgm_mac, 1625 sizeof (left->mlgm_mac)); 1626 if (bcmpr < 0) 1627 return (-1); 1628 if (bcmpr > 0) 1629 return (1); 1630 return (0); 1631 } 1632 1633 int 1634 mlxcx_page_compare(const void *arg0, const void *arg1) 1635 { 1636 const mlxcx_dev_page_t *p0 = arg0; 1637 const mlxcx_dev_page_t *p1 = arg1; 1638 1639 if (p0->mxdp_pa < p1->mxdp_pa) 1640 return (-1); 1641 if (p0->mxdp_pa > p1->mxdp_pa) 1642 return (1); 1643 return (0); 1644 } 1645 1646 static boolean_t 1647 mlxcx_setup_ports(mlxcx_t *mlxp) 1648 { 1649 uint_t i, j; 1650 mlxcx_port_t *p; 1651 mlxcx_flow_table_t *ft; 1652 mlxcx_flow_group_t *fg; 1653 mlxcx_flow_entry_t *fe; 1654 1655 VERIFY3U(mlxp->mlx_nports, >, 0); 1656 mlxp->mlx_ports_size = mlxp->mlx_nports * sizeof (mlxcx_port_t); 1657 mlxp->mlx_ports = kmem_zalloc(mlxp->mlx_ports_size, KM_SLEEP); 1658 1659 for (i = 0; i < mlxp->mlx_nports; ++i) { 1660 p = &mlxp->mlx_ports[i]; 1661 p->mlp_num = i; 1662 p->mlp_init |= MLXCX_PORT_INIT; 1663 mutex_init(&p->mlp_mtx, NULL, MUTEX_DRIVER, 1664 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 1665 mutex_enter(&p->mlp_mtx); 1666 if (!mlxcx_cmd_query_nic_vport_ctx(mlxp, p)) { 1667 mutex_exit(&p->mlp_mtx); 1668 goto err; 1669 } 1670 if (!mlxcx_cmd_query_port_mtu(mlxp, p)) { 1671 mutex_exit(&p->mlp_mtx); 1672 goto err; 1673 } 1674 if (!mlxcx_cmd_query_port_status(mlxp, p)) { 1675 mutex_exit(&p->mlp_mtx); 1676 goto err; 1677 } 1678 if (!mlxcx_cmd_query_port_speed(mlxp, p)) { 1679 mutex_exit(&p->mlp_mtx); 1680 goto err; 1681 } 1682 if (!mlxcx_cmd_modify_nic_vport_ctx(mlxp, p, 1683 MLXCX_MODIFY_NIC_VPORT_CTX_PROMISC)) { 1684 mutex_exit(&p->mlp_mtx); 1685 goto err; 1686 } 1687 1688 mutex_exit(&p->mlp_mtx); 1689 } 1690 1691 for (i = 0; i < mlxp->mlx_nports; ++i) { 1692 p = &mlxp->mlx_ports[i]; 1693 mutex_enter(&p->mlp_mtx); 1694 p->mlp_rx_flow = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t), 1695 KM_SLEEP)); 1696 mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER, 1697 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 1698 1699 mutex_enter(&ft->mlft_mtx); 1700 1701 ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX; 1702 ft->mlft_port = p; 1703 ft->mlft_entshift = mlxp->mlx_props.mldp_ftbl_root_size_shift; 1704 if (ft->mlft_entshift > mlxp->mlx_caps->mlc_max_rx_ft_shift) 1705 ft->mlft_entshift = mlxp->mlx_caps->mlc_max_rx_ft_shift; 1706 ft->mlft_nents = (1 << ft->mlft_entshift); 1707 ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t); 1708 ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP); 1709 list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t), 1710 offsetof(mlxcx_flow_group_t, mlfg_entry)); 1711 1712 for (j = 0; j < ft->mlft_nents; ++j) { 1713 ft->mlft_ent[j].mlfe_table = ft; 1714 ft->mlft_ent[j].mlfe_index = j; 1715 } 1716 1717 if (!mlxcx_cmd_create_flow_table(mlxp, ft)) { 1718 mutex_exit(&ft->mlft_mtx); 1719 mutex_exit(&p->mlp_mtx); 1720 goto err; 1721 } 1722 1723 if (!mlxcx_cmd_set_flow_table_root(mlxp, ft)) { 1724 mutex_exit(&ft->mlft_mtx); 1725 mutex_exit(&p->mlp_mtx); 1726 goto err; 1727 } 1728 1729 /* 1730 * We match broadcast at the top of the root flow table, then 1731 * all multicast/unicast MACs, then the promisc entry is down 1732 * the very bottom. 1733 * 1734 * This way when promisc is on, that entry simply catches any 1735 * remaining traffic that earlier flows haven't matched. 1736 */ 1737 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1738 list_insert_tail(&ft->mlft_groups, fg); 1739 fg->mlfg_table = ft; 1740 fg->mlfg_size = 1; 1741 fg->mlfg_mask |= MLXCX_FLOW_MATCH_DMAC; 1742 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1743 mutex_exit(&ft->mlft_mtx); 1744 mutex_exit(&p->mlp_mtx); 1745 goto err; 1746 } 1747 p->mlp_bcast = fg; 1748 fe = list_head(&fg->mlfg_entries); 1749 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 1750 (void) memset(fe->mlfe_dmac, 0xff, sizeof (fe->mlfe_dmac)); 1751 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 1752 1753 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1754 list_insert_tail(&ft->mlft_groups, fg); 1755 fg->mlfg_table = ft; 1756 fg->mlfg_size = ft->mlft_nents - 2; 1757 fg->mlfg_mask |= MLXCX_FLOW_MATCH_DMAC; 1758 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1759 mutex_exit(&ft->mlft_mtx); 1760 mutex_exit(&p->mlp_mtx); 1761 goto err; 1762 } 1763 p->mlp_umcast = fg; 1764 1765 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1766 list_insert_tail(&ft->mlft_groups, fg); 1767 fg->mlfg_table = ft; 1768 fg->mlfg_size = 1; 1769 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1770 mutex_exit(&ft->mlft_mtx); 1771 mutex_exit(&p->mlp_mtx); 1772 goto err; 1773 } 1774 p->mlp_promisc = fg; 1775 fe = list_head(&fg->mlfg_entries); 1776 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 1777 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 1778 1779 avl_create(&p->mlp_dmac_fe, mlxcx_dmac_fe_compare, 1780 sizeof (mlxcx_flow_entry_t), offsetof(mlxcx_flow_entry_t, 1781 mlfe_dmac_entry)); 1782 1783 mutex_exit(&ft->mlft_mtx); 1784 mutex_exit(&p->mlp_mtx); 1785 } 1786 1787 return (B_TRUE); 1788 1789 err: 1790 mlxcx_teardown_ports(mlxp); 1791 return (B_FALSE); 1792 } 1793 1794 void 1795 mlxcx_remove_all_vlan_entries(mlxcx_t *mlxp, mlxcx_ring_group_t *g) 1796 { 1797 mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft; 1798 mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg; 1799 mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg; 1800 mlxcx_flow_entry_t *fe; 1801 mlxcx_group_vlan_t *v; 1802 1803 ASSERT(mutex_owned(&g->mlg_mtx)); 1804 1805 mutex_enter(&ft->mlft_mtx); 1806 1807 if (!list_is_empty(&g->mlg_rx_vlans)) { 1808 fe = list_head(&dfg->mlfg_entries); 1809 (void) mlxcx_cmd_set_flow_table_entry(mlxp, fe); 1810 } 1811 1812 while ((v = list_remove_head(&g->mlg_rx_vlans)) != NULL) { 1813 fe = v->mlgv_fe; 1814 ASSERT3P(fe->mlfe_table, ==, ft); 1815 ASSERT3P(fe->mlfe_group, ==, fg); 1816 kmem_free(v, sizeof (mlxcx_group_vlan_t)); 1817 1818 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 1819 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 1820 } 1821 1822 mutex_exit(&ft->mlft_mtx); 1823 } 1824 1825 boolean_t 1826 mlxcx_remove_vlan_entry(mlxcx_t *mlxp, mlxcx_ring_group_t *g, 1827 boolean_t tagged, uint16_t vid) 1828 { 1829 mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft; 1830 mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg; 1831 mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg; 1832 mlxcx_flow_entry_t *fe; 1833 mlxcx_group_vlan_t *v; 1834 boolean_t found = B_FALSE; 1835 1836 ASSERT(mutex_owned(&g->mlg_mtx)); 1837 1838 mutex_enter(&ft->mlft_mtx); 1839 1840 for (v = list_head(&g->mlg_rx_vlans); v != NULL; 1841 v = list_next(&g->mlg_rx_vlans, v)) { 1842 if (v->mlgv_tagged == tagged && v->mlgv_vid == vid) { 1843 found = B_TRUE; 1844 break; 1845 } 1846 } 1847 if (!found) { 1848 mutex_exit(&ft->mlft_mtx); 1849 return (B_FALSE); 1850 } 1851 1852 list_remove(&g->mlg_rx_vlans, v); 1853 1854 /* 1855 * If this is the last VLAN entry, we have to go back to accepting 1856 * any VLAN (which means re-enabling the default entry). 1857 * 1858 * Do this before we remove the flow entry for the last specific 1859 * VLAN so that we don't lose any traffic in the transition. 1860 */ 1861 if (list_is_empty(&g->mlg_rx_vlans)) { 1862 fe = list_head(&dfg->mlfg_entries); 1863 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 1864 list_insert_tail(&g->mlg_rx_vlans, v); 1865 mutex_exit(&ft->mlft_mtx); 1866 return (B_FALSE); 1867 } 1868 } 1869 1870 fe = v->mlgv_fe; 1871 ASSERT(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED); 1872 ASSERT(fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED); 1873 ASSERT3P(fe->mlfe_table, ==, ft); 1874 ASSERT3P(fe->mlfe_group, ==, fg); 1875 1876 if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) { 1877 list_insert_tail(&g->mlg_rx_vlans, v); 1878 fe = list_head(&dfg->mlfg_entries); 1879 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) { 1880 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 1881 } 1882 mutex_exit(&ft->mlft_mtx); 1883 return (B_FALSE); 1884 } 1885 1886 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 1887 1888 kmem_free(v, sizeof (mlxcx_group_vlan_t)); 1889 1890 mutex_exit(&ft->mlft_mtx); 1891 return (B_TRUE); 1892 } 1893 1894 boolean_t 1895 mlxcx_add_vlan_entry(mlxcx_t *mlxp, mlxcx_ring_group_t *g, boolean_t tagged, 1896 uint16_t vid) 1897 { 1898 mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft; 1899 mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg; 1900 mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg; 1901 mlxcx_flow_entry_t *fe; 1902 mlxcx_group_vlan_t *v; 1903 boolean_t found = B_FALSE; 1904 boolean_t first = B_FALSE; 1905 1906 ASSERT(mutex_owned(&g->mlg_mtx)); 1907 1908 mutex_enter(&ft->mlft_mtx); 1909 1910 for (v = list_head(&g->mlg_rx_vlans); v != NULL; 1911 v = list_next(&g->mlg_rx_vlans, v)) { 1912 if (v->mlgv_tagged == tagged && v->mlgv_vid == vid) { 1913 mutex_exit(&ft->mlft_mtx); 1914 return (B_TRUE); 1915 } 1916 } 1917 if (list_is_empty(&g->mlg_rx_vlans)) 1918 first = B_TRUE; 1919 1920 for (fe = list_head(&fg->mlfg_entries); fe != NULL; 1921 fe = list_next(&fg->mlfg_entries, fe)) { 1922 if (!(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED)) { 1923 found = B_TRUE; 1924 break; 1925 } 1926 } 1927 if (!found) { 1928 mutex_exit(&ft->mlft_mtx); 1929 return (B_FALSE); 1930 } 1931 1932 v = kmem_zalloc(sizeof (mlxcx_group_vlan_t), KM_SLEEP); 1933 v->mlgv_fe = fe; 1934 v->mlgv_tagged = tagged; 1935 v->mlgv_vid = vid; 1936 1937 fe->mlfe_state |= MLXCX_FLOW_ENTRY_RESERVED; 1938 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 1939 fe->mlfe_vid = vid; 1940 if (tagged) { 1941 fe->mlfe_vlan_type = MLXCX_VLAN_TYPE_CVLAN; 1942 } else { 1943 fe->mlfe_vlan_type = MLXCX_VLAN_TYPE_NONE; 1944 } 1945 1946 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 1947 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_DIRTY; 1948 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 1949 kmem_free(v, sizeof (mlxcx_group_vlan_t)); 1950 mutex_exit(&ft->mlft_mtx); 1951 return (B_FALSE); 1952 } 1953 1954 list_insert_tail(&g->mlg_rx_vlans, v); 1955 1956 /* 1957 * If the vlan list was empty for this group before adding this one, 1958 * then we no longer want the "default" entry to allow all VLANs 1959 * through. 1960 */ 1961 if (first) { 1962 fe = list_head(&dfg->mlfg_entries); 1963 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 1964 } 1965 1966 mutex_exit(&ft->mlft_mtx); 1967 return (B_TRUE); 1968 } 1969 1970 void 1971 mlxcx_remove_all_umcast_entries(mlxcx_t *mlxp, mlxcx_port_t *port, 1972 mlxcx_ring_group_t *group) 1973 { 1974 mlxcx_flow_entry_t *fe; 1975 mlxcx_flow_table_t *ft = port->mlp_rx_flow; 1976 mlxcx_group_mac_t *gm, *ngm; 1977 1978 ASSERT(mutex_owned(&port->mlp_mtx)); 1979 ASSERT(mutex_owned(&group->mlg_mtx)); 1980 1981 mutex_enter(&ft->mlft_mtx); 1982 1983 gm = avl_first(&group->mlg_rx_macs); 1984 for (; gm != NULL; gm = ngm) { 1985 ngm = AVL_NEXT(&group->mlg_rx_macs, gm); 1986 1987 ASSERT3P(gm->mlgm_group, ==, group); 1988 fe = gm->mlgm_fe; 1989 ASSERT3P(fe->mlfe_table, ==, ft); 1990 1991 avl_remove(&group->mlg_rx_macs, gm); 1992 list_remove(&fe->mlfe_ring_groups, gm); 1993 kmem_free(gm, sizeof (mlxcx_group_mac_t)); 1994 1995 fe->mlfe_ndest = 0; 1996 for (gm = list_head(&fe->mlfe_ring_groups); gm != NULL; 1997 gm = list_next(&fe->mlfe_ring_groups, gm)) { 1998 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = 1999 gm->mlgm_group->mlg_rx_vlan_ft; 2000 } 2001 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 2002 2003 if (fe->mlfe_ndest > 0) { 2004 (void) mlxcx_cmd_set_flow_table_entry(mlxp, fe); 2005 continue; 2006 } 2007 2008 /* 2009 * There are no more ring groups left for this MAC (it wasn't 2010 * attached to any other groups since ndest == 0), so clean up 2011 * its flow entry. 2012 */ 2013 avl_remove(&port->mlp_dmac_fe, fe); 2014 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 2015 list_destroy(&fe->mlfe_ring_groups); 2016 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2017 } 2018 2019 mutex_exit(&ft->mlft_mtx); 2020 } 2021 2022 boolean_t 2023 mlxcx_remove_umcast_entry(mlxcx_t *mlxp, mlxcx_port_t *port, 2024 mlxcx_ring_group_t *group, const uint8_t *macaddr) 2025 { 2026 mlxcx_flow_entry_t *fe; 2027 mlxcx_flow_table_t *ft = port->mlp_rx_flow; 2028 mlxcx_group_mac_t *gm, probe; 2029 2030 ASSERT(mutex_owned(&port->mlp_mtx)); 2031 ASSERT(mutex_owned(&group->mlg_mtx)); 2032 2033 bzero(&probe, sizeof (probe)); 2034 bcopy(macaddr, probe.mlgm_mac, sizeof (probe.mlgm_mac)); 2035 2036 mutex_enter(&ft->mlft_mtx); 2037 2038 gm = avl_find(&group->mlg_rx_macs, &probe, NULL); 2039 if (gm == NULL) { 2040 mutex_exit(&ft->mlft_mtx); 2041 return (B_FALSE); 2042 } 2043 ASSERT3P(gm->mlgm_group, ==, group); 2044 ASSERT0(bcmp(macaddr, gm->mlgm_mac, sizeof (gm->mlgm_mac))); 2045 2046 fe = gm->mlgm_fe; 2047 ASSERT3P(fe->mlfe_table, ==, ft); 2048 ASSERT0(bcmp(macaddr, fe->mlfe_dmac, sizeof (fe->mlfe_dmac))); 2049 2050 list_remove(&fe->mlfe_ring_groups, gm); 2051 avl_remove(&group->mlg_rx_macs, gm); 2052 kmem_free(gm, sizeof (mlxcx_group_mac_t)); 2053 2054 fe->mlfe_ndest = 0; 2055 for (gm = list_head(&fe->mlfe_ring_groups); gm != NULL; 2056 gm = list_next(&fe->mlfe_ring_groups, gm)) { 2057 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = 2058 gm->mlgm_group->mlg_rx_vlan_ft; 2059 } 2060 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 2061 2062 if (fe->mlfe_ndest > 0) { 2063 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 2064 mutex_exit(&ft->mlft_mtx); 2065 return (B_FALSE); 2066 } 2067 mutex_exit(&ft->mlft_mtx); 2068 return (B_TRUE); 2069 } 2070 2071 /* 2072 * There are no more ring groups left for this MAC (it wasn't attached 2073 * to any other groups since ndest == 0), so clean up its flow entry. 2074 */ 2075 avl_remove(&port->mlp_dmac_fe, fe); 2076 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 2077 list_destroy(&fe->mlfe_ring_groups); 2078 2079 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2080 2081 mutex_exit(&ft->mlft_mtx); 2082 2083 return (B_TRUE); 2084 } 2085 2086 boolean_t 2087 mlxcx_add_umcast_entry(mlxcx_t *mlxp, mlxcx_port_t *port, 2088 mlxcx_ring_group_t *group, const uint8_t *macaddr) 2089 { 2090 mlxcx_flow_group_t *fg; 2091 mlxcx_flow_entry_t *fe, probe; 2092 mlxcx_flow_table_t *ft = port->mlp_rx_flow; 2093 mlxcx_group_mac_t *gm; 2094 boolean_t found = B_FALSE; 2095 2096 ASSERT(mutex_owned(&port->mlp_mtx)); 2097 ASSERT(mutex_owned(&group->mlg_mtx)); 2098 2099 bzero(&probe, sizeof (probe)); 2100 bcopy(macaddr, probe.mlfe_dmac, sizeof (probe.mlfe_dmac)); 2101 2102 mutex_enter(&ft->mlft_mtx); 2103 2104 fe = avl_find(&port->mlp_dmac_fe, &probe, NULL); 2105 2106 if (fe == NULL) { 2107 fg = port->mlp_umcast; 2108 for (fe = list_head(&fg->mlfg_entries); fe != NULL; 2109 fe = list_next(&fg->mlfg_entries, fe)) { 2110 if (!(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED)) { 2111 found = B_TRUE; 2112 break; 2113 } 2114 } 2115 if (!found) { 2116 mutex_exit(&ft->mlft_mtx); 2117 return (B_FALSE); 2118 } 2119 list_create(&fe->mlfe_ring_groups, sizeof (mlxcx_group_mac_t), 2120 offsetof(mlxcx_group_mac_t, mlgm_fe_entry)); 2121 fe->mlfe_state |= MLXCX_FLOW_ENTRY_RESERVED; 2122 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 2123 bcopy(macaddr, fe->mlfe_dmac, sizeof (fe->mlfe_dmac)); 2124 2125 avl_add(&port->mlp_dmac_fe, fe); 2126 } 2127 2128 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = group->mlg_rx_vlan_ft; 2129 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 2130 2131 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 2132 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_DIRTY; 2133 if (--fe->mlfe_ndest == 0) { 2134 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2135 } 2136 mutex_exit(&ft->mlft_mtx); 2137 return (B_FALSE); 2138 } 2139 2140 gm = kmem_zalloc(sizeof (mlxcx_group_mac_t), KM_SLEEP); 2141 gm->mlgm_group = group; 2142 gm->mlgm_fe = fe; 2143 bcopy(macaddr, gm->mlgm_mac, sizeof (gm->mlgm_mac)); 2144 avl_add(&group->mlg_rx_macs, gm); 2145 list_insert_tail(&fe->mlfe_ring_groups, gm); 2146 2147 mutex_exit(&ft->mlft_mtx); 2148 2149 return (B_TRUE); 2150 } 2151 2152 boolean_t 2153 mlxcx_setup_flow_group(mlxcx_t *mlxp, mlxcx_flow_table_t *ft, 2154 mlxcx_flow_group_t *fg) 2155 { 2156 mlxcx_flow_entry_t *fe; 2157 uint_t i, idx; 2158 2159 ASSERT(mutex_owned(&ft->mlft_mtx)); 2160 ASSERT(ft->mlft_state & MLXCX_FLOW_TABLE_CREATED); 2161 ASSERT3P(fg->mlfg_table, ==, ft); 2162 2163 if (ft->mlft_next_ent + fg->mlfg_size > ft->mlft_nents) 2164 return (B_FALSE); 2165 fg->mlfg_start_idx = ft->mlft_next_ent; 2166 2167 if (!mlxcx_cmd_create_flow_group(mlxp, fg)) { 2168 return (B_FALSE); 2169 } 2170 2171 list_create(&fg->mlfg_entries, sizeof (mlxcx_flow_entry_t), 2172 offsetof(mlxcx_flow_entry_t, mlfe_group_entry)); 2173 for (i = 0; i < fg->mlfg_size; ++i) { 2174 idx = fg->mlfg_start_idx + i; 2175 fe = &ft->mlft_ent[idx]; 2176 fe->mlfe_group = fg; 2177 list_insert_tail(&fg->mlfg_entries, fe); 2178 } 2179 fg->mlfg_avail = fg->mlfg_size; 2180 ft->mlft_next_ent += fg->mlfg_size; 2181 2182 return (B_TRUE); 2183 } 2184 2185 static boolean_t 2186 mlxcx_setup_eq0(mlxcx_t *mlxp) 2187 { 2188 mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[0]; 2189 2190 mutex_enter(&mleq->mleq_mtx); 2191 if (!mlxcx_eq_alloc_dma(mlxp, mleq)) { 2192 /* mlxcx_teardown_eqs() will clean this up */ 2193 mutex_exit(&mleq->mleq_mtx); 2194 return (B_FALSE); 2195 } 2196 mleq->mleq_mlx = mlxp; 2197 mleq->mleq_uar = &mlxp->mlx_uar; 2198 mleq->mleq_events = 2199 (1ULL << MLXCX_EVENT_PAGE_REQUEST) | 2200 (1ULL << MLXCX_EVENT_PORT_STATE) | 2201 (1ULL << MLXCX_EVENT_INTERNAL_ERROR) | 2202 (1ULL << MLXCX_EVENT_PORT_MODULE) | 2203 (1ULL << MLXCX_EVENT_SENDQ_DRAIN) | 2204 (1ULL << MLXCX_EVENT_LAST_WQE) | 2205 (1ULL << MLXCX_EVENT_CQ_ERROR) | 2206 (1ULL << MLXCX_EVENT_WQ_CATASTROPHE) | 2207 (1ULL << MLXCX_EVENT_PAGE_FAULT) | 2208 (1ULL << MLXCX_EVENT_WQ_INVALID_REQ) | 2209 (1ULL << MLXCX_EVENT_WQ_ACCESS_VIOL) | 2210 (1ULL << MLXCX_EVENT_NIC_VPORT) | 2211 (1ULL << MLXCX_EVENT_DOORBELL_CONGEST); 2212 if (!mlxcx_cmd_create_eq(mlxp, mleq)) { 2213 /* mlxcx_teardown_eqs() will clean this up */ 2214 mutex_exit(&mleq->mleq_mtx); 2215 return (B_FALSE); 2216 } 2217 if (ddi_intr_enable(mlxp->mlx_intr_handles[0]) != DDI_SUCCESS) { 2218 /* 2219 * mlxcx_teardown_eqs() will handle calling cmd_destroy_eq and 2220 * eq_rele_dma 2221 */ 2222 mutex_exit(&mleq->mleq_mtx); 2223 return (B_FALSE); 2224 } 2225 mlxcx_arm_eq(mlxp, mleq); 2226 mutex_exit(&mleq->mleq_mtx); 2227 return (B_TRUE); 2228 } 2229 2230 int 2231 mlxcx_cq_compare(const void *arg0, const void *arg1) 2232 { 2233 const mlxcx_completion_queue_t *left = arg0; 2234 const mlxcx_completion_queue_t *right = arg1; 2235 2236 if (left->mlcq_num < right->mlcq_num) { 2237 return (-1); 2238 } 2239 if (left->mlcq_num > right->mlcq_num) { 2240 return (1); 2241 } 2242 return (0); 2243 } 2244 2245 static boolean_t 2246 mlxcx_setup_eqs(mlxcx_t *mlxp) 2247 { 2248 uint_t i; 2249 mlxcx_event_queue_t *mleq; 2250 2251 ASSERT3S(mlxp->mlx_intr_count, >, 0); 2252 2253 for (i = 1; i < mlxp->mlx_intr_count; ++i) { 2254 mleq = &mlxp->mlx_eqs[i]; 2255 mutex_enter(&mleq->mleq_mtx); 2256 if (!mlxcx_eq_alloc_dma(mlxp, mleq)) { 2257 mutex_exit(&mleq->mleq_mtx); 2258 return (B_FALSE); 2259 } 2260 mleq->mleq_uar = &mlxp->mlx_uar; 2261 if (!mlxcx_cmd_create_eq(mlxp, mleq)) { 2262 /* mlxcx_teardown() will handle calling eq_rele_dma */ 2263 mutex_exit(&mleq->mleq_mtx); 2264 return (B_FALSE); 2265 } 2266 if (mlxp->mlx_props.mldp_intrmod_period_usec != 0 && 2267 !mlxcx_cmd_set_int_mod(mlxp, i, 2268 mlxp->mlx_props.mldp_intrmod_period_usec)) { 2269 mutex_exit(&mleq->mleq_mtx); 2270 return (B_FALSE); 2271 } 2272 if (ddi_intr_enable(mlxp->mlx_intr_handles[i]) != DDI_SUCCESS) { 2273 mutex_exit(&mleq->mleq_mtx); 2274 return (B_FALSE); 2275 } 2276 mlxcx_arm_eq(mlxp, mleq); 2277 mutex_exit(&mleq->mleq_mtx); 2278 } 2279 2280 mlxp->mlx_next_eq = 1; 2281 2282 return (B_TRUE); 2283 } 2284 2285 /* 2286 * Snapshot all of the hardware capabilities that we care about and then modify 2287 * the HCA capabilities to get things moving. 2288 */ 2289 static boolean_t 2290 mlxcx_init_caps(mlxcx_t *mlxp) 2291 { 2292 mlxcx_caps_t *c; 2293 2294 mlxp->mlx_caps = c = kmem_zalloc(sizeof (mlxcx_caps_t), KM_SLEEP); 2295 2296 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_GENERAL, 2297 MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_hca_cur)) { 2298 mlxcx_warn(mlxp, "failed to obtain current HCA general caps"); 2299 } 2300 2301 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_GENERAL, 2302 MLXCX_HCA_CAP_MODE_MAX, &c->mlc_hca_max)) { 2303 mlxcx_warn(mlxp, "failed to obtain maximum HCA general caps"); 2304 } 2305 2306 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_ETHERNET, 2307 MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_ether_cur)) { 2308 mlxcx_warn(mlxp, "failed to obtain current HCA eth caps"); 2309 } 2310 2311 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_ETHERNET, 2312 MLXCX_HCA_CAP_MODE_MAX, &c->mlc_ether_max)) { 2313 mlxcx_warn(mlxp, "failed to obtain maximum HCA eth caps"); 2314 } 2315 2316 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_NIC_FLOW, 2317 MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_nic_flow_cur)) { 2318 mlxcx_warn(mlxp, "failed to obtain current HCA flow caps"); 2319 } 2320 2321 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_NIC_FLOW, 2322 MLXCX_HCA_CAP_MODE_MAX, &c->mlc_nic_flow_max)) { 2323 mlxcx_warn(mlxp, "failed to obtain maximum HCA flow caps"); 2324 } 2325 2326 /* 2327 * Check the caps meet our requirements. 2328 */ 2329 const mlxcx_hca_cap_general_caps_t *gen = &c->mlc_hca_cur.mhc_general; 2330 2331 if (gen->mlcap_general_log_pg_sz != 12) { 2332 mlxcx_warn(mlxp, "!hardware has page size != 4k " 2333 "(log_pg_sz = %u)", (uint_t)gen->mlcap_general_log_pg_sz); 2334 goto err; 2335 } 2336 if (gen->mlcap_general_cqe_version != 1) { 2337 mlxcx_warn(mlxp, "!hardware does not support CQE v1 " 2338 "(cqe_ver = %u)", (uint_t)gen->mlcap_general_cqe_version); 2339 goto err; 2340 } 2341 if (gen->mlcap_general_port_type != 2342 MLXCX_CAP_GENERAL_PORT_TYPE_ETHERNET) { 2343 mlxcx_warn(mlxp, "!hardware has non-ethernet ports"); 2344 goto err; 2345 } 2346 mlxp->mlx_nports = gen->mlcap_general_num_ports; 2347 mlxp->mlx_max_sdu = (1 << (gen->mlcap_general_log_max_msg & 0x1F)); 2348 2349 c->mlc_max_tir = (1 << gen->mlcap_general_log_max_tir); 2350 2351 c->mlc_checksum = get_bit32(c->mlc_ether_cur.mhc_eth.mlcap_eth_flags, 2352 MLXCX_ETH_CAP_CSUM_CAP); 2353 c->mlc_vxlan = get_bit32(c->mlc_ether_cur.mhc_eth.mlcap_eth_flags, 2354 MLXCX_ETH_CAP_TUNNEL_STATELESS_VXLAN); 2355 2356 c->mlc_max_lso_size = (1 << get_bits32(c->mlc_ether_cur.mhc_eth. 2357 mlcap_eth_flags, MLXCX_ETH_CAP_MAX_LSO_CAP)); 2358 if (c->mlc_max_lso_size == 1) { 2359 c->mlc_max_lso_size = 0; 2360 c->mlc_lso = B_FALSE; 2361 } else { 2362 c->mlc_lso = B_TRUE; 2363 } 2364 2365 c->mlc_max_rqt_size = (1 << get_bits32(c->mlc_ether_cur.mhc_eth. 2366 mlcap_eth_flags, MLXCX_ETH_CAP_RSS_IND_TBL_CAP)); 2367 2368 if (!get_bit32(c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx. 2369 mlcap_flow_prop_flags, MLXCX_FLOW_CAP_PROPS_SUPPORT)) { 2370 mlxcx_warn(mlxp, "!hardware does not support rx flow tables"); 2371 goto err; 2372 } 2373 if (!get_bit32(c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx. 2374 mlcap_flow_prop_flags, MLXCX_FLOW_CAP_PROPS_MODIFY)) { 2375 mlxcx_warn(mlxp, "!hardware does not support modifying rx " 2376 "flow table entries"); 2377 goto err; 2378 } 2379 2380 c->mlc_max_rx_ft_shift = c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx. 2381 mlcap_flow_prop_log_max_ft_size; 2382 c->mlc_max_rx_flows = (1 << c->mlc_nic_flow_cur.mhc_flow. 2383 mlcap_flow_nic_rx.mlcap_flow_prop_log_max_flow); 2384 c->mlc_max_rx_fe_dest = (1 << c->mlc_nic_flow_cur.mhc_flow. 2385 mlcap_flow_nic_rx.mlcap_flow_prop_log_max_destination); 2386 2387 return (B_TRUE); 2388 2389 err: 2390 kmem_free(mlxp->mlx_caps, sizeof (mlxcx_caps_t)); 2391 return (B_FALSE); 2392 } 2393 2394 static int 2395 mlxcx_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2396 { 2397 mlxcx_t *mlxp; 2398 2399 if (cmd != DDI_DETACH) 2400 return (DDI_FAILURE); 2401 2402 mlxp = ddi_get_driver_private(dip); 2403 if (mlxp == NULL) { 2404 mlxcx_warn(NULL, "asked to detach, but missing instance " 2405 "private data"); 2406 return (DDI_FAILURE); 2407 } 2408 2409 if (mlxp->mlx_attach & MLXCX_ATTACH_MAC_HDL) { 2410 if (mac_unregister(mlxp->mlx_mac_hdl) != DDI_SUCCESS) { 2411 return (DDI_FAILURE); 2412 } 2413 mlxp->mlx_attach &= ~MLXCX_ATTACH_MAC_HDL; 2414 } 2415 2416 mlxcx_teardown(mlxp); 2417 return (DDI_SUCCESS); 2418 } 2419 2420 static size_t 2421 mlxcx_calc_rx_ngroups(mlxcx_t *mlxp) 2422 { 2423 size_t ngroups = mlxp->mlx_props.mldp_rx_ngroups_large + 2424 mlxp->mlx_props.mldp_rx_ngroups_small; 2425 size_t tirlim, flowlim, gflowlim; 2426 2427 tirlim = mlxp->mlx_caps->mlc_max_tir / MLXCX_TIRS_PER_GROUP; 2428 if (tirlim < ngroups) { 2429 mlxcx_note(mlxp, "limiting number of rx groups to %u based " 2430 "on number of TIRs available", tirlim); 2431 ngroups = tirlim; 2432 } 2433 2434 flowlim = (1 << mlxp->mlx_caps->mlc_max_rx_ft_shift) - 2; 2435 if (flowlim < ngroups) { 2436 mlxcx_note(mlxp, "limiting number of rx groups to %u based " 2437 "on max size of RX flow tables", flowlim); 2438 ngroups = flowlim; 2439 } 2440 2441 do { 2442 gflowlim = mlxp->mlx_caps->mlc_max_rx_flows - 16 * ngroups - 2; 2443 if (gflowlim < ngroups) { 2444 mlxcx_note(mlxp, "limiting number of rx groups to %u " 2445 "based on max total RX flows", gflowlim); 2446 --ngroups; 2447 } 2448 } while (gflowlim < ngroups); 2449 2450 return (ngroups); 2451 } 2452 2453 static int 2454 mlxcx_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2455 { 2456 mlxcx_t *mlxp; 2457 uint_t i; 2458 int inst, ret; 2459 2460 if (cmd != DDI_ATTACH) 2461 return (DDI_FAILURE); 2462 2463 inst = ddi_get_instance(dip); 2464 ret = ddi_soft_state_zalloc(mlxcx_softstate, inst); 2465 if (ret != 0) 2466 return (ret); 2467 2468 mlxp = ddi_get_soft_state(mlxcx_softstate, inst); 2469 if (mlxp == NULL) 2470 return (DDI_FAILURE); 2471 mlxp->mlx_dip = dip; 2472 mlxp->mlx_inst = inst; 2473 ddi_set_driver_private(dip, mlxp); 2474 2475 mlxcx_load_props(mlxp); 2476 2477 mlxcx_fm_init(mlxp); 2478 mlxp->mlx_attach |= MLXCX_ATTACH_FM; 2479 2480 if (pci_config_setup(mlxp->mlx_dip, &mlxp->mlx_cfg_handle) != 2481 DDI_SUCCESS) { 2482 mlxcx_warn(mlxp, "failed to initial PCI config space"); 2483 goto err; 2484 } 2485 mlxp->mlx_attach |= MLXCX_ATTACH_PCI_CONFIG; 2486 2487 if (!mlxcx_regs_map(mlxp)) { 2488 goto err; 2489 } 2490 mlxp->mlx_attach |= MLXCX_ATTACH_REGS; 2491 2492 if (!mlxcx_cmd_queue_init(mlxp)) { 2493 goto err; 2494 } 2495 mlxp->mlx_attach |= MLXCX_ATTACH_CMD; 2496 2497 if (!mlxcx_cmd_enable_hca(mlxp)) { 2498 goto err; 2499 } 2500 mlxp->mlx_attach |= MLXCX_ATTACH_ENABLE_HCA; 2501 2502 if (!mlxcx_check_issi(mlxp)) { 2503 goto err; 2504 } 2505 2506 /* 2507 * We have to get our interrupts now so we know what priority to 2508 * create pagemtx with. 2509 */ 2510 if (!mlxcx_intr_setup(mlxp)) { 2511 goto err; 2512 } 2513 mlxp->mlx_attach |= MLXCX_ATTACH_INTRS; 2514 2515 mutex_init(&mlxp->mlx_pagemtx, NULL, MUTEX_DRIVER, 2516 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 2517 avl_create(&mlxp->mlx_pages, mlxcx_page_compare, 2518 sizeof (mlxcx_dev_page_t), offsetof(mlxcx_dev_page_t, mxdp_tree)); 2519 mlxp->mlx_attach |= MLXCX_ATTACH_PAGE_LIST; 2520 2521 if (!mlxcx_init_pages(mlxp, MLXCX_QUERY_PAGES_OPMOD_BOOT)) { 2522 goto err; 2523 } 2524 2525 if (!mlxcx_init_caps(mlxp)) { 2526 goto err; 2527 } 2528 mlxp->mlx_attach |= MLXCX_ATTACH_CAPS; 2529 2530 if (!mlxcx_init_pages(mlxp, MLXCX_QUERY_PAGES_OPMOD_INIT)) { 2531 goto err; 2532 } 2533 2534 if (!mlxcx_cmd_init_hca(mlxp)) { 2535 goto err; 2536 } 2537 mlxp->mlx_attach |= MLXCX_ATTACH_INIT_HCA; 2538 2539 if (!mlxcx_cmd_set_driver_version(mlxp, MLXCX_DRIVER_VERSION)) { 2540 goto err; 2541 } 2542 2543 /* 2544 * The User Access Region (UAR) is needed so we can ring EQ and CQ 2545 * doorbells. 2546 */ 2547 if (!mlxcx_cmd_alloc_uar(mlxp, &mlxp->mlx_uar)) { 2548 goto err; 2549 } 2550 for (i = 0; i < MLXCX_BF_PER_UAR; ++i) { 2551 mutex_init(&mlxp->mlx_uar.mlu_bf[i].mbf_mtx, NULL, 2552 MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_intr_pri)); 2553 } 2554 mlxp->mlx_attach |= MLXCX_ATTACH_UAR_PD_TD; 2555 2556 /* 2557 * Set up event queue #0 -- it's special and only handles control 2558 * type events, like PAGE_REQUEST (which we will probably get during 2559 * the commands below). 2560 * 2561 * This will enable and arm the interrupt on EQ 0, too. 2562 */ 2563 if (!mlxcx_setup_eq0(mlxp)) { 2564 goto err; 2565 } 2566 2567 /* 2568 * Allocate a protection and transport domain. These don't really do 2569 * anything for us (they're IB concepts), but we need to give their 2570 * ID numbers in other commands. 2571 */ 2572 if (!mlxcx_cmd_alloc_pd(mlxp, &mlxp->mlx_pd)) { 2573 goto err; 2574 } 2575 if (!mlxcx_cmd_alloc_tdom(mlxp, &mlxp->mlx_tdom)) { 2576 goto err; 2577 } 2578 /* 2579 * Fetch the "reserved" lkey that lets us give linear addresses in 2580 * work queue entries, rather than having to mess with the NIC's 2581 * internal MMU. 2582 */ 2583 if (!mlxcx_cmd_query_special_ctxs(mlxp)) { 2584 goto err; 2585 } 2586 2587 /* 2588 * Query our port information and current state, populate the 2589 * mlxcx_port_t structs. 2590 * 2591 * This also sets up the root flow tables and flow groups. 2592 */ 2593 if (!mlxcx_setup_ports(mlxp)) { 2594 goto err; 2595 } 2596 mlxp->mlx_attach |= MLXCX_ATTACH_PORTS; 2597 2598 /* 2599 * Set up, enable and arm the rest of the interrupt EQs which will 2600 * service events from CQs. 2601 * 2602 * The MLXCX_ATTACH_INTRS flag covers checking if these need to be 2603 * cleaned up. 2604 */ 2605 if (!mlxcx_setup_eqs(mlxp)) { 2606 goto err; 2607 } 2608 2609 /* Completion queues */ 2610 list_create(&mlxp->mlx_cqs, sizeof (mlxcx_completion_queue_t), 2611 offsetof(mlxcx_completion_queue_t, mlcq_entry)); 2612 mlxp->mlx_attach |= MLXCX_ATTACH_CQS; 2613 2614 /* Work queues (send queues, receive queues) */ 2615 list_create(&mlxp->mlx_wqs, sizeof (mlxcx_work_queue_t), 2616 offsetof(mlxcx_work_queue_t, mlwq_entry)); 2617 mlxp->mlx_attach |= MLXCX_ATTACH_WQS; 2618 2619 /* Set up periodic fault check timers which check the queue states */ 2620 if (!mlxcx_setup_checktimers(mlxp)) { 2621 goto err; 2622 } 2623 mlxp->mlx_attach |= MLXCX_ATTACH_CHKTIMERS; 2624 2625 /* 2626 * Construct our arrays of mlxcx_ring_group_ts, which represent the 2627 * "groups" we advertise to MAC. 2628 */ 2629 mlxp->mlx_rx_ngroups = mlxcx_calc_rx_ngroups(mlxp); 2630 mlxp->mlx_rx_groups_size = mlxp->mlx_rx_ngroups * 2631 sizeof (mlxcx_ring_group_t); 2632 mlxp->mlx_rx_groups = kmem_zalloc(mlxp->mlx_rx_groups_size, KM_SLEEP); 2633 2634 mlxp->mlx_tx_ngroups = mlxp->mlx_props.mldp_tx_ngroups; 2635 mlxp->mlx_tx_groups_size = mlxp->mlx_tx_ngroups * 2636 sizeof (mlxcx_ring_group_t); 2637 mlxp->mlx_tx_groups = kmem_zalloc(mlxp->mlx_tx_groups_size, KM_SLEEP); 2638 2639 mlxp->mlx_attach |= MLXCX_ATTACH_GROUPS; 2640 2641 /* 2642 * Sets up the free/busy buffers list for keeping track of packet 2643 * buffers. 2644 */ 2645 if (!mlxcx_setup_bufs(mlxp)) 2646 goto err; 2647 mlxp->mlx_attach |= MLXCX_ATTACH_BUFS; 2648 2649 /* 2650 * Before we tell MAC about our rings/groups, we need to do enough 2651 * setup on them to be sure about the numbers and configuration that 2652 * we have. This will do basically everything short of allocating 2653 * packet buffers and starting the rings up. 2654 */ 2655 for (i = 0; i < mlxp->mlx_tx_ngroups; ++i) { 2656 if (!mlxcx_tx_group_setup(mlxp, &mlxp->mlx_tx_groups[i])) 2657 goto err; 2658 } 2659 for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) { 2660 if (!mlxcx_rx_group_setup(mlxp, &mlxp->mlx_rx_groups[i])) 2661 goto err; 2662 } 2663 2664 /* 2665 * Finally, tell MAC that we exist! 2666 */ 2667 if (!mlxcx_register_mac(mlxp)) { 2668 goto err; 2669 } 2670 mlxp->mlx_attach |= MLXCX_ATTACH_MAC_HDL; 2671 2672 return (DDI_SUCCESS); 2673 2674 err: 2675 mlxcx_teardown(mlxp); 2676 return (DDI_FAILURE); 2677 } 2678 2679 static struct cb_ops mlxcx_cb_ops = { 2680 .cb_open = nulldev, 2681 .cb_close = nulldev, 2682 .cb_strategy = nodev, 2683 .cb_print = nodev, 2684 .cb_dump = nodev, 2685 .cb_read = nodev, 2686 .cb_write = nodev, 2687 .cb_ioctl = nodev, 2688 .cb_devmap = nodev, 2689 .cb_mmap = nodev, 2690 .cb_segmap = nodev, 2691 .cb_chpoll = nochpoll, 2692 .cb_prop_op = ddi_prop_op, 2693 .cb_flag = D_MP, 2694 .cb_rev = CB_REV, 2695 .cb_aread = nodev, 2696 .cb_awrite = nodev 2697 }; 2698 2699 static struct dev_ops mlxcx_dev_ops = { 2700 .devo_rev = DEVO_REV, 2701 .devo_refcnt = 0, 2702 .devo_getinfo = NULL, 2703 .devo_identify = nulldev, 2704 .devo_probe = nulldev, 2705 .devo_attach = mlxcx_attach, 2706 .devo_detach = mlxcx_detach, 2707 .devo_reset = nodev, 2708 .devo_power = ddi_power, 2709 .devo_quiesce = ddi_quiesce_not_supported, 2710 .devo_cb_ops = &mlxcx_cb_ops 2711 }; 2712 2713 static struct modldrv mlxcx_modldrv = { 2714 .drv_modops = &mod_driverops, 2715 .drv_linkinfo = "Mellanox Connect-X 4/5/6", 2716 .drv_dev_ops = &mlxcx_dev_ops 2717 }; 2718 2719 static struct modlinkage mlxcx_modlinkage = { 2720 .ml_rev = MODREV_1, 2721 .ml_linkage = { &mlxcx_modldrv, NULL } 2722 }; 2723 2724 int 2725 _init(void) 2726 { 2727 int ret; 2728 2729 ret = ddi_soft_state_init(&mlxcx_softstate, sizeof (mlxcx_t), 0); 2730 if (ret != 0) { 2731 return (ret); 2732 } 2733 2734 mac_init_ops(&mlxcx_dev_ops, MLXCX_MODULE_NAME); 2735 2736 if ((ret = mod_install(&mlxcx_modlinkage)) != DDI_SUCCESS) { 2737 mac_fini_ops(&mlxcx_dev_ops); 2738 ddi_soft_state_fini(&mlxcx_softstate); 2739 return (ret); 2740 } 2741 2742 return (DDI_SUCCESS); 2743 } 2744 2745 int 2746 _info(struct modinfo *modinfop) 2747 { 2748 return (mod_info(&mlxcx_modlinkage, modinfop)); 2749 } 2750 2751 int 2752 _fini(void) 2753 { 2754 int ret; 2755 2756 if ((ret = mod_remove(&mlxcx_modlinkage)) != DDI_SUCCESS) { 2757 return (ret); 2758 } 2759 2760 mac_fini_ops(&mlxcx_dev_ops); 2761 2762 ddi_soft_state_fini(&mlxcx_softstate); 2763 2764 return (DDI_SUCCESS); 2765 } 2766