1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2020, The University of Queensland 14 * Copyright (c) 2018, Joyent, Inc. 15 * Copyright 2020 RackTop Systems, Inc. 16 */ 17 18 /* 19 * Mellanox Connect-X 4/5/6 driver. 20 */ 21 22 /* 23 * The PRM for this family of parts is freely available, and can be found at: 24 * https://www.mellanox.com/related-docs/user_manuals/ \ 25 * Ethernet_Adapters_Programming_Manual.pdf 26 */ 27 /* 28 * ConnectX glossary 29 * ----------------- 30 * 31 * WR Work Request: something we've asked the hardware to do by 32 * creating a Work Queue Entry (WQE), e.g. send or recv a packet 33 * 34 * WQE Work Queue Entry: a descriptor on a work queue descriptor ring 35 * 36 * WQ Work Queue: a descriptor ring that we can place WQEs on, usually 37 * either a Send Queue (SQ) or Receive Queue (RQ). Different WQ 38 * types have different WQE structures, different commands for 39 * creating and destroying them, etc, but share a common context 40 * structure, counter setup and state graph. 41 * SQ Send Queue, a specific type of WQ that sends packets 42 * RQ Receive Queue, a specific type of WQ that receives packets 43 * 44 * CQ Completion Queue: completion of WRs from a WQ are reported to 45 * one of these, as a CQE on its entry ring. 46 * CQE Completion Queue Entry: an entry in a CQ ring. Contains error 47 * info, as well as packet size, the ID of the WQ, and the index 48 * of the WQE which completed. Does not contain any packet data. 49 * 50 * EQ Event Queue: a ring of event structs from the hardware informing 51 * us when particular events happen. Many events can point at a 52 * a particular CQ which we should then go look at. 53 * EQE Event Queue Entry: an entry on the EQ ring 54 * 55 * UAR User Access Region, a page of the device's PCI BAR which is 56 * tied to particular EQ/CQ/WQ sets and contains doorbells to 57 * ring to arm them for interrupts or wake them up for new work 58 * 59 * RQT RQ Table, a collection of indexed RQs used to refer to the group 60 * as a single unit (for e.g. hashing/RSS). 61 * 62 * TIR Transport Interface Recieve, a bucket of resources for the 63 * reception of packets. TIRs have to point at either a single RQ 64 * or a table of RQs (RQT). They then serve as a target for flow 65 * table entries (FEs). TIRs that point at an RQT also contain the 66 * settings for hashing for RSS. 67 * 68 * TIS Transport Interface Send, a bucket of resources associated with 69 * the transmission of packets. In particular, the temporary 70 * resources used for LSO internally in the card are accounted to 71 * a TIS. 72 * 73 * FT Flow Table, a collection of FEs and FGs that can be referred to 74 * as a single entity (e.g. used as a target from another flow 75 * entry or set as the "root" table to handle incoming or outgoing 76 * packets). Packets arriving at a FT are matched against the 77 * FEs in the table until either one matches with a terminating 78 * action or all FEs are exhausted (it's first-match-wins but with 79 * some actions that are non-terminal, like counting actions). 80 * 81 * FG Flow Group, a group of FEs which share a common "mask" (i.e. 82 * they match on the same attributes of packets coming into the 83 * flow). 84 * 85 * FE Flow Entry, an individual set of values to match against 86 * packets entering the flow table, combined with an action to 87 * take upon a successful match. The action we use most is 88 * "forward", which sends the packets to a TIR or another flow 89 * table and then stops further processing within the FE's FT. 90 * 91 * lkey/mkey A reference to something similar to a page table but in the 92 * device's internal onboard MMU. Since Connect-X parts double as 93 * IB cards (lots of RDMA) they have extensive onboard memory mgmt 94 * features which we try very hard not to use. For our WQEs we use 95 * the "reserved" lkey, which is a special value which indicates 96 * that addresses we give are linear addresses and should not be 97 * translated. 98 * 99 * PD Protection Domain, an IB concept. We have to allocate one to 100 * provide as a parameter for new WQs, but we don't do anything 101 * with it. 102 * 103 * TDOM/TD Transport Domain, an IB concept. We allocate one in order to 104 * provide it as a parameter to TIR/TIS creation, but we don't do 105 * anything with it. 106 */ 107 /* 108 * 109 * Data flow overview 110 * ------------------ 111 * 112 * This driver is a MAC ring-enabled driver which maps rings to send and recv 113 * queues in hardware on the device. 114 * 115 * Each SQ and RQ is set up to report to its own individual CQ, to ensure 116 * sufficient space, and simplify the logic needed to work out which buffer 117 * was completed. 118 * 119 * The CQs are then round-robin allocated onto EQs, of which we set up one per 120 * interrupt that the system gives us for the device. Normally this means we 121 * have 8 EQs. 122 * 123 * When we have >= 8 EQs available, we try to allocate only RX or only TX 124 * CQs on each one. The EQs are chosen for RX and TX in an alternating fashion. 125 * 126 * EQ #0 is reserved for all event types other than completion events, and has 127 * no CQs associated with it at any time. EQs #1 and upwards are only used for 128 * handling CQ completion events. 129 * 130 * +------+ +------+ +------+ +---------+ 131 * | SQ 0 |---->| CQ 0 |-----+ | EQ 0 |------> | MSI-X 0 | mlxcx_intr_0 132 * +------+ +------+ | +------+ +---------+ 133 * | 134 * +------+ +------+ | 135 * | SQ 1 |---->| CQ 1 |---+ | +------+ 136 * +------+ +------+ | +---> | | 137 * | | | 138 * +------+ +------+ | | EQ 1 | +---------+ 139 * | SQ 2 |---->| CQ 2 |---------> | |------> | MSI-X 1 | mlxcx_intr_n 140 * +------+ +------+ | +---> | | +---------+ 141 * | | +------+ 142 * | | 143 * ... | | 144 * | | +------+ 145 * +------+ +------+ +-----> | | 146 * | RQ 0 |---->| CQ 3 |---------> | | +---------+ 147 * +------+ +------+ | | EQ 2 |------> | MSI-X 2 | mlxcx_intr_n 148 * | | | +---------+ 149 * +------+ +------+ | +-> | | 150 * | RQ 1 |---->| CQ 4 |-----+ | +------+ 151 * +------+ +------+ | 152 * | .... 153 * +------+ +------+ | 154 * | RQ 2 |---->| CQ 5 |-------+ 155 * +------+ +------+ 156 * 157 * ... (note this diagram does not show RX-only or TX-only EQs) 158 * 159 * For TX, we advertise all of the SQs we create as plain rings to MAC with 160 * no TX groups. This puts MAC in "virtual group" mode where it will allocate 161 * and use the rings as it sees fit. 162 * 163 * For RX, we advertise actual groups in order to make use of hardware 164 * classification. 165 * 166 * The hardware classification we use is based around Flow Tables, and we 167 * currently ignore all of the eswitch features of the card. The NIC VPORT 168 * is always set to promisc mode so that the eswitch sends us all of the 169 * traffic that arrives on the NIC, and we use flow entries to manage 170 * everything. 171 * 172 * We use 2 layers of flow tables for classification: traffic arrives at the 173 * root RX flow table which contains MAC address filters. Those then send 174 * matched traffic to the per-group L1 VLAN filter tables which contain VLAN 175 * presence and VID filters. 176 * 177 * Since these parts only support doing RSS hashing on a single protocol at a 178 * time, we have to use a third layer of flow tables as well to break traffic 179 * down by L4 and L3 protocol (TCPv6, TCPv4, UDPv6, UDPv4, IPv6, IPv4 etc) 180 * so that it can be sent to the appropriate TIR for hashing. 181 * 182 * Incoming packets 183 * + +---------+ +---------+ 184 * | +->| group 0 | | group 0 | 185 * | | | vlan ft | +-->| hash ft | 186 * v | | L1 | | | L2 | 187 * +----+----+ | +---------+ | +---------+ +-----+ +-----+------+ 188 * | eswitch | | | | | | TCPv6 |--->| TIR |--->| | RQ0 | 189 * +----+----+ | | | | +---------+ +-----+ | +------+ 190 * | | | | | | UDPv6 |--->| TIR |--->| | RQ1 | 191 * | | | | | +---------+ +-----+ | +------+ 192 * | | | | | | TCPv4 |--->| TIR |--->| | RQ2 | 193 * v | | | | +---------+ +-----+ | RQT +------+ 194 * +----+----+ | +---------+ | | UDPv4 |--->| TIR |--->| | ... | 195 * | root rx | | | default |--+ +---------+ +-----+ | | | 196 * | flow tb | | +---------+ | | IPv6 |--->| TIR |--->| | | 197 * | L0 | | | promisc |--+ +---------+ +-----+ | | | 198 * +---------+ | +---------+ ^ | IPv4 |--->| TIR |--->| | | 199 * | bcast |---|---------------+ +---------+ +-----+ +-----+------+ 200 * +---------+ | ^ | other |-+ 201 * | MAC 0 |---+ | +---------+ | +-----+ +-----+ 202 * +---------+ | +->| TIR |--->| RQ0 | 203 * | MAC 1 |-+ | +-----+ +-----+ 204 * +---------+ | +---------------+ 205 * | MAC 2 |-+ | ^ 206 * +---------+ | | | 207 * | MAC 3 |-+ | +---------+ | +---------+ 208 * +---------+ | | | group 1 | | | group 1 | 209 * | ..... | +--->| vlan ft | | +>| hash ft | 210 * | | | | L1 | | | | L2 | 211 * +---------+ | +---------+ | | +---------+ +-----+ +-----+------+ 212 * | promisc |---+ | VLAN 0 |----+ | TCPv6 |--->| TIR |--->| | RQ3 | 213 * +---------+ +---------+ | +---------+ +-----+ | +------+ 214 * | ..... | | | UDPv6 |--->| TIR |--->| | RQ4 | 215 * | | | +---------+ +-----+ | +------+ 216 * | | | | TCPv4 |--->| TIR |--->| | RQ5 | 217 * | | | +---------+ +-----+ | RQT +------+ 218 * +---------+ | | UDPv4 |--->| TIR |--->| | ... | 219 * | | | +---------+ +-----+ | | | 220 * +---------+ | | IPv6 |--->| TIR |--->| | | 221 * | promisc |--+ +---------+ +-----+ | | | 222 * +---------+ | IPv4 |--->| TIR |--->| | | 223 * +---------+ +-----+ +-----+------+ 224 * | other |-+ 225 * +---------+ | 226 * ....... | +-----+ +-----+ 227 * +->| TIR |--->| RQ3 | 228 * +-----+ +-----+ 229 * 230 * Note that the "promisc" flow entries are only set/enabled when promisc 231 * mode is enabled for the NIC. All promisc flow entries point directly at 232 * group 0's hashing flowtable (so all promisc-only traffic lands on group 0, 233 * the "default group" in MAC). 234 * 235 * The "default" entry in the L1 VLAN filter flow tables is used when there 236 * are no VLANs set for the group, to accept any traffic regardless of tag. It 237 * is deleted as soon as a VLAN filter is added (and re-instated if the 238 * last VLAN filter is removed). 239 * 240 * The actual descriptor ring structures for RX on Connect-X4 don't contain any 241 * space for packet data (they're a collection of scatter pointers only). TX 242 * descriptors contain some space for "inline headers" (and the card requires 243 * us to put at least the L2 Ethernet headers there for the eswitch to look at) 244 * but all the rest of the data comes from the gather pointers. 245 * 246 * When we get completions back they simply contain the ring index number of 247 * the WR (work request) which completed. So, we manage the buffers for actual 248 * packet data completely independently of the descriptors in this driver. When 249 * a WR is enqueued in a WQE (work queue entry), we stamp the packet data buffer 250 * with the WQE index that we put it at, and therefore don't have to look at 251 * the original descriptor at all when handling completions. 252 * 253 * For RX, we create sufficient packet data buffers to fill 150% of the 254 * available descriptors for each ring. These all are pre-set-up for DMA and 255 * have an mblk_t associated with them (with desballoc()). 256 * 257 * For TX we either borrow the mblk's memory and DMA bind it (if the packet is 258 * large enough), or we copy it into a pre-allocated buffer set up in the same 259 * as as for RX. 260 */ 261 262 /* 263 * Buffer lifecycle: RX 264 * -------------------- 265 * 266 * The lifecycle of an mlxcx_buffer_t (packet buffer) used for RX is pretty 267 * straightforward. 268 * 269 * It is created (and has all its memory allocated) at the time of starting up 270 * the RX ring it belongs to. Then it is placed on the "free" list in the 271 * mlxcx_buffer_shard_t associated with its RQ. When mlxcx_rq_refill() wants 272 * more buffers to add to the RQ, it takes one off and marks it as "on WQ" 273 * before making a WQE for it. 274 * 275 * After a completion event occurs, the packet is either discarded (and the 276 * buffer_t returned to the free list), or it is readied for loaning to MAC 277 * and placed on the "loaned" list in the mlxcx_buffer_shard_t. 278 * 279 * Once MAC and the rest of the system have finished with the packet, they call 280 * freemsg() on its mblk, which will call mlxcx_buf_mp_return. At this point 281 * the fate of the buffer_t is determined by the state of the 282 * mlxcx_buffer_shard_t. When the shard is in its normal state the buffer_t 283 * will be returned to the free list, potentially to be recycled and used 284 * again. But if the shard is draining (E.g. after a ring stop) there will be 285 * no recycling and the buffer_t is immediately destroyed. 286 * 287 * At detach/teardown time, buffers are only every destroyed from the free list. 288 * 289 * 290 * + 291 * | 292 * | mlxcx_buf_create 293 * | 294 * v 295 * +----+----+ 296 * | created | 297 * +----+----+ +------+ 298 * | | dead | 299 * | +------+ 300 * | mlxcx_buf_return ^ 301 * | | 302 * v | mlxcx_buf_destroy 303 * mlxcx_buf_destroy +----+----+ +-----------+ | 304 * +---------| free |<------no-| draining? |-yes-+ 305 * | +----+----+ +-----------+ 306 * | | ^ 307 * | | | 308 * v | mlxcx_buf_take | mlxcx_buf_return 309 * +---+--+ v | 310 * | dead | +---+---+ | 311 * +------+ | on WQ |- - - - - - - - >O 312 * +---+---+ ^ 313 * | | 314 * | | 315 * | mlxcx_buf_loan | mlxcx_buf_mp_return 316 * v | 317 * +-------+--------+ | 318 * | on loan to MAC |----------->O 319 * +----------------+ freemsg() 320 * 321 */ 322 323 /* 324 * Buffer lifecycle: TX 325 * -------------------- 326 * 327 * mlxcx_buffer_ts used for TX are divided into two kinds: regular buffers, and 328 * "foreign" buffers. 329 * 330 * The former have their memory allocated and DMA bound by this driver, while 331 * the latter (the "foreign" buffers) are on loan from MAC. Their memory is 332 * not owned by us, though we do DMA bind it (and take responsibility for 333 * un-binding it when we're done with them). 334 * 335 * We use separate mlxcx_buf_shard_ts for foreign and local buffers on each 336 * SQ. Thus, there is a separate free list and mutex for each kind. 337 * 338 * Since a TX packet might consist of multiple mblks, we translate each mblk 339 * into exactly one buffer_t. The buffer_ts are chained together in the same 340 * order as the mblks, using the mlb_tx_chain/mlb_tx_chain_entry list_t. 341 * 342 * Each chain of TX buffers may consist of foreign or driver buffers, in any 343 * mixture. 344 * 345 * The head of a TX buffer chain has mlb_tx_head == itself, which distinguishes 346 * it from the rest of the chain buffers. 347 * 348 * TX buffer chains are always returned to the free list by 349 * mlxcx_buf_return_chain(), which takes care of walking the mlb_tx_chain and 350 * freeing all of the members. 351 * 352 * We only call freemsg() once, on the head of the TX buffer chain's original 353 * mblk. This is true whether we copied it or bound it in a foreign buffer. 354 */ 355 356 /* 357 * Startup and command interface 358 * ----------------------------- 359 * 360 * The command interface is the primary way in which we give control orders to 361 * the hardware (e.g. actions like "create this queue" or "delete this flow 362 * entry"). The command interface is never used to transmit or receive packets 363 * -- that takes place only on the queues that are set up through it. 364 * 365 * In mlxcx_cmd.c we implement our use of the command interface on top of a 366 * simple taskq. Since it's not performance critical, we busy-wait on command 367 * completions and only process a single command at a time. 368 * 369 * If this becomes a problem later we can wire command completions up to EQ 0 370 * once we have interrupts running. 371 * 372 * The startup/attach process for this card involves a bunch of different steps 373 * which are summarised pretty well in the PRM. We have to send a number of 374 * commands which do different things to start the card up, give it some pages 375 * of our own memory for it to use, then start creating all the entities that 376 * we need to use like EQs, CQs, WQs, as well as their dependencies like PDs 377 * and TDoms. 378 */ 379 380 /* 381 * UARs 382 * ---- 383 * 384 * The pages of the PCI BAR other than the first few are reserved for use as 385 * "UAR" sections in this device. Each UAR section can be used as a set of 386 * doorbells for our queues. 387 * 388 * Currently we just make one single UAR for all of our queues. It doesn't 389 * seem to be a major limitation yet. 390 * 391 * When we're sending packets through an SQ, the PRM is not awful clear about 392 * exactly how we're meant to use the first 16 bytes of the Blueflame buffers 393 * (it's clear on the pattern of alternation you're expected to use between 394 * even and odd for Blueflame sends, but not for regular doorbells). 395 * 396 * Currently we don't do the even-odd alternating pattern for ordinary 397 * doorbells, and we don't use Blueflame at all. This seems to work fine, at 398 * least on Connect-X4 Lx. 399 */ 400 401 /* 402 * Lock ordering 403 * ------------- 404 * 405 * Interrupt side: 406 * 407 * - mleq_mtx 408 * - mlcq_mtx 409 * - mlcq_bufbmtx 410 * - mlwq_mtx 411 * - mlbs_mtx 412 * - mlp_mtx 413 * 414 * GLD side: 415 * 416 * - mlp_mtx 417 * - mlg_mtx 418 * - mlg_*.mlft_mtx 419 * - mlp_*.mlft_mtx 420 * - mlwq_mtx 421 * - mlbs_mtx 422 * - mlcq_bufbmtx 423 * - mleq_mtx 424 * - mlcq_mtx 425 * 426 */ 427 428 #include <sys/modctl.h> 429 #include <sys/conf.h> 430 #include <sys/devops.h> 431 #include <sys/sysmacros.h> 432 #include <sys/time.h> 433 434 #include <sys/mac_provider.h> 435 436 #include <mlxcx.h> 437 438 CTASSERT((1 << MLXCX_RX_HASH_FT_SIZE_SHIFT) >= MLXCX_TIRS_PER_GROUP); 439 440 #define MLXCX_MODULE_NAME "mlxcx" 441 /* 442 * We give this to the firmware, so it has to be in a fixed format that it 443 * understands. 444 */ 445 #define MLXCX_DRIVER_VERSION "illumos,mlxcx,1.0.0,1,000,000000" 446 447 /* 448 * Firmware may take a while to reclaim pages. Try a set number of times. 449 */ 450 clock_t mlxcx_reclaim_delay = 1000 * 50; /* 50 ms in us */ 451 uint_t mlxcx_reclaim_tries = 100; /* Wait at most 5000ms */ 452 453 static void *mlxcx_softstate; 454 455 /* 456 * Fault detection thresholds. 457 */ 458 uint_t mlxcx_doorbell_tries = MLXCX_DOORBELL_TRIES_DFLT; 459 uint_t mlxcx_stuck_intr_count = MLXCX_STUCK_INTR_COUNT_DFLT; 460 461 static void 462 mlxcx_load_prop_defaults(mlxcx_t *mlxp) 463 { 464 mlxcx_drv_props_t *p = &mlxp->mlx_props; 465 mlxcx_port_t *port = &mlxp->mlx_ports[0]; 466 467 VERIFY((mlxp->mlx_attach & MLXCX_ATTACH_PORTS) != 0); 468 VERIFY((mlxp->mlx_attach & (MLXCX_ATTACH_CQS | MLXCX_ATTACH_WQS)) == 0); 469 470 /* 471 * Currently we have different queue size defaults for two 472 * categories of queues. One set for devices which support a 473 * maximum speed of 10Gb/s, and another for those above that. 474 */ 475 if ((port->mlp_max_proto & (MLXCX_PROTO_25G | MLXCX_PROTO_40G | 476 MLXCX_PROTO_50G | MLXCX_PROTO_100G)) != 0) { 477 p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_25G; 478 p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_25G; 479 p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_25G; 480 } else if ((port->mlp_max_proto & (MLXCX_PROTO_100M | MLXCX_PROTO_1G | 481 MLXCX_PROTO_10G)) != 0) { 482 p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT; 483 p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT; 484 p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT; 485 } else { 486 mlxcx_warn(mlxp, "Encountered a port with a speed we don't " 487 "recognize. Proto: 0x%x", port->mlp_max_proto); 488 p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT; 489 p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT; 490 p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT; 491 } 492 } 493 494 /* 495 * Properties which may have different defaults based on hardware 496 * characteristics. 497 */ 498 static void 499 mlxcx_load_model_props(mlxcx_t *mlxp) 500 { 501 mlxcx_drv_props_t *p = &mlxp->mlx_props; 502 503 mlxcx_load_prop_defaults(mlxp); 504 505 p->mldp_cq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 506 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cq_size_shift", 507 p->mldp_cq_size_shift_default); 508 p->mldp_sq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 509 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "sq_size_shift", 510 p->mldp_sq_size_shift_default); 511 p->mldp_rq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 512 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rq_size_shift", 513 p->mldp_rq_size_shift_default); 514 } 515 516 static void 517 mlxcx_load_props(mlxcx_t *mlxp) 518 { 519 mlxcx_drv_props_t *p = &mlxp->mlx_props; 520 521 p->mldp_eq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 522 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "eq_size_shift", 523 MLXCX_EQ_SIZE_SHIFT_DFLT); 524 p->mldp_cqemod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 525 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_period_usec", 526 MLXCX_CQEMOD_PERIOD_USEC_DFLT); 527 p->mldp_cqemod_count = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 528 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_count", 529 MLXCX_CQEMOD_COUNT_DFLT); 530 p->mldp_intrmod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 531 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "intrmod_period_usec", 532 MLXCX_INTRMOD_PERIOD_USEC_DFLT); 533 534 p->mldp_tx_ngroups = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 535 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_ngroups", 536 MLXCX_TX_NGROUPS_DFLT); 537 p->mldp_tx_nrings_per_group = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 538 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_nrings_per_group", 539 MLXCX_TX_NRINGS_PER_GROUP_DFLT); 540 541 p->mldp_rx_ngroups_large = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 542 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_ngroups_large", 543 MLXCX_RX_NGROUPS_LARGE_DFLT); 544 p->mldp_rx_ngroups_small = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 545 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_ngroups_small", 546 MLXCX_RX_NGROUPS_SMALL_DFLT); 547 p->mldp_rx_nrings_per_large_group = ddi_getprop(DDI_DEV_T_ANY, 548 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 549 "rx_nrings_per_large_group", MLXCX_RX_NRINGS_PER_LARGE_GROUP_DFLT); 550 p->mldp_rx_nrings_per_small_group = ddi_getprop(DDI_DEV_T_ANY, 551 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 552 "rx_nrings_per_small_group", MLXCX_RX_NRINGS_PER_SMALL_GROUP_DFLT); 553 554 p->mldp_ftbl_root_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 555 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "ftbl_root_size_shift", 556 MLXCX_FTBL_ROOT_SIZE_SHIFT_DFLT); 557 558 p->mldp_tx_bind_threshold = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 559 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_bind_threshold", 560 MLXCX_TX_BIND_THRESHOLD_DFLT); 561 562 p->mldp_ftbl_vlan_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 563 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "ftbl_vlan_size_shift", 564 MLXCX_FTBL_VLAN_SIZE_SHIFT_DFLT); 565 566 p->mldp_eq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY, 567 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 568 "eq_check_interval_sec", MLXCX_EQ_CHECK_INTERVAL_SEC_DFLT); 569 p->mldp_cq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY, 570 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 571 "cq_check_interval_sec", MLXCX_CQ_CHECK_INTERVAL_SEC_DFLT); 572 p->mldp_wq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY, 573 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 574 "wq_check_interval_sec", MLXCX_WQ_CHECK_INTERVAL_SEC_DFLT); 575 576 p->mldp_rx_per_cq = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 577 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_limit_per_completion", 578 MLXCX_RX_PER_CQ_DEFAULT); 579 580 if (p->mldp_rx_per_cq < MLXCX_RX_PER_CQ_MIN || 581 p->mldp_rx_per_cq > MLXCX_RX_PER_CQ_MAX) { 582 mlxcx_warn(mlxp, "!rx_limit_per_completion = %u is " 583 "out of range. Defaulting to: %d. Valid values are from " 584 "%d to %d", p->mldp_rx_per_cq, MLXCX_RX_PER_CQ_DEFAULT, 585 MLXCX_RX_PER_CQ_MIN, MLXCX_RX_PER_CQ_MAX); 586 p->mldp_rx_per_cq = MLXCX_RX_PER_CQ_DEFAULT; 587 } 588 } 589 590 void 591 mlxcx_note(mlxcx_t *mlxp, const char *fmt, ...) 592 { 593 va_list ap; 594 595 va_start(ap, fmt); 596 if (mlxp != NULL && mlxp->mlx_dip != NULL) { 597 vdev_err(mlxp->mlx_dip, CE_NOTE, fmt, ap); 598 } else { 599 vcmn_err(CE_NOTE, fmt, ap); 600 } 601 va_end(ap); 602 } 603 604 void 605 mlxcx_warn(mlxcx_t *mlxp, const char *fmt, ...) 606 { 607 va_list ap; 608 609 va_start(ap, fmt); 610 if (mlxp != NULL && mlxp->mlx_dip != NULL) { 611 vdev_err(mlxp->mlx_dip, CE_WARN, fmt, ap); 612 } else { 613 vcmn_err(CE_WARN, fmt, ap); 614 } 615 va_end(ap); 616 } 617 618 void 619 mlxcx_panic(mlxcx_t *mlxp, const char *fmt, ...) 620 { 621 va_list ap; 622 623 va_start(ap, fmt); 624 if (mlxp != NULL && mlxp->mlx_dip != NULL) { 625 vdev_err(mlxp->mlx_dip, CE_PANIC, fmt, ap); 626 } else { 627 vcmn_err(CE_PANIC, fmt, ap); 628 } 629 va_end(ap); 630 } 631 632 uint16_t 633 mlxcx_get16(mlxcx_t *mlxp, uintptr_t off) 634 { 635 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 636 return (ddi_get16(mlxp->mlx_regs_handle, (void *)addr)); 637 } 638 639 uint32_t 640 mlxcx_get32(mlxcx_t *mlxp, uintptr_t off) 641 { 642 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 643 return (ddi_get32(mlxp->mlx_regs_handle, (void *)addr)); 644 } 645 646 uint64_t 647 mlxcx_get64(mlxcx_t *mlxp, uintptr_t off) 648 { 649 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 650 return (ddi_get64(mlxp->mlx_regs_handle, (void *)addr)); 651 } 652 653 void 654 mlxcx_put32(mlxcx_t *mlxp, uintptr_t off, uint32_t val) 655 { 656 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 657 ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val); 658 } 659 660 void 661 mlxcx_put64(mlxcx_t *mlxp, uintptr_t off, uint64_t val) 662 { 663 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 664 ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val); 665 } 666 667 void 668 mlxcx_uar_put32(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint32_t val) 669 { 670 /* 671 * The UAR is always inside the first BAR, which we mapped as 672 * mlx_regs 673 */ 674 uintptr_t addr = off + (uintptr_t)mlu->mlu_base + 675 (uintptr_t)mlxp->mlx_regs_base; 676 ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val); 677 } 678 679 void 680 mlxcx_uar_put64(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint64_t val) 681 { 682 uintptr_t addr = off + (uintptr_t)mlu->mlu_base + 683 (uintptr_t)mlxp->mlx_regs_base; 684 ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val); 685 } 686 687 static void 688 mlxcx_fm_fini(mlxcx_t *mlxp) 689 { 690 if (mlxp->mlx_fm_caps == 0) 691 return; 692 693 if (DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) 694 ddi_fm_handler_unregister(mlxp->mlx_dip); 695 696 if (DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps) || 697 DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) 698 pci_ereport_teardown(mlxp->mlx_dip); 699 700 ddi_fm_fini(mlxp->mlx_dip); 701 702 mlxp->mlx_fm_caps = 0; 703 } 704 705 void 706 mlxcx_fm_ereport(mlxcx_t *mlxp, const char *detail) 707 { 708 uint64_t ena; 709 char buf[FM_MAX_CLASS]; 710 711 if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps)) 712 return; 713 714 (void) snprintf(buf, FM_MAX_CLASS, "%s.%s", DDI_FM_DEVICE, detail); 715 ena = fm_ena_generate(0, FM_ENA_FMT1); 716 ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP, 717 FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0, 718 NULL); 719 } 720 721 static int 722 mlxcx_fm_errcb(dev_info_t *dip, ddi_fm_error_t *err, const void *arg) 723 { 724 /* 725 * as the driver can always deal with an error in any dma or 726 * access handle, we can just return the fme_status value. 727 */ 728 pci_ereport_post(dip, err, NULL); 729 return (err->fme_status); 730 } 731 732 static void 733 mlxcx_fm_init(mlxcx_t *mlxp) 734 { 735 ddi_iblock_cookie_t iblk; 736 int def = DDI_FM_EREPORT_CAPABLE | DDI_FM_ACCCHK_CAPABLE | 737 DDI_FM_DMACHK_CAPABLE | DDI_FM_ERRCB_CAPABLE; 738 739 mlxp->mlx_fm_caps = ddi_prop_get_int(DDI_DEV_T_ANY, mlxp->mlx_dip, 740 DDI_PROP_DONTPASS, "fm_capable", def); 741 742 if (mlxp->mlx_fm_caps < 0) { 743 mlxp->mlx_fm_caps = 0; 744 } 745 mlxp->mlx_fm_caps &= def; 746 747 if (mlxp->mlx_fm_caps == 0) 748 return; 749 750 ddi_fm_init(mlxp->mlx_dip, &mlxp->mlx_fm_caps, &iblk); 751 if (DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps) || 752 DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) { 753 pci_ereport_setup(mlxp->mlx_dip); 754 } 755 if (DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) { 756 ddi_fm_handler_register(mlxp->mlx_dip, mlxcx_fm_errcb, 757 (void *)mlxp); 758 } 759 } 760 761 static void 762 mlxcx_mlbs_teardown(mlxcx_t *mlxp, mlxcx_buf_shard_t *s) 763 { 764 mlxcx_buffer_t *buf; 765 766 mutex_enter(&s->mlbs_mtx); 767 768 while (!list_is_empty(&s->mlbs_busy)) 769 cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx); 770 771 while (!list_is_empty(&s->mlbs_loaned)) 772 cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx); 773 774 while ((buf = list_head(&s->mlbs_free)) != NULL) 775 mlxcx_buf_destroy(mlxp, buf); 776 777 list_destroy(&s->mlbs_free); 778 list_destroy(&s->mlbs_busy); 779 list_destroy(&s->mlbs_loaned); 780 mutex_exit(&s->mlbs_mtx); 781 782 cv_destroy(&s->mlbs_free_nonempty); 783 mutex_destroy(&s->mlbs_mtx); 784 } 785 786 static void 787 mlxcx_teardown_bufs(mlxcx_t *mlxp) 788 { 789 mlxcx_buf_shard_t *s; 790 791 while ((s = list_remove_head(&mlxp->mlx_buf_shards)) != NULL) { 792 mlxcx_mlbs_teardown(mlxp, s); 793 kmem_free(s, sizeof (mlxcx_buf_shard_t)); 794 } 795 list_destroy(&mlxp->mlx_buf_shards); 796 797 kmem_cache_destroy(mlxp->mlx_bufs_cache); 798 } 799 800 static void 801 mlxcx_teardown_pages(mlxcx_t *mlxp) 802 { 803 uint_t nzeros = 0; 804 805 mutex_enter(&mlxp->mlx_pagemtx); 806 807 while (mlxp->mlx_npages > 0) { 808 int32_t req, ret; 809 uint64_t pas[MLXCX_MANAGE_PAGES_MAX_PAGES]; 810 811 ASSERT0(avl_is_empty(&mlxp->mlx_pages)); 812 req = MIN(mlxp->mlx_npages, MLXCX_MANAGE_PAGES_MAX_PAGES); 813 814 if (!mlxcx_cmd_return_pages(mlxp, req, pas, &ret)) { 815 mlxcx_warn(mlxp, "hardware refused to return pages, " 816 "leaking %u remaining pages", mlxp->mlx_npages); 817 goto out; 818 } 819 820 for (int32_t i = 0; i < ret; i++) { 821 mlxcx_dev_page_t *mdp, probe; 822 bzero(&probe, sizeof (probe)); 823 probe.mxdp_pa = pas[i]; 824 825 mdp = avl_find(&mlxp->mlx_pages, &probe, NULL); 826 827 if (mdp != NULL) { 828 avl_remove(&mlxp->mlx_pages, mdp); 829 mlxp->mlx_npages--; 830 mlxcx_dma_free(&mdp->mxdp_dma); 831 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 832 } else { 833 mlxcx_panic(mlxp, "hardware returned a page " 834 "with PA 0x%" PRIx64 " but we have no " 835 "record of giving out such a page", pas[i]); 836 } 837 } 838 839 /* 840 * If no pages were returned, note that fact. 841 */ 842 if (ret == 0) { 843 nzeros++; 844 if (nzeros > mlxcx_reclaim_tries) { 845 mlxcx_warn(mlxp, "hardware refused to return " 846 "pages, leaking %u remaining pages", 847 mlxp->mlx_npages); 848 goto out; 849 } 850 delay(drv_usectohz(mlxcx_reclaim_delay)); 851 } 852 } 853 854 avl_destroy(&mlxp->mlx_pages); 855 856 out: 857 mutex_exit(&mlxp->mlx_pagemtx); 858 mutex_destroy(&mlxp->mlx_pagemtx); 859 } 860 861 static boolean_t 862 mlxcx_eq_alloc_dma(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) 863 { 864 ddi_device_acc_attr_t acc; 865 ddi_dma_attr_t attr; 866 boolean_t ret; 867 size_t sz, i; 868 869 VERIFY0(mleq->mleq_state & MLXCX_EQ_ALLOC); 870 871 mleq->mleq_entshift = mlxp->mlx_props.mldp_eq_size_shift; 872 mleq->mleq_nents = (1 << mleq->mleq_entshift); 873 sz = mleq->mleq_nents * sizeof (mlxcx_eventq_ent_t); 874 ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0); 875 876 mlxcx_dma_acc_attr(mlxp, &acc); 877 mlxcx_dma_queue_attr(mlxp, &attr); 878 879 ret = mlxcx_dma_alloc(mlxp, &mleq->mleq_dma, &attr, &acc, 880 B_TRUE, sz, B_TRUE); 881 if (!ret) { 882 mlxcx_warn(mlxp, "failed to allocate EQ memory"); 883 return (B_FALSE); 884 } 885 886 mleq->mleq_ent = (mlxcx_eventq_ent_t *)mleq->mleq_dma.mxdb_va; 887 888 for (i = 0; i < mleq->mleq_nents; ++i) 889 mleq->mleq_ent[i].mleqe_owner = MLXCX_EQ_OWNER_INIT; 890 891 mleq->mleq_state |= MLXCX_EQ_ALLOC; 892 893 return (B_TRUE); 894 } 895 896 static void 897 mlxcx_eq_rele_dma(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) 898 { 899 VERIFY(mleq->mleq_state & MLXCX_EQ_ALLOC); 900 if (mleq->mleq_state & MLXCX_EQ_CREATED) 901 VERIFY(mleq->mleq_state & MLXCX_EQ_DESTROYED); 902 903 mlxcx_dma_free(&mleq->mleq_dma); 904 mleq->mleq_ent = NULL; 905 906 mleq->mleq_state &= ~MLXCX_EQ_ALLOC; 907 } 908 909 void 910 mlxcx_teardown_flow_table(mlxcx_t *mlxp, mlxcx_flow_table_t *ft) 911 { 912 mlxcx_flow_group_t *fg; 913 mlxcx_flow_entry_t *fe; 914 int i; 915 916 ASSERT(mutex_owned(&ft->mlft_mtx)); 917 918 for (i = ft->mlft_nents - 1; i >= 0; --i) { 919 fe = &ft->mlft_ent[i]; 920 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) { 921 if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) { 922 mlxcx_panic(mlxp, "failed to delete flow " 923 "entry %u on table %u", i, 924 ft->mlft_num); 925 } 926 } 927 } 928 929 while ((fg = list_remove_head(&ft->mlft_groups)) != NULL) { 930 if (fg->mlfg_state & MLXCX_FLOW_GROUP_CREATED && 931 !(fg->mlfg_state & MLXCX_FLOW_GROUP_DESTROYED)) { 932 if (!mlxcx_cmd_destroy_flow_group(mlxp, fg)) { 933 mlxcx_panic(mlxp, "failed to destroy flow " 934 "group %u", fg->mlfg_num); 935 } 936 } 937 kmem_free(fg, sizeof (mlxcx_flow_group_t)); 938 } 939 list_destroy(&ft->mlft_groups); 940 if (ft->mlft_state & MLXCX_FLOW_TABLE_CREATED && 941 !(ft->mlft_state & MLXCX_FLOW_TABLE_DESTROYED)) { 942 if (!mlxcx_cmd_destroy_flow_table(mlxp, ft)) { 943 mlxcx_panic(mlxp, "failed to destroy flow table %u", 944 ft->mlft_num); 945 } 946 } 947 kmem_free(ft->mlft_ent, ft->mlft_entsize); 948 ft->mlft_ent = NULL; 949 mutex_exit(&ft->mlft_mtx); 950 mutex_destroy(&ft->mlft_mtx); 951 kmem_free(ft, sizeof (mlxcx_flow_table_t)); 952 } 953 954 static void 955 mlxcx_teardown_ports(mlxcx_t *mlxp) 956 { 957 uint_t i; 958 mlxcx_port_t *p; 959 mlxcx_flow_table_t *ft; 960 961 for (i = 0; i < mlxp->mlx_nports; ++i) { 962 p = &mlxp->mlx_ports[i]; 963 if (!(p->mlp_init & MLXCX_PORT_INIT)) 964 continue; 965 mutex_enter(&p->mlp_mtx); 966 if ((ft = p->mlp_rx_flow) != NULL) { 967 mutex_enter(&ft->mlft_mtx); 968 /* 969 * teardown_flow_table() will destroy the mutex, so 970 * we don't release it here. 971 */ 972 mlxcx_teardown_flow_table(mlxp, ft); 973 } 974 mutex_exit(&p->mlp_mtx); 975 mutex_destroy(&p->mlp_mtx); 976 p->mlp_init &= ~MLXCX_PORT_INIT; 977 } 978 979 kmem_free(mlxp->mlx_ports, mlxp->mlx_ports_size); 980 mlxp->mlx_ports = NULL; 981 } 982 983 static void 984 mlxcx_teardown_wqs(mlxcx_t *mlxp) 985 { 986 mlxcx_work_queue_t *mlwq; 987 988 while ((mlwq = list_head(&mlxp->mlx_wqs)) != NULL) { 989 mlxcx_wq_teardown(mlxp, mlwq); 990 } 991 list_destroy(&mlxp->mlx_wqs); 992 } 993 994 static void 995 mlxcx_teardown_cqs(mlxcx_t *mlxp) 996 { 997 mlxcx_completion_queue_t *mlcq; 998 999 while ((mlcq = list_head(&mlxp->mlx_cqs)) != NULL) { 1000 mlxcx_cq_teardown(mlxp, mlcq); 1001 } 1002 list_destroy(&mlxp->mlx_cqs); 1003 } 1004 1005 static void 1006 mlxcx_teardown_eqs(mlxcx_t *mlxp) 1007 { 1008 mlxcx_event_queue_t *mleq; 1009 uint_t i; 1010 1011 for (i = 0; i < mlxp->mlx_intr_count; ++i) { 1012 mleq = &mlxp->mlx_eqs[i]; 1013 mutex_enter(&mleq->mleq_mtx); 1014 if ((mleq->mleq_state & MLXCX_EQ_CREATED) && 1015 !(mleq->mleq_state & MLXCX_EQ_DESTROYED)) { 1016 if (!mlxcx_cmd_destroy_eq(mlxp, mleq)) { 1017 mlxcx_warn(mlxp, "failed to destroy " 1018 "event queue idx %u eqn %u", 1019 i, mleq->mleq_num); 1020 } 1021 } 1022 if (mleq->mleq_state & MLXCX_EQ_ALLOC) { 1023 mlxcx_eq_rele_dma(mlxp, mleq); 1024 } 1025 mutex_exit(&mleq->mleq_mtx); 1026 } 1027 } 1028 1029 static void 1030 mlxcx_teardown_checktimers(mlxcx_t *mlxp) 1031 { 1032 if (mlxp->mlx_props.mldp_eq_check_interval_sec > 0) 1033 ddi_periodic_delete(mlxp->mlx_eq_checktimer); 1034 if (mlxp->mlx_props.mldp_cq_check_interval_sec > 0) 1035 ddi_periodic_delete(mlxp->mlx_cq_checktimer); 1036 if (mlxp->mlx_props.mldp_wq_check_interval_sec > 0) 1037 ddi_periodic_delete(mlxp->mlx_wq_checktimer); 1038 } 1039 1040 static void 1041 mlxcx_teardown(mlxcx_t *mlxp) 1042 { 1043 uint_t i; 1044 dev_info_t *dip = mlxp->mlx_dip; 1045 1046 if (mlxp->mlx_attach & MLXCX_ATTACH_GROUPS) { 1047 mlxcx_teardown_groups(mlxp); 1048 mlxp->mlx_attach &= ~MLXCX_ATTACH_GROUPS; 1049 } 1050 1051 if (mlxp->mlx_attach & MLXCX_ATTACH_CHKTIMERS) { 1052 mlxcx_teardown_checktimers(mlxp); 1053 mlxp->mlx_attach &= ~MLXCX_ATTACH_CHKTIMERS; 1054 } 1055 1056 if (mlxp->mlx_attach & MLXCX_ATTACH_WQS) { 1057 mlxcx_teardown_wqs(mlxp); 1058 mlxp->mlx_attach &= ~MLXCX_ATTACH_WQS; 1059 } 1060 1061 if (mlxp->mlx_attach & MLXCX_ATTACH_CQS) { 1062 mlxcx_teardown_cqs(mlxp); 1063 mlxp->mlx_attach &= ~MLXCX_ATTACH_CQS; 1064 } 1065 1066 if (mlxp->mlx_attach & MLXCX_ATTACH_BUFS) { 1067 mlxcx_teardown_bufs(mlxp); 1068 mlxp->mlx_attach &= ~MLXCX_ATTACH_BUFS; 1069 } 1070 1071 if (mlxp->mlx_attach & MLXCX_ATTACH_PORTS) { 1072 mlxcx_teardown_ports(mlxp); 1073 mlxp->mlx_attach &= ~MLXCX_ATTACH_PORTS; 1074 } 1075 1076 if (mlxp->mlx_attach & MLXCX_ATTACH_INTRS) { 1077 mlxcx_teardown_eqs(mlxp); 1078 mlxcx_intr_teardown(mlxp); 1079 mlxp->mlx_attach &= ~MLXCX_ATTACH_INTRS; 1080 } 1081 1082 if (mlxp->mlx_attach & MLXCX_ATTACH_UAR_PD_TD) { 1083 if (mlxp->mlx_uar.mlu_allocated) { 1084 if (!mlxcx_cmd_dealloc_uar(mlxp, &mlxp->mlx_uar)) { 1085 mlxcx_warn(mlxp, "failed to release UAR"); 1086 } 1087 for (i = 0; i < MLXCX_BF_PER_UAR; ++i) 1088 mutex_destroy(&mlxp->mlx_uar.mlu_bf[i].mbf_mtx); 1089 } 1090 if (mlxp->mlx_pd.mlpd_allocated && 1091 !mlxcx_cmd_dealloc_pd(mlxp, &mlxp->mlx_pd)) { 1092 mlxcx_warn(mlxp, "failed to release PD"); 1093 } 1094 if (mlxp->mlx_tdom.mltd_allocated && 1095 !mlxcx_cmd_dealloc_tdom(mlxp, &mlxp->mlx_tdom)) { 1096 mlxcx_warn(mlxp, "failed to release TDOM"); 1097 } 1098 mlxp->mlx_attach &= ~MLXCX_ATTACH_UAR_PD_TD; 1099 } 1100 1101 if (mlxp->mlx_attach & MLXCX_ATTACH_INIT_HCA) { 1102 if (!mlxcx_cmd_teardown_hca(mlxp)) { 1103 mlxcx_warn(mlxp, "failed to send teardown HCA " 1104 "command during device detach"); 1105 } 1106 mlxp->mlx_attach &= ~MLXCX_ATTACH_INIT_HCA; 1107 } 1108 1109 if (mlxp->mlx_attach & MLXCX_ATTACH_PAGE_LIST) { 1110 mlxcx_teardown_pages(mlxp); 1111 mlxp->mlx_attach &= ~MLXCX_ATTACH_PAGE_LIST; 1112 } 1113 1114 if (mlxp->mlx_attach & MLXCX_ATTACH_ENABLE_HCA) { 1115 if (!mlxcx_cmd_disable_hca(mlxp)) { 1116 mlxcx_warn(mlxp, "failed to send DISABLE HCA command " 1117 "during device detach"); 1118 } 1119 mlxp->mlx_attach &= ~MLXCX_ATTACH_ENABLE_HCA; 1120 } 1121 1122 if (mlxp->mlx_attach & MLXCX_ATTACH_CMD) { 1123 mlxcx_cmd_queue_fini(mlxp); 1124 mlxp->mlx_attach &= ~MLXCX_ATTACH_CMD; 1125 } 1126 1127 if (mlxp->mlx_attach & MLXCX_ATTACH_CAPS) { 1128 kmem_free(mlxp->mlx_caps, sizeof (mlxcx_caps_t)); 1129 mlxp->mlx_caps = NULL; 1130 mlxp->mlx_attach &= ~MLXCX_ATTACH_CAPS; 1131 } 1132 1133 if (mlxp->mlx_attach & MLXCX_ATTACH_REGS) { 1134 ddi_regs_map_free(&mlxp->mlx_regs_handle); 1135 mlxp->mlx_regs_handle = NULL; 1136 mlxp->mlx_attach &= ~MLXCX_ATTACH_REGS; 1137 } 1138 1139 if (mlxp->mlx_attach & MLXCX_ATTACH_PCI_CONFIG) { 1140 pci_config_teardown(&mlxp->mlx_cfg_handle); 1141 mlxp->mlx_cfg_handle = NULL; 1142 mlxp->mlx_attach &= ~MLXCX_ATTACH_PCI_CONFIG; 1143 } 1144 1145 if (mlxp->mlx_attach & MLXCX_ATTACH_FM) { 1146 mlxcx_fm_fini(mlxp); 1147 mlxp->mlx_attach &= ~MLXCX_ATTACH_FM; 1148 } 1149 1150 VERIFY3S(mlxp->mlx_attach, ==, 0); 1151 ddi_soft_state_free(mlxcx_softstate, mlxp->mlx_inst); 1152 ddi_set_driver_private(dip, NULL); 1153 } 1154 1155 static boolean_t 1156 mlxcx_regs_map(mlxcx_t *mlxp) 1157 { 1158 off_t memsize; 1159 int ret; 1160 ddi_device_acc_attr_t da; 1161 1162 if (ddi_dev_regsize(mlxp->mlx_dip, MLXCX_REG_NUMBER, &memsize) != 1163 DDI_SUCCESS) { 1164 mlxcx_warn(mlxp, "failed to get register set size"); 1165 return (B_FALSE); 1166 } 1167 1168 /* 1169 * All data in the main BAR is kept in big-endian even though it's a PCI 1170 * device. 1171 */ 1172 bzero(&da, sizeof (ddi_device_acc_attr_t)); 1173 da.devacc_attr_version = DDI_DEVICE_ATTR_V0; 1174 da.devacc_attr_endian_flags = DDI_STRUCTURE_BE_ACC; 1175 da.devacc_attr_dataorder = DDI_STRICTORDER_ACC; 1176 if (DDI_FM_ACC_ERR_CAP(mlxp->mlx_fm_caps)) { 1177 da.devacc_attr_access = DDI_FLAGERR_ACC; 1178 } else { 1179 da.devacc_attr_access = DDI_DEFAULT_ACC; 1180 } 1181 1182 ret = ddi_regs_map_setup(mlxp->mlx_dip, MLXCX_REG_NUMBER, 1183 &mlxp->mlx_regs_base, 0, memsize, &da, &mlxp->mlx_regs_handle); 1184 1185 if (ret != DDI_SUCCESS) { 1186 mlxcx_warn(mlxp, "failed to map device registers: %d", ret); 1187 return (B_FALSE); 1188 } 1189 1190 return (B_TRUE); 1191 } 1192 1193 static boolean_t 1194 mlxcx_check_issi(mlxcx_t *mlxp) 1195 { 1196 uint32_t issi; 1197 1198 if (!mlxcx_cmd_query_issi(mlxp, &issi)) { 1199 mlxcx_warn(mlxp, "failed to get ISSI"); 1200 return (B_FALSE); 1201 } 1202 1203 if ((issi & (1 << MLXCX_CURRENT_ISSI)) == 0) { 1204 mlxcx_warn(mlxp, "hardware does not support software ISSI, " 1205 "hw vector 0x%x, sw version %u", issi, MLXCX_CURRENT_ISSI); 1206 return (B_FALSE); 1207 } 1208 1209 if (!mlxcx_cmd_set_issi(mlxp, MLXCX_CURRENT_ISSI)) { 1210 mlxcx_warn(mlxp, "failed to set ISSI to %u", 1211 MLXCX_CURRENT_ISSI); 1212 return (B_FALSE); 1213 } 1214 1215 return (B_TRUE); 1216 } 1217 1218 boolean_t 1219 mlxcx_give_pages(mlxcx_t *mlxp, int32_t npages) 1220 { 1221 ddi_device_acc_attr_t acc; 1222 ddi_dma_attr_t attr; 1223 int32_t i; 1224 list_t plist; 1225 mlxcx_dev_page_t *mdp; 1226 const ddi_dma_cookie_t *ck; 1227 1228 /* 1229 * If there are no pages required, then we're done here. 1230 */ 1231 if (npages <= 0) { 1232 return (B_TRUE); 1233 } 1234 1235 list_create(&plist, sizeof (mlxcx_dev_page_t), 1236 offsetof(mlxcx_dev_page_t, mxdp_list)); 1237 1238 for (i = 0; i < npages; i++) { 1239 mdp = kmem_zalloc(sizeof (mlxcx_dev_page_t), KM_SLEEP); 1240 mlxcx_dma_acc_attr(mlxp, &acc); 1241 mlxcx_dma_page_attr(mlxp, &attr); 1242 if (!mlxcx_dma_alloc(mlxp, &mdp->mxdp_dma, &attr, &acc, 1243 B_TRUE, MLXCX_HW_PAGE_SIZE, B_TRUE)) { 1244 mlxcx_warn(mlxp, "failed to allocate 4k page %u/%u", i, 1245 npages); 1246 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 1247 goto cleanup_npages; 1248 } 1249 ck = mlxcx_dma_cookie_one(&mdp->mxdp_dma); 1250 mdp->mxdp_pa = ck->dmac_laddress; 1251 1252 list_insert_tail(&plist, mdp); 1253 } 1254 1255 /* 1256 * Now that all of the pages have been allocated, given them to hardware 1257 * in chunks. 1258 */ 1259 while (npages > 0) { 1260 mlxcx_dev_page_t *pages[MLXCX_MANAGE_PAGES_MAX_PAGES]; 1261 int32_t togive = MIN(MLXCX_MANAGE_PAGES_MAX_PAGES, npages); 1262 1263 for (i = 0; i < togive; i++) { 1264 pages[i] = list_remove_head(&plist); 1265 } 1266 1267 if (!mlxcx_cmd_give_pages(mlxp, 1268 MLXCX_MANAGE_PAGES_OPMOD_GIVE_PAGES, togive, pages)) { 1269 mlxcx_warn(mlxp, "!hardware refused our gift of %u " 1270 "pages!", togive); 1271 for (i = 0; i < togive; i++) { 1272 list_insert_tail(&plist, pages[i]); 1273 } 1274 goto cleanup_npages; 1275 } 1276 1277 mutex_enter(&mlxp->mlx_pagemtx); 1278 for (i = 0; i < togive; i++) { 1279 avl_add(&mlxp->mlx_pages, pages[i]); 1280 } 1281 mlxp->mlx_npages += togive; 1282 mutex_exit(&mlxp->mlx_pagemtx); 1283 npages -= togive; 1284 } 1285 1286 list_destroy(&plist); 1287 1288 return (B_TRUE); 1289 1290 cleanup_npages: 1291 while ((mdp = list_remove_head(&plist)) != NULL) { 1292 mlxcx_dma_free(&mdp->mxdp_dma); 1293 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 1294 } 1295 list_destroy(&plist); 1296 return (B_FALSE); 1297 } 1298 1299 static boolean_t 1300 mlxcx_init_pages(mlxcx_t *mlxp, uint_t type) 1301 { 1302 int32_t npages; 1303 1304 if (!mlxcx_cmd_query_pages(mlxp, type, &npages)) { 1305 mlxcx_warn(mlxp, "failed to determine boot pages"); 1306 return (B_FALSE); 1307 } 1308 1309 return (mlxcx_give_pages(mlxp, npages)); 1310 } 1311 1312 static int 1313 mlxcx_bufs_cache_constr(void *arg, void *cookie, int kmflags) 1314 { 1315 mlxcx_t *mlxp = cookie; 1316 mlxcx_buffer_t *b = arg; 1317 1318 bzero(b, sizeof (mlxcx_buffer_t)); 1319 b->mlb_mlx = mlxp; 1320 b->mlb_state = MLXCX_BUFFER_INIT; 1321 list_create(&b->mlb_tx_chain, sizeof (mlxcx_buffer_t), 1322 offsetof(mlxcx_buffer_t, mlb_tx_chain_entry)); 1323 1324 return (0); 1325 } 1326 1327 static void 1328 mlxcx_bufs_cache_destr(void *arg, void *cookie) 1329 { 1330 mlxcx_t *mlxp = cookie; 1331 mlxcx_buffer_t *b = arg; 1332 VERIFY3P(b->mlb_mlx, ==, mlxp); 1333 VERIFY(b->mlb_state == MLXCX_BUFFER_INIT); 1334 list_destroy(&b->mlb_tx_chain); 1335 } 1336 1337 mlxcx_buf_shard_t * 1338 mlxcx_mlbs_create(mlxcx_t *mlxp) 1339 { 1340 mlxcx_buf_shard_t *s; 1341 1342 s = kmem_zalloc(sizeof (mlxcx_buf_shard_t), KM_SLEEP); 1343 1344 mutex_init(&s->mlbs_mtx, NULL, MUTEX_DRIVER, 1345 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 1346 list_create(&s->mlbs_busy, sizeof (mlxcx_buffer_t), 1347 offsetof(mlxcx_buffer_t, mlb_entry)); 1348 list_create(&s->mlbs_free, sizeof (mlxcx_buffer_t), 1349 offsetof(mlxcx_buffer_t, mlb_entry)); 1350 list_create(&s->mlbs_loaned, sizeof (mlxcx_buffer_t), 1351 offsetof(mlxcx_buffer_t, mlb_entry)); 1352 cv_init(&s->mlbs_free_nonempty, NULL, CV_DRIVER, NULL); 1353 1354 list_insert_tail(&mlxp->mlx_buf_shards, s); 1355 1356 return (s); 1357 } 1358 1359 static boolean_t 1360 mlxcx_setup_bufs(mlxcx_t *mlxp) 1361 { 1362 char namebuf[KSTAT_STRLEN]; 1363 1364 (void) snprintf(namebuf, KSTAT_STRLEN, "mlxcx%d_bufs_cache", 1365 ddi_get_instance(mlxp->mlx_dip)); 1366 mlxp->mlx_bufs_cache = kmem_cache_create(namebuf, 1367 sizeof (mlxcx_buffer_t), sizeof (uint64_t), 1368 mlxcx_bufs_cache_constr, mlxcx_bufs_cache_destr, 1369 NULL, mlxp, NULL, 0); 1370 1371 list_create(&mlxp->mlx_buf_shards, sizeof (mlxcx_buf_shard_t), 1372 offsetof(mlxcx_buf_shard_t, mlbs_entry)); 1373 1374 return (B_TRUE); 1375 } 1376 1377 static void 1378 mlxcx_fm_qstate_ereport(mlxcx_t *mlxp, const char *qtype, uint32_t qnum, 1379 const char *state, uint8_t statenum) 1380 { 1381 uint64_t ena; 1382 char buf[FM_MAX_CLASS]; 1383 1384 if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps)) 1385 return; 1386 1387 (void) snprintf(buf, FM_MAX_CLASS, "%s.%s", 1388 MLXCX_FM_SERVICE_MLXCX, "qstate.err"); 1389 ena = fm_ena_generate(0, FM_ENA_FMT1); 1390 1391 ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP, 1392 FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0, 1393 "state", DATA_TYPE_STRING, state, 1394 "state_num", DATA_TYPE_UINT8, statenum, 1395 "qtype", DATA_TYPE_STRING, qtype, 1396 "qnum", DATA_TYPE_UINT32, qnum, 1397 NULL); 1398 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_DEGRADED); 1399 } 1400 1401 static void 1402 mlxcx_eq_check(void *arg) 1403 { 1404 mlxcx_t *mlxp = (mlxcx_t *)arg; 1405 mlxcx_event_queue_t *eq; 1406 mlxcx_eventq_ctx_t ctx; 1407 const char *str; 1408 1409 uint_t i; 1410 1411 for (i = 0; i < mlxp->mlx_intr_count; ++i) { 1412 eq = &mlxp->mlx_eqs[i]; 1413 if (!(eq->mleq_state & MLXCX_EQ_CREATED) || 1414 (eq->mleq_state & MLXCX_EQ_DESTROYED)) 1415 continue; 1416 mutex_enter(&eq->mleq_mtx); 1417 if (!mlxcx_cmd_query_eq(mlxp, eq, &ctx)) { 1418 mutex_exit(&eq->mleq_mtx); 1419 continue; 1420 } 1421 1422 str = "???"; 1423 switch (ctx.mleqc_status) { 1424 case MLXCX_EQ_STATUS_OK: 1425 break; 1426 case MLXCX_EQ_STATUS_WRITE_FAILURE: 1427 str = "WRITE_FAILURE"; 1428 break; 1429 } 1430 if (ctx.mleqc_status != MLXCX_EQ_STATUS_OK) { 1431 mlxcx_fm_qstate_ereport(mlxp, "event", 1432 eq->mleq_num, str, ctx.mleqc_status); 1433 mlxcx_warn(mlxp, "EQ %u is in bad status: %x (%s)", 1434 eq->mleq_intr_index, ctx.mleqc_status, str); 1435 } 1436 1437 if (ctx.mleqc_state != MLXCX_EQ_ST_ARMED && 1438 (eq->mleq_state & MLXCX_EQ_ARMED)) { 1439 if (eq->mleq_cc == eq->mleq_check_disarm_cc && 1440 ++eq->mleq_check_disarm_cnt >= 3) { 1441 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_STALL); 1442 mlxcx_warn(mlxp, "EQ %u isn't armed", 1443 eq->mleq_intr_index); 1444 } 1445 eq->mleq_check_disarm_cc = eq->mleq_cc; 1446 } else { 1447 eq->mleq_check_disarm_cc = 0; 1448 eq->mleq_check_disarm_cnt = 0; 1449 } 1450 1451 mutex_exit(&eq->mleq_mtx); 1452 } 1453 } 1454 1455 static void 1456 mlxcx_cq_check(void *arg) 1457 { 1458 mlxcx_t *mlxp = (mlxcx_t *)arg; 1459 mlxcx_completion_queue_t *cq; 1460 mlxcx_completionq_ctx_t ctx; 1461 const char *str, *type; 1462 uint_t v; 1463 1464 for (cq = list_head(&mlxp->mlx_cqs); cq != NULL; 1465 cq = list_next(&mlxp->mlx_cqs, cq)) { 1466 mutex_enter(&cq->mlcq_mtx); 1467 if (!(cq->mlcq_state & MLXCX_CQ_CREATED) || 1468 (cq->mlcq_state & MLXCX_CQ_DESTROYED) || 1469 (cq->mlcq_state & MLXCX_CQ_TEARDOWN)) { 1470 mutex_exit(&cq->mlcq_mtx); 1471 continue; 1472 } 1473 if (cq->mlcq_fm_repd_qstate) { 1474 mutex_exit(&cq->mlcq_mtx); 1475 continue; 1476 } 1477 if (!mlxcx_cmd_query_cq(mlxp, cq, &ctx)) { 1478 mutex_exit(&cq->mlcq_mtx); 1479 continue; 1480 } 1481 if (cq->mlcq_wq != NULL) { 1482 mlxcx_work_queue_t *wq = cq->mlcq_wq; 1483 if (wq->mlwq_type == MLXCX_WQ_TYPE_RECVQ) 1484 type = "rx "; 1485 else if (wq->mlwq_type == MLXCX_WQ_TYPE_SENDQ) 1486 type = "tx "; 1487 else 1488 type = ""; 1489 } else { 1490 type = ""; 1491 } 1492 1493 str = "???"; 1494 v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATUS); 1495 switch (v) { 1496 case MLXCX_CQC_STATUS_OK: 1497 break; 1498 case MLXCX_CQC_STATUS_OVERFLOW: 1499 str = "OVERFLOW"; 1500 break; 1501 case MLXCX_CQC_STATUS_WRITE_FAIL: 1502 str = "WRITE_FAIL"; 1503 break; 1504 case MLXCX_CQC_STATUS_INVALID: 1505 str = "INVALID"; 1506 break; 1507 } 1508 if (v != MLXCX_CQC_STATUS_OK) { 1509 mlxcx_fm_qstate_ereport(mlxp, "completion", 1510 cq->mlcq_num, str, v); 1511 mlxcx_warn(mlxp, "%sCQ 0x%x is in bad status: %x (%s)", 1512 type, cq->mlcq_num, v, str); 1513 cq->mlcq_fm_repd_qstate = B_TRUE; 1514 } 1515 1516 v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATE); 1517 if (v != MLXCX_CQC_STATE_ARMED && 1518 (cq->mlcq_state & MLXCX_CQ_ARMED) && 1519 !(cq->mlcq_state & MLXCX_CQ_POLLING)) { 1520 if (cq->mlcq_cc == cq->mlcq_check_disarm_cc && 1521 ++cq->mlcq_check_disarm_cnt >= 3) { 1522 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_STALL); 1523 mlxcx_warn(mlxp, "%sCQ 0x%x (%p) isn't armed", 1524 type, cq->mlcq_num, cq); 1525 } 1526 cq->mlcq_check_disarm_cc = cq->mlcq_cc; 1527 } else { 1528 cq->mlcq_check_disarm_cnt = 0; 1529 cq->mlcq_check_disarm_cc = 0; 1530 } 1531 mutex_exit(&cq->mlcq_mtx); 1532 } 1533 } 1534 1535 void 1536 mlxcx_check_sq(mlxcx_t *mlxp, mlxcx_work_queue_t *sq) 1537 { 1538 mlxcx_sq_ctx_t ctx; 1539 mlxcx_sq_state_t state; 1540 1541 ASSERT(mutex_owned(&sq->mlwq_mtx)); 1542 1543 if (!mlxcx_cmd_query_sq(mlxp, sq, &ctx)) 1544 return; 1545 1546 ASSERT3U(from_be24(ctx.mlsqc_cqn), ==, sq->mlwq_cq->mlcq_num); 1547 state = get_bits32(ctx.mlsqc_flags, MLXCX_SQ_STATE); 1548 switch (state) { 1549 case MLXCX_SQ_STATE_RST: 1550 if (sq->mlwq_state & MLXCX_WQ_STARTED) { 1551 mlxcx_fm_qstate_ereport(mlxp, "send", 1552 sq->mlwq_num, "RST", state); 1553 sq->mlwq_fm_repd_qstate = B_TRUE; 1554 } 1555 break; 1556 case MLXCX_SQ_STATE_RDY: 1557 if (!(sq->mlwq_state & MLXCX_WQ_STARTED)) { 1558 mlxcx_fm_qstate_ereport(mlxp, "send", 1559 sq->mlwq_num, "RDY", state); 1560 sq->mlwq_fm_repd_qstate = B_TRUE; 1561 } 1562 break; 1563 case MLXCX_SQ_STATE_ERR: 1564 mlxcx_fm_qstate_ereport(mlxp, "send", 1565 sq->mlwq_num, "ERR", state); 1566 sq->mlwq_fm_repd_qstate = B_TRUE; 1567 break; 1568 default: 1569 mlxcx_fm_qstate_ereport(mlxp, "send", 1570 sq->mlwq_num, "???", state); 1571 sq->mlwq_fm_repd_qstate = B_TRUE; 1572 break; 1573 } 1574 } 1575 1576 void 1577 mlxcx_check_rq(mlxcx_t *mlxp, mlxcx_work_queue_t *rq) 1578 { 1579 mlxcx_rq_ctx_t ctx; 1580 mlxcx_rq_state_t state; 1581 1582 ASSERT(mutex_owned(&rq->mlwq_mtx)); 1583 1584 if (!mlxcx_cmd_query_rq(mlxp, rq, &ctx)) 1585 return; 1586 1587 ASSERT3U(from_be24(ctx.mlrqc_cqn), ==, rq->mlwq_cq->mlcq_num); 1588 state = get_bits32(ctx.mlrqc_flags, MLXCX_RQ_STATE); 1589 switch (state) { 1590 case MLXCX_RQ_STATE_RST: 1591 if (rq->mlwq_state & MLXCX_WQ_STARTED) { 1592 mlxcx_fm_qstate_ereport(mlxp, "receive", 1593 rq->mlwq_num, "RST", state); 1594 rq->mlwq_fm_repd_qstate = B_TRUE; 1595 } 1596 break; 1597 case MLXCX_RQ_STATE_RDY: 1598 if (!(rq->mlwq_state & MLXCX_WQ_STARTED)) { 1599 mlxcx_fm_qstate_ereport(mlxp, "receive", 1600 rq->mlwq_num, "RDY", state); 1601 rq->mlwq_fm_repd_qstate = B_TRUE; 1602 } 1603 break; 1604 case MLXCX_RQ_STATE_ERR: 1605 mlxcx_fm_qstate_ereport(mlxp, "receive", 1606 rq->mlwq_num, "ERR", state); 1607 rq->mlwq_fm_repd_qstate = B_TRUE; 1608 break; 1609 default: 1610 mlxcx_fm_qstate_ereport(mlxp, "receive", 1611 rq->mlwq_num, "???", state); 1612 rq->mlwq_fm_repd_qstate = B_TRUE; 1613 break; 1614 } 1615 } 1616 1617 static void 1618 mlxcx_wq_check(void *arg) 1619 { 1620 mlxcx_t *mlxp = (mlxcx_t *)arg; 1621 mlxcx_work_queue_t *wq; 1622 1623 for (wq = list_head(&mlxp->mlx_wqs); wq != NULL; 1624 wq = list_next(&mlxp->mlx_wqs, wq)) { 1625 mutex_enter(&wq->mlwq_mtx); 1626 if (!(wq->mlwq_state & MLXCX_WQ_CREATED) || 1627 (wq->mlwq_state & MLXCX_WQ_DESTROYED) || 1628 (wq->mlwq_state & MLXCX_WQ_TEARDOWN)) { 1629 mutex_exit(&wq->mlwq_mtx); 1630 continue; 1631 } 1632 if (wq->mlwq_fm_repd_qstate) { 1633 mutex_exit(&wq->mlwq_mtx); 1634 continue; 1635 } 1636 switch (wq->mlwq_type) { 1637 case MLXCX_WQ_TYPE_SENDQ: 1638 mlxcx_check_sq(mlxp, wq); 1639 break; 1640 case MLXCX_WQ_TYPE_RECVQ: 1641 mlxcx_check_rq(mlxp, wq); 1642 break; 1643 } 1644 mutex_exit(&wq->mlwq_mtx); 1645 } 1646 } 1647 1648 static boolean_t 1649 mlxcx_setup_checktimers(mlxcx_t *mlxp) 1650 { 1651 if (mlxp->mlx_props.mldp_eq_check_interval_sec > 0) { 1652 mlxp->mlx_eq_checktimer = ddi_periodic_add(mlxcx_eq_check, mlxp, 1653 mlxp->mlx_props.mldp_eq_check_interval_sec * NANOSEC, 1654 DDI_IPL_0); 1655 } 1656 if (mlxp->mlx_props.mldp_cq_check_interval_sec > 0) { 1657 mlxp->mlx_cq_checktimer = ddi_periodic_add(mlxcx_cq_check, mlxp, 1658 mlxp->mlx_props.mldp_cq_check_interval_sec * NANOSEC, 1659 DDI_IPL_0); 1660 } 1661 if (mlxp->mlx_props.mldp_wq_check_interval_sec > 0) { 1662 mlxp->mlx_wq_checktimer = ddi_periodic_add(mlxcx_wq_check, mlxp, 1663 mlxp->mlx_props.mldp_wq_check_interval_sec * NANOSEC, 1664 DDI_IPL_0); 1665 } 1666 return (B_TRUE); 1667 } 1668 1669 int 1670 mlxcx_dmac_fe_compare(const void *arg0, const void *arg1) 1671 { 1672 const mlxcx_flow_entry_t *left = arg0; 1673 const mlxcx_flow_entry_t *right = arg1; 1674 int bcmpr; 1675 1676 bcmpr = memcmp(left->mlfe_dmac, right->mlfe_dmac, 1677 sizeof (left->mlfe_dmac)); 1678 if (bcmpr < 0) 1679 return (-1); 1680 if (bcmpr > 0) 1681 return (1); 1682 if (left->mlfe_vid < right->mlfe_vid) 1683 return (-1); 1684 if (left->mlfe_vid > right->mlfe_vid) 1685 return (1); 1686 return (0); 1687 } 1688 1689 int 1690 mlxcx_grmac_compare(const void *arg0, const void *arg1) 1691 { 1692 const mlxcx_group_mac_t *left = arg0; 1693 const mlxcx_group_mac_t *right = arg1; 1694 int bcmpr; 1695 1696 bcmpr = memcmp(left->mlgm_mac, right->mlgm_mac, 1697 sizeof (left->mlgm_mac)); 1698 if (bcmpr < 0) 1699 return (-1); 1700 if (bcmpr > 0) 1701 return (1); 1702 return (0); 1703 } 1704 1705 int 1706 mlxcx_page_compare(const void *arg0, const void *arg1) 1707 { 1708 const mlxcx_dev_page_t *p0 = arg0; 1709 const mlxcx_dev_page_t *p1 = arg1; 1710 1711 if (p0->mxdp_pa < p1->mxdp_pa) 1712 return (-1); 1713 if (p0->mxdp_pa > p1->mxdp_pa) 1714 return (1); 1715 return (0); 1716 } 1717 1718 static boolean_t 1719 mlxcx_setup_ports(mlxcx_t *mlxp) 1720 { 1721 uint_t i, j; 1722 mlxcx_port_t *p; 1723 mlxcx_flow_table_t *ft; 1724 mlxcx_flow_group_t *fg; 1725 mlxcx_flow_entry_t *fe; 1726 1727 VERIFY3U(mlxp->mlx_nports, >, 0); 1728 mlxp->mlx_ports_size = mlxp->mlx_nports * sizeof (mlxcx_port_t); 1729 mlxp->mlx_ports = kmem_zalloc(mlxp->mlx_ports_size, KM_SLEEP); 1730 1731 for (i = 0; i < mlxp->mlx_nports; ++i) { 1732 p = &mlxp->mlx_ports[i]; 1733 p->mlp_num = i; 1734 p->mlp_init |= MLXCX_PORT_INIT; 1735 mutex_init(&p->mlp_mtx, NULL, MUTEX_DRIVER, 1736 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 1737 mutex_enter(&p->mlp_mtx); 1738 if (!mlxcx_cmd_query_nic_vport_ctx(mlxp, p)) { 1739 mutex_exit(&p->mlp_mtx); 1740 goto err; 1741 } 1742 if (!mlxcx_cmd_query_port_mtu(mlxp, p)) { 1743 mutex_exit(&p->mlp_mtx); 1744 goto err; 1745 } 1746 if (!mlxcx_cmd_query_port_status(mlxp, p)) { 1747 mutex_exit(&p->mlp_mtx); 1748 goto err; 1749 } 1750 if (!mlxcx_cmd_query_port_speed(mlxp, p)) { 1751 mutex_exit(&p->mlp_mtx); 1752 goto err; 1753 } 1754 if (!mlxcx_cmd_modify_nic_vport_ctx(mlxp, p, 1755 MLXCX_MODIFY_NIC_VPORT_CTX_PROMISC)) { 1756 mutex_exit(&p->mlp_mtx); 1757 goto err; 1758 } 1759 1760 mutex_exit(&p->mlp_mtx); 1761 } 1762 1763 for (i = 0; i < mlxp->mlx_nports; ++i) { 1764 p = &mlxp->mlx_ports[i]; 1765 mutex_enter(&p->mlp_mtx); 1766 p->mlp_rx_flow = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t), 1767 KM_SLEEP)); 1768 mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER, 1769 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 1770 1771 mutex_enter(&ft->mlft_mtx); 1772 1773 ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX; 1774 ft->mlft_port = p; 1775 ft->mlft_entshift = mlxp->mlx_props.mldp_ftbl_root_size_shift; 1776 if (ft->mlft_entshift > mlxp->mlx_caps->mlc_max_rx_ft_shift) 1777 ft->mlft_entshift = mlxp->mlx_caps->mlc_max_rx_ft_shift; 1778 ft->mlft_nents = (1 << ft->mlft_entshift); 1779 ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t); 1780 ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP); 1781 list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t), 1782 offsetof(mlxcx_flow_group_t, mlfg_entry)); 1783 1784 for (j = 0; j < ft->mlft_nents; ++j) { 1785 ft->mlft_ent[j].mlfe_table = ft; 1786 ft->mlft_ent[j].mlfe_index = j; 1787 } 1788 1789 if (!mlxcx_cmd_create_flow_table(mlxp, ft)) { 1790 mutex_exit(&ft->mlft_mtx); 1791 mutex_exit(&p->mlp_mtx); 1792 goto err; 1793 } 1794 1795 if (!mlxcx_cmd_set_flow_table_root(mlxp, ft)) { 1796 mutex_exit(&ft->mlft_mtx); 1797 mutex_exit(&p->mlp_mtx); 1798 goto err; 1799 } 1800 1801 /* 1802 * We match broadcast at the top of the root flow table, then 1803 * all multicast/unicast MACs, then the promisc entry is down 1804 * the very bottom. 1805 * 1806 * This way when promisc is on, that entry simply catches any 1807 * remaining traffic that earlier flows haven't matched. 1808 */ 1809 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1810 list_insert_tail(&ft->mlft_groups, fg); 1811 fg->mlfg_table = ft; 1812 fg->mlfg_size = 1; 1813 fg->mlfg_mask |= MLXCX_FLOW_MATCH_DMAC; 1814 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1815 mutex_exit(&ft->mlft_mtx); 1816 mutex_exit(&p->mlp_mtx); 1817 goto err; 1818 } 1819 p->mlp_bcast = fg; 1820 fe = list_head(&fg->mlfg_entries); 1821 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 1822 (void) memset(fe->mlfe_dmac, 0xff, sizeof (fe->mlfe_dmac)); 1823 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 1824 1825 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1826 list_insert_tail(&ft->mlft_groups, fg); 1827 fg->mlfg_table = ft; 1828 fg->mlfg_size = ft->mlft_nents - 2; 1829 fg->mlfg_mask |= MLXCX_FLOW_MATCH_DMAC; 1830 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1831 mutex_exit(&ft->mlft_mtx); 1832 mutex_exit(&p->mlp_mtx); 1833 goto err; 1834 } 1835 p->mlp_umcast = fg; 1836 1837 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1838 list_insert_tail(&ft->mlft_groups, fg); 1839 fg->mlfg_table = ft; 1840 fg->mlfg_size = 1; 1841 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1842 mutex_exit(&ft->mlft_mtx); 1843 mutex_exit(&p->mlp_mtx); 1844 goto err; 1845 } 1846 p->mlp_promisc = fg; 1847 fe = list_head(&fg->mlfg_entries); 1848 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 1849 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 1850 1851 avl_create(&p->mlp_dmac_fe, mlxcx_dmac_fe_compare, 1852 sizeof (mlxcx_flow_entry_t), offsetof(mlxcx_flow_entry_t, 1853 mlfe_dmac_entry)); 1854 1855 mutex_exit(&ft->mlft_mtx); 1856 mutex_exit(&p->mlp_mtx); 1857 } 1858 1859 return (B_TRUE); 1860 1861 err: 1862 mlxcx_teardown_ports(mlxp); 1863 return (B_FALSE); 1864 } 1865 1866 void 1867 mlxcx_remove_all_vlan_entries(mlxcx_t *mlxp, mlxcx_ring_group_t *g) 1868 { 1869 mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft; 1870 mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg; 1871 mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg; 1872 mlxcx_flow_entry_t *fe; 1873 mlxcx_group_vlan_t *v; 1874 1875 ASSERT(mutex_owned(&g->mlg_mtx)); 1876 1877 mutex_enter(&ft->mlft_mtx); 1878 1879 if (!list_is_empty(&g->mlg_rx_vlans)) { 1880 fe = list_head(&dfg->mlfg_entries); 1881 (void) mlxcx_cmd_set_flow_table_entry(mlxp, fe); 1882 } 1883 1884 while ((v = list_remove_head(&g->mlg_rx_vlans)) != NULL) { 1885 fe = v->mlgv_fe; 1886 ASSERT3P(fe->mlfe_table, ==, ft); 1887 ASSERT3P(fe->mlfe_group, ==, fg); 1888 kmem_free(v, sizeof (mlxcx_group_vlan_t)); 1889 1890 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 1891 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 1892 } 1893 1894 mutex_exit(&ft->mlft_mtx); 1895 } 1896 1897 boolean_t 1898 mlxcx_remove_vlan_entry(mlxcx_t *mlxp, mlxcx_ring_group_t *g, 1899 boolean_t tagged, uint16_t vid) 1900 { 1901 mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft; 1902 mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg; 1903 mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg; 1904 mlxcx_flow_entry_t *fe; 1905 mlxcx_group_vlan_t *v; 1906 boolean_t found = B_FALSE; 1907 1908 ASSERT(mutex_owned(&g->mlg_mtx)); 1909 1910 mutex_enter(&ft->mlft_mtx); 1911 1912 for (v = list_head(&g->mlg_rx_vlans); v != NULL; 1913 v = list_next(&g->mlg_rx_vlans, v)) { 1914 if (v->mlgv_tagged == tagged && v->mlgv_vid == vid) { 1915 found = B_TRUE; 1916 break; 1917 } 1918 } 1919 if (!found) { 1920 mutex_exit(&ft->mlft_mtx); 1921 return (B_FALSE); 1922 } 1923 1924 list_remove(&g->mlg_rx_vlans, v); 1925 1926 /* 1927 * If this is the last VLAN entry, we have to go back to accepting 1928 * any VLAN (which means re-enabling the default entry). 1929 * 1930 * Do this before we remove the flow entry for the last specific 1931 * VLAN so that we don't lose any traffic in the transition. 1932 */ 1933 if (list_is_empty(&g->mlg_rx_vlans)) { 1934 fe = list_head(&dfg->mlfg_entries); 1935 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 1936 list_insert_tail(&g->mlg_rx_vlans, v); 1937 mutex_exit(&ft->mlft_mtx); 1938 return (B_FALSE); 1939 } 1940 } 1941 1942 fe = v->mlgv_fe; 1943 ASSERT(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED); 1944 ASSERT(fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED); 1945 ASSERT3P(fe->mlfe_table, ==, ft); 1946 ASSERT3P(fe->mlfe_group, ==, fg); 1947 1948 if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) { 1949 list_insert_tail(&g->mlg_rx_vlans, v); 1950 fe = list_head(&dfg->mlfg_entries); 1951 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) { 1952 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 1953 } 1954 mutex_exit(&ft->mlft_mtx); 1955 return (B_FALSE); 1956 } 1957 1958 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 1959 1960 kmem_free(v, sizeof (mlxcx_group_vlan_t)); 1961 1962 mutex_exit(&ft->mlft_mtx); 1963 return (B_TRUE); 1964 } 1965 1966 boolean_t 1967 mlxcx_add_vlan_entry(mlxcx_t *mlxp, mlxcx_ring_group_t *g, boolean_t tagged, 1968 uint16_t vid) 1969 { 1970 mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft; 1971 mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg; 1972 mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg; 1973 mlxcx_flow_entry_t *fe; 1974 mlxcx_group_vlan_t *v; 1975 boolean_t found = B_FALSE; 1976 boolean_t first = B_FALSE; 1977 1978 ASSERT(mutex_owned(&g->mlg_mtx)); 1979 1980 mutex_enter(&ft->mlft_mtx); 1981 1982 for (v = list_head(&g->mlg_rx_vlans); v != NULL; 1983 v = list_next(&g->mlg_rx_vlans, v)) { 1984 if (v->mlgv_tagged == tagged && v->mlgv_vid == vid) { 1985 mutex_exit(&ft->mlft_mtx); 1986 return (B_TRUE); 1987 } 1988 } 1989 if (list_is_empty(&g->mlg_rx_vlans)) 1990 first = B_TRUE; 1991 1992 for (fe = list_head(&fg->mlfg_entries); fe != NULL; 1993 fe = list_next(&fg->mlfg_entries, fe)) { 1994 if (!(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED)) { 1995 found = B_TRUE; 1996 break; 1997 } 1998 } 1999 if (!found) { 2000 mutex_exit(&ft->mlft_mtx); 2001 return (B_FALSE); 2002 } 2003 2004 v = kmem_zalloc(sizeof (mlxcx_group_vlan_t), KM_SLEEP); 2005 v->mlgv_fe = fe; 2006 v->mlgv_tagged = tagged; 2007 v->mlgv_vid = vid; 2008 2009 fe->mlfe_state |= MLXCX_FLOW_ENTRY_RESERVED; 2010 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 2011 fe->mlfe_vid = vid; 2012 if (tagged) { 2013 fe->mlfe_vlan_type = MLXCX_VLAN_TYPE_CVLAN; 2014 } else { 2015 fe->mlfe_vlan_type = MLXCX_VLAN_TYPE_NONE; 2016 } 2017 2018 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 2019 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_DIRTY; 2020 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2021 kmem_free(v, sizeof (mlxcx_group_vlan_t)); 2022 mutex_exit(&ft->mlft_mtx); 2023 return (B_FALSE); 2024 } 2025 2026 list_insert_tail(&g->mlg_rx_vlans, v); 2027 2028 /* 2029 * If the vlan list was empty for this group before adding this one, 2030 * then we no longer want the "default" entry to allow all VLANs 2031 * through. 2032 */ 2033 if (first) { 2034 fe = list_head(&dfg->mlfg_entries); 2035 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 2036 } 2037 2038 mutex_exit(&ft->mlft_mtx); 2039 return (B_TRUE); 2040 } 2041 2042 void 2043 mlxcx_remove_all_umcast_entries(mlxcx_t *mlxp, mlxcx_port_t *port, 2044 mlxcx_ring_group_t *group) 2045 { 2046 mlxcx_flow_entry_t *fe; 2047 mlxcx_flow_table_t *ft = port->mlp_rx_flow; 2048 mlxcx_group_mac_t *gm, *ngm; 2049 2050 ASSERT(mutex_owned(&port->mlp_mtx)); 2051 ASSERT(mutex_owned(&group->mlg_mtx)); 2052 2053 mutex_enter(&ft->mlft_mtx); 2054 2055 gm = avl_first(&group->mlg_rx_macs); 2056 for (; gm != NULL; gm = ngm) { 2057 ngm = AVL_NEXT(&group->mlg_rx_macs, gm); 2058 2059 ASSERT3P(gm->mlgm_group, ==, group); 2060 fe = gm->mlgm_fe; 2061 ASSERT3P(fe->mlfe_table, ==, ft); 2062 2063 avl_remove(&group->mlg_rx_macs, gm); 2064 list_remove(&fe->mlfe_ring_groups, gm); 2065 kmem_free(gm, sizeof (mlxcx_group_mac_t)); 2066 2067 fe->mlfe_ndest = 0; 2068 for (gm = list_head(&fe->mlfe_ring_groups); gm != NULL; 2069 gm = list_next(&fe->mlfe_ring_groups, gm)) { 2070 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = 2071 gm->mlgm_group->mlg_rx_vlan_ft; 2072 } 2073 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 2074 2075 if (fe->mlfe_ndest > 0) { 2076 (void) mlxcx_cmd_set_flow_table_entry(mlxp, fe); 2077 continue; 2078 } 2079 2080 /* 2081 * There are no more ring groups left for this MAC (it wasn't 2082 * attached to any other groups since ndest == 0), so clean up 2083 * its flow entry. 2084 */ 2085 avl_remove(&port->mlp_dmac_fe, fe); 2086 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 2087 list_destroy(&fe->mlfe_ring_groups); 2088 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2089 } 2090 2091 mutex_exit(&ft->mlft_mtx); 2092 } 2093 2094 boolean_t 2095 mlxcx_remove_umcast_entry(mlxcx_t *mlxp, mlxcx_port_t *port, 2096 mlxcx_ring_group_t *group, const uint8_t *macaddr) 2097 { 2098 mlxcx_flow_entry_t *fe; 2099 mlxcx_flow_table_t *ft = port->mlp_rx_flow; 2100 mlxcx_group_mac_t *gm, probe; 2101 2102 ASSERT(mutex_owned(&port->mlp_mtx)); 2103 ASSERT(mutex_owned(&group->mlg_mtx)); 2104 2105 bzero(&probe, sizeof (probe)); 2106 bcopy(macaddr, probe.mlgm_mac, sizeof (probe.mlgm_mac)); 2107 2108 mutex_enter(&ft->mlft_mtx); 2109 2110 gm = avl_find(&group->mlg_rx_macs, &probe, NULL); 2111 if (gm == NULL) { 2112 mutex_exit(&ft->mlft_mtx); 2113 return (B_FALSE); 2114 } 2115 ASSERT3P(gm->mlgm_group, ==, group); 2116 ASSERT0(bcmp(macaddr, gm->mlgm_mac, sizeof (gm->mlgm_mac))); 2117 2118 fe = gm->mlgm_fe; 2119 ASSERT3P(fe->mlfe_table, ==, ft); 2120 ASSERT0(bcmp(macaddr, fe->mlfe_dmac, sizeof (fe->mlfe_dmac))); 2121 2122 list_remove(&fe->mlfe_ring_groups, gm); 2123 avl_remove(&group->mlg_rx_macs, gm); 2124 kmem_free(gm, sizeof (mlxcx_group_mac_t)); 2125 2126 fe->mlfe_ndest = 0; 2127 for (gm = list_head(&fe->mlfe_ring_groups); gm != NULL; 2128 gm = list_next(&fe->mlfe_ring_groups, gm)) { 2129 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = 2130 gm->mlgm_group->mlg_rx_vlan_ft; 2131 } 2132 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 2133 2134 if (fe->mlfe_ndest > 0) { 2135 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 2136 mutex_exit(&ft->mlft_mtx); 2137 return (B_FALSE); 2138 } 2139 mutex_exit(&ft->mlft_mtx); 2140 return (B_TRUE); 2141 } 2142 2143 /* 2144 * There are no more ring groups left for this MAC (it wasn't attached 2145 * to any other groups since ndest == 0), so clean up its flow entry. 2146 */ 2147 avl_remove(&port->mlp_dmac_fe, fe); 2148 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 2149 list_destroy(&fe->mlfe_ring_groups); 2150 2151 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2152 2153 mutex_exit(&ft->mlft_mtx); 2154 2155 return (B_TRUE); 2156 } 2157 2158 boolean_t 2159 mlxcx_add_umcast_entry(mlxcx_t *mlxp, mlxcx_port_t *port, 2160 mlxcx_ring_group_t *group, const uint8_t *macaddr) 2161 { 2162 mlxcx_flow_group_t *fg; 2163 mlxcx_flow_entry_t *fe, probe; 2164 mlxcx_flow_table_t *ft = port->mlp_rx_flow; 2165 mlxcx_group_mac_t *gm; 2166 boolean_t found = B_FALSE; 2167 2168 ASSERT(mutex_owned(&port->mlp_mtx)); 2169 ASSERT(mutex_owned(&group->mlg_mtx)); 2170 2171 bzero(&probe, sizeof (probe)); 2172 bcopy(macaddr, probe.mlfe_dmac, sizeof (probe.mlfe_dmac)); 2173 2174 mutex_enter(&ft->mlft_mtx); 2175 2176 fe = avl_find(&port->mlp_dmac_fe, &probe, NULL); 2177 2178 if (fe == NULL) { 2179 fg = port->mlp_umcast; 2180 for (fe = list_head(&fg->mlfg_entries); fe != NULL; 2181 fe = list_next(&fg->mlfg_entries, fe)) { 2182 if (!(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED)) { 2183 found = B_TRUE; 2184 break; 2185 } 2186 } 2187 if (!found) { 2188 mutex_exit(&ft->mlft_mtx); 2189 return (B_FALSE); 2190 } 2191 list_create(&fe->mlfe_ring_groups, sizeof (mlxcx_group_mac_t), 2192 offsetof(mlxcx_group_mac_t, mlgm_fe_entry)); 2193 fe->mlfe_state |= MLXCX_FLOW_ENTRY_RESERVED; 2194 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 2195 bcopy(macaddr, fe->mlfe_dmac, sizeof (fe->mlfe_dmac)); 2196 2197 avl_add(&port->mlp_dmac_fe, fe); 2198 } 2199 2200 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = group->mlg_rx_vlan_ft; 2201 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 2202 2203 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 2204 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_DIRTY; 2205 if (--fe->mlfe_ndest == 0) { 2206 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2207 } 2208 mutex_exit(&ft->mlft_mtx); 2209 return (B_FALSE); 2210 } 2211 2212 gm = kmem_zalloc(sizeof (mlxcx_group_mac_t), KM_SLEEP); 2213 gm->mlgm_group = group; 2214 gm->mlgm_fe = fe; 2215 bcopy(macaddr, gm->mlgm_mac, sizeof (gm->mlgm_mac)); 2216 avl_add(&group->mlg_rx_macs, gm); 2217 list_insert_tail(&fe->mlfe_ring_groups, gm); 2218 2219 mutex_exit(&ft->mlft_mtx); 2220 2221 return (B_TRUE); 2222 } 2223 2224 boolean_t 2225 mlxcx_setup_flow_group(mlxcx_t *mlxp, mlxcx_flow_table_t *ft, 2226 mlxcx_flow_group_t *fg) 2227 { 2228 mlxcx_flow_entry_t *fe; 2229 uint_t i, idx; 2230 2231 ASSERT(mutex_owned(&ft->mlft_mtx)); 2232 ASSERT(ft->mlft_state & MLXCX_FLOW_TABLE_CREATED); 2233 ASSERT3P(fg->mlfg_table, ==, ft); 2234 2235 if (ft->mlft_next_ent + fg->mlfg_size > ft->mlft_nents) 2236 return (B_FALSE); 2237 fg->mlfg_start_idx = ft->mlft_next_ent; 2238 2239 if (!mlxcx_cmd_create_flow_group(mlxp, fg)) { 2240 return (B_FALSE); 2241 } 2242 2243 list_create(&fg->mlfg_entries, sizeof (mlxcx_flow_entry_t), 2244 offsetof(mlxcx_flow_entry_t, mlfe_group_entry)); 2245 for (i = 0; i < fg->mlfg_size; ++i) { 2246 idx = fg->mlfg_start_idx + i; 2247 fe = &ft->mlft_ent[idx]; 2248 fe->mlfe_group = fg; 2249 list_insert_tail(&fg->mlfg_entries, fe); 2250 } 2251 fg->mlfg_avail = fg->mlfg_size; 2252 ft->mlft_next_ent += fg->mlfg_size; 2253 2254 return (B_TRUE); 2255 } 2256 2257 static boolean_t 2258 mlxcx_setup_eq0(mlxcx_t *mlxp) 2259 { 2260 mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[0]; 2261 2262 mutex_enter(&mleq->mleq_mtx); 2263 if (!mlxcx_eq_alloc_dma(mlxp, mleq)) { 2264 /* mlxcx_teardown_eqs() will clean this up */ 2265 mutex_exit(&mleq->mleq_mtx); 2266 return (B_FALSE); 2267 } 2268 mleq->mleq_mlx = mlxp; 2269 mleq->mleq_uar = &mlxp->mlx_uar; 2270 mleq->mleq_events = 2271 (1ULL << MLXCX_EVENT_PAGE_REQUEST) | 2272 (1ULL << MLXCX_EVENT_PORT_STATE) | 2273 (1ULL << MLXCX_EVENT_INTERNAL_ERROR) | 2274 (1ULL << MLXCX_EVENT_PORT_MODULE) | 2275 (1ULL << MLXCX_EVENT_SENDQ_DRAIN) | 2276 (1ULL << MLXCX_EVENT_LAST_WQE) | 2277 (1ULL << MLXCX_EVENT_CQ_ERROR) | 2278 (1ULL << MLXCX_EVENT_WQ_CATASTROPHE) | 2279 (1ULL << MLXCX_EVENT_PAGE_FAULT) | 2280 (1ULL << MLXCX_EVENT_WQ_INVALID_REQ) | 2281 (1ULL << MLXCX_EVENT_WQ_ACCESS_VIOL) | 2282 (1ULL << MLXCX_EVENT_NIC_VPORT) | 2283 (1ULL << MLXCX_EVENT_DOORBELL_CONGEST); 2284 if (!mlxcx_cmd_create_eq(mlxp, mleq)) { 2285 /* mlxcx_teardown_eqs() will clean this up */ 2286 mutex_exit(&mleq->mleq_mtx); 2287 return (B_FALSE); 2288 } 2289 if (ddi_intr_enable(mlxp->mlx_intr_handles[0]) != DDI_SUCCESS) { 2290 /* 2291 * mlxcx_teardown_eqs() will handle calling cmd_destroy_eq and 2292 * eq_rele_dma 2293 */ 2294 mutex_exit(&mleq->mleq_mtx); 2295 return (B_FALSE); 2296 } 2297 mlxcx_arm_eq(mlxp, mleq); 2298 mutex_exit(&mleq->mleq_mtx); 2299 return (B_TRUE); 2300 } 2301 2302 int 2303 mlxcx_cq_compare(const void *arg0, const void *arg1) 2304 { 2305 const mlxcx_completion_queue_t *left = arg0; 2306 const mlxcx_completion_queue_t *right = arg1; 2307 2308 if (left->mlcq_num < right->mlcq_num) { 2309 return (-1); 2310 } 2311 if (left->mlcq_num > right->mlcq_num) { 2312 return (1); 2313 } 2314 return (0); 2315 } 2316 2317 static boolean_t 2318 mlxcx_setup_eqs(mlxcx_t *mlxp) 2319 { 2320 uint_t i; 2321 mlxcx_event_queue_t *mleq; 2322 2323 ASSERT3S(mlxp->mlx_intr_count, >, 0); 2324 2325 for (i = 1; i < mlxp->mlx_intr_count; ++i) { 2326 mleq = &mlxp->mlx_eqs[i]; 2327 mutex_enter(&mleq->mleq_mtx); 2328 if (!mlxcx_eq_alloc_dma(mlxp, mleq)) { 2329 mutex_exit(&mleq->mleq_mtx); 2330 return (B_FALSE); 2331 } 2332 mleq->mleq_uar = &mlxp->mlx_uar; 2333 if (!mlxcx_cmd_create_eq(mlxp, mleq)) { 2334 /* mlxcx_teardown() will handle calling eq_rele_dma */ 2335 mutex_exit(&mleq->mleq_mtx); 2336 return (B_FALSE); 2337 } 2338 if (mlxp->mlx_props.mldp_intrmod_period_usec != 0 && 2339 !mlxcx_cmd_set_int_mod(mlxp, i, 2340 mlxp->mlx_props.mldp_intrmod_period_usec)) { 2341 mutex_exit(&mleq->mleq_mtx); 2342 return (B_FALSE); 2343 } 2344 if (ddi_intr_enable(mlxp->mlx_intr_handles[i]) != DDI_SUCCESS) { 2345 mutex_exit(&mleq->mleq_mtx); 2346 return (B_FALSE); 2347 } 2348 mlxcx_arm_eq(mlxp, mleq); 2349 mutex_exit(&mleq->mleq_mtx); 2350 } 2351 2352 mlxp->mlx_next_eq = 1; 2353 2354 return (B_TRUE); 2355 } 2356 2357 /* 2358 * Snapshot all of the hardware capabilities that we care about and then modify 2359 * the HCA capabilities to get things moving. 2360 */ 2361 static boolean_t 2362 mlxcx_init_caps(mlxcx_t *mlxp) 2363 { 2364 mlxcx_caps_t *c; 2365 2366 mlxp->mlx_caps = c = kmem_zalloc(sizeof (mlxcx_caps_t), KM_SLEEP); 2367 2368 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_GENERAL, 2369 MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_hca_cur)) { 2370 mlxcx_warn(mlxp, "failed to obtain current HCA general caps"); 2371 } 2372 2373 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_GENERAL, 2374 MLXCX_HCA_CAP_MODE_MAX, &c->mlc_hca_max)) { 2375 mlxcx_warn(mlxp, "failed to obtain maximum HCA general caps"); 2376 } 2377 2378 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_ETHERNET, 2379 MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_ether_cur)) { 2380 mlxcx_warn(mlxp, "failed to obtain current HCA eth caps"); 2381 } 2382 2383 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_ETHERNET, 2384 MLXCX_HCA_CAP_MODE_MAX, &c->mlc_ether_max)) { 2385 mlxcx_warn(mlxp, "failed to obtain maximum HCA eth caps"); 2386 } 2387 2388 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_NIC_FLOW, 2389 MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_nic_flow_cur)) { 2390 mlxcx_warn(mlxp, "failed to obtain current HCA flow caps"); 2391 } 2392 2393 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_NIC_FLOW, 2394 MLXCX_HCA_CAP_MODE_MAX, &c->mlc_nic_flow_max)) { 2395 mlxcx_warn(mlxp, "failed to obtain maximum HCA flow caps"); 2396 } 2397 2398 /* 2399 * Check the caps meet our requirements. 2400 */ 2401 const mlxcx_hca_cap_general_caps_t *gen = &c->mlc_hca_cur.mhc_general; 2402 2403 if (gen->mlcap_general_log_pg_sz != 12) { 2404 mlxcx_warn(mlxp, "!hardware has page size != 4k " 2405 "(log_pg_sz = %u)", (uint_t)gen->mlcap_general_log_pg_sz); 2406 goto err; 2407 } 2408 if (gen->mlcap_general_cqe_version != 1) { 2409 mlxcx_warn(mlxp, "!hardware does not support CQE v1 " 2410 "(cqe_ver = %u)", (uint_t)gen->mlcap_general_cqe_version); 2411 goto err; 2412 } 2413 if (gen->mlcap_general_port_type != 2414 MLXCX_CAP_GENERAL_PORT_TYPE_ETHERNET) { 2415 mlxcx_warn(mlxp, "!hardware has non-ethernet ports"); 2416 goto err; 2417 } 2418 mlxp->mlx_nports = gen->mlcap_general_num_ports; 2419 mlxp->mlx_max_sdu = (1 << (gen->mlcap_general_log_max_msg & 0x1F)); 2420 2421 c->mlc_max_tir = (1 << gen->mlcap_general_log_max_tir); 2422 2423 c->mlc_checksum = get_bit32(c->mlc_ether_cur.mhc_eth.mlcap_eth_flags, 2424 MLXCX_ETH_CAP_CSUM_CAP); 2425 c->mlc_vxlan = get_bit32(c->mlc_ether_cur.mhc_eth.mlcap_eth_flags, 2426 MLXCX_ETH_CAP_TUNNEL_STATELESS_VXLAN); 2427 2428 c->mlc_max_lso_size = (1 << get_bits32(c->mlc_ether_cur.mhc_eth. 2429 mlcap_eth_flags, MLXCX_ETH_CAP_MAX_LSO_CAP)); 2430 if (c->mlc_max_lso_size == 1) { 2431 c->mlc_max_lso_size = 0; 2432 c->mlc_lso = B_FALSE; 2433 } else { 2434 c->mlc_lso = B_TRUE; 2435 } 2436 2437 c->mlc_max_rqt_size = (1 << get_bits32(c->mlc_ether_cur.mhc_eth. 2438 mlcap_eth_flags, MLXCX_ETH_CAP_RSS_IND_TBL_CAP)); 2439 2440 if (!get_bit32(c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx. 2441 mlcap_flow_prop_flags, MLXCX_FLOW_CAP_PROPS_SUPPORT)) { 2442 mlxcx_warn(mlxp, "!hardware does not support rx flow tables"); 2443 goto err; 2444 } 2445 if (!get_bit32(c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx. 2446 mlcap_flow_prop_flags, MLXCX_FLOW_CAP_PROPS_MODIFY)) { 2447 mlxcx_warn(mlxp, "!hardware does not support modifying rx " 2448 "flow table entries"); 2449 goto err; 2450 } 2451 2452 c->mlc_max_rx_ft_shift = c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx. 2453 mlcap_flow_prop_log_max_ft_size; 2454 c->mlc_max_rx_flows = (1 << c->mlc_nic_flow_cur.mhc_flow. 2455 mlcap_flow_nic_rx.mlcap_flow_prop_log_max_flow); 2456 c->mlc_max_rx_fe_dest = (1 << c->mlc_nic_flow_cur.mhc_flow. 2457 mlcap_flow_nic_rx.mlcap_flow_prop_log_max_destination); 2458 2459 return (B_TRUE); 2460 2461 err: 2462 kmem_free(mlxp->mlx_caps, sizeof (mlxcx_caps_t)); 2463 return (B_FALSE); 2464 } 2465 2466 static int 2467 mlxcx_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2468 { 2469 mlxcx_t *mlxp; 2470 2471 if (cmd != DDI_DETACH) 2472 return (DDI_FAILURE); 2473 2474 mlxp = ddi_get_driver_private(dip); 2475 if (mlxp == NULL) { 2476 mlxcx_warn(NULL, "asked to detach, but missing instance " 2477 "private data"); 2478 return (DDI_FAILURE); 2479 } 2480 2481 if (mlxp->mlx_attach & MLXCX_ATTACH_MAC_HDL) { 2482 if (mac_unregister(mlxp->mlx_mac_hdl) != DDI_SUCCESS) { 2483 return (DDI_FAILURE); 2484 } 2485 mlxp->mlx_attach &= ~MLXCX_ATTACH_MAC_HDL; 2486 } 2487 2488 mlxcx_teardown(mlxp); 2489 return (DDI_SUCCESS); 2490 } 2491 2492 static size_t 2493 mlxcx_calc_rx_ngroups(mlxcx_t *mlxp) 2494 { 2495 size_t ngroups = mlxp->mlx_props.mldp_rx_ngroups_large + 2496 mlxp->mlx_props.mldp_rx_ngroups_small; 2497 size_t tirlim, flowlim, gflowlim; 2498 2499 tirlim = mlxp->mlx_caps->mlc_max_tir / MLXCX_TIRS_PER_GROUP; 2500 if (tirlim < ngroups) { 2501 mlxcx_note(mlxp, "limiting number of rx groups to %u based " 2502 "on number of TIRs available", tirlim); 2503 ngroups = tirlim; 2504 } 2505 2506 flowlim = (1 << mlxp->mlx_caps->mlc_max_rx_ft_shift) - 2; 2507 if (flowlim < ngroups) { 2508 mlxcx_note(mlxp, "limiting number of rx groups to %u based " 2509 "on max size of RX flow tables", flowlim); 2510 ngroups = flowlim; 2511 } 2512 2513 do { 2514 gflowlim = mlxp->mlx_caps->mlc_max_rx_flows - 16 * ngroups - 2; 2515 if (gflowlim < ngroups) { 2516 mlxcx_note(mlxp, "limiting number of rx groups to %u " 2517 "based on max total RX flows", gflowlim); 2518 --ngroups; 2519 } 2520 } while (gflowlim < ngroups); 2521 2522 return (ngroups); 2523 } 2524 2525 static int 2526 mlxcx_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2527 { 2528 mlxcx_t *mlxp; 2529 uint_t i; 2530 int inst, ret; 2531 2532 if (cmd != DDI_ATTACH) 2533 return (DDI_FAILURE); 2534 2535 inst = ddi_get_instance(dip); 2536 ret = ddi_soft_state_zalloc(mlxcx_softstate, inst); 2537 if (ret != 0) 2538 return (ret); 2539 2540 mlxp = ddi_get_soft_state(mlxcx_softstate, inst); 2541 if (mlxp == NULL) 2542 return (DDI_FAILURE); 2543 mlxp->mlx_dip = dip; 2544 mlxp->mlx_inst = inst; 2545 ddi_set_driver_private(dip, mlxp); 2546 2547 mlxcx_load_props(mlxp); 2548 2549 mlxcx_fm_init(mlxp); 2550 mlxp->mlx_attach |= MLXCX_ATTACH_FM; 2551 2552 if (pci_config_setup(mlxp->mlx_dip, &mlxp->mlx_cfg_handle) != 2553 DDI_SUCCESS) { 2554 mlxcx_warn(mlxp, "failed to initial PCI config space"); 2555 goto err; 2556 } 2557 mlxp->mlx_attach |= MLXCX_ATTACH_PCI_CONFIG; 2558 2559 if (!mlxcx_regs_map(mlxp)) { 2560 goto err; 2561 } 2562 mlxp->mlx_attach |= MLXCX_ATTACH_REGS; 2563 2564 if (!mlxcx_cmd_queue_init(mlxp)) { 2565 goto err; 2566 } 2567 mlxp->mlx_attach |= MLXCX_ATTACH_CMD; 2568 2569 if (!mlxcx_cmd_enable_hca(mlxp)) { 2570 goto err; 2571 } 2572 mlxp->mlx_attach |= MLXCX_ATTACH_ENABLE_HCA; 2573 2574 if (!mlxcx_check_issi(mlxp)) { 2575 goto err; 2576 } 2577 2578 /* 2579 * We have to get our interrupts now so we know what priority to 2580 * create pagemtx with. 2581 */ 2582 if (!mlxcx_intr_setup(mlxp)) { 2583 goto err; 2584 } 2585 mlxp->mlx_attach |= MLXCX_ATTACH_INTRS; 2586 2587 mutex_init(&mlxp->mlx_pagemtx, NULL, MUTEX_DRIVER, 2588 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 2589 avl_create(&mlxp->mlx_pages, mlxcx_page_compare, 2590 sizeof (mlxcx_dev_page_t), offsetof(mlxcx_dev_page_t, mxdp_tree)); 2591 mlxp->mlx_attach |= MLXCX_ATTACH_PAGE_LIST; 2592 2593 if (!mlxcx_init_pages(mlxp, MLXCX_QUERY_PAGES_OPMOD_BOOT)) { 2594 goto err; 2595 } 2596 2597 if (!mlxcx_init_caps(mlxp)) { 2598 goto err; 2599 } 2600 mlxp->mlx_attach |= MLXCX_ATTACH_CAPS; 2601 2602 if (!mlxcx_init_pages(mlxp, MLXCX_QUERY_PAGES_OPMOD_INIT)) { 2603 goto err; 2604 } 2605 2606 if (!mlxcx_cmd_init_hca(mlxp)) { 2607 goto err; 2608 } 2609 mlxp->mlx_attach |= MLXCX_ATTACH_INIT_HCA; 2610 2611 if (!mlxcx_cmd_set_driver_version(mlxp, MLXCX_DRIVER_VERSION)) { 2612 goto err; 2613 } 2614 2615 /* 2616 * The User Access Region (UAR) is needed so we can ring EQ and CQ 2617 * doorbells. 2618 */ 2619 if (!mlxcx_cmd_alloc_uar(mlxp, &mlxp->mlx_uar)) { 2620 goto err; 2621 } 2622 for (i = 0; i < MLXCX_BF_PER_UAR; ++i) { 2623 mutex_init(&mlxp->mlx_uar.mlu_bf[i].mbf_mtx, NULL, 2624 MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_intr_pri)); 2625 } 2626 mlxp->mlx_attach |= MLXCX_ATTACH_UAR_PD_TD; 2627 2628 /* 2629 * Set up event queue #0 -- it's special and only handles control 2630 * type events, like PAGE_REQUEST (which we will probably get during 2631 * the commands below). 2632 * 2633 * This will enable and arm the interrupt on EQ 0, too. 2634 */ 2635 if (!mlxcx_setup_eq0(mlxp)) { 2636 goto err; 2637 } 2638 2639 /* 2640 * Allocate a protection and transport domain. These don't really do 2641 * anything for us (they're IB concepts), but we need to give their 2642 * ID numbers in other commands. 2643 */ 2644 if (!mlxcx_cmd_alloc_pd(mlxp, &mlxp->mlx_pd)) { 2645 goto err; 2646 } 2647 if (!mlxcx_cmd_alloc_tdom(mlxp, &mlxp->mlx_tdom)) { 2648 goto err; 2649 } 2650 /* 2651 * Fetch the "reserved" lkey that lets us give linear addresses in 2652 * work queue entries, rather than having to mess with the NIC's 2653 * internal MMU. 2654 */ 2655 if (!mlxcx_cmd_query_special_ctxs(mlxp)) { 2656 goto err; 2657 } 2658 2659 /* 2660 * Query our port information and current state, populate the 2661 * mlxcx_port_t structs. 2662 * 2663 * This also sets up the root flow tables and flow groups. 2664 */ 2665 if (!mlxcx_setup_ports(mlxp)) { 2666 goto err; 2667 } 2668 mlxp->mlx_attach |= MLXCX_ATTACH_PORTS; 2669 2670 mlxcx_load_model_props(mlxp); 2671 2672 /* 2673 * Set up, enable and arm the rest of the interrupt EQs which will 2674 * service events from CQs. 2675 * 2676 * The MLXCX_ATTACH_INTRS flag covers checking if these need to be 2677 * cleaned up. 2678 */ 2679 if (!mlxcx_setup_eqs(mlxp)) { 2680 goto err; 2681 } 2682 2683 /* Completion queues */ 2684 list_create(&mlxp->mlx_cqs, sizeof (mlxcx_completion_queue_t), 2685 offsetof(mlxcx_completion_queue_t, mlcq_entry)); 2686 mlxp->mlx_attach |= MLXCX_ATTACH_CQS; 2687 2688 /* Work queues (send queues, receive queues) */ 2689 list_create(&mlxp->mlx_wqs, sizeof (mlxcx_work_queue_t), 2690 offsetof(mlxcx_work_queue_t, mlwq_entry)); 2691 mlxp->mlx_attach |= MLXCX_ATTACH_WQS; 2692 2693 /* Set up periodic fault check timers which check the queue states */ 2694 if (!mlxcx_setup_checktimers(mlxp)) { 2695 goto err; 2696 } 2697 mlxp->mlx_attach |= MLXCX_ATTACH_CHKTIMERS; 2698 2699 /* 2700 * Construct our arrays of mlxcx_ring_group_ts, which represent the 2701 * "groups" we advertise to MAC. 2702 */ 2703 mlxp->mlx_rx_ngroups = mlxcx_calc_rx_ngroups(mlxp); 2704 mlxp->mlx_rx_groups_size = mlxp->mlx_rx_ngroups * 2705 sizeof (mlxcx_ring_group_t); 2706 mlxp->mlx_rx_groups = kmem_zalloc(mlxp->mlx_rx_groups_size, KM_SLEEP); 2707 2708 mlxp->mlx_tx_ngroups = mlxp->mlx_props.mldp_tx_ngroups; 2709 mlxp->mlx_tx_groups_size = mlxp->mlx_tx_ngroups * 2710 sizeof (mlxcx_ring_group_t); 2711 mlxp->mlx_tx_groups = kmem_zalloc(mlxp->mlx_tx_groups_size, KM_SLEEP); 2712 2713 mlxp->mlx_attach |= MLXCX_ATTACH_GROUPS; 2714 2715 /* 2716 * Sets up the free/busy buffers list for keeping track of packet 2717 * buffers. 2718 */ 2719 if (!mlxcx_setup_bufs(mlxp)) 2720 goto err; 2721 mlxp->mlx_attach |= MLXCX_ATTACH_BUFS; 2722 2723 /* 2724 * Before we tell MAC about our rings/groups, we need to do enough 2725 * setup on them to be sure about the numbers and configuration that 2726 * we have. This will do basically everything short of allocating 2727 * packet buffers and starting the rings up. 2728 */ 2729 for (i = 0; i < mlxp->mlx_tx_ngroups; ++i) { 2730 if (!mlxcx_tx_group_setup(mlxp, &mlxp->mlx_tx_groups[i])) 2731 goto err; 2732 } 2733 for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) { 2734 if (!mlxcx_rx_group_setup(mlxp, &mlxp->mlx_rx_groups[i])) 2735 goto err; 2736 } 2737 2738 /* 2739 * Finally, tell MAC that we exist! 2740 */ 2741 if (!mlxcx_register_mac(mlxp)) { 2742 goto err; 2743 } 2744 mlxp->mlx_attach |= MLXCX_ATTACH_MAC_HDL; 2745 2746 return (DDI_SUCCESS); 2747 2748 err: 2749 mlxcx_teardown(mlxp); 2750 return (DDI_FAILURE); 2751 } 2752 2753 static struct cb_ops mlxcx_cb_ops = { 2754 .cb_open = nulldev, 2755 .cb_close = nulldev, 2756 .cb_strategy = nodev, 2757 .cb_print = nodev, 2758 .cb_dump = nodev, 2759 .cb_read = nodev, 2760 .cb_write = nodev, 2761 .cb_ioctl = nodev, 2762 .cb_devmap = nodev, 2763 .cb_mmap = nodev, 2764 .cb_segmap = nodev, 2765 .cb_chpoll = nochpoll, 2766 .cb_prop_op = ddi_prop_op, 2767 .cb_flag = D_MP, 2768 .cb_rev = CB_REV, 2769 .cb_aread = nodev, 2770 .cb_awrite = nodev 2771 }; 2772 2773 static struct dev_ops mlxcx_dev_ops = { 2774 .devo_rev = DEVO_REV, 2775 .devo_refcnt = 0, 2776 .devo_getinfo = NULL, 2777 .devo_identify = nulldev, 2778 .devo_probe = nulldev, 2779 .devo_attach = mlxcx_attach, 2780 .devo_detach = mlxcx_detach, 2781 .devo_reset = nodev, 2782 .devo_power = ddi_power, 2783 .devo_quiesce = ddi_quiesce_not_supported, 2784 .devo_cb_ops = &mlxcx_cb_ops 2785 }; 2786 2787 static struct modldrv mlxcx_modldrv = { 2788 .drv_modops = &mod_driverops, 2789 .drv_linkinfo = "Mellanox Connect-X 4/5/6", 2790 .drv_dev_ops = &mlxcx_dev_ops 2791 }; 2792 2793 static struct modlinkage mlxcx_modlinkage = { 2794 .ml_rev = MODREV_1, 2795 .ml_linkage = { &mlxcx_modldrv, NULL } 2796 }; 2797 2798 int 2799 _init(void) 2800 { 2801 int ret; 2802 2803 ret = ddi_soft_state_init(&mlxcx_softstate, sizeof (mlxcx_t), 0); 2804 if (ret != 0) { 2805 return (ret); 2806 } 2807 2808 mac_init_ops(&mlxcx_dev_ops, MLXCX_MODULE_NAME); 2809 2810 if ((ret = mod_install(&mlxcx_modlinkage)) != DDI_SUCCESS) { 2811 mac_fini_ops(&mlxcx_dev_ops); 2812 ddi_soft_state_fini(&mlxcx_softstate); 2813 return (ret); 2814 } 2815 2816 return (DDI_SUCCESS); 2817 } 2818 2819 int 2820 _info(struct modinfo *modinfop) 2821 { 2822 return (mod_info(&mlxcx_modlinkage, modinfop)); 2823 } 2824 2825 int 2826 _fini(void) 2827 { 2828 int ret; 2829 2830 if ((ret = mod_remove(&mlxcx_modlinkage)) != DDI_SUCCESS) { 2831 return (ret); 2832 } 2833 2834 mac_fini_ops(&mlxcx_dev_ops); 2835 2836 ddi_soft_state_fini(&mlxcx_softstate); 2837 2838 return (DDI_SUCCESS); 2839 } 2840