1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2021, The University of Queensland 14 * Copyright (c) 2018, Joyent, Inc. 15 * Copyright 2020 RackTop Systems, Inc. 16 * Copyright 2023 MNX Cloud, Inc. 17 */ 18 19 /* 20 * Mellanox Connect-X 4/5/6 driver. 21 */ 22 23 /* 24 * The PRM for this family of parts was freely available at: 25 * 26 * https://www.mellanox.com/related-docs/user_manuals/ \ 27 * Ethernet_Adapters_Programming_Manual.pdf 28 * 29 * but has since disappeared. 30 */ 31 /* 32 * ConnectX glossary 33 * ----------------- 34 * 35 * WR Work Request: something we've asked the hardware to do by 36 * creating a Work Queue Entry (WQE), e.g. send or recv a packet 37 * 38 * WQE Work Queue Entry: a descriptor on a work queue descriptor ring 39 * 40 * WQ Work Queue: a descriptor ring that we can place WQEs on, usually 41 * either a Send Queue (SQ) or Receive Queue (RQ). Different WQ 42 * types have different WQE structures, different commands for 43 * creating and destroying them, etc, but share a common context 44 * structure, counter setup and state graph. 45 * SQ Send Queue, a specific type of WQ that sends packets 46 * RQ Receive Queue, a specific type of WQ that receives packets 47 * 48 * CQ Completion Queue: completion of WRs from a WQ are reported to 49 * one of these, as a CQE on its entry ring. 50 * CQE Completion Queue Entry: an entry in a CQ ring. Contains error 51 * info, as well as packet size, the ID of the WQ, and the index 52 * of the WQE which completed. Does not contain any packet data. 53 * 54 * EQ Event Queue: a ring of event structs from the hardware informing 55 * us when particular events happen. Many events can point at a 56 * a particular CQ which we should then go look at. 57 * EQE Event Queue Entry: an entry on the EQ ring 58 * 59 * UAR User Access Region, a page of the device's PCI BAR which is 60 * tied to particular EQ/CQ/WQ sets and contains doorbells to 61 * ring to arm them for interrupts or wake them up for new work 62 * 63 * RQT RQ Table, a collection of indexed RQs used to refer to the group 64 * as a single unit (for e.g. hashing/RSS). 65 * 66 * TIR Transport Interface Recieve, a bucket of resources for the 67 * reception of packets. TIRs have to point at either a single RQ 68 * or a table of RQs (RQT). They then serve as a target for flow 69 * table entries (FEs). TIRs that point at an RQT also contain the 70 * settings for hashing for RSS. 71 * 72 * TIS Transport Interface Send, a bucket of resources associated with 73 * the transmission of packets. In particular, the temporary 74 * resources used for LSO internally in the card are accounted to 75 * a TIS. 76 * 77 * FT Flow Table, a collection of FEs and FGs that can be referred to 78 * as a single entity (e.g. used as a target from another flow 79 * entry or set as the "root" table to handle incoming or outgoing 80 * packets). Packets arriving at a FT are matched against the 81 * FEs in the table until either one matches with a terminating 82 * action or all FEs are exhausted (it's first-match-wins but with 83 * some actions that are non-terminal, like counting actions). 84 * 85 * FG Flow Group, a group of FEs which share a common "mask" (i.e. 86 * they match on the same attributes of packets coming into the 87 * flow). 88 * 89 * FE Flow Entry, an individual set of values to match against 90 * packets entering the flow table, combined with an action to 91 * take upon a successful match. The action we use most is 92 * "forward", which sends the packets to a TIR or another flow 93 * table and then stops further processing within the FE's FT. 94 * 95 * lkey/mkey A reference to something similar to a page table but in the 96 * device's internal onboard MMU. Since Connect-X parts double as 97 * IB cards (lots of RDMA) they have extensive onboard memory mgmt 98 * features which we try very hard not to use. For our WQEs we use 99 * the "reserved" lkey, which is a special value which indicates 100 * that addresses we give are linear addresses and should not be 101 * translated. 102 * 103 * PD Protection Domain, an IB concept. We have to allocate one to 104 * provide as a parameter for new WQs, but we don't do anything 105 * with it. 106 * 107 * TDOM/TD Transport Domain, an IB concept. We allocate one in order to 108 * provide it as a parameter to TIR/TIS creation, but we don't do 109 * anything with it. 110 */ 111 /* 112 * 113 * Data flow overview 114 * ------------------ 115 * 116 * This driver is a MAC ring-enabled driver which maps rings to send and recv 117 * queues in hardware on the device. 118 * 119 * Each SQ and RQ is set up to report to its own individual CQ, to ensure 120 * sufficient space, and simplify the logic needed to work out which buffer 121 * was completed. 122 * 123 * The CQs are then round-robin allocated onto EQs, of which we set up one per 124 * interrupt that the system gives us for the device. Normally this means we 125 * have 8 EQs. 126 * 127 * When we have >= 8 EQs available, we try to allocate only RX or only TX 128 * CQs on each one. The EQs are chosen for RX and TX in an alternating fashion. 129 * 130 * EQ #0 is reserved for all event types other than completion events, and has 131 * no CQs associated with it at any time. EQs #1 and upwards are only used for 132 * handling CQ completion events. 133 * 134 * +------+ +------+ +------+ +---------+ 135 * | SQ 0 |---->| CQ 0 |-----+ | EQ 0 |------> | MSI-X 0 | mlxcx_intr_0 136 * +------+ +------+ | +------+ +---------+ 137 * | 138 * +------+ +------+ | 139 * | SQ 1 |---->| CQ 1 |---+ | +------+ 140 * +------+ +------+ | +---> | | 141 * | | | 142 * +------+ +------+ | | EQ 1 | +---------+ 143 * | SQ 2 |---->| CQ 2 |---------> | |------> | MSI-X 1 | mlxcx_intr_n 144 * +------+ +------+ | +---> | | +---------+ 145 * | | +------+ 146 * | | 147 * ... | | 148 * | | +------+ 149 * +------+ +------+ +-----> | | 150 * | RQ 0 |---->| CQ 3 |---------> | | +---------+ 151 * +------+ +------+ | | EQ 2 |------> | MSI-X 2 | mlxcx_intr_n 152 * | | | +---------+ 153 * +------+ +------+ | +-> | | 154 * | RQ 1 |---->| CQ 4 |-----+ | +------+ 155 * +------+ +------+ | 156 * | .... 157 * +------+ +------+ | 158 * | RQ 2 |---->| CQ 5 |-------+ 159 * +------+ +------+ 160 * 161 * ... (note this diagram does not show RX-only or TX-only EQs) 162 * 163 * For TX, we advertise all of the SQs we create as plain rings to MAC with 164 * no TX groups. This puts MAC in "virtual group" mode where it will allocate 165 * and use the rings as it sees fit. 166 * 167 * For RX, we advertise actual groups in order to make use of hardware 168 * classification. 169 * 170 * The hardware classification we use is based around Flow Tables, and we 171 * currently ignore all of the eswitch features of the card. The NIC VPORT 172 * is always set to promisc mode so that the eswitch sends us all of the 173 * traffic that arrives on the NIC, and we use flow entries to manage 174 * everything. 175 * 176 * We use 2 layers of flow tables for classification: traffic arrives at the 177 * root RX flow table which contains MAC address filters. Those then send 178 * matched traffic to the per-group L1 VLAN filter tables which contain VLAN 179 * presence and VID filters. 180 * 181 * Since these parts only support doing RSS hashing on a single protocol at a 182 * time, we have to use a third layer of flow tables as well to break traffic 183 * down by L4 and L3 protocol (TCPv6, TCPv4, UDPv6, UDPv4, IPv6, IPv4 etc) 184 * so that it can be sent to the appropriate TIR for hashing. 185 * 186 * Incoming packets 187 * + +---------+ +---------+ 188 * | +->| group 0 | | group 0 | 189 * | | | vlan ft | +-->| hash ft | 190 * v | | L1 | | | L2 | 191 * +----+----+ | +---------+ | +---------+ +-----+ +-----+------+ 192 * | eswitch | | | | | | TCPv6 |--->| TIR |--->| | RQ0 | 193 * +----+----+ | | | | +---------+ +-----+ | +------+ 194 * | | | | | | UDPv6 |--->| TIR |--->| | RQ1 | 195 * | | | | | +---------+ +-----+ | +------+ 196 * | | | | | | TCPv4 |--->| TIR |--->| | RQ2 | 197 * v | | | | +---------+ +-----+ | RQT +------+ 198 * +----+----+ | +---------+ | | UDPv4 |--->| TIR |--->| | ... | 199 * | root rx | | | default |--+ +---------+ +-----+ | | | 200 * | flow tb | | +---------+ | | IPv6 |--->| TIR |--->| | | 201 * | L0 | | | promisc |--+ +---------+ +-----+ | | | 202 * +---------+ | +---------+ ^ | IPv4 |--->| TIR |--->| | | 203 * | bcast |---|---------------+ +---------+ +-----+ +-----+------+ 204 * +---------+ | ^ | other |-+ 205 * | MAC 0 |---+ | +---------+ | +-----+ +-----+ 206 * +---------+ | +->| TIR |--->| RQ0 | 207 * | MAC 1 |-+ | +-----+ +-----+ 208 * +---------+ | +---------------+ 209 * | MAC 2 |-+ | ^ 210 * +---------+ | | | 211 * | MAC 3 |-+ | +---------+ | +---------+ 212 * +---------+ | | | group 1 | | | group 1 | 213 * | ..... | +--->| vlan ft | | +>| hash ft | 214 * | | | | L1 | | | | L2 | 215 * +---------+ | +---------+ | | +---------+ +-----+ +-----+------+ 216 * | promisc |---+ | VLAN 0 |----+ | TCPv6 |--->| TIR |--->| | RQ3 | 217 * +---------+ +---------+ | +---------+ +-----+ | +------+ 218 * | ..... | | | UDPv6 |--->| TIR |--->| | RQ4 | 219 * | | | +---------+ +-----+ | +------+ 220 * | | | | TCPv4 |--->| TIR |--->| | RQ5 | 221 * | | | +---------+ +-----+ | RQT +------+ 222 * +---------+ | | UDPv4 |--->| TIR |--->| | ... | 223 * | | | +---------+ +-----+ | | | 224 * +---------+ | | IPv6 |--->| TIR |--->| | | 225 * | promisc |--+ +---------+ +-----+ | | | 226 * +---------+ | IPv4 |--->| TIR |--->| | | 227 * +---------+ +-----+ +-----+------+ 228 * | other |-+ 229 * +---------+ | 230 * ....... | +-----+ +-----+ 231 * +->| TIR |--->| RQ3 | 232 * +-----+ +-----+ 233 * 234 * Note that the "promisc" flow entries are only set/enabled when promisc 235 * mode is enabled for the NIC. All promisc flow entries point directly at 236 * group 0's hashing flowtable (so all promisc-only traffic lands on group 0, 237 * the "default group" in MAC). 238 * 239 * The "default" entry in the L1 VLAN filter flow tables is used when there 240 * are no VLANs set for the group, to accept any traffic regardless of tag. It 241 * is deleted as soon as a VLAN filter is added (and re-instated if the 242 * last VLAN filter is removed). 243 * 244 * The actual descriptor ring structures for RX on Connect-X4 don't contain any 245 * space for packet data (they're a collection of scatter pointers only). TX 246 * descriptors contain some space for "inline headers" (and the card requires 247 * us to put at least the L2 Ethernet headers there for the eswitch to look at) 248 * but all the rest of the data comes from the gather pointers. 249 * 250 * When we get completions back they simply contain the ring index number of 251 * the WR (work request) which completed. So, we manage the buffers for actual 252 * packet data completely independently of the descriptors in this driver. When 253 * a WR is enqueued in a WQE (work queue entry), we stamp the packet data buffer 254 * with the WQE index that we put it at, and therefore don't have to look at 255 * the original descriptor at all when handling completions. 256 * 257 * For RX, we create sufficient packet data buffers to fill 150% of the 258 * available descriptors for each ring. These all are pre-set-up for DMA and 259 * have an mblk_t associated with them (with desballoc()). 260 * 261 * For TX we either borrow the mblk's memory and DMA bind it (if the packet is 262 * large enough), or we copy it into a pre-allocated buffer set up in the same 263 * as as for RX. 264 */ 265 266 /* 267 * Buffer lifecycle: RX 268 * -------------------- 269 * 270 * The lifecycle of an mlxcx_buffer_t (packet buffer) used for RX is pretty 271 * straightforward. 272 * 273 * It is created (and has all its memory allocated) at the time of starting up 274 * the RX ring it belongs to. Then it is placed on the "free" list in the 275 * mlxcx_buffer_shard_t associated with its RQ. When mlxcx_rq_refill() wants 276 * more buffers to add to the RQ, it takes one off and marks it as "on WQ" 277 * before making a WQE for it. 278 * 279 * After a completion event occurs, the packet is either discarded (and the 280 * buffer_t returned to the free list), or it is readied for loaning to MAC 281 * and placed on the "loaned" list in the mlxcx_buffer_shard_t. 282 * 283 * Once MAC and the rest of the system have finished with the packet, they call 284 * freemsg() on its mblk, which will call mlxcx_buf_mp_return. At this point 285 * the fate of the buffer_t is determined by the state of the 286 * mlxcx_buffer_shard_t. When the shard is in its normal state the buffer_t 287 * will be returned to the free list, potentially to be recycled and used 288 * again. But if the shard is draining (E.g. after a ring stop) there will be 289 * no recycling and the buffer_t is immediately destroyed. 290 * 291 * At detach/teardown time, buffers are only every destroyed from the free list. 292 * 293 * 294 * + 295 * | 296 * | mlxcx_buf_create 297 * | 298 * v 299 * +----+----+ 300 * | created | 301 * +----+----+ +------+ 302 * | | dead | 303 * | +------+ 304 * | mlxcx_buf_return ^ 305 * | | 306 * v | mlxcx_buf_destroy 307 * mlxcx_buf_destroy +----+----+ +-----------+ | 308 * +---------| free |<------no-| draining? |-yes-+ 309 * | +----+----+ +-----------+ 310 * | | ^ 311 * | | | 312 * v | mlxcx_buf_take | mlxcx_buf_return 313 * +---+--+ v | 314 * | dead | +---+---+ | 315 * +------+ | on WQ |- - - - - - - - >O 316 * +---+---+ ^ 317 * | | 318 * | | 319 * | mlxcx_buf_loan | mlxcx_buf_mp_return 320 * v | 321 * +-------+--------+ | 322 * | on loan to MAC |----------->O 323 * +----------------+ freemsg() 324 * 325 */ 326 327 /* 328 * Buffer lifecycle: TX 329 * -------------------- 330 * 331 * mlxcx_buffer_ts used for TX are divided into two kinds: regular buffers, and 332 * "foreign" buffers. 333 * 334 * The former have their memory allocated and DMA bound by this driver, while 335 * the latter (the "foreign" buffers) are on loan from MAC. Their memory is 336 * not owned by us, though we do DMA bind it (and take responsibility for 337 * un-binding it when we're done with them). 338 * 339 * We use separate mlxcx_buf_shard_ts for foreign and local buffers on each 340 * SQ. Thus, there is a separate free list and mutex for each kind. 341 * 342 * Since a TX packet might consist of multiple mblks, we translate each mblk 343 * into exactly one buffer_t. The buffer_ts are chained together in the same 344 * order as the mblks, using the mlb_tx_chain/mlb_tx_chain_entry list_t. 345 * 346 * Each chain of TX buffers may consist of foreign or driver buffers, in any 347 * mixture. 348 * 349 * The head of a TX buffer chain has mlb_tx_head == itself, which distinguishes 350 * it from the rest of the chain buffers. 351 * 352 * TX buffer chains are always returned to the free list by 353 * mlxcx_buf_return_chain(), which takes care of walking the mlb_tx_chain and 354 * freeing all of the members. 355 * 356 * We only call freemsg() once, on the head of the TX buffer chain's original 357 * mblk. This is true whether we copied it or bound it in a foreign buffer. 358 */ 359 360 /* 361 * Startup and command interface 362 * ----------------------------- 363 * 364 * The command interface is the primary way in which we give control orders to 365 * the hardware (e.g. actions like "create this queue" or "delete this flow 366 * entry"). The command interface is never used to transmit or receive packets 367 * -- that takes place only on the queues that are set up through it. 368 * 369 * In mlxcx_cmd.c we implement our use of the command interface on top of a 370 * simple taskq. As commands are submitted from the taskq they choose a 371 * "slot", if there are no free slots then execution of the command will 372 * be paused until one is free. The hardware permits up to 32 independent 373 * slots for concurrent command execution. 374 * 375 * Before interrupts are enabled, command completion is polled, once 376 * interrupts are up command completions become asynchronous and are 377 * wired to EQ 0. A caveat to this is commands can not be submitted 378 * directly from EQ 0's completion handler, and any processing resulting from 379 * an asynchronous event which requires further use of the command interface 380 * is posted through a taskq. 381 * 382 * The startup/attach process for this card involves a bunch of different steps 383 * which are summarised pretty well in the PRM. We have to send a number of 384 * commands which do different things to start the card up, give it some pages 385 * of our own memory for it to use, then start creating all the entities that 386 * we need to use like EQs, CQs, WQs, as well as their dependencies like PDs 387 * and TDoms. 388 */ 389 390 /* 391 * UARs 392 * ---- 393 * 394 * The pages of the PCI BAR other than the first few are reserved for use as 395 * "UAR" sections in this device. Each UAR section can be used as a set of 396 * doorbells for our queues. 397 * 398 * Currently we just make one single UAR for all of our queues. It doesn't 399 * seem to be a major limitation yet. 400 * 401 * When we're sending packets through an SQ, the PRM is not awful clear about 402 * exactly how we're meant to use the first 16 bytes of the Blueflame buffers 403 * (it's clear on the pattern of alternation you're expected to use between 404 * even and odd for Blueflame sends, but not for regular doorbells). 405 * 406 * Currently we don't do the even-odd alternating pattern for ordinary 407 * doorbells, and we don't use Blueflame at all. This seems to work fine, at 408 * least on Connect-X4 Lx. 409 */ 410 411 /* 412 * Lock ordering 413 * ------------- 414 * 415 * Interrupt side: 416 * 417 * - mleq_mtx 418 * - mlcq_arm_mtx 419 * - mlcq_mtx 420 * - mlcq_bufbmtx 421 * - mlwq_mtx 422 * - mlbs_mtx 423 * - mlp_mtx 424 * 425 * GLD side: 426 * 427 * - mlp_mtx 428 * - mlg_mtx 429 * - mlg_*.mlft_mtx 430 * - mlp_*.mlft_mtx 431 * - mlwq_mtx 432 * - mlbs_mtx 433 * - mlcq_bufbmtx 434 * - mleq_mtx 435 * - mlcq_arm_mtx 436 * - mlcq_mtx 437 * 438 */ 439 440 #include <sys/modctl.h> 441 #include <sys/conf.h> 442 #include <sys/devops.h> 443 #include <sys/sysmacros.h> 444 #include <sys/time.h> 445 446 #include <sys/mac_provider.h> 447 448 #include <mlxcx.h> 449 450 CTASSERT((1 << MLXCX_RX_HASH_FT_SIZE_SHIFT) >= MLXCX_TIRS_PER_GROUP); 451 452 #define MLXCX_MODULE_NAME "mlxcx" 453 /* 454 * We give this to the firmware, so it has to be in a fixed format that it 455 * understands. 456 */ 457 #define MLXCX_DRIVER_VERSION "illumos,mlxcx,1.0.0,1,000,000000" 458 459 /* 460 * Firmware may take a while to reclaim pages. Try a set number of times. 461 */ 462 clock_t mlxcx_reclaim_delay = 1000 * 50; /* 50 ms in us */ 463 uint_t mlxcx_reclaim_tries = 100; /* Wait at most 5000ms */ 464 465 static void *mlxcx_softstate; 466 467 /* 468 * Fault detection thresholds. 469 */ 470 uint_t mlxcx_doorbell_tries = MLXCX_DOORBELL_TRIES_DFLT; 471 uint_t mlxcx_stuck_intr_count = MLXCX_STUCK_INTR_COUNT_DFLT; 472 473 static void 474 mlxcx_load_prop_defaults(mlxcx_t *mlxp) 475 { 476 mlxcx_drv_props_t *p = &mlxp->mlx_props; 477 mlxcx_port_t *port = &mlxp->mlx_ports[0]; 478 479 VERIFY((mlxp->mlx_attach & MLXCX_ATTACH_PORTS) != 0); 480 VERIFY((mlxp->mlx_attach & (MLXCX_ATTACH_CQS | MLXCX_ATTACH_WQS)) == 0); 481 482 /* 483 * Currently we have different queue size defaults for two 484 * categories of queues. One set for devices which support a 485 * maximum speed of 10Gb/s, and another for those above that. 486 */ 487 if ((port->mlp_max_proto & (MLXCX_PROTO_25G | MLXCX_PROTO_40G | 488 MLXCX_PROTO_50G | MLXCX_PROTO_100G)) != 0 || 489 (port->mlp_ext_max_proto & (MLXCX_EXTPROTO_25G | 490 MLXCX_EXTPROTO_40G | MLXCX_EXTPROTO_50G | MLXCX_EXTPROTO_100G | 491 MLXCX_EXTPROTO_200G | MLXCX_EXTPROTO_400G)) != 0) { 492 p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_25G; 493 p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_25G; 494 p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_25G; 495 } else if ((port->mlp_max_proto & (MLXCX_PROTO_100M | MLXCX_PROTO_1G | 496 MLXCX_PROTO_10G)) != 0 || 497 (port->mlp_ext_max_proto & (MLXCX_EXTPROTO_100M | 498 MLXCX_EXTPROTO_5G | MLXCX_EXTPROTO_1G | MLXCX_EXTPROTO_10G)) != 0) { 499 p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT; 500 p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT; 501 p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT; 502 } else { 503 mlxcx_warn(mlxp, "Encountered a port with a speed we don't " 504 "recognize. Proto: 0x%x", port->mlp_max_proto); 505 p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT; 506 p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT; 507 p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT; 508 } 509 } 510 511 /* 512 * Properties which may have different defaults based on hardware 513 * characteristics. 514 */ 515 static void 516 mlxcx_load_model_props(mlxcx_t *mlxp) 517 { 518 mlxcx_drv_props_t *p = &mlxp->mlx_props; 519 520 mlxcx_load_prop_defaults(mlxp); 521 522 p->mldp_cq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 523 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cq_size_shift", 524 p->mldp_cq_size_shift_default); 525 p->mldp_sq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 526 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "sq_size_shift", 527 p->mldp_sq_size_shift_default); 528 p->mldp_rq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 529 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rq_size_shift", 530 p->mldp_rq_size_shift_default); 531 } 532 533 static void 534 mlxcx_load_props(mlxcx_t *mlxp) 535 { 536 mlxcx_drv_props_t *p = &mlxp->mlx_props; 537 538 p->mldp_eq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 539 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "eq_size_shift", 540 MLXCX_EQ_SIZE_SHIFT_DFLT); 541 p->mldp_cqemod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 542 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_period_usec", 543 MLXCX_CQEMOD_PERIOD_USEC_DFLT); 544 p->mldp_cqemod_count = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 545 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_count", 546 MLXCX_CQEMOD_COUNT_DFLT); 547 p->mldp_intrmod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 548 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "intrmod_period_usec", 549 MLXCX_INTRMOD_PERIOD_USEC_DFLT); 550 551 p->mldp_tx_ngroups = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 552 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_ngroups", 553 MLXCX_TX_NGROUPS_DFLT); 554 p->mldp_tx_nrings_per_group = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 555 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_nrings_per_group", 556 MLXCX_TX_NRINGS_PER_GROUP_DFLT); 557 558 p->mldp_rx_ngroups_large = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 559 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_ngroups_large", 560 MLXCX_RX_NGROUPS_LARGE_DFLT); 561 p->mldp_rx_ngroups_small = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 562 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_ngroups_small", 563 MLXCX_RX_NGROUPS_SMALL_DFLT); 564 p->mldp_rx_nrings_per_large_group = ddi_getprop(DDI_DEV_T_ANY, 565 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 566 "rx_nrings_per_large_group", MLXCX_RX_NRINGS_PER_LARGE_GROUP_DFLT); 567 p->mldp_rx_nrings_per_small_group = ddi_getprop(DDI_DEV_T_ANY, 568 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 569 "rx_nrings_per_small_group", MLXCX_RX_NRINGS_PER_SMALL_GROUP_DFLT); 570 571 p->mldp_ftbl_root_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 572 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "ftbl_root_size_shift", 573 MLXCX_FTBL_ROOT_SIZE_SHIFT_DFLT); 574 575 p->mldp_tx_bind_threshold = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 576 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_bind_threshold", 577 MLXCX_TX_BIND_THRESHOLD_DFLT); 578 579 p->mldp_ftbl_vlan_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 580 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "ftbl_vlan_size_shift", 581 MLXCX_FTBL_VLAN_SIZE_SHIFT_DFLT); 582 583 p->mldp_eq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY, 584 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 585 "eq_check_interval_sec", MLXCX_EQ_CHECK_INTERVAL_SEC_DFLT); 586 p->mldp_cq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY, 587 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 588 "cq_check_interval_sec", MLXCX_CQ_CHECK_INTERVAL_SEC_DFLT); 589 p->mldp_wq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY, 590 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 591 "wq_check_interval_sec", MLXCX_WQ_CHECK_INTERVAL_SEC_DFLT); 592 593 p->mldp_rx_per_cq = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 594 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_limit_per_completion", 595 MLXCX_RX_PER_CQ_DEFAULT); 596 597 if (p->mldp_rx_per_cq < MLXCX_RX_PER_CQ_MIN || 598 p->mldp_rx_per_cq > MLXCX_RX_PER_CQ_MAX) { 599 mlxcx_warn(mlxp, "!rx_limit_per_completion = %u is " 600 "out of range. Defaulting to: %d. Valid values are from " 601 "%d to %d", p->mldp_rx_per_cq, MLXCX_RX_PER_CQ_DEFAULT, 602 MLXCX_RX_PER_CQ_MIN, MLXCX_RX_PER_CQ_MAX); 603 p->mldp_rx_per_cq = MLXCX_RX_PER_CQ_DEFAULT; 604 } 605 } 606 607 void 608 mlxcx_note(mlxcx_t *mlxp, const char *fmt, ...) 609 { 610 va_list ap; 611 612 va_start(ap, fmt); 613 if (mlxp != NULL && mlxp->mlx_dip != NULL) { 614 vdev_err(mlxp->mlx_dip, CE_NOTE, fmt, ap); 615 } else { 616 vcmn_err(CE_NOTE, fmt, ap); 617 } 618 va_end(ap); 619 } 620 621 void 622 mlxcx_warn(mlxcx_t *mlxp, const char *fmt, ...) 623 { 624 va_list ap; 625 626 va_start(ap, fmt); 627 if (mlxp != NULL && mlxp->mlx_dip != NULL) { 628 vdev_err(mlxp->mlx_dip, CE_WARN, fmt, ap); 629 } else { 630 vcmn_err(CE_WARN, fmt, ap); 631 } 632 va_end(ap); 633 } 634 635 void 636 mlxcx_panic(mlxcx_t *mlxp, const char *fmt, ...) 637 { 638 va_list ap; 639 640 va_start(ap, fmt); 641 if (mlxp != NULL && mlxp->mlx_dip != NULL) { 642 vdev_err(mlxp->mlx_dip, CE_PANIC, fmt, ap); 643 } else { 644 vcmn_err(CE_PANIC, fmt, ap); 645 } 646 va_end(ap); 647 } 648 649 uint16_t 650 mlxcx_get16(mlxcx_t *mlxp, uintptr_t off) 651 { 652 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 653 return (ddi_get16(mlxp->mlx_regs_handle, (void *)addr)); 654 } 655 656 uint32_t 657 mlxcx_get32(mlxcx_t *mlxp, uintptr_t off) 658 { 659 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 660 return (ddi_get32(mlxp->mlx_regs_handle, (void *)addr)); 661 } 662 663 uint64_t 664 mlxcx_get64(mlxcx_t *mlxp, uintptr_t off) 665 { 666 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 667 return (ddi_get64(mlxp->mlx_regs_handle, (void *)addr)); 668 } 669 670 void 671 mlxcx_put32(mlxcx_t *mlxp, uintptr_t off, uint32_t val) 672 { 673 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 674 ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val); 675 } 676 677 void 678 mlxcx_put64(mlxcx_t *mlxp, uintptr_t off, uint64_t val) 679 { 680 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 681 ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val); 682 } 683 684 void 685 mlxcx_uar_put32(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint32_t val) 686 { 687 /* 688 * The UAR is always inside the first BAR, which we mapped as 689 * mlx_regs 690 */ 691 uintptr_t addr = off + (uintptr_t)mlu->mlu_base + 692 (uintptr_t)mlxp->mlx_regs_base; 693 ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val); 694 } 695 696 void 697 mlxcx_uar_put64(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint64_t val) 698 { 699 uintptr_t addr = off + (uintptr_t)mlu->mlu_base + 700 (uintptr_t)mlxp->mlx_regs_base; 701 ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val); 702 } 703 704 static void 705 mlxcx_fm_fini(mlxcx_t *mlxp) 706 { 707 if (mlxp->mlx_fm_caps == 0) 708 return; 709 710 if (DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) 711 ddi_fm_handler_unregister(mlxp->mlx_dip); 712 713 if (DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps) || 714 DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) 715 pci_ereport_teardown(mlxp->mlx_dip); 716 717 ddi_fm_fini(mlxp->mlx_dip); 718 719 mlxp->mlx_fm_caps = 0; 720 } 721 722 void 723 mlxcx_fm_ereport(mlxcx_t *mlxp, const char *detail) 724 { 725 uint64_t ena; 726 char buf[FM_MAX_CLASS]; 727 728 if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps)) 729 return; 730 731 (void) snprintf(buf, FM_MAX_CLASS, "%s.%s", DDI_FM_DEVICE, detail); 732 ena = fm_ena_generate(0, FM_ENA_FMT1); 733 ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP, 734 FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0, 735 NULL); 736 } 737 738 static int 739 mlxcx_fm_errcb(dev_info_t *dip, ddi_fm_error_t *err, const void *arg) 740 { 741 /* 742 * as the driver can always deal with an error in any dma or 743 * access handle, we can just return the fme_status value. 744 */ 745 pci_ereport_post(dip, err, NULL); 746 return (err->fme_status); 747 } 748 749 static void 750 mlxcx_fm_init(mlxcx_t *mlxp) 751 { 752 ddi_iblock_cookie_t iblk; 753 int def = DDI_FM_EREPORT_CAPABLE | DDI_FM_ACCCHK_CAPABLE | 754 DDI_FM_DMACHK_CAPABLE | DDI_FM_ERRCB_CAPABLE; 755 756 mlxp->mlx_fm_caps = ddi_prop_get_int(DDI_DEV_T_ANY, mlxp->mlx_dip, 757 DDI_PROP_DONTPASS, "fm_capable", def); 758 759 if (mlxp->mlx_fm_caps < 0) { 760 mlxp->mlx_fm_caps = 0; 761 } 762 mlxp->mlx_fm_caps &= def; 763 764 if (mlxp->mlx_fm_caps == 0) 765 return; 766 767 ddi_fm_init(mlxp->mlx_dip, &mlxp->mlx_fm_caps, &iblk); 768 if (DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps) || 769 DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) { 770 pci_ereport_setup(mlxp->mlx_dip); 771 } 772 if (DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) { 773 ddi_fm_handler_register(mlxp->mlx_dip, mlxcx_fm_errcb, 774 (void *)mlxp); 775 } 776 } 777 778 static void 779 mlxcx_mlbs_teardown(mlxcx_t *mlxp, mlxcx_buf_shard_t *s) 780 { 781 mlxcx_buffer_t *buf; 782 783 mutex_enter(&s->mlbs_mtx); 784 785 while (!list_is_empty(&s->mlbs_busy)) 786 cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx); 787 788 while (!list_is_empty(&s->mlbs_loaned)) 789 cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx); 790 791 while ((buf = list_head(&s->mlbs_free)) != NULL) 792 mlxcx_buf_destroy(mlxp, buf); 793 794 list_destroy(&s->mlbs_free); 795 list_destroy(&s->mlbs_busy); 796 list_destroy(&s->mlbs_loaned); 797 mutex_exit(&s->mlbs_mtx); 798 799 cv_destroy(&s->mlbs_free_nonempty); 800 mutex_destroy(&s->mlbs_mtx); 801 } 802 803 static void 804 mlxcx_teardown_bufs(mlxcx_t *mlxp) 805 { 806 mlxcx_buf_shard_t *s; 807 808 while ((s = list_remove_head(&mlxp->mlx_buf_shards)) != NULL) { 809 mlxcx_mlbs_teardown(mlxp, s); 810 kmem_free(s, sizeof (mlxcx_buf_shard_t)); 811 } 812 list_destroy(&mlxp->mlx_buf_shards); 813 814 kmem_cache_destroy(mlxp->mlx_bufs_cache); 815 } 816 817 static void 818 mlxcx_teardown_pages(mlxcx_t *mlxp) 819 { 820 uint_t nzeros = 0; 821 uint64_t *pas; 822 823 pas = kmem_alloc(sizeof (*pas) * MLXCX_MANAGE_PAGES_MAX_PAGES, 824 KM_SLEEP); 825 826 mutex_enter(&mlxp->mlx_pagemtx); 827 828 while (mlxp->mlx_npages > 0) { 829 int32_t req, ret; 830 831 ASSERT0(avl_is_empty(&mlxp->mlx_pages)); 832 req = MIN(mlxp->mlx_npages, MLXCX_MANAGE_PAGES_MAX_PAGES); 833 834 if (!mlxcx_cmd_return_pages(mlxp, req, pas, &ret)) { 835 mlxcx_warn(mlxp, "hardware refused to return pages, " 836 "leaking %u remaining pages", mlxp->mlx_npages); 837 goto out; 838 } 839 840 for (int32_t i = 0; i < ret; i++) { 841 mlxcx_dev_page_t *mdp, probe; 842 bzero(&probe, sizeof (probe)); 843 probe.mxdp_pa = pas[i]; 844 845 mdp = avl_find(&mlxp->mlx_pages, &probe, NULL); 846 847 if (mdp != NULL) { 848 avl_remove(&mlxp->mlx_pages, mdp); 849 mlxp->mlx_npages--; 850 mlxcx_dma_free(&mdp->mxdp_dma); 851 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 852 } else { 853 mlxcx_panic(mlxp, "hardware returned a page " 854 "with PA 0x%" PRIx64 " but we have no " 855 "record of giving out such a page", pas[i]); 856 } 857 } 858 859 /* 860 * If no pages were returned, note that fact. 861 */ 862 if (ret == 0) { 863 nzeros++; 864 if (nzeros > mlxcx_reclaim_tries) { 865 mlxcx_warn(mlxp, "hardware refused to return " 866 "pages, leaking %u remaining pages", 867 mlxp->mlx_npages); 868 goto out; 869 } 870 delay(drv_usectohz(mlxcx_reclaim_delay)); 871 } 872 } 873 874 avl_destroy(&mlxp->mlx_pages); 875 876 out: 877 mutex_exit(&mlxp->mlx_pagemtx); 878 mutex_destroy(&mlxp->mlx_pagemtx); 879 880 kmem_free(pas, sizeof (*pas) * MLXCX_MANAGE_PAGES_MAX_PAGES); 881 } 882 883 static boolean_t 884 mlxcx_eq_alloc_dma(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) 885 { 886 ddi_device_acc_attr_t acc; 887 ddi_dma_attr_t attr; 888 boolean_t ret; 889 size_t sz, i; 890 891 VERIFY0(mleq->mleq_state & MLXCX_EQ_ALLOC); 892 893 mleq->mleq_entshift = mlxp->mlx_props.mldp_eq_size_shift; 894 mleq->mleq_nents = (1 << mleq->mleq_entshift); 895 sz = mleq->mleq_nents * sizeof (mlxcx_eventq_ent_t); 896 ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0); 897 898 mlxcx_dma_acc_attr(mlxp, &acc); 899 mlxcx_dma_queue_attr(mlxp, &attr); 900 901 ret = mlxcx_dma_alloc(mlxp, &mleq->mleq_dma, &attr, &acc, 902 B_TRUE, sz, B_TRUE); 903 if (!ret) { 904 mlxcx_warn(mlxp, "failed to allocate EQ memory"); 905 return (B_FALSE); 906 } 907 908 mleq->mleq_ent = (mlxcx_eventq_ent_t *)mleq->mleq_dma.mxdb_va; 909 910 for (i = 0; i < mleq->mleq_nents; ++i) 911 mleq->mleq_ent[i].mleqe_owner = MLXCX_EQ_OWNER_INIT; 912 913 mleq->mleq_state |= MLXCX_EQ_ALLOC; 914 915 return (B_TRUE); 916 } 917 918 static void 919 mlxcx_eq_rele_dma(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) 920 { 921 VERIFY(mleq->mleq_state & MLXCX_EQ_ALLOC); 922 if (mleq->mleq_state & MLXCX_EQ_CREATED) 923 VERIFY(mleq->mleq_state & MLXCX_EQ_DESTROYED); 924 925 mlxcx_dma_free(&mleq->mleq_dma); 926 mleq->mleq_ent = NULL; 927 928 mleq->mleq_state &= ~MLXCX_EQ_ALLOC; 929 } 930 931 void 932 mlxcx_teardown_flow_table(mlxcx_t *mlxp, mlxcx_flow_table_t *ft) 933 { 934 mlxcx_flow_group_t *fg; 935 mlxcx_flow_entry_t *fe; 936 int i; 937 938 ASSERT(mutex_owned(&ft->mlft_mtx)); 939 940 for (i = ft->mlft_nents - 1; i >= 0; --i) { 941 fe = &ft->mlft_ent[i]; 942 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) { 943 if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) { 944 mlxcx_panic(mlxp, "failed to delete flow " 945 "entry %u on table %u", i, 946 ft->mlft_num); 947 } 948 } 949 } 950 951 while ((fg = list_remove_head(&ft->mlft_groups)) != NULL) { 952 if (fg->mlfg_state & MLXCX_FLOW_GROUP_CREATED && 953 !(fg->mlfg_state & MLXCX_FLOW_GROUP_DESTROYED)) { 954 if (!mlxcx_cmd_destroy_flow_group(mlxp, fg)) { 955 mlxcx_panic(mlxp, "failed to destroy flow " 956 "group %u", fg->mlfg_num); 957 } 958 } 959 kmem_free(fg, sizeof (mlxcx_flow_group_t)); 960 } 961 list_destroy(&ft->mlft_groups); 962 if (ft->mlft_state & MLXCX_FLOW_TABLE_CREATED && 963 !(ft->mlft_state & MLXCX_FLOW_TABLE_DESTROYED)) { 964 if (!mlxcx_cmd_destroy_flow_table(mlxp, ft)) { 965 mlxcx_panic(mlxp, "failed to destroy flow table %u", 966 ft->mlft_num); 967 } 968 } 969 kmem_free(ft->mlft_ent, ft->mlft_entsize); 970 ft->mlft_ent = NULL; 971 mutex_exit(&ft->mlft_mtx); 972 mutex_destroy(&ft->mlft_mtx); 973 kmem_free(ft, sizeof (mlxcx_flow_table_t)); 974 } 975 976 static void 977 mlxcx_teardown_ports(mlxcx_t *mlxp) 978 { 979 uint_t i; 980 mlxcx_port_t *p; 981 mlxcx_flow_table_t *ft; 982 983 for (i = 0; i < mlxp->mlx_nports; ++i) { 984 p = &mlxp->mlx_ports[i]; 985 if (!(p->mlp_init & MLXCX_PORT_INIT)) 986 continue; 987 mutex_enter(&p->mlp_mtx); 988 if ((ft = p->mlp_rx_flow) != NULL) { 989 mutex_enter(&ft->mlft_mtx); 990 /* 991 * teardown_flow_table() will destroy the mutex, so 992 * we don't release it here. 993 */ 994 mlxcx_teardown_flow_table(mlxp, ft); 995 } 996 mutex_exit(&p->mlp_mtx); 997 mutex_destroy(&p->mlp_mtx); 998 mutex_destroy(&p->mlx_port_event.mla_mtx); 999 p->mlx_port_event.mla_mlx = NULL; 1000 p->mlx_port_event.mla_port = NULL; 1001 p->mlp_init &= ~MLXCX_PORT_INIT; 1002 } 1003 1004 kmem_free(mlxp->mlx_ports, mlxp->mlx_ports_size); 1005 mlxp->mlx_ports = NULL; 1006 } 1007 1008 static void 1009 mlxcx_teardown_wqs(mlxcx_t *mlxp) 1010 { 1011 mlxcx_work_queue_t *mlwq; 1012 1013 while ((mlwq = list_head(&mlxp->mlx_wqs)) != NULL) { 1014 mlxcx_wq_teardown(mlxp, mlwq); 1015 } 1016 list_destroy(&mlxp->mlx_wqs); 1017 } 1018 1019 static void 1020 mlxcx_teardown_cqs(mlxcx_t *mlxp) 1021 { 1022 mlxcx_completion_queue_t *mlcq; 1023 1024 while ((mlcq = list_head(&mlxp->mlx_cqs)) != NULL) { 1025 mlxcx_cq_teardown(mlxp, mlcq); 1026 } 1027 list_destroy(&mlxp->mlx_cqs); 1028 } 1029 1030 static void 1031 mlxcx_teardown_eqs(mlxcx_t *mlxp) 1032 { 1033 mlxcx_event_queue_t *mleq; 1034 uint_t i; 1035 1036 for (i = 0; i < mlxp->mlx_intr_count; ++i) { 1037 mleq = &mlxp->mlx_eqs[i]; 1038 mutex_enter(&mleq->mleq_mtx); 1039 if ((mleq->mleq_state & MLXCX_EQ_CREATED) && 1040 !(mleq->mleq_state & MLXCX_EQ_DESTROYED)) { 1041 if (!mlxcx_cmd_destroy_eq(mlxp, mleq)) { 1042 mlxcx_warn(mlxp, "failed to destroy " 1043 "event queue idx %u eqn %u", 1044 i, mleq->mleq_num); 1045 } 1046 } 1047 if (mleq->mleq_state & MLXCX_EQ_ALLOC) { 1048 mlxcx_eq_rele_dma(mlxp, mleq); 1049 } 1050 mutex_exit(&mleq->mleq_mtx); 1051 } 1052 } 1053 1054 static void 1055 mlxcx_teardown_checktimers(mlxcx_t *mlxp) 1056 { 1057 if (mlxp->mlx_props.mldp_eq_check_interval_sec > 0) 1058 ddi_periodic_delete(mlxp->mlx_eq_checktimer); 1059 if (mlxp->mlx_props.mldp_cq_check_interval_sec > 0) 1060 ddi_periodic_delete(mlxp->mlx_cq_checktimer); 1061 if (mlxp->mlx_props.mldp_wq_check_interval_sec > 0) 1062 ddi_periodic_delete(mlxp->mlx_wq_checktimer); 1063 } 1064 1065 static void 1066 mlxcx_teardown(mlxcx_t *mlxp) 1067 { 1068 uint_t i; 1069 dev_info_t *dip = mlxp->mlx_dip; 1070 1071 if (mlxp->mlx_attach & MLXCX_ATTACH_INTRS) { 1072 /* 1073 * Disable interrupts and let any active vectors quiesce. 1074 */ 1075 mlxcx_intr_disable(mlxp); 1076 } 1077 1078 if (mlxp->mlx_attach & MLXCX_ATTACH_SENSORS) { 1079 mlxcx_teardown_sensors(mlxp); 1080 mlxp->mlx_attach &= ~MLXCX_ATTACH_SENSORS; 1081 } 1082 1083 if (mlxp->mlx_attach & MLXCX_ATTACH_CHKTIMERS) { 1084 mlxcx_teardown_checktimers(mlxp); 1085 mlxp->mlx_attach &= ~MLXCX_ATTACH_CHKTIMERS; 1086 } 1087 1088 if (mlxp->mlx_attach & MLXCX_ATTACH_GROUPS) { 1089 mlxcx_teardown_groups(mlxp); 1090 mlxp->mlx_attach &= ~MLXCX_ATTACH_GROUPS; 1091 } 1092 1093 if (mlxp->mlx_attach & MLXCX_ATTACH_WQS) { 1094 mlxcx_teardown_wqs(mlxp); 1095 mlxp->mlx_attach &= ~MLXCX_ATTACH_WQS; 1096 } 1097 1098 if (mlxp->mlx_attach & MLXCX_ATTACH_CQS) { 1099 mlxcx_teardown_cqs(mlxp); 1100 mlxp->mlx_attach &= ~MLXCX_ATTACH_CQS; 1101 } 1102 1103 if (mlxp->mlx_attach & MLXCX_ATTACH_BUFS) { 1104 mlxcx_teardown_bufs(mlxp); 1105 mlxp->mlx_attach &= ~MLXCX_ATTACH_BUFS; 1106 } 1107 1108 if (mlxp->mlx_attach & MLXCX_ATTACH_PORTS) { 1109 mlxcx_teardown_ports(mlxp); 1110 mlxp->mlx_attach &= ~MLXCX_ATTACH_PORTS; 1111 } 1112 1113 if (mlxp->mlx_attach & MLXCX_ATTACH_INTRS) { 1114 mlxcx_teardown_eqs(mlxp); 1115 mlxcx_intr_teardown(mlxp); 1116 mlxp->mlx_attach &= ~MLXCX_ATTACH_INTRS; 1117 } 1118 1119 if (mlxp->mlx_attach & MLXCX_ATTACH_UAR_PD_TD) { 1120 if (mlxp->mlx_uar.mlu_allocated) { 1121 if (!mlxcx_cmd_dealloc_uar(mlxp, &mlxp->mlx_uar)) { 1122 mlxcx_warn(mlxp, "failed to release UAR"); 1123 } 1124 for (i = 0; i < MLXCX_BF_PER_UAR; ++i) 1125 mutex_destroy(&mlxp->mlx_uar.mlu_bf[i].mbf_mtx); 1126 } 1127 if (mlxp->mlx_pd.mlpd_allocated && 1128 !mlxcx_cmd_dealloc_pd(mlxp, &mlxp->mlx_pd)) { 1129 mlxcx_warn(mlxp, "failed to release PD"); 1130 } 1131 if (mlxp->mlx_tdom.mltd_allocated && 1132 !mlxcx_cmd_dealloc_tdom(mlxp, &mlxp->mlx_tdom)) { 1133 mlxcx_warn(mlxp, "failed to release TDOM"); 1134 } 1135 mlxp->mlx_attach &= ~MLXCX_ATTACH_UAR_PD_TD; 1136 } 1137 1138 if (mlxp->mlx_attach & MLXCX_ATTACH_INIT_HCA) { 1139 if (!mlxcx_cmd_teardown_hca(mlxp)) { 1140 mlxcx_warn(mlxp, "failed to send teardown HCA " 1141 "command during device detach"); 1142 } 1143 mlxp->mlx_attach &= ~MLXCX_ATTACH_INIT_HCA; 1144 } 1145 1146 if (mlxp->mlx_attach & MLXCX_ATTACH_PAGE_LIST) { 1147 mlxcx_teardown_pages(mlxp); 1148 mlxp->mlx_attach &= ~MLXCX_ATTACH_PAGE_LIST; 1149 } 1150 1151 if (mlxp->mlx_attach & MLXCX_ATTACH_ASYNC_TQ) { 1152 for (i = 0; i <= MLXCX_FUNC_ID_MAX; i++) { 1153 mlxp->mlx_npages_req[i].mla_mlx = NULL; 1154 mutex_destroy(&mlxp->mlx_npages_req[i].mla_mtx); 1155 } 1156 taskq_destroy(mlxp->mlx_async_tq); 1157 mlxp->mlx_async_tq = NULL; 1158 mlxp->mlx_attach &= ~MLXCX_ATTACH_ASYNC_TQ; 1159 } 1160 1161 if (mlxp->mlx_attach & MLXCX_ATTACH_ENABLE_HCA) { 1162 if (!mlxcx_cmd_disable_hca(mlxp)) { 1163 mlxcx_warn(mlxp, "failed to send DISABLE HCA command " 1164 "during device detach"); 1165 } 1166 mlxp->mlx_attach &= ~MLXCX_ATTACH_ENABLE_HCA; 1167 } 1168 1169 if (mlxp->mlx_attach & MLXCX_ATTACH_CMD) { 1170 mlxcx_cmd_queue_fini(mlxp); 1171 mlxp->mlx_attach &= ~MLXCX_ATTACH_CMD; 1172 } 1173 1174 if (mlxp->mlx_attach & MLXCX_ATTACH_CAPS) { 1175 kmem_free(mlxp->mlx_caps, sizeof (mlxcx_caps_t)); 1176 mlxp->mlx_caps = NULL; 1177 mlxp->mlx_attach &= ~MLXCX_ATTACH_CAPS; 1178 } 1179 1180 if (mlxp->mlx_attach & MLXCX_ATTACH_REGS) { 1181 ddi_regs_map_free(&mlxp->mlx_regs_handle); 1182 mlxp->mlx_regs_handle = NULL; 1183 mlxp->mlx_attach &= ~MLXCX_ATTACH_REGS; 1184 } 1185 1186 if (mlxp->mlx_attach & MLXCX_ATTACH_PCI_CONFIG) { 1187 pci_config_teardown(&mlxp->mlx_cfg_handle); 1188 mlxp->mlx_cfg_handle = NULL; 1189 mlxp->mlx_attach &= ~MLXCX_ATTACH_PCI_CONFIG; 1190 } 1191 1192 if (mlxp->mlx_attach & MLXCX_ATTACH_FM) { 1193 mlxcx_fm_fini(mlxp); 1194 mlxp->mlx_attach &= ~MLXCX_ATTACH_FM; 1195 } 1196 1197 VERIFY3S(mlxp->mlx_attach, ==, 0); 1198 ddi_soft_state_free(mlxcx_softstate, mlxp->mlx_inst); 1199 ddi_set_driver_private(dip, NULL); 1200 } 1201 1202 static boolean_t 1203 mlxcx_regs_map(mlxcx_t *mlxp) 1204 { 1205 off_t memsize; 1206 int ret; 1207 ddi_device_acc_attr_t da; 1208 1209 if (ddi_dev_regsize(mlxp->mlx_dip, MLXCX_REG_NUMBER, &memsize) != 1210 DDI_SUCCESS) { 1211 mlxcx_warn(mlxp, "failed to get register set size"); 1212 return (B_FALSE); 1213 } 1214 1215 /* 1216 * All data in the main BAR is kept in big-endian even though it's a PCI 1217 * device. 1218 */ 1219 bzero(&da, sizeof (ddi_device_acc_attr_t)); 1220 da.devacc_attr_version = DDI_DEVICE_ATTR_V0; 1221 da.devacc_attr_endian_flags = DDI_STRUCTURE_BE_ACC; 1222 da.devacc_attr_dataorder = DDI_STRICTORDER_ACC; 1223 if (DDI_FM_ACC_ERR_CAP(mlxp->mlx_fm_caps)) { 1224 da.devacc_attr_access = DDI_FLAGERR_ACC; 1225 } else { 1226 da.devacc_attr_access = DDI_DEFAULT_ACC; 1227 } 1228 1229 ret = ddi_regs_map_setup(mlxp->mlx_dip, MLXCX_REG_NUMBER, 1230 &mlxp->mlx_regs_base, 0, memsize, &da, &mlxp->mlx_regs_handle); 1231 1232 if (ret != DDI_SUCCESS) { 1233 mlxcx_warn(mlxp, "failed to map device registers: %d", ret); 1234 return (B_FALSE); 1235 } 1236 1237 return (B_TRUE); 1238 } 1239 1240 static boolean_t 1241 mlxcx_check_issi(mlxcx_t *mlxp) 1242 { 1243 uint32_t issi; 1244 1245 if (!mlxcx_cmd_query_issi(mlxp, &issi)) { 1246 mlxcx_warn(mlxp, "failed to get ISSI"); 1247 return (B_FALSE); 1248 } 1249 1250 if ((issi & (1 << MLXCX_CURRENT_ISSI)) == 0) { 1251 mlxcx_warn(mlxp, "hardware does not support software ISSI, " 1252 "hw vector 0x%x, sw version %u", issi, MLXCX_CURRENT_ISSI); 1253 return (B_FALSE); 1254 } 1255 1256 if (!mlxcx_cmd_set_issi(mlxp, MLXCX_CURRENT_ISSI)) { 1257 mlxcx_warn(mlxp, "failed to set ISSI to %u", 1258 MLXCX_CURRENT_ISSI); 1259 return (B_FALSE); 1260 } 1261 1262 return (B_TRUE); 1263 } 1264 1265 boolean_t 1266 mlxcx_give_pages(mlxcx_t *mlxp, int32_t npages, int32_t *ngiven) 1267 { 1268 ddi_device_acc_attr_t acc; 1269 ddi_dma_attr_t attr; 1270 int32_t i; 1271 list_t plist; 1272 mlxcx_dev_page_t *mdp; 1273 mlxcx_dev_page_t **pages; 1274 const ddi_dma_cookie_t *ck; 1275 1276 /* 1277 * If there are no pages required, then we're done here. 1278 */ 1279 if (npages <= 0) { 1280 *ngiven = 0; 1281 return (B_TRUE); 1282 } 1283 1284 npages = MIN(npages, MLXCX_MANAGE_PAGES_MAX_PAGES); 1285 1286 pages = kmem_alloc(sizeof (*pages) * npages, KM_SLEEP); 1287 1288 list_create(&plist, sizeof (mlxcx_dev_page_t), 1289 offsetof(mlxcx_dev_page_t, mxdp_list)); 1290 1291 for (i = 0; i < npages; i++) { 1292 mdp = kmem_zalloc(sizeof (mlxcx_dev_page_t), KM_SLEEP); 1293 mlxcx_dma_acc_attr(mlxp, &acc); 1294 mlxcx_dma_page_attr(mlxp, &attr); 1295 if (!mlxcx_dma_alloc(mlxp, &mdp->mxdp_dma, &attr, &acc, 1296 B_TRUE, MLXCX_HW_PAGE_SIZE, B_TRUE)) { 1297 mlxcx_warn(mlxp, "failed to allocate 4k page %u/%u", i, 1298 npages); 1299 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 1300 goto cleanup_npages; 1301 } 1302 ck = mlxcx_dma_cookie_one(&mdp->mxdp_dma); 1303 mdp->mxdp_pa = ck->dmac_laddress; 1304 1305 list_insert_tail(&plist, mdp); 1306 } 1307 1308 /* 1309 * Now that all of the pages have been allocated, given them to hardware 1310 * in chunks. 1311 */ 1312 for (i = 0; i < npages; i++) { 1313 pages[i] = list_remove_head(&plist); 1314 } 1315 1316 if (!mlxcx_cmd_give_pages(mlxp, 1317 MLXCX_MANAGE_PAGES_OPMOD_GIVE_PAGES, npages, pages)) { 1318 mlxcx_warn(mlxp, "!hardware refused our gift of %u " 1319 "pages!", npages); 1320 for (i = 0; i < npages; i++) { 1321 list_insert_tail(&plist, pages[i]); 1322 } 1323 goto cleanup_npages; 1324 } 1325 1326 mutex_enter(&mlxp->mlx_pagemtx); 1327 for (i = 0; i < npages; i++) { 1328 avl_add(&mlxp->mlx_pages, pages[i]); 1329 } 1330 mlxp->mlx_npages += npages; 1331 mutex_exit(&mlxp->mlx_pagemtx); 1332 1333 list_destroy(&plist); 1334 kmem_free(pages, sizeof (*pages) * npages); 1335 1336 *ngiven = npages; 1337 1338 return (B_TRUE); 1339 1340 cleanup_npages: 1341 kmem_free(pages, sizeof (*pages) * npages); 1342 while ((mdp = list_remove_head(&plist)) != NULL) { 1343 mlxcx_dma_free(&mdp->mxdp_dma); 1344 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 1345 } 1346 list_destroy(&plist); 1347 return (B_FALSE); 1348 } 1349 1350 static boolean_t 1351 mlxcx_init_pages(mlxcx_t *mlxp, uint_t type) 1352 { 1353 int32_t npages, given; 1354 1355 if (!mlxcx_cmd_query_pages(mlxp, type, &npages)) { 1356 mlxcx_warn(mlxp, "failed to determine boot pages"); 1357 return (B_FALSE); 1358 } 1359 1360 while (npages > 0) { 1361 if (!mlxcx_give_pages(mlxp, npages, &given)) 1362 return (B_FALSE); 1363 1364 npages -= given; 1365 } 1366 1367 return (B_TRUE); 1368 } 1369 1370 static int 1371 mlxcx_bufs_cache_constr(void *arg, void *cookie, int kmflags) 1372 { 1373 mlxcx_t *mlxp = cookie; 1374 mlxcx_buffer_t *b = arg; 1375 1376 bzero(b, sizeof (mlxcx_buffer_t)); 1377 b->mlb_mlx = mlxp; 1378 b->mlb_state = MLXCX_BUFFER_INIT; 1379 list_create(&b->mlb_tx_chain, sizeof (mlxcx_buffer_t), 1380 offsetof(mlxcx_buffer_t, mlb_tx_chain_entry)); 1381 1382 return (0); 1383 } 1384 1385 static void 1386 mlxcx_bufs_cache_destr(void *arg, void *cookie) 1387 { 1388 mlxcx_t *mlxp = cookie; 1389 mlxcx_buffer_t *b = arg; 1390 VERIFY3P(b->mlb_mlx, ==, mlxp); 1391 VERIFY(b->mlb_state == MLXCX_BUFFER_INIT); 1392 list_destroy(&b->mlb_tx_chain); 1393 } 1394 1395 mlxcx_buf_shard_t * 1396 mlxcx_mlbs_create(mlxcx_t *mlxp) 1397 { 1398 mlxcx_buf_shard_t *s; 1399 1400 s = kmem_zalloc(sizeof (mlxcx_buf_shard_t), KM_SLEEP); 1401 1402 mutex_init(&s->mlbs_mtx, NULL, MUTEX_DRIVER, 1403 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 1404 list_create(&s->mlbs_busy, sizeof (mlxcx_buffer_t), 1405 offsetof(mlxcx_buffer_t, mlb_entry)); 1406 list_create(&s->mlbs_free, sizeof (mlxcx_buffer_t), 1407 offsetof(mlxcx_buffer_t, mlb_entry)); 1408 list_create(&s->mlbs_loaned, sizeof (mlxcx_buffer_t), 1409 offsetof(mlxcx_buffer_t, mlb_entry)); 1410 cv_init(&s->mlbs_free_nonempty, NULL, CV_DRIVER, NULL); 1411 1412 list_insert_tail(&mlxp->mlx_buf_shards, s); 1413 1414 return (s); 1415 } 1416 1417 static boolean_t 1418 mlxcx_setup_bufs(mlxcx_t *mlxp) 1419 { 1420 char namebuf[KSTAT_STRLEN]; 1421 1422 (void) snprintf(namebuf, KSTAT_STRLEN, "mlxcx%d_bufs_cache", 1423 ddi_get_instance(mlxp->mlx_dip)); 1424 mlxp->mlx_bufs_cache = kmem_cache_create(namebuf, 1425 sizeof (mlxcx_buffer_t), sizeof (uint64_t), 1426 mlxcx_bufs_cache_constr, mlxcx_bufs_cache_destr, 1427 NULL, mlxp, NULL, 0); 1428 1429 list_create(&mlxp->mlx_buf_shards, sizeof (mlxcx_buf_shard_t), 1430 offsetof(mlxcx_buf_shard_t, mlbs_entry)); 1431 1432 return (B_TRUE); 1433 } 1434 1435 static void 1436 mlxcx_fm_qstate_ereport(mlxcx_t *mlxp, const char *qtype, uint32_t qnum, 1437 const char *state, uint8_t statenum) 1438 { 1439 uint64_t ena; 1440 char buf[FM_MAX_CLASS]; 1441 1442 if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps)) 1443 return; 1444 1445 (void) snprintf(buf, FM_MAX_CLASS, "%s.%s", 1446 MLXCX_FM_SERVICE_MLXCX, "qstate.err"); 1447 ena = fm_ena_generate(0, FM_ENA_FMT1); 1448 1449 ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP, 1450 FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0, 1451 "state", DATA_TYPE_STRING, state, 1452 "state_num", DATA_TYPE_UINT8, statenum, 1453 "qtype", DATA_TYPE_STRING, qtype, 1454 "qnum", DATA_TYPE_UINT32, qnum, 1455 NULL); 1456 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_DEGRADED); 1457 } 1458 1459 /* 1460 * The following set of routines are for monitoring the health of 1461 * event, completion and work queues. They run infrequently peeking at 1462 * the structs to catch stalls and inconsistent state. 1463 * 1464 * They peek at the structs *without* acquiring locks - we don't want 1465 * to impede flow of data. Driver start up and shutdown semantics 1466 * guarantee the structs are present and won't disappear underneath 1467 * these routines. 1468 * 1469 * As previously noted, the routines peek at active data in the structs and 1470 * they will store some values for comparison on next invocation. To 1471 * maintain integrity of the saved values, these values are only modified 1472 * within these routines. 1473 */ 1474 static void 1475 mlxcx_eq_check(void *arg) 1476 { 1477 mlxcx_t *mlxp = (mlxcx_t *)arg; 1478 mlxcx_event_queue_t *eq; 1479 mlxcx_eventq_ctx_t ctx; 1480 const char *str; 1481 1482 uint_t i; 1483 1484 for (i = 0; i < mlxp->mlx_intr_count; ++i) { 1485 eq = &mlxp->mlx_eqs[i]; 1486 1487 if ((eq->mleq_state & MLXCX_EQ_CREATED) == 0) 1488 continue; 1489 1490 /* 1491 * If the event queue was successfully created in the HCA, 1492 * then initialization and shutdown sequences guarantee 1493 * the queue exists. 1494 */ 1495 ASSERT0(eq->mleq_state & MLXCX_EQ_DESTROYED); 1496 1497 if (!mlxcx_cmd_query_eq(mlxp, eq, &ctx)) 1498 continue; 1499 1500 str = "???"; 1501 switch (ctx.mleqc_status) { 1502 case MLXCX_EQ_STATUS_OK: 1503 break; 1504 case MLXCX_EQ_STATUS_WRITE_FAILURE: 1505 str = "WRITE_FAILURE"; 1506 break; 1507 } 1508 1509 if (ctx.mleqc_status != MLXCX_EQ_STATUS_OK) { 1510 mlxcx_fm_qstate_ereport(mlxp, "event", 1511 eq->mleq_num, str, ctx.mleqc_status); 1512 mlxcx_warn(mlxp, "EQ %u is in bad status: %x (%s)", 1513 eq->mleq_intr_index, ctx.mleqc_status, str); 1514 } 1515 1516 if (ctx.mleqc_state != MLXCX_EQ_ST_ARMED && 1517 (eq->mleq_state & MLXCX_EQ_ARMED)) { 1518 if (eq->mleq_cc == eq->mleq_check_disarm_cc && 1519 ++eq->mleq_check_disarm_cnt >= 3) { 1520 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_STALL); 1521 mlxcx_warn(mlxp, "EQ %u isn't armed", 1522 eq->mleq_intr_index); 1523 } 1524 eq->mleq_check_disarm_cc = eq->mleq_cc; 1525 } else { 1526 eq->mleq_check_disarm_cc = 0; 1527 eq->mleq_check_disarm_cnt = 0; 1528 } 1529 } 1530 } 1531 1532 static void 1533 mlxcx_cq_check(void *arg) 1534 { 1535 mlxcx_t *mlxp = (mlxcx_t *)arg; 1536 mlxcx_completion_queue_t *cq; 1537 mlxcx_completionq_ctx_t ctx; 1538 const char *str, *type; 1539 uint_t v; 1540 1541 for (cq = list_head(&mlxp->mlx_cqs); cq != NULL; 1542 cq = list_next(&mlxp->mlx_cqs, cq)) { 1543 1544 if ((cq->mlcq_state & MLXCX_CQ_CREATED) == 0) 1545 continue; 1546 1547 /* 1548 * If the completion queue was successfully created in the HCA, 1549 * then initialization and shutdown sequences guarantee 1550 * the queue exists. 1551 */ 1552 ASSERT0(cq->mlcq_state & MLXCX_CQ_DESTROYED); 1553 ASSERT0(cq->mlcq_state & MLXCX_CQ_TEARDOWN); 1554 1555 if (cq->mlcq_fm_repd_qstate) 1556 continue; 1557 1558 if (!mlxcx_cmd_query_cq(mlxp, cq, &ctx)) 1559 continue; 1560 1561 if (cq->mlcq_wq != NULL) { 1562 mlxcx_work_queue_t *wq = cq->mlcq_wq; 1563 if (wq->mlwq_type == MLXCX_WQ_TYPE_RECVQ) 1564 type = "rx "; 1565 else if (wq->mlwq_type == MLXCX_WQ_TYPE_SENDQ) 1566 type = "tx "; 1567 else 1568 type = ""; 1569 } else { 1570 type = ""; 1571 } 1572 1573 str = "???"; 1574 v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATUS); 1575 switch (v) { 1576 case MLXCX_CQC_STATUS_OK: 1577 break; 1578 case MLXCX_CQC_STATUS_OVERFLOW: 1579 str = "OVERFLOW"; 1580 break; 1581 case MLXCX_CQC_STATUS_WRITE_FAIL: 1582 str = "WRITE_FAIL"; 1583 break; 1584 case MLXCX_CQC_STATUS_INVALID: 1585 str = "INVALID"; 1586 break; 1587 } 1588 1589 if (v != MLXCX_CQC_STATUS_OK) { 1590 mlxcx_fm_qstate_ereport(mlxp, "completion", 1591 cq->mlcq_num, str, v); 1592 mlxcx_warn(mlxp, "%sCQ 0x%x is in bad status: %x (%s)", 1593 type, cq->mlcq_num, v, str); 1594 cq->mlcq_fm_repd_qstate = B_TRUE; 1595 } 1596 1597 v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATE); 1598 if (v != MLXCX_CQC_STATE_ARMED && 1599 (cq->mlcq_state & MLXCX_CQ_ARMED) && 1600 !(cq->mlcq_state & MLXCX_CQ_POLLING)) { 1601 if (cq->mlcq_cc == cq->mlcq_check_disarm_cc && 1602 ++cq->mlcq_check_disarm_cnt >= 3) { 1603 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_STALL); 1604 mlxcx_warn(mlxp, "%sCQ 0x%x (%p) isn't armed", 1605 type, cq->mlcq_num, cq); 1606 } 1607 cq->mlcq_check_disarm_cc = cq->mlcq_cc; 1608 } else { 1609 cq->mlcq_check_disarm_cnt = 0; 1610 cq->mlcq_check_disarm_cc = 0; 1611 } 1612 } 1613 } 1614 1615 void 1616 mlxcx_check_sq(mlxcx_t *mlxp, mlxcx_work_queue_t *sq) 1617 { 1618 mlxcx_sq_ctx_t ctx; 1619 mlxcx_sq_state_t state; 1620 1621 if (!mlxcx_cmd_query_sq(mlxp, sq, &ctx)) 1622 return; 1623 1624 ASSERT3U(from_be24(ctx.mlsqc_cqn), ==, sq->mlwq_cq->mlcq_num); 1625 state = get_bits32(ctx.mlsqc_flags, MLXCX_SQ_STATE); 1626 switch (state) { 1627 case MLXCX_SQ_STATE_RST: 1628 if (sq->mlwq_state & MLXCX_WQ_STARTED) { 1629 mlxcx_fm_qstate_ereport(mlxp, "send", 1630 sq->mlwq_num, "RST", state); 1631 sq->mlwq_fm_repd_qstate = B_TRUE; 1632 } 1633 break; 1634 case MLXCX_SQ_STATE_RDY: 1635 if (!(sq->mlwq_state & MLXCX_WQ_STARTED)) { 1636 mlxcx_fm_qstate_ereport(mlxp, "send", 1637 sq->mlwq_num, "RDY", state); 1638 sq->mlwq_fm_repd_qstate = B_TRUE; 1639 } 1640 break; 1641 case MLXCX_SQ_STATE_ERR: 1642 mlxcx_fm_qstate_ereport(mlxp, "send", 1643 sq->mlwq_num, "ERR", state); 1644 sq->mlwq_fm_repd_qstate = B_TRUE; 1645 break; 1646 default: 1647 mlxcx_fm_qstate_ereport(mlxp, "send", 1648 sq->mlwq_num, "???", state); 1649 sq->mlwq_fm_repd_qstate = B_TRUE; 1650 break; 1651 } 1652 } 1653 1654 void 1655 mlxcx_check_rq(mlxcx_t *mlxp, mlxcx_work_queue_t *rq) 1656 { 1657 mlxcx_rq_ctx_t ctx; 1658 mlxcx_rq_state_t state; 1659 1660 1661 if (!mlxcx_cmd_query_rq(mlxp, rq, &ctx)) 1662 return; 1663 1664 ASSERT3U(from_be24(ctx.mlrqc_cqn), ==, rq->mlwq_cq->mlcq_num); 1665 state = get_bits32(ctx.mlrqc_flags, MLXCX_RQ_STATE); 1666 switch (state) { 1667 case MLXCX_RQ_STATE_RST: 1668 if (rq->mlwq_state & MLXCX_WQ_STARTED) { 1669 mlxcx_fm_qstate_ereport(mlxp, "receive", 1670 rq->mlwq_num, "RST", state); 1671 rq->mlwq_fm_repd_qstate = B_TRUE; 1672 } 1673 break; 1674 case MLXCX_RQ_STATE_RDY: 1675 if (!(rq->mlwq_state & MLXCX_WQ_STARTED)) { 1676 mlxcx_fm_qstate_ereport(mlxp, "receive", 1677 rq->mlwq_num, "RDY", state); 1678 rq->mlwq_fm_repd_qstate = B_TRUE; 1679 } 1680 break; 1681 case MLXCX_RQ_STATE_ERR: 1682 mlxcx_fm_qstate_ereport(mlxp, "receive", 1683 rq->mlwq_num, "ERR", state); 1684 rq->mlwq_fm_repd_qstate = B_TRUE; 1685 break; 1686 default: 1687 mlxcx_fm_qstate_ereport(mlxp, "receive", 1688 rq->mlwq_num, "???", state); 1689 rq->mlwq_fm_repd_qstate = B_TRUE; 1690 break; 1691 } 1692 } 1693 1694 static void 1695 mlxcx_wq_check(void *arg) 1696 { 1697 mlxcx_t *mlxp = (mlxcx_t *)arg; 1698 mlxcx_work_queue_t *wq; 1699 1700 for (wq = list_head(&mlxp->mlx_wqs); wq != NULL; 1701 wq = list_next(&mlxp->mlx_wqs, wq)) { 1702 1703 if ((wq->mlwq_state & MLXCX_WQ_CREATED) == 0) 1704 continue; 1705 1706 /* 1707 * If the work queue was successfully created in the HCA, 1708 * then initialization and shutdown sequences guarantee 1709 * the queue exists. 1710 */ 1711 ASSERT0(wq->mlwq_state & MLXCX_WQ_DESTROYED); 1712 ASSERT0(wq->mlwq_state & MLXCX_WQ_TEARDOWN); 1713 1714 if (wq->mlwq_fm_repd_qstate) 1715 continue; 1716 1717 switch (wq->mlwq_type) { 1718 case MLXCX_WQ_TYPE_SENDQ: 1719 mlxcx_check_sq(mlxp, wq); 1720 break; 1721 case MLXCX_WQ_TYPE_RECVQ: 1722 mlxcx_check_rq(mlxp, wq); 1723 break; 1724 } 1725 } 1726 } 1727 1728 static boolean_t 1729 mlxcx_setup_checktimers(mlxcx_t *mlxp) 1730 { 1731 if (mlxp->mlx_props.mldp_eq_check_interval_sec > 0) { 1732 mlxp->mlx_eq_checktimer = ddi_periodic_add(mlxcx_eq_check, mlxp, 1733 mlxp->mlx_props.mldp_eq_check_interval_sec * NANOSEC, 1734 DDI_IPL_0); 1735 } 1736 if (mlxp->mlx_props.mldp_cq_check_interval_sec > 0) { 1737 mlxp->mlx_cq_checktimer = ddi_periodic_add(mlxcx_cq_check, mlxp, 1738 mlxp->mlx_props.mldp_cq_check_interval_sec * NANOSEC, 1739 DDI_IPL_0); 1740 } 1741 if (mlxp->mlx_props.mldp_wq_check_interval_sec > 0) { 1742 mlxp->mlx_wq_checktimer = ddi_periodic_add(mlxcx_wq_check, mlxp, 1743 mlxp->mlx_props.mldp_wq_check_interval_sec * NANOSEC, 1744 DDI_IPL_0); 1745 } 1746 return (B_TRUE); 1747 } 1748 1749 int 1750 mlxcx_dmac_fe_compare(const void *arg0, const void *arg1) 1751 { 1752 const mlxcx_flow_entry_t *left = arg0; 1753 const mlxcx_flow_entry_t *right = arg1; 1754 int bcmpr; 1755 1756 bcmpr = memcmp(left->mlfe_dmac, right->mlfe_dmac, 1757 sizeof (left->mlfe_dmac)); 1758 if (bcmpr < 0) 1759 return (-1); 1760 if (bcmpr > 0) 1761 return (1); 1762 if (left->mlfe_vid < right->mlfe_vid) 1763 return (-1); 1764 if (left->mlfe_vid > right->mlfe_vid) 1765 return (1); 1766 return (0); 1767 } 1768 1769 int 1770 mlxcx_grmac_compare(const void *arg0, const void *arg1) 1771 { 1772 const mlxcx_group_mac_t *left = arg0; 1773 const mlxcx_group_mac_t *right = arg1; 1774 int bcmpr; 1775 1776 bcmpr = memcmp(left->mlgm_mac, right->mlgm_mac, 1777 sizeof (left->mlgm_mac)); 1778 if (bcmpr < 0) 1779 return (-1); 1780 if (bcmpr > 0) 1781 return (1); 1782 return (0); 1783 } 1784 1785 int 1786 mlxcx_page_compare(const void *arg0, const void *arg1) 1787 { 1788 const mlxcx_dev_page_t *p0 = arg0; 1789 const mlxcx_dev_page_t *p1 = arg1; 1790 1791 if (p0->mxdp_pa < p1->mxdp_pa) 1792 return (-1); 1793 if (p0->mxdp_pa > p1->mxdp_pa) 1794 return (1); 1795 return (0); 1796 } 1797 1798 static boolean_t 1799 mlxcx_setup_ports(mlxcx_t *mlxp) 1800 { 1801 uint_t i, j; 1802 mlxcx_port_t *p; 1803 mlxcx_flow_table_t *ft; 1804 mlxcx_flow_group_t *fg; 1805 mlxcx_flow_entry_t *fe; 1806 1807 VERIFY3U(mlxp->mlx_nports, >, 0); 1808 mlxp->mlx_ports_size = mlxp->mlx_nports * sizeof (mlxcx_port_t); 1809 mlxp->mlx_ports = kmem_zalloc(mlxp->mlx_ports_size, KM_SLEEP); 1810 1811 for (i = 0; i < mlxp->mlx_nports; ++i) { 1812 p = &mlxp->mlx_ports[i]; 1813 p->mlp_num = i; 1814 p->mlx_port_event.mla_mlx = mlxp; 1815 p->mlx_port_event.mla_port = p; 1816 mutex_init(&p->mlx_port_event.mla_mtx, NULL, 1817 MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_async_intr_pri)); 1818 p->mlp_init |= MLXCX_PORT_INIT; 1819 mutex_init(&p->mlp_mtx, NULL, MUTEX_DRIVER, 1820 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 1821 mutex_enter(&p->mlp_mtx); 1822 if (!mlxcx_cmd_query_nic_vport_ctx(mlxp, p)) { 1823 mutex_exit(&p->mlp_mtx); 1824 goto err; 1825 } 1826 if (!mlxcx_cmd_query_port_mtu(mlxp, p)) { 1827 mutex_exit(&p->mlp_mtx); 1828 goto err; 1829 } 1830 if (!mlxcx_cmd_query_port_status(mlxp, p)) { 1831 mutex_exit(&p->mlp_mtx); 1832 goto err; 1833 } 1834 if (!mlxcx_cmd_query_port_speed(mlxp, p)) { 1835 mutex_exit(&p->mlp_mtx); 1836 goto err; 1837 } 1838 if (!mlxcx_cmd_modify_nic_vport_ctx(mlxp, p, 1839 MLXCX_MODIFY_NIC_VPORT_CTX_PROMISC)) { 1840 mutex_exit(&p->mlp_mtx); 1841 goto err; 1842 } 1843 if (!mlxcx_cmd_query_port_fec(mlxp, p)) { 1844 mutex_exit(&p->mlp_mtx); 1845 goto err; 1846 } 1847 p->mlp_fec_requested = LINK_FEC_AUTO; 1848 1849 mutex_exit(&p->mlp_mtx); 1850 } 1851 1852 for (i = 0; i < mlxp->mlx_nports; ++i) { 1853 p = &mlxp->mlx_ports[i]; 1854 mutex_enter(&p->mlp_mtx); 1855 p->mlp_rx_flow = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t), 1856 KM_SLEEP)); 1857 mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER, 1858 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 1859 1860 mutex_enter(&ft->mlft_mtx); 1861 1862 ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX; 1863 ft->mlft_port = p; 1864 ft->mlft_entshift = mlxp->mlx_props.mldp_ftbl_root_size_shift; 1865 if (ft->mlft_entshift > mlxp->mlx_caps->mlc_max_rx_ft_shift) 1866 ft->mlft_entshift = mlxp->mlx_caps->mlc_max_rx_ft_shift; 1867 ft->mlft_nents = (1 << ft->mlft_entshift); 1868 ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t); 1869 ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP); 1870 list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t), 1871 offsetof(mlxcx_flow_group_t, mlfg_entry)); 1872 1873 for (j = 0; j < ft->mlft_nents; ++j) { 1874 ft->mlft_ent[j].mlfe_table = ft; 1875 ft->mlft_ent[j].mlfe_index = j; 1876 } 1877 1878 if (!mlxcx_cmd_create_flow_table(mlxp, ft)) { 1879 mutex_exit(&ft->mlft_mtx); 1880 mutex_exit(&p->mlp_mtx); 1881 goto err; 1882 } 1883 1884 if (!mlxcx_cmd_set_flow_table_root(mlxp, ft)) { 1885 mutex_exit(&ft->mlft_mtx); 1886 mutex_exit(&p->mlp_mtx); 1887 goto err; 1888 } 1889 1890 /* 1891 * We match broadcast at the top of the root flow table, then 1892 * all multicast/unicast MACs, then the promisc entry is down 1893 * the very bottom. 1894 * 1895 * This way when promisc is on, that entry simply catches any 1896 * remaining traffic that earlier flows haven't matched. 1897 */ 1898 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1899 list_insert_tail(&ft->mlft_groups, fg); 1900 fg->mlfg_table = ft; 1901 fg->mlfg_size = 1; 1902 fg->mlfg_mask |= MLXCX_FLOW_MATCH_DMAC; 1903 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1904 mutex_exit(&ft->mlft_mtx); 1905 mutex_exit(&p->mlp_mtx); 1906 goto err; 1907 } 1908 p->mlp_bcast = fg; 1909 fe = list_head(&fg->mlfg_entries); 1910 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 1911 (void) memset(fe->mlfe_dmac, 0xff, sizeof (fe->mlfe_dmac)); 1912 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 1913 1914 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1915 list_insert_tail(&ft->mlft_groups, fg); 1916 fg->mlfg_table = ft; 1917 fg->mlfg_size = ft->mlft_nents - 2; 1918 fg->mlfg_mask |= MLXCX_FLOW_MATCH_DMAC; 1919 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1920 mutex_exit(&ft->mlft_mtx); 1921 mutex_exit(&p->mlp_mtx); 1922 goto err; 1923 } 1924 p->mlp_umcast = fg; 1925 1926 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1927 list_insert_tail(&ft->mlft_groups, fg); 1928 fg->mlfg_table = ft; 1929 fg->mlfg_size = 1; 1930 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1931 mutex_exit(&ft->mlft_mtx); 1932 mutex_exit(&p->mlp_mtx); 1933 goto err; 1934 } 1935 p->mlp_promisc = fg; 1936 fe = list_head(&fg->mlfg_entries); 1937 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 1938 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 1939 1940 avl_create(&p->mlp_dmac_fe, mlxcx_dmac_fe_compare, 1941 sizeof (mlxcx_flow_entry_t), offsetof(mlxcx_flow_entry_t, 1942 mlfe_dmac_entry)); 1943 1944 mutex_exit(&ft->mlft_mtx); 1945 mutex_exit(&p->mlp_mtx); 1946 } 1947 1948 return (B_TRUE); 1949 1950 err: 1951 mlxcx_teardown_ports(mlxp); 1952 return (B_FALSE); 1953 } 1954 1955 void 1956 mlxcx_remove_all_vlan_entries(mlxcx_t *mlxp, mlxcx_ring_group_t *g) 1957 { 1958 mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft; 1959 mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg; 1960 mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg; 1961 mlxcx_flow_entry_t *fe; 1962 mlxcx_group_vlan_t *v; 1963 1964 ASSERT(mutex_owned(&g->mlg_mtx)); 1965 1966 mutex_enter(&ft->mlft_mtx); 1967 1968 if (!list_is_empty(&g->mlg_rx_vlans)) { 1969 fe = list_head(&dfg->mlfg_entries); 1970 (void) mlxcx_cmd_set_flow_table_entry(mlxp, fe); 1971 } 1972 1973 while ((v = list_remove_head(&g->mlg_rx_vlans)) != NULL) { 1974 fe = v->mlgv_fe; 1975 ASSERT3P(fe->mlfe_table, ==, ft); 1976 ASSERT3P(fe->mlfe_group, ==, fg); 1977 kmem_free(v, sizeof (mlxcx_group_vlan_t)); 1978 1979 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 1980 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 1981 } 1982 1983 mutex_exit(&ft->mlft_mtx); 1984 } 1985 1986 boolean_t 1987 mlxcx_remove_vlan_entry(mlxcx_t *mlxp, mlxcx_ring_group_t *g, 1988 boolean_t tagged, uint16_t vid) 1989 { 1990 mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft; 1991 mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg; 1992 mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg; 1993 mlxcx_flow_entry_t *fe; 1994 mlxcx_group_vlan_t *v; 1995 boolean_t found = B_FALSE; 1996 1997 ASSERT(mutex_owned(&g->mlg_mtx)); 1998 1999 mutex_enter(&ft->mlft_mtx); 2000 2001 for (v = list_head(&g->mlg_rx_vlans); v != NULL; 2002 v = list_next(&g->mlg_rx_vlans, v)) { 2003 if (v->mlgv_tagged == tagged && v->mlgv_vid == vid) { 2004 found = B_TRUE; 2005 break; 2006 } 2007 } 2008 if (!found) { 2009 mutex_exit(&ft->mlft_mtx); 2010 return (B_FALSE); 2011 } 2012 2013 list_remove(&g->mlg_rx_vlans, v); 2014 2015 /* 2016 * If this is the last VLAN entry, we have to go back to accepting 2017 * any VLAN (which means re-enabling the default entry). 2018 * 2019 * Do this before we remove the flow entry for the last specific 2020 * VLAN so that we don't lose any traffic in the transition. 2021 */ 2022 if (list_is_empty(&g->mlg_rx_vlans)) { 2023 fe = list_head(&dfg->mlfg_entries); 2024 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 2025 list_insert_tail(&g->mlg_rx_vlans, v); 2026 mutex_exit(&ft->mlft_mtx); 2027 return (B_FALSE); 2028 } 2029 } 2030 2031 fe = v->mlgv_fe; 2032 ASSERT(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED); 2033 ASSERT(fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED); 2034 ASSERT3P(fe->mlfe_table, ==, ft); 2035 ASSERT3P(fe->mlfe_group, ==, fg); 2036 2037 if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) { 2038 list_insert_tail(&g->mlg_rx_vlans, v); 2039 fe = list_head(&dfg->mlfg_entries); 2040 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) { 2041 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 2042 } 2043 mutex_exit(&ft->mlft_mtx); 2044 return (B_FALSE); 2045 } 2046 2047 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2048 2049 kmem_free(v, sizeof (mlxcx_group_vlan_t)); 2050 2051 mutex_exit(&ft->mlft_mtx); 2052 return (B_TRUE); 2053 } 2054 2055 boolean_t 2056 mlxcx_add_vlan_entry(mlxcx_t *mlxp, mlxcx_ring_group_t *g, boolean_t tagged, 2057 uint16_t vid) 2058 { 2059 mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft; 2060 mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg; 2061 mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg; 2062 mlxcx_flow_entry_t *fe; 2063 mlxcx_group_vlan_t *v; 2064 boolean_t found = B_FALSE; 2065 boolean_t first = B_FALSE; 2066 2067 ASSERT(mutex_owned(&g->mlg_mtx)); 2068 2069 mutex_enter(&ft->mlft_mtx); 2070 2071 for (v = list_head(&g->mlg_rx_vlans); v != NULL; 2072 v = list_next(&g->mlg_rx_vlans, v)) { 2073 if (v->mlgv_tagged == tagged && v->mlgv_vid == vid) { 2074 mutex_exit(&ft->mlft_mtx); 2075 return (B_TRUE); 2076 } 2077 } 2078 if (list_is_empty(&g->mlg_rx_vlans)) 2079 first = B_TRUE; 2080 2081 for (fe = list_head(&fg->mlfg_entries); fe != NULL; 2082 fe = list_next(&fg->mlfg_entries, fe)) { 2083 if (!(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED)) { 2084 found = B_TRUE; 2085 break; 2086 } 2087 } 2088 if (!found) { 2089 mutex_exit(&ft->mlft_mtx); 2090 return (B_FALSE); 2091 } 2092 2093 v = kmem_zalloc(sizeof (mlxcx_group_vlan_t), KM_SLEEP); 2094 v->mlgv_fe = fe; 2095 v->mlgv_tagged = tagged; 2096 v->mlgv_vid = vid; 2097 2098 fe->mlfe_state |= MLXCX_FLOW_ENTRY_RESERVED; 2099 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 2100 fe->mlfe_vid = vid; 2101 if (tagged) { 2102 fe->mlfe_vlan_type = MLXCX_VLAN_TYPE_CVLAN; 2103 } else { 2104 fe->mlfe_vlan_type = MLXCX_VLAN_TYPE_NONE; 2105 } 2106 2107 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 2108 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_DIRTY; 2109 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2110 kmem_free(v, sizeof (mlxcx_group_vlan_t)); 2111 mutex_exit(&ft->mlft_mtx); 2112 return (B_FALSE); 2113 } 2114 2115 list_insert_tail(&g->mlg_rx_vlans, v); 2116 2117 /* 2118 * If the vlan list was empty for this group before adding this one, 2119 * then we no longer want the "default" entry to allow all VLANs 2120 * through. 2121 */ 2122 if (first) { 2123 fe = list_head(&dfg->mlfg_entries); 2124 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 2125 } 2126 2127 mutex_exit(&ft->mlft_mtx); 2128 return (B_TRUE); 2129 } 2130 2131 void 2132 mlxcx_remove_all_umcast_entries(mlxcx_t *mlxp, mlxcx_port_t *port, 2133 mlxcx_ring_group_t *group) 2134 { 2135 mlxcx_flow_entry_t *fe; 2136 mlxcx_flow_table_t *ft = port->mlp_rx_flow; 2137 mlxcx_group_mac_t *gm, *ngm; 2138 2139 ASSERT(mutex_owned(&port->mlp_mtx)); 2140 ASSERT(mutex_owned(&group->mlg_mtx)); 2141 2142 mutex_enter(&ft->mlft_mtx); 2143 2144 gm = avl_first(&group->mlg_rx_macs); 2145 for (; gm != NULL; gm = ngm) { 2146 ngm = AVL_NEXT(&group->mlg_rx_macs, gm); 2147 2148 ASSERT3P(gm->mlgm_group, ==, group); 2149 fe = gm->mlgm_fe; 2150 ASSERT3P(fe->mlfe_table, ==, ft); 2151 2152 avl_remove(&group->mlg_rx_macs, gm); 2153 list_remove(&fe->mlfe_ring_groups, gm); 2154 kmem_free(gm, sizeof (mlxcx_group_mac_t)); 2155 2156 fe->mlfe_ndest = 0; 2157 for (gm = list_head(&fe->mlfe_ring_groups); gm != NULL; 2158 gm = list_next(&fe->mlfe_ring_groups, gm)) { 2159 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = 2160 gm->mlgm_group->mlg_rx_vlan_ft; 2161 } 2162 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 2163 2164 if (fe->mlfe_ndest > 0) { 2165 (void) mlxcx_cmd_set_flow_table_entry(mlxp, fe); 2166 continue; 2167 } 2168 2169 /* 2170 * There are no more ring groups left for this MAC (it wasn't 2171 * attached to any other groups since ndest == 0), so clean up 2172 * its flow entry. 2173 */ 2174 avl_remove(&port->mlp_dmac_fe, fe); 2175 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 2176 list_destroy(&fe->mlfe_ring_groups); 2177 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2178 } 2179 2180 mutex_exit(&ft->mlft_mtx); 2181 } 2182 2183 boolean_t 2184 mlxcx_remove_umcast_entry(mlxcx_t *mlxp, mlxcx_port_t *port, 2185 mlxcx_ring_group_t *group, const uint8_t *macaddr) 2186 { 2187 mlxcx_flow_entry_t *fe; 2188 mlxcx_flow_table_t *ft = port->mlp_rx_flow; 2189 mlxcx_group_mac_t *gm, probe; 2190 2191 ASSERT(mutex_owned(&port->mlp_mtx)); 2192 ASSERT(mutex_owned(&group->mlg_mtx)); 2193 2194 bzero(&probe, sizeof (probe)); 2195 bcopy(macaddr, probe.mlgm_mac, sizeof (probe.mlgm_mac)); 2196 2197 mutex_enter(&ft->mlft_mtx); 2198 2199 gm = avl_find(&group->mlg_rx_macs, &probe, NULL); 2200 if (gm == NULL) { 2201 mutex_exit(&ft->mlft_mtx); 2202 return (B_FALSE); 2203 } 2204 ASSERT3P(gm->mlgm_group, ==, group); 2205 ASSERT0(bcmp(macaddr, gm->mlgm_mac, sizeof (gm->mlgm_mac))); 2206 2207 fe = gm->mlgm_fe; 2208 ASSERT3P(fe->mlfe_table, ==, ft); 2209 ASSERT0(bcmp(macaddr, fe->mlfe_dmac, sizeof (fe->mlfe_dmac))); 2210 2211 list_remove(&fe->mlfe_ring_groups, gm); 2212 avl_remove(&group->mlg_rx_macs, gm); 2213 kmem_free(gm, sizeof (mlxcx_group_mac_t)); 2214 2215 fe->mlfe_ndest = 0; 2216 for (gm = list_head(&fe->mlfe_ring_groups); gm != NULL; 2217 gm = list_next(&fe->mlfe_ring_groups, gm)) { 2218 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = 2219 gm->mlgm_group->mlg_rx_vlan_ft; 2220 } 2221 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 2222 2223 if (fe->mlfe_ndest > 0) { 2224 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 2225 mutex_exit(&ft->mlft_mtx); 2226 return (B_FALSE); 2227 } 2228 mutex_exit(&ft->mlft_mtx); 2229 return (B_TRUE); 2230 } 2231 2232 /* 2233 * There are no more ring groups left for this MAC (it wasn't attached 2234 * to any other groups since ndest == 0), so clean up its flow entry. 2235 */ 2236 avl_remove(&port->mlp_dmac_fe, fe); 2237 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 2238 list_destroy(&fe->mlfe_ring_groups); 2239 2240 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2241 2242 mutex_exit(&ft->mlft_mtx); 2243 2244 return (B_TRUE); 2245 } 2246 2247 boolean_t 2248 mlxcx_add_umcast_entry(mlxcx_t *mlxp, mlxcx_port_t *port, 2249 mlxcx_ring_group_t *group, const uint8_t *macaddr) 2250 { 2251 mlxcx_flow_group_t *fg; 2252 mlxcx_flow_entry_t *fe, probe; 2253 mlxcx_flow_table_t *ft = port->mlp_rx_flow; 2254 mlxcx_group_mac_t *gm; 2255 boolean_t found = B_FALSE; 2256 2257 ASSERT(mutex_owned(&port->mlp_mtx)); 2258 ASSERT(mutex_owned(&group->mlg_mtx)); 2259 2260 bzero(&probe, sizeof (probe)); 2261 bcopy(macaddr, probe.mlfe_dmac, sizeof (probe.mlfe_dmac)); 2262 2263 mutex_enter(&ft->mlft_mtx); 2264 2265 fe = avl_find(&port->mlp_dmac_fe, &probe, NULL); 2266 2267 if (fe == NULL) { 2268 fg = port->mlp_umcast; 2269 for (fe = list_head(&fg->mlfg_entries); fe != NULL; 2270 fe = list_next(&fg->mlfg_entries, fe)) { 2271 if (!(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED)) { 2272 found = B_TRUE; 2273 break; 2274 } 2275 } 2276 if (!found) { 2277 mutex_exit(&ft->mlft_mtx); 2278 return (B_FALSE); 2279 } 2280 list_create(&fe->mlfe_ring_groups, sizeof (mlxcx_group_mac_t), 2281 offsetof(mlxcx_group_mac_t, mlgm_fe_entry)); 2282 fe->mlfe_state |= MLXCX_FLOW_ENTRY_RESERVED; 2283 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 2284 bcopy(macaddr, fe->mlfe_dmac, sizeof (fe->mlfe_dmac)); 2285 2286 avl_add(&port->mlp_dmac_fe, fe); 2287 } 2288 2289 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = group->mlg_rx_vlan_ft; 2290 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 2291 2292 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 2293 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_DIRTY; 2294 if (--fe->mlfe_ndest == 0) { 2295 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2296 } 2297 mutex_exit(&ft->mlft_mtx); 2298 return (B_FALSE); 2299 } 2300 2301 gm = kmem_zalloc(sizeof (mlxcx_group_mac_t), KM_SLEEP); 2302 gm->mlgm_group = group; 2303 gm->mlgm_fe = fe; 2304 bcopy(macaddr, gm->mlgm_mac, sizeof (gm->mlgm_mac)); 2305 avl_add(&group->mlg_rx_macs, gm); 2306 list_insert_tail(&fe->mlfe_ring_groups, gm); 2307 2308 mutex_exit(&ft->mlft_mtx); 2309 2310 return (B_TRUE); 2311 } 2312 2313 boolean_t 2314 mlxcx_setup_flow_group(mlxcx_t *mlxp, mlxcx_flow_table_t *ft, 2315 mlxcx_flow_group_t *fg) 2316 { 2317 mlxcx_flow_entry_t *fe; 2318 uint_t i, idx; 2319 2320 ASSERT(mutex_owned(&ft->mlft_mtx)); 2321 ASSERT(ft->mlft_state & MLXCX_FLOW_TABLE_CREATED); 2322 ASSERT3P(fg->mlfg_table, ==, ft); 2323 2324 if (ft->mlft_next_ent + fg->mlfg_size > ft->mlft_nents) 2325 return (B_FALSE); 2326 fg->mlfg_start_idx = ft->mlft_next_ent; 2327 2328 if (!mlxcx_cmd_create_flow_group(mlxp, fg)) { 2329 return (B_FALSE); 2330 } 2331 2332 list_create(&fg->mlfg_entries, sizeof (mlxcx_flow_entry_t), 2333 offsetof(mlxcx_flow_entry_t, mlfe_group_entry)); 2334 for (i = 0; i < fg->mlfg_size; ++i) { 2335 idx = fg->mlfg_start_idx + i; 2336 fe = &ft->mlft_ent[idx]; 2337 fe->mlfe_group = fg; 2338 list_insert_tail(&fg->mlfg_entries, fe); 2339 } 2340 fg->mlfg_avail = fg->mlfg_size; 2341 ft->mlft_next_ent += fg->mlfg_size; 2342 2343 return (B_TRUE); 2344 } 2345 2346 static boolean_t 2347 mlxcx_setup_eq(mlxcx_t *mlxp, uint_t vec, uint64_t events) 2348 { 2349 mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[vec]; 2350 2351 mutex_enter(&mleq->mleq_mtx); 2352 if (!mlxcx_eq_alloc_dma(mlxp, mleq)) { 2353 /* mlxcx_teardown_eqs() will clean this up */ 2354 mutex_exit(&mleq->mleq_mtx); 2355 return (B_FALSE); 2356 } 2357 mleq->mleq_mlx = mlxp; 2358 mleq->mleq_uar = &mlxp->mlx_uar; 2359 mleq->mleq_events = events; 2360 mleq->mleq_intr_index = vec; 2361 2362 if (!mlxcx_cmd_create_eq(mlxp, mleq)) { 2363 /* mlxcx_teardown_eqs() will clean this up */ 2364 mutex_exit(&mleq->mleq_mtx); 2365 return (B_FALSE); 2366 } 2367 2368 if (ddi_intr_enable(mlxp->mlx_intr_handles[vec]) != DDI_SUCCESS) { 2369 /* 2370 * mlxcx_teardown_eqs() will handle calling cmd_destroy_eq and 2371 * eq_rele_dma 2372 */ 2373 mutex_exit(&mleq->mleq_mtx); 2374 return (B_FALSE); 2375 } 2376 mleq->mleq_state |= MLXCX_EQ_INTR_ENABLED; 2377 mleq->mleq_state |= MLXCX_EQ_ATTACHING; 2378 mlxcx_arm_eq(mlxp, mleq); 2379 mutex_exit(&mleq->mleq_mtx); 2380 2381 return (B_TRUE); 2382 } 2383 2384 static void 2385 mlxcx_eq_set_attached(mlxcx_t *mlxp) 2386 { 2387 uint_t vec; 2388 mlxcx_event_queue_t *mleq; 2389 2390 for (vec = 0; vec < mlxp->mlx_intr_count; ++vec) { 2391 mleq = &mlxp->mlx_eqs[vec]; 2392 2393 mutex_enter(&mleq->mleq_mtx); 2394 mleq->mleq_state &= ~MLXCX_EQ_ATTACHING; 2395 mutex_exit(&mleq->mleq_mtx); 2396 } 2397 } 2398 2399 static boolean_t 2400 mlxcx_setup_async_eqs(mlxcx_t *mlxp) 2401 { 2402 boolean_t ret; 2403 2404 ret = mlxcx_setup_eq(mlxp, 0, 2405 (1ULL << MLXCX_EVENT_CMD_COMPLETION) | 2406 (1ULL << MLXCX_EVENT_PAGE_REQUEST) | 2407 (1ULL << MLXCX_EVENT_PORT_STATE) | 2408 (1ULL << MLXCX_EVENT_INTERNAL_ERROR) | 2409 (1ULL << MLXCX_EVENT_PORT_MODULE) | 2410 (1ULL << MLXCX_EVENT_SENDQ_DRAIN) | 2411 (1ULL << MLXCX_EVENT_LAST_WQE) | 2412 (1ULL << MLXCX_EVENT_CQ_ERROR) | 2413 (1ULL << MLXCX_EVENT_WQ_CATASTROPHE) | 2414 (1ULL << MLXCX_EVENT_PAGE_FAULT) | 2415 (1ULL << MLXCX_EVENT_WQ_INVALID_REQ) | 2416 (1ULL << MLXCX_EVENT_WQ_ACCESS_VIOL) | 2417 (1ULL << MLXCX_EVENT_NIC_VPORT) | 2418 (1ULL << MLXCX_EVENT_DOORBELL_CONGEST)); 2419 2420 if (ret) 2421 mlxcx_cmd_eq_enable(mlxp); 2422 2423 return (ret); 2424 } 2425 2426 int 2427 mlxcx_cq_compare(const void *arg0, const void *arg1) 2428 { 2429 const mlxcx_completion_queue_t *left = arg0; 2430 const mlxcx_completion_queue_t *right = arg1; 2431 2432 if (left->mlcq_num < right->mlcq_num) { 2433 return (-1); 2434 } 2435 if (left->mlcq_num > right->mlcq_num) { 2436 return (1); 2437 } 2438 return (0); 2439 } 2440 2441 static boolean_t 2442 mlxcx_setup_eqs(mlxcx_t *mlxp) 2443 { 2444 uint_t i; 2445 mlxcx_event_queue_t *mleq; 2446 2447 ASSERT3S(mlxp->mlx_intr_count, >, 0); 2448 2449 for (i = mlxp->mlx_intr_cq0; i < mlxp->mlx_intr_count; ++i) { 2450 mleq = &mlxp->mlx_eqs[i]; 2451 mutex_enter(&mleq->mleq_mtx); 2452 if (!mlxcx_eq_alloc_dma(mlxp, mleq)) { 2453 mutex_exit(&mleq->mleq_mtx); 2454 return (B_FALSE); 2455 } 2456 mleq->mleq_uar = &mlxp->mlx_uar; 2457 if (!mlxcx_cmd_create_eq(mlxp, mleq)) { 2458 /* mlxcx_teardown() will handle calling eq_rele_dma */ 2459 mutex_exit(&mleq->mleq_mtx); 2460 return (B_FALSE); 2461 } 2462 if (mlxp->mlx_props.mldp_intrmod_period_usec != 0 && 2463 !mlxcx_cmd_set_int_mod(mlxp, i, 2464 mlxp->mlx_props.mldp_intrmod_period_usec)) { 2465 mutex_exit(&mleq->mleq_mtx); 2466 return (B_FALSE); 2467 } 2468 if (ddi_intr_enable(mlxp->mlx_intr_handles[i]) != DDI_SUCCESS) { 2469 mutex_exit(&mleq->mleq_mtx); 2470 return (B_FALSE); 2471 } 2472 mleq->mleq_state |= MLXCX_EQ_INTR_ENABLED; 2473 mlxcx_arm_eq(mlxp, mleq); 2474 mutex_exit(&mleq->mleq_mtx); 2475 } 2476 2477 mlxp->mlx_next_eq = mlxp->mlx_intr_cq0; 2478 2479 return (B_TRUE); 2480 } 2481 2482 /* 2483 * A more recent ConnectX part will have the Port CApability Mask register. 2484 * Explore it and note things here. 2485 */ 2486 static void 2487 mlxcx_explore_pcam(mlxcx_t *mlxp, mlxcx_caps_t *c) 2488 { 2489 mlxcx_register_data_t data; 2490 mlxcx_reg_pcam_t *pcam = &data.mlrd_pcam; 2491 2492 ASSERT(c->mlc_pcam); 2493 bzero(&data, sizeof (data)); 2494 2495 /* 2496 * Okay, so we have access the the Ports CApability Mask (PCAM). 2497 * There are various things we need to check about it. 2498 */ 2499 2500 VERIFY(mlxcx_cmd_access_register(mlxp, MLXCX_CMD_ACCESS_REGISTER_READ, 2501 MLXCX_REG_PCAM, &data)); 2502 2503 /* 2504 * NOTE: These ASSERT()s may change in future mlxcx(4D) parts. 2505 * As of now, only 0 is valid, and 1-255 are reserved. A future part 2506 * may return non-zero in these fields. 2507 */ 2508 ASSERT0(pcam->mlrd_pcam_feature_group); 2509 ASSERT0(pcam->mlrd_pcam_access_reg_group); 2510 2511 c->mlc_ext_ptys = get_bit64(pcam->mlrd_pcam_feature_cap_mask_low, 2512 MLXCX_PCAM_LOW_FFLAGS_PTYS_EXTENDED); 2513 } 2514 2515 /* 2516 * Snapshot all of the hardware capabilities that we care about and then modify 2517 * the HCA capabilities to get things moving. 2518 */ 2519 static boolean_t 2520 mlxcx_init_caps(mlxcx_t *mlxp) 2521 { 2522 mlxcx_caps_t *c; 2523 2524 mlxp->mlx_caps = c = kmem_zalloc(sizeof (mlxcx_caps_t), KM_SLEEP); 2525 2526 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_GENERAL, 2527 MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_hca_cur)) { 2528 mlxcx_warn(mlxp, "failed to obtain current HCA general caps"); 2529 } 2530 2531 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_GENERAL, 2532 MLXCX_HCA_CAP_MODE_MAX, &c->mlc_hca_max)) { 2533 mlxcx_warn(mlxp, "failed to obtain maximum HCA general caps"); 2534 } 2535 2536 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_ETHERNET, 2537 MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_ether_cur)) { 2538 mlxcx_warn(mlxp, "failed to obtain current HCA eth caps"); 2539 } 2540 2541 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_ETHERNET, 2542 MLXCX_HCA_CAP_MODE_MAX, &c->mlc_ether_max)) { 2543 mlxcx_warn(mlxp, "failed to obtain maximum HCA eth caps"); 2544 } 2545 2546 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_NIC_FLOW, 2547 MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_nic_flow_cur)) { 2548 mlxcx_warn(mlxp, "failed to obtain current HCA flow caps"); 2549 } 2550 2551 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_NIC_FLOW, 2552 MLXCX_HCA_CAP_MODE_MAX, &c->mlc_nic_flow_max)) { 2553 mlxcx_warn(mlxp, "failed to obtain maximum HCA flow caps"); 2554 } 2555 2556 /* 2557 * Check the caps meet our requirements. 2558 */ 2559 const mlxcx_hca_cap_general_caps_t *gen = &c->mlc_hca_cur.mhc_general; 2560 2561 if (gen->mlcap_general_log_pg_sz != 12) { 2562 mlxcx_warn(mlxp, "!hardware has page size != 4k " 2563 "(log_pg_sz = %u)", (uint_t)gen->mlcap_general_log_pg_sz); 2564 goto err; 2565 } 2566 if (gen->mlcap_general_cqe_version != 1) { 2567 mlxcx_warn(mlxp, "!hardware does not support CQE v1 " 2568 "(cqe_ver = %u)", (uint_t)gen->mlcap_general_cqe_version); 2569 goto err; 2570 } 2571 if (gen->mlcap_general_port_type != 2572 MLXCX_CAP_GENERAL_PORT_TYPE_ETHERNET) { 2573 mlxcx_warn(mlxp, "!hardware has non-ethernet ports"); 2574 goto err; 2575 } 2576 mlxp->mlx_nports = gen->mlcap_general_num_ports; 2577 mlxp->mlx_max_sdu = (1 << (gen->mlcap_general_log_max_msg & 0x1F)); 2578 2579 if (get_bit16(gen->mlcap_general_flags_c, 2580 MLXCX_CAP_GENERAL_FLAGS_C_PCAM_REG)) { 2581 c->mlc_pcam = B_TRUE; 2582 mlxcx_explore_pcam(mlxp, c); 2583 } 2584 2585 c->mlc_max_tir = (1 << gen->mlcap_general_log_max_tir); 2586 2587 c->mlc_checksum = get_bit32(c->mlc_ether_cur.mhc_eth.mlcap_eth_flags, 2588 MLXCX_ETH_CAP_CSUM_CAP); 2589 c->mlc_vxlan = get_bit32(c->mlc_ether_cur.mhc_eth.mlcap_eth_flags, 2590 MLXCX_ETH_CAP_TUNNEL_STATELESS_VXLAN); 2591 2592 c->mlc_max_lso_size = (1 << get_bits32(c->mlc_ether_cur.mhc_eth. 2593 mlcap_eth_flags, MLXCX_ETH_CAP_MAX_LSO_CAP)); 2594 if (c->mlc_max_lso_size == 1) { 2595 c->mlc_max_lso_size = 0; 2596 c->mlc_lso = B_FALSE; 2597 } else { 2598 c->mlc_lso = B_TRUE; 2599 } 2600 2601 c->mlc_max_rqt_size = (1 << get_bits32(c->mlc_ether_cur.mhc_eth. 2602 mlcap_eth_flags, MLXCX_ETH_CAP_RSS_IND_TBL_CAP)); 2603 2604 if (!get_bit32(c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx. 2605 mlcap_flow_prop_flags, MLXCX_FLOW_CAP_PROPS_SUPPORT)) { 2606 mlxcx_warn(mlxp, "!hardware does not support rx flow tables"); 2607 goto err; 2608 } 2609 if (!get_bit32(c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx. 2610 mlcap_flow_prop_flags, MLXCX_FLOW_CAP_PROPS_MODIFY)) { 2611 mlxcx_warn(mlxp, "!hardware does not support modifying rx " 2612 "flow table entries"); 2613 goto err; 2614 } 2615 2616 c->mlc_max_rx_ft_shift = c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx. 2617 mlcap_flow_prop_log_max_ft_size; 2618 c->mlc_max_rx_flows = (1 << c->mlc_nic_flow_cur.mhc_flow. 2619 mlcap_flow_nic_rx.mlcap_flow_prop_log_max_flow); 2620 c->mlc_max_rx_ft = (1 << c->mlc_nic_flow_cur.mhc_flow. 2621 mlcap_flow_nic_rx.mlcap_flow_prop_log_max_ft_num); 2622 c->mlc_max_rx_fe_dest = (1 << c->mlc_nic_flow_cur.mhc_flow. 2623 mlcap_flow_nic_rx.mlcap_flow_prop_log_max_destination); 2624 2625 return (B_TRUE); 2626 2627 err: 2628 kmem_free(mlxp->mlx_caps, sizeof (mlxcx_caps_t)); 2629 return (B_FALSE); 2630 } 2631 2632 static int 2633 mlxcx_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2634 { 2635 mlxcx_t *mlxp; 2636 2637 if (cmd != DDI_DETACH) 2638 return (DDI_FAILURE); 2639 2640 mlxp = ddi_get_driver_private(dip); 2641 if (mlxp == NULL) { 2642 mlxcx_warn(NULL, "asked to detach, but missing instance " 2643 "private data"); 2644 return (DDI_FAILURE); 2645 } 2646 2647 if (mlxp->mlx_attach & MLXCX_ATTACH_MAC_HDL) { 2648 if (mac_unregister(mlxp->mlx_mac_hdl) != DDI_SUCCESS) { 2649 return (DDI_FAILURE); 2650 } 2651 mlxp->mlx_attach &= ~MLXCX_ATTACH_MAC_HDL; 2652 } 2653 2654 mlxcx_teardown(mlxp); 2655 return (DDI_SUCCESS); 2656 } 2657 2658 static size_t 2659 mlxcx_calc_rx_ngroups(mlxcx_t *mlxp) 2660 { 2661 size_t ngroups = mlxp->mlx_props.mldp_rx_ngroups_large + 2662 mlxp->mlx_props.mldp_rx_ngroups_small; 2663 size_t tirlim, flowlim, gflowlim; 2664 2665 tirlim = mlxp->mlx_caps->mlc_max_tir / MLXCX_TIRS_PER_GROUP; 2666 if (tirlim < ngroups) { 2667 mlxcx_note(mlxp, "limiting number of rx groups to %u based " 2668 "on number of TIRs available", tirlim); 2669 ngroups = tirlim; 2670 } 2671 2672 flowlim = (1 << mlxp->mlx_caps->mlc_max_rx_ft_shift) - 2; 2673 if (flowlim < ngroups) { 2674 mlxcx_note(mlxp, "limiting number of rx groups to %u based " 2675 "on max size of RX flow tables", flowlim); 2676 ngroups = flowlim; 2677 } 2678 2679 /* 2680 * Restrict the number of groups not to exceed the max flow 2681 * table number from the devices capabilities. 2682 * There is one root table entry per port and 2 entries per 2683 * group. 2684 */ 2685 flowlim = (mlxp->mlx_caps->mlc_max_rx_ft - mlxp->mlx_nports) / 2; 2686 if (flowlim < ngroups) { 2687 mlxcx_note(mlxp, "limiting number of rx groups to %u based " 2688 "on max number of RX flow tables", 2689 flowlim); 2690 ngroups = flowlim; 2691 } 2692 2693 do { 2694 gflowlim = mlxp->mlx_caps->mlc_max_rx_flows - 16 * ngroups - 2; 2695 if (gflowlim < ngroups) { 2696 mlxcx_note(mlxp, "limiting number of rx groups to %u " 2697 "based on max total RX flows", gflowlim); 2698 --ngroups; 2699 } 2700 } while (gflowlim < ngroups); 2701 2702 return (ngroups); 2703 } 2704 2705 static int 2706 mlxcx_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2707 { 2708 mlxcx_t *mlxp; 2709 char tq_name[TASKQ_NAMELEN]; 2710 uint_t i; 2711 int inst, ret; 2712 2713 if (cmd != DDI_ATTACH) 2714 return (DDI_FAILURE); 2715 2716 inst = ddi_get_instance(dip); 2717 ret = ddi_soft_state_zalloc(mlxcx_softstate, inst); 2718 if (ret != 0) 2719 return (ret); 2720 2721 mlxp = ddi_get_soft_state(mlxcx_softstate, inst); 2722 if (mlxp == NULL) 2723 return (DDI_FAILURE); 2724 mlxp->mlx_dip = dip; 2725 mlxp->mlx_inst = inst; 2726 ddi_set_driver_private(dip, mlxp); 2727 2728 mlxcx_load_props(mlxp); 2729 2730 mlxcx_fm_init(mlxp); 2731 mlxp->mlx_attach |= MLXCX_ATTACH_FM; 2732 2733 if (pci_config_setup(mlxp->mlx_dip, &mlxp->mlx_cfg_handle) != 2734 DDI_SUCCESS) { 2735 mlxcx_warn(mlxp, "failed to initial PCI config space"); 2736 goto err; 2737 } 2738 mlxp->mlx_attach |= MLXCX_ATTACH_PCI_CONFIG; 2739 2740 if (!mlxcx_regs_map(mlxp)) { 2741 goto err; 2742 } 2743 mlxp->mlx_attach |= MLXCX_ATTACH_REGS; 2744 2745 if (!mlxcx_cmd_queue_init(mlxp)) { 2746 goto err; 2747 } 2748 mlxp->mlx_attach |= MLXCX_ATTACH_CMD; 2749 2750 if (!mlxcx_cmd_enable_hca(mlxp)) { 2751 goto err; 2752 } 2753 mlxp->mlx_attach |= MLXCX_ATTACH_ENABLE_HCA; 2754 2755 if (!mlxcx_check_issi(mlxp)) { 2756 goto err; 2757 } 2758 2759 /* 2760 * We have to get our interrupts now so we know what priority to 2761 * create pagemtx with. 2762 */ 2763 if (!mlxcx_intr_setup(mlxp)) { 2764 goto err; 2765 } 2766 mlxp->mlx_attach |= MLXCX_ATTACH_INTRS; 2767 2768 mutex_init(&mlxp->mlx_pagemtx, NULL, MUTEX_DRIVER, 2769 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 2770 avl_create(&mlxp->mlx_pages, mlxcx_page_compare, 2771 sizeof (mlxcx_dev_page_t), offsetof(mlxcx_dev_page_t, mxdp_tree)); 2772 mlxp->mlx_attach |= MLXCX_ATTACH_PAGE_LIST; 2773 2774 /* 2775 * Taskq for asynchronous events which may interact with the HCA 2776 * via the command interface. Single threaded FIFO. 2777 */ 2778 (void) snprintf(tq_name, sizeof (tq_name), "%s_async_%d", 2779 ddi_driver_name(mlxp->mlx_dip), mlxp->mlx_inst); 2780 mlxp->mlx_async_tq = taskq_create(tq_name, 1, minclsyspri, 1, INT_MAX, 2781 TASKQ_PREPOPULATE); 2782 /* 2783 * Initialize any pre-allocated taskq param structs. 2784 */ 2785 for (i = 0; i <= MLXCX_FUNC_ID_MAX; i++) { 2786 mlxp->mlx_npages_req[i].mla_mlx = mlxp; 2787 mutex_init(&mlxp->mlx_npages_req[i].mla_mtx, NULL, 2788 MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_async_intr_pri)); 2789 } 2790 mlxp->mlx_attach |= MLXCX_ATTACH_ASYNC_TQ; 2791 2792 if (!mlxcx_init_pages(mlxp, MLXCX_QUERY_PAGES_OPMOD_BOOT)) { 2793 goto err; 2794 } 2795 2796 if (!mlxcx_init_caps(mlxp)) { 2797 goto err; 2798 } 2799 mlxp->mlx_attach |= MLXCX_ATTACH_CAPS; 2800 2801 if (!mlxcx_init_pages(mlxp, MLXCX_QUERY_PAGES_OPMOD_INIT)) { 2802 goto err; 2803 } 2804 2805 if (!mlxcx_cmd_init_hca(mlxp)) { 2806 goto err; 2807 } 2808 mlxp->mlx_attach |= MLXCX_ATTACH_INIT_HCA; 2809 2810 if (!mlxcx_cmd_set_driver_version(mlxp, MLXCX_DRIVER_VERSION)) { 2811 goto err; 2812 } 2813 2814 /* 2815 * The User Access Region (UAR) is needed so we can ring EQ and CQ 2816 * doorbells. 2817 */ 2818 if (!mlxcx_cmd_alloc_uar(mlxp, &mlxp->mlx_uar)) { 2819 goto err; 2820 } 2821 for (i = 0; i < MLXCX_BF_PER_UAR; ++i) { 2822 mutex_init(&mlxp->mlx_uar.mlu_bf[i].mbf_mtx, NULL, 2823 MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_intr_pri)); 2824 } 2825 mlxp->mlx_attach |= MLXCX_ATTACH_UAR_PD_TD; 2826 2827 /* 2828 * Set up asynchronous event queue which handles control type events 2829 * like PAGE_REQUEST and CMD completion events. 2830 * 2831 * This will enable and arm the interrupt on EQ 0. Note that only page 2832 * reqs and cmd completions will be handled until we call 2833 * mlxcx_eq_set_attached further down (this way we don't need an extra 2834 * set of locks over the mlxcx_t sub-structs not allocated yet) 2835 */ 2836 if (!mlxcx_setup_async_eqs(mlxp)) { 2837 goto err; 2838 } 2839 2840 /* 2841 * Allocate a protection and transport domain. These don't really do 2842 * anything for us (they're IB concepts), but we need to give their 2843 * ID numbers in other commands. 2844 */ 2845 if (!mlxcx_cmd_alloc_pd(mlxp, &mlxp->mlx_pd)) { 2846 goto err; 2847 } 2848 if (!mlxcx_cmd_alloc_tdom(mlxp, &mlxp->mlx_tdom)) { 2849 goto err; 2850 } 2851 /* 2852 * Fetch the "reserved" lkey that lets us give linear addresses in 2853 * work queue entries, rather than having to mess with the NIC's 2854 * internal MMU. 2855 */ 2856 if (!mlxcx_cmd_query_special_ctxs(mlxp)) { 2857 goto err; 2858 } 2859 2860 /* 2861 * Query our port information and current state, populate the 2862 * mlxcx_port_t structs. 2863 * 2864 * This also sets up the root flow tables and flow groups. 2865 */ 2866 if (!mlxcx_setup_ports(mlxp)) { 2867 goto err; 2868 } 2869 mlxp->mlx_attach |= MLXCX_ATTACH_PORTS; 2870 2871 mlxcx_load_model_props(mlxp); 2872 2873 /* 2874 * Set up, enable and arm the rest of the interrupt EQs which will 2875 * service events from CQs. 2876 * 2877 * The MLXCX_ATTACH_INTRS flag covers checking if these need to be 2878 * cleaned up. 2879 */ 2880 if (!mlxcx_setup_eqs(mlxp)) { 2881 goto err; 2882 } 2883 2884 /* Completion queues */ 2885 list_create(&mlxp->mlx_cqs, sizeof (mlxcx_completion_queue_t), 2886 offsetof(mlxcx_completion_queue_t, mlcq_entry)); 2887 mlxp->mlx_attach |= MLXCX_ATTACH_CQS; 2888 2889 /* Work queues (send queues, receive queues) */ 2890 list_create(&mlxp->mlx_wqs, sizeof (mlxcx_work_queue_t), 2891 offsetof(mlxcx_work_queue_t, mlwq_entry)); 2892 mlxp->mlx_attach |= MLXCX_ATTACH_WQS; 2893 2894 /* 2895 * Construct our arrays of mlxcx_ring_group_ts, which represent the 2896 * "groups" we advertise to MAC. 2897 */ 2898 mlxp->mlx_rx_ngroups = mlxcx_calc_rx_ngroups(mlxp); 2899 mlxp->mlx_rx_groups_size = mlxp->mlx_rx_ngroups * 2900 sizeof (mlxcx_ring_group_t); 2901 mlxp->mlx_rx_groups = kmem_zalloc(mlxp->mlx_rx_groups_size, KM_SLEEP); 2902 2903 mlxp->mlx_tx_ngroups = mlxp->mlx_props.mldp_tx_ngroups; 2904 mlxp->mlx_tx_groups_size = mlxp->mlx_tx_ngroups * 2905 sizeof (mlxcx_ring_group_t); 2906 mlxp->mlx_tx_groups = kmem_zalloc(mlxp->mlx_tx_groups_size, KM_SLEEP); 2907 2908 mlxp->mlx_attach |= MLXCX_ATTACH_GROUPS; 2909 2910 /* 2911 * Sets up the free/busy buffers list for keeping track of packet 2912 * buffers. 2913 */ 2914 if (!mlxcx_setup_bufs(mlxp)) 2915 goto err; 2916 mlxp->mlx_attach |= MLXCX_ATTACH_BUFS; 2917 2918 /* 2919 * Before we tell MAC about our rings/groups, we need to do enough 2920 * setup on them to be sure about the numbers and configuration that 2921 * we have. This will do basically everything short of allocating 2922 * packet buffers and starting the rings up. 2923 */ 2924 for (i = 0; i < mlxp->mlx_tx_ngroups; ++i) { 2925 if (!mlxcx_tx_group_setup(mlxp, &mlxp->mlx_tx_groups[i])) 2926 goto err; 2927 } 2928 for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) { 2929 if (!mlxcx_rx_group_setup(mlxp, &mlxp->mlx_rx_groups[i])) 2930 goto err; 2931 } 2932 2933 /* 2934 * Set up periodic fault check timers which check the queue states, 2935 * set up should be after all the queues have been initialized and 2936 * consequently the teardown of timers must happen before 2937 * queue teardown. 2938 */ 2939 if (!mlxcx_setup_checktimers(mlxp)) { 2940 goto err; 2941 } 2942 mlxp->mlx_attach |= MLXCX_ATTACH_CHKTIMERS; 2943 2944 /* 2945 * Some devices may not have a working temperature sensor; however, 2946 * there isn't a great way for us to know. We shouldn't fail attach if 2947 * this doesn't work. 2948 */ 2949 if (mlxcx_setup_sensors(mlxp)) { 2950 mlxp->mlx_attach |= MLXCX_ATTACH_SENSORS; 2951 } 2952 2953 /* 2954 * Finally, tell MAC that we exist! 2955 */ 2956 if (!mlxcx_register_mac(mlxp)) { 2957 goto err; 2958 } 2959 mlxp->mlx_attach |= MLXCX_ATTACH_MAC_HDL; 2960 2961 /* 2962 * This tells the interrupt handlers they can start processing events 2963 * other than cmd completions and page requests. 2964 */ 2965 mlxcx_eq_set_attached(mlxp); 2966 2967 return (DDI_SUCCESS); 2968 2969 err: 2970 mlxcx_teardown(mlxp); 2971 return (DDI_FAILURE); 2972 } 2973 2974 static struct cb_ops mlxcx_cb_ops = { 2975 .cb_open = nulldev, 2976 .cb_close = nulldev, 2977 .cb_strategy = nodev, 2978 .cb_print = nodev, 2979 .cb_dump = nodev, 2980 .cb_read = nodev, 2981 .cb_write = nodev, 2982 .cb_ioctl = nodev, 2983 .cb_devmap = nodev, 2984 .cb_mmap = nodev, 2985 .cb_segmap = nodev, 2986 .cb_chpoll = nochpoll, 2987 .cb_prop_op = ddi_prop_op, 2988 .cb_flag = D_MP, 2989 .cb_rev = CB_REV, 2990 .cb_aread = nodev, 2991 .cb_awrite = nodev 2992 }; 2993 2994 static struct dev_ops mlxcx_dev_ops = { 2995 .devo_rev = DEVO_REV, 2996 .devo_refcnt = 0, 2997 .devo_getinfo = NULL, 2998 .devo_identify = nulldev, 2999 .devo_probe = nulldev, 3000 .devo_attach = mlxcx_attach, 3001 .devo_detach = mlxcx_detach, 3002 .devo_reset = nodev, 3003 .devo_quiesce = ddi_quiesce_not_supported, 3004 .devo_cb_ops = &mlxcx_cb_ops 3005 }; 3006 3007 static struct modldrv mlxcx_modldrv = { 3008 .drv_modops = &mod_driverops, 3009 .drv_linkinfo = "Mellanox Connect-X 4/5/6", 3010 .drv_dev_ops = &mlxcx_dev_ops 3011 }; 3012 3013 static struct modlinkage mlxcx_modlinkage = { 3014 .ml_rev = MODREV_1, 3015 .ml_linkage = { &mlxcx_modldrv, NULL } 3016 }; 3017 3018 int 3019 _init(void) 3020 { 3021 int ret; 3022 3023 ret = ddi_soft_state_init(&mlxcx_softstate, sizeof (mlxcx_t), 0); 3024 if (ret != 0) { 3025 return (ret); 3026 } 3027 3028 mac_init_ops(&mlxcx_dev_ops, MLXCX_MODULE_NAME); 3029 3030 if ((ret = mod_install(&mlxcx_modlinkage)) != DDI_SUCCESS) { 3031 mac_fini_ops(&mlxcx_dev_ops); 3032 ddi_soft_state_fini(&mlxcx_softstate); 3033 return (ret); 3034 } 3035 3036 return (DDI_SUCCESS); 3037 } 3038 3039 int 3040 _info(struct modinfo *modinfop) 3041 { 3042 return (mod_info(&mlxcx_modlinkage, modinfop)); 3043 } 3044 3045 int 3046 _fini(void) 3047 { 3048 int ret; 3049 3050 if ((ret = mod_remove(&mlxcx_modlinkage)) != DDI_SUCCESS) { 3051 return (ret); 3052 } 3053 3054 mac_fini_ops(&mlxcx_dev_ops); 3055 3056 ddi_soft_state_fini(&mlxcx_softstate); 3057 3058 return (DDI_SUCCESS); 3059 } 3060