1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2023, The University of Queensland 14 * Copyright (c) 2018, Joyent, Inc. 15 * Copyright 2023 RackTop Systems, Inc. 16 * Copyright 2023 MNX Cloud, Inc. 17 */ 18 19 /* 20 * Mellanox Connect-X 4/5/6 driver. 21 */ 22 23 /* 24 * The PRM for this family of parts was freely available at: 25 * 26 * https://www.mellanox.com/related-docs/user_manuals/ \ 27 * Ethernet_Adapters_Programming_Manual.pdf 28 * 29 * but has since disappeared. 30 */ 31 /* 32 * ConnectX glossary 33 * ----------------- 34 * 35 * WR Work Request: something we've asked the hardware to do by 36 * creating a Work Queue Entry (WQE), e.g. send or recv a packet 37 * 38 * WQE Work Queue Entry: a descriptor on a work queue descriptor ring 39 * 40 * WQ Work Queue: a descriptor ring that we can place WQEs on, usually 41 * either a Send Queue (SQ) or Receive Queue (RQ). Different WQ 42 * types have different WQE structures, different commands for 43 * creating and destroying them, etc, but share a common context 44 * structure, counter setup and state graph. 45 * SQ Send Queue, a specific type of WQ that sends packets 46 * RQ Receive Queue, a specific type of WQ that receives packets 47 * 48 * CQ Completion Queue: completion of WRs from a WQ are reported to 49 * one of these, as a CQE on its entry ring. 50 * CQE Completion Queue Entry: an entry in a CQ ring. Contains error 51 * info, as well as packet size, the ID of the WQ, and the index 52 * of the WQE which completed. Does not contain any packet data. 53 * 54 * EQ Event Queue: a ring of event structs from the hardware informing 55 * us when particular events happen. Many events can point at a 56 * a particular CQ which we should then go look at. 57 * EQE Event Queue Entry: an entry on the EQ ring 58 * 59 * UAR User Access Region, a page of the device's PCI BAR which is 60 * tied to particular EQ/CQ/WQ sets and contains doorbells to 61 * ring to arm them for interrupts or wake them up for new work 62 * 63 * RQT RQ Table, a collection of indexed RQs used to refer to the group 64 * as a single unit (for e.g. hashing/RSS). 65 * 66 * TIR Transport Interface Recieve, a bucket of resources for the 67 * reception of packets. TIRs have to point at either a single RQ 68 * or a table of RQs (RQT). They then serve as a target for flow 69 * table entries (FEs). TIRs that point at an RQT also contain the 70 * settings for hashing for RSS. 71 * 72 * TIS Transport Interface Send, a bucket of resources associated with 73 * the transmission of packets. In particular, the temporary 74 * resources used for LSO internally in the card are accounted to 75 * a TIS. 76 * 77 * FT Flow Table, a collection of FEs and FGs that can be referred to 78 * as a single entity (e.g. used as a target from another flow 79 * entry or set as the "root" table to handle incoming or outgoing 80 * packets). Packets arriving at a FT are matched against the 81 * FEs in the table until either one matches with a terminating 82 * action or all FEs are exhausted (it's first-match-wins but with 83 * some actions that are non-terminal, like counting actions). 84 * 85 * FG Flow Group, a group of FEs which share a common "mask" (i.e. 86 * they match on the same attributes of packets coming into the 87 * flow). 88 * 89 * FE Flow Entry, an individual set of values to match against 90 * packets entering the flow table, combined with an action to 91 * take upon a successful match. The action we use most is 92 * "forward", which sends the packets to a TIR or another flow 93 * table and then stops further processing within the FE's FT. 94 * 95 * lkey/mkey A reference to something similar to a page table but in the 96 * device's internal onboard MMU. Since Connect-X parts double as 97 * IB cards (lots of RDMA) they have extensive onboard memory mgmt 98 * features which we try very hard not to use. For our WQEs we use 99 * the "reserved" lkey, which is a special value which indicates 100 * that addresses we give are linear addresses and should not be 101 * translated. 102 * 103 * PD Protection Domain, an IB concept. We have to allocate one to 104 * provide as a parameter for new WQs, but we don't do anything 105 * with it. 106 * 107 * TDOM/TD Transport Domain, an IB concept. We allocate one in order to 108 * provide it as a parameter to TIR/TIS creation, but we don't do 109 * anything with it. 110 */ 111 /* 112 * 113 * Data flow overview 114 * ------------------ 115 * 116 * This driver is a MAC ring-enabled driver which maps rings to send and recv 117 * queues in hardware on the device. 118 * 119 * Each SQ and RQ is set up to report to its own individual CQ, to ensure 120 * sufficient space, and simplify the logic needed to work out which buffer 121 * was completed. 122 * 123 * The CQs are then round-robin allocated onto EQs, of which we set up one per 124 * interrupt that the system gives us for the device. Normally this means we 125 * have 8 EQs. 126 * 127 * When we have >= 8 EQs available, we try to allocate only RX or only TX 128 * CQs on each one. The EQs are chosen for RX and TX in an alternating fashion. 129 * 130 * EQ #0 is reserved for all event types other than completion events, and has 131 * no CQs associated with it at any time. EQs #1 and upwards are only used for 132 * handling CQ completion events. 133 * 134 * +------+ +------+ +------+ +---------+ 135 * | SQ 0 |---->| CQ 0 |-----+ | EQ 0 |------> | MSI-X 0 | mlxcx_intr_0 136 * +------+ +------+ | +------+ +---------+ 137 * | 138 * +------+ +------+ | 139 * | SQ 1 |---->| CQ 1 |---+ | +------+ 140 * +------+ +------+ | +---> | | 141 * | | | 142 * +------+ +------+ | | EQ 1 | +---------+ 143 * | SQ 2 |---->| CQ 2 |---------> | |------> | MSI-X 1 | mlxcx_intr_n 144 * +------+ +------+ | +---> | | +---------+ 145 * | | +------+ 146 * | | 147 * ... | | 148 * | | +------+ 149 * +------+ +------+ +-----> | | 150 * | RQ 0 |---->| CQ 3 |---------> | | +---------+ 151 * +------+ +------+ | | EQ 2 |------> | MSI-X 2 | mlxcx_intr_n 152 * | | | +---------+ 153 * +------+ +------+ | +-> | | 154 * | RQ 1 |---->| CQ 4 |-----+ | +------+ 155 * +------+ +------+ | 156 * | .... 157 * +------+ +------+ | 158 * | RQ 2 |---->| CQ 5 |-------+ 159 * +------+ +------+ 160 * 161 * ... (note this diagram does not show RX-only or TX-only EQs) 162 * 163 * For TX, we advertise all of the SQs we create as plain rings to MAC with 164 * no TX groups. This puts MAC in "virtual group" mode where it will allocate 165 * and use the rings as it sees fit. 166 * 167 * For RX, we advertise actual groups in order to make use of hardware 168 * classification. 169 * 170 * The hardware classification we use is based around Flow Tables, and we 171 * currently ignore all of the eswitch features of the card. The NIC VPORT 172 * is always set to promisc mode so that the eswitch sends us all of the 173 * traffic that arrives on the NIC, and we use flow entries to manage 174 * everything. 175 * 176 * We use 2 layers of flow tables for classification: traffic arrives at the 177 * root RX flow table which contains MAC address filters. Those then send 178 * matched traffic to the per-group L1 VLAN filter tables which contain VLAN 179 * presence and VID filters. 180 * 181 * Since these parts only support doing RSS hashing on a single protocol at a 182 * time, we have to use a third layer of flow tables as well to break traffic 183 * down by L4 and L3 protocol (TCPv6, TCPv4, UDPv6, UDPv4, IPv6, IPv4 etc) 184 * so that it can be sent to the appropriate TIR for hashing. 185 * 186 * Incoming packets 187 * + +---------+ +---------+ 188 * | +->| group 0 | | group 0 | 189 * | | | vlan ft | +-->| hash ft | 190 * v | | L1 | | | L2 | 191 * +----+----+ | +---------+ | +---------+ +-----+ +-----+------+ 192 * | eswitch | | | | | | TCPv6 |--->| TIR |--->| | RQ0 | 193 * +----+----+ | | | | +---------+ +-----+ | +------+ 194 * | | | | | | UDPv6 |--->| TIR |--->| | RQ1 | 195 * | | | | | +---------+ +-----+ | +------+ 196 * | | | | | | TCPv4 |--->| TIR |--->| | RQ2 | 197 * v | | | | +---------+ +-----+ | RQT +------+ 198 * +----+----+ | +---------+ | | UDPv4 |--->| TIR |--->| | ... | 199 * | root rx | | | default |--+ +---------+ +-----+ | | | 200 * | flow tb | | +---------+ | | IPv6 |--->| TIR |--->| | | 201 * | L0 | | | promisc |--+ +---------+ +-----+ | | | 202 * +---------+ | +---------+ ^ | IPv4 |--->| TIR |--->| | | 203 * | bcast |---|---------------+ +---------+ +-----+ +-----+------+ 204 * +---------+ | ^ | other |-+ 205 * | MAC 0 |---+ | +---------+ | +-----+ +-----+ 206 * +---------+ | +->| TIR |--->| RQ0 | 207 * | MAC 1 |-+ | +-----+ +-----+ 208 * +---------+ | +---------------+ 209 * | MAC 2 |-+ | ^ 210 * +---------+ | | | 211 * | MAC 3 |-+ | +---------+ | +---------+ 212 * +---------+ | | | group 1 | | | group 1 | 213 * | ..... | +--->| vlan ft | | +>| hash ft | 214 * | | | | L1 | | | | L2 | 215 * +---------+ | +---------+ | | +---------+ +-----+ +-----+------+ 216 * | promisc |---+ | VLAN 0 |----+ | TCPv6 |--->| TIR |--->| | RQ3 | 217 * +---------+ +---------+ | +---------+ +-----+ | +------+ 218 * | ..... | | | UDPv6 |--->| TIR |--->| | RQ4 | 219 * | | | +---------+ +-----+ | +------+ 220 * | | | | TCPv4 |--->| TIR |--->| | RQ5 | 221 * | | | +---------+ +-----+ | RQT +------+ 222 * +---------+ | | UDPv4 |--->| TIR |--->| | ... | 223 * | | | +---------+ +-----+ | | | 224 * +---------+ | | IPv6 |--->| TIR |--->| | | 225 * | promisc |--+ +---------+ +-----+ | | | 226 * +---------+ | IPv4 |--->| TIR |--->| | | 227 * +---------+ +-----+ +-----+------+ 228 * | other |-+ 229 * +---------+ | 230 * ....... | +-----+ +-----+ 231 * +->| TIR |--->| RQ3 | 232 * +-----+ +-----+ 233 * 234 * Note that the "promisc" flow entries are only set/enabled when promisc 235 * mode is enabled for the NIC. All promisc flow entries point directly at 236 * group 0's hashing flowtable (so all promisc-only traffic lands on group 0, 237 * the "default group" in MAC). 238 * 239 * The "default" entry in the L1 VLAN filter flow tables is used when there 240 * are no VLANs set for the group, to accept any traffic regardless of tag. It 241 * is deleted as soon as a VLAN filter is added (and re-instated if the 242 * last VLAN filter is removed). 243 * 244 * The actual descriptor ring structures for RX on Connect-X4 don't contain any 245 * space for packet data (they're a collection of scatter pointers only). TX 246 * descriptors contain some space for "inline headers" (and the card requires 247 * us to put at least the L2 Ethernet headers there for the eswitch to look at) 248 * but all the rest of the data comes from the gather pointers. 249 * 250 * When we get completions back they simply contain the ring index number of 251 * the WR (work request) which completed. So, we manage the buffers for actual 252 * packet data completely independently of the descriptors in this driver. When 253 * a WR is enqueued in a WQE (work queue entry), we stamp the packet data buffer 254 * with the WQE index that we put it at, and therefore don't have to look at 255 * the original descriptor at all when handling completions. 256 * 257 * For RX, we create sufficient packet data buffers to fill 150% of the 258 * available descriptors for each ring. These all are pre-set-up for DMA and 259 * have an mblk_t associated with them (with desballoc()). 260 * 261 * For TX we either borrow the mblk's memory and DMA bind it (if the packet is 262 * large enough), or we copy it into a pre-allocated buffer set up in the same 263 * as as for RX. 264 */ 265 266 /* 267 * Buffer lifecycle: RX 268 * -------------------- 269 * 270 * The lifecycle of an mlxcx_buffer_t (packet buffer) used for RX is pretty 271 * straightforward. 272 * 273 * It is created (and has all its memory allocated) at the time of starting up 274 * the RX ring it belongs to. Then it is placed on the "free" list in the 275 * mlxcx_buffer_shard_t associated with its RQ. When mlxcx_rq_refill() wants 276 * more buffers to add to the RQ, it takes one off and marks it as "on WQ" 277 * before making a WQE for it. 278 * 279 * After a completion event occurs, the packet is either discarded (and the 280 * buffer_t returned to the free list), or it is readied for loaning to MAC 281 * and placed on the "loaned" list in the mlxcx_buffer_shard_t. 282 * 283 * Once MAC and the rest of the system have finished with the packet, they call 284 * freemsg() on its mblk, which will call mlxcx_buf_mp_return. At this point 285 * the fate of the buffer_t is determined by the state of the 286 * mlxcx_buffer_shard_t. When the shard is in its normal state the buffer_t 287 * will be returned to the free list, potentially to be recycled and used 288 * again. But if the shard is draining (E.g. after a ring stop) there will be 289 * no recycling and the buffer_t is immediately destroyed. 290 * 291 * At detach/teardown time, buffers are only every destroyed from the free list. 292 * 293 * 294 * + 295 * | 296 * | mlxcx_buf_create 297 * | 298 * v 299 * +----+----+ 300 * | created | 301 * +----+----+ +------+ 302 * | | dead | 303 * | +------+ 304 * | mlxcx_buf_return ^ 305 * | | 306 * v | mlxcx_buf_destroy 307 * mlxcx_buf_destroy +----+----+ +-----------+ | 308 * +---------| free |<------no-| draining? |-yes-+ 309 * | +----+----+ +-----------+ 310 * | | ^ 311 * | | | 312 * v | mlxcx_buf_take | mlxcx_buf_return 313 * +---+--+ v | 314 * | dead | +---+---+ | 315 * +------+ | on WQ |- - - - - - - - >O 316 * +---+---+ ^ 317 * | | 318 * | | 319 * | mlxcx_buf_loan | mlxcx_buf_mp_return 320 * v | 321 * +-------+--------+ | 322 * | on loan to MAC |----------->O 323 * +----------------+ freemsg() 324 * 325 */ 326 327 /* 328 * Buffer lifecycle: TX 329 * -------------------- 330 * 331 * mlxcx_buffer_ts used for TX are divided into two kinds: regular buffers, and 332 * "foreign" buffers. 333 * 334 * The former have their memory allocated and DMA bound by this driver, while 335 * the latter (the "foreign" buffers) are on loan from MAC. Their memory is 336 * not owned by us, though we do DMA bind it (and take responsibility for 337 * un-binding it when we're done with them). 338 * 339 * We use separate mlxcx_buf_shard_ts for foreign and local buffers on each 340 * SQ. Thus, there is a separate free list and mutex for each kind. 341 * 342 * Since a TX packet might consist of multiple mblks, we translate each mblk 343 * into exactly one buffer_t. The buffer_ts are chained together in the same 344 * order as the mblks, using the mlb_tx_chain/mlb_tx_chain_entry list_t. 345 * 346 * Each chain of TX buffers may consist of foreign or driver buffers, in any 347 * mixture. 348 * 349 * The head of a TX buffer chain has mlb_tx_head == itself, which distinguishes 350 * it from the rest of the chain buffers. 351 * 352 * TX buffer chains are always returned to the free list by 353 * mlxcx_buf_return_chain(), which takes care of walking the mlb_tx_chain and 354 * freeing all of the members. 355 * 356 * We only call freemsg() once, on the head of the TX buffer chain's original 357 * mblk. This is true whether we copied it or bound it in a foreign buffer. 358 */ 359 360 /* 361 * Startup and command interface 362 * ----------------------------- 363 * 364 * The command interface is the primary way in which we give control orders to 365 * the hardware (e.g. actions like "create this queue" or "delete this flow 366 * entry"). The command interface is never used to transmit or receive packets 367 * -- that takes place only on the queues that are set up through it. 368 * 369 * In mlxcx_cmd.c we implement our use of the command interface on top of a 370 * simple taskq. As commands are submitted from the taskq they choose a 371 * "slot", if there are no free slots then execution of the command will 372 * be paused until one is free. The hardware permits up to 32 independent 373 * slots for concurrent command execution. 374 * 375 * Before interrupts are enabled, command completion is polled, once 376 * interrupts are up command completions become asynchronous and are 377 * wired to EQ 0. A caveat to this is commands can not be submitted 378 * directly from EQ 0's completion handler, and any processing resulting from 379 * an asynchronous event which requires further use of the command interface 380 * is posted through a taskq. 381 * 382 * The startup/attach process for this card involves a bunch of different steps 383 * which are summarised pretty well in the PRM. We have to send a number of 384 * commands which do different things to start the card up, give it some pages 385 * of our own memory for it to use, then start creating all the entities that 386 * we need to use like EQs, CQs, WQs, as well as their dependencies like PDs 387 * and TDoms. 388 */ 389 390 /* 391 * UARs 392 * ---- 393 * 394 * The pages of the PCI BAR other than the first few are reserved for use as 395 * "UAR" sections in this device. Each UAR section can be used as a set of 396 * doorbells for our queues. 397 * 398 * Currently we just make one single UAR for all of our queues. It doesn't 399 * seem to be a major limitation yet. 400 * 401 * When we're sending packets through an SQ, the PRM is not awful clear about 402 * exactly how we're meant to use the first 16 bytes of the Blueflame buffers 403 * (it's clear on the pattern of alternation you're expected to use between 404 * even and odd for Blueflame sends, but not for regular doorbells). 405 * 406 * Currently we don't do the even-odd alternating pattern for ordinary 407 * doorbells, and we don't use Blueflame at all. This seems to work fine, at 408 * least on Connect-X4 Lx. 409 */ 410 411 /* 412 * Lock ordering 413 * ------------- 414 * 415 * Interrupt side: 416 * 417 * - mleq_mtx 418 * - mlcq_arm_mtx 419 * - mlcq_mtx 420 * - mlcq_bufbmtx 421 * - mlwq_mtx 422 * - mlbs_mtx 423 * - mlp_mtx 424 * 425 * GLD side: 426 * 427 * - mlp_mtx 428 * - mlg_mtx 429 * - mlg_*.mlft_mtx 430 * - mlp_*.mlft_mtx 431 * - mlwq_mtx 432 * - mlbs_mtx 433 * - mlcq_bufbmtx 434 * - mleq_mtx 435 * - mlcq_arm_mtx 436 * - mlcq_mtx 437 * 438 */ 439 440 #include <sys/modctl.h> 441 #include <sys/conf.h> 442 #include <sys/devops.h> 443 #include <sys/sysmacros.h> 444 #include <sys/time.h> 445 #include <sys/pci.h> 446 #include <sys/mac_provider.h> 447 448 #include <mlxcx.h> 449 450 CTASSERT((1 << MLXCX_RX_HASH_FT_SIZE_SHIFT) >= MLXCX_TIRS_PER_GROUP); 451 452 #define MLXCX_MODULE_NAME "mlxcx" 453 /* 454 * We give this to the firmware, so it has to be in a fixed format that it 455 * understands. 456 */ 457 #define MLXCX_DRIVER_VERSION "illumos,mlxcx,1.0.0,1,000,000000" 458 459 /* 460 * Firmware may take a while to reclaim pages. Try a set number of times. 461 */ 462 clock_t mlxcx_reclaim_delay = 1000 * 50; /* 50 ms in us */ 463 uint_t mlxcx_reclaim_tries = 100; /* Wait at most 5000ms */ 464 465 static void *mlxcx_softstate; 466 467 /* 468 * Fault detection thresholds. 469 */ 470 uint_t mlxcx_doorbell_tries = MLXCX_DOORBELL_TRIES_DFLT; 471 uint_t mlxcx_stuck_intr_count = MLXCX_STUCK_INTR_COUNT_DFLT; 472 473 static void 474 mlxcx_load_prop_defaults(mlxcx_t *mlxp) 475 { 476 mlxcx_drv_props_t *p = &mlxp->mlx_props; 477 mlxcx_port_t *port = &mlxp->mlx_ports[0]; 478 479 VERIFY((mlxp->mlx_attach & MLXCX_ATTACH_PORTS) != 0); 480 VERIFY((mlxp->mlx_attach & (MLXCX_ATTACH_CQS | MLXCX_ATTACH_WQS)) == 0); 481 482 /* 483 * Currently we have different queue size defaults for two 484 * categories of queues. One set for devices which support a 485 * maximum speed of 10Gb/s, and another for those above that. 486 */ 487 if ((port->mlp_max_proto & (MLXCX_PROTO_25G | MLXCX_PROTO_40G | 488 MLXCX_PROTO_50G | MLXCX_PROTO_100G)) != 0 || 489 (port->mlp_ext_max_proto & (MLXCX_EXTPROTO_25G | 490 MLXCX_EXTPROTO_40G | MLXCX_EXTPROTO_50G | MLXCX_EXTPROTO_100G | 491 MLXCX_EXTPROTO_200G | MLXCX_EXTPROTO_400G)) != 0) { 492 p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_25G; 493 p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_25G; 494 p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_25G; 495 } else if ((port->mlp_max_proto & (MLXCX_PROTO_100M | MLXCX_PROTO_1G | 496 MLXCX_PROTO_10G)) != 0 || 497 (port->mlp_ext_max_proto & (MLXCX_EXTPROTO_100M | 498 MLXCX_EXTPROTO_5G | MLXCX_EXTPROTO_1G | MLXCX_EXTPROTO_10G)) != 0) { 499 p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT; 500 p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT; 501 p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT; 502 } else { 503 mlxcx_warn(mlxp, "Encountered a port with a speed we don't " 504 "recognize. Proto: 0x%x", port->mlp_max_proto); 505 p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT; 506 p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT; 507 p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT; 508 } 509 } 510 511 /* 512 * Properties which may have different defaults based on hardware 513 * characteristics. 514 */ 515 static void 516 mlxcx_load_model_props(mlxcx_t *mlxp) 517 { 518 mlxcx_drv_props_t *p = &mlxp->mlx_props; 519 520 mlxcx_load_prop_defaults(mlxp); 521 522 p->mldp_cq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 523 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cq_size_shift", 524 p->mldp_cq_size_shift_default); 525 p->mldp_sq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 526 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "sq_size_shift", 527 p->mldp_sq_size_shift_default); 528 p->mldp_rq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 529 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rq_size_shift", 530 p->mldp_rq_size_shift_default); 531 } 532 533 static void 534 mlxcx_load_props(mlxcx_t *mlxp) 535 { 536 mlxcx_drv_props_t *p = &mlxp->mlx_props; 537 538 p->mldp_eq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 539 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "eq_size_shift", 540 MLXCX_EQ_SIZE_SHIFT_DFLT); 541 p->mldp_cqemod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 542 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_period_usec", 543 MLXCX_CQEMOD_PERIOD_USEC_DFLT); 544 p->mldp_cqemod_count = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 545 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_count", 546 MLXCX_CQEMOD_COUNT_DFLT); 547 p->mldp_intrmod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 548 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "intrmod_period_usec", 549 MLXCX_INTRMOD_PERIOD_USEC_DFLT); 550 551 p->mldp_tx_ngroups = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 552 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_ngroups", 553 MLXCX_TX_NGROUPS_DFLT); 554 p->mldp_tx_nrings_per_group = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 555 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_nrings_per_group", 556 MLXCX_TX_NRINGS_PER_GROUP_DFLT); 557 558 p->mldp_rx_ngroups_large = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 559 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_ngroups_large", 560 MLXCX_RX_NGROUPS_LARGE_DFLT); 561 p->mldp_rx_ngroups_small = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 562 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_ngroups_small", 563 MLXCX_RX_NGROUPS_SMALL_DFLT); 564 p->mldp_rx_nrings_per_large_group = ddi_getprop(DDI_DEV_T_ANY, 565 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 566 "rx_nrings_per_large_group", MLXCX_RX_NRINGS_PER_LARGE_GROUP_DFLT); 567 p->mldp_rx_nrings_per_small_group = ddi_getprop(DDI_DEV_T_ANY, 568 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 569 "rx_nrings_per_small_group", MLXCX_RX_NRINGS_PER_SMALL_GROUP_DFLT); 570 571 p->mldp_ftbl_root_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 572 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "ftbl_root_size_shift", 573 MLXCX_FTBL_ROOT_SIZE_SHIFT_DFLT); 574 575 p->mldp_tx_bind_threshold = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 576 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_bind_threshold", 577 MLXCX_TX_BIND_THRESHOLD_DFLT); 578 579 p->mldp_ftbl_vlan_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 580 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "ftbl_vlan_size_shift", 581 MLXCX_FTBL_VLAN_SIZE_SHIFT_DFLT); 582 583 p->mldp_eq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY, 584 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 585 "eq_check_interval_sec", MLXCX_EQ_CHECK_INTERVAL_SEC_DFLT); 586 p->mldp_cq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY, 587 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 588 "cq_check_interval_sec", MLXCX_CQ_CHECK_INTERVAL_SEC_DFLT); 589 p->mldp_wq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY, 590 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 591 "wq_check_interval_sec", MLXCX_WQ_CHECK_INTERVAL_SEC_DFLT); 592 593 p->mldp_rx_per_cq = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 594 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_limit_per_completion", 595 MLXCX_RX_PER_CQ_DEFAULT); 596 597 if (p->mldp_rx_per_cq < MLXCX_RX_PER_CQ_MIN || 598 p->mldp_rx_per_cq > MLXCX_RX_PER_CQ_MAX) { 599 mlxcx_warn(mlxp, "!rx_limit_per_completion = %u is " 600 "out of range. Defaulting to: %d. Valid values are from " 601 "%d to %d", p->mldp_rx_per_cq, MLXCX_RX_PER_CQ_DEFAULT, 602 MLXCX_RX_PER_CQ_MIN, MLXCX_RX_PER_CQ_MAX); 603 p->mldp_rx_per_cq = MLXCX_RX_PER_CQ_DEFAULT; 604 } 605 606 p->mldp_rx_p50_loan_min_size = ddi_getprop(DDI_DEV_T_ANY, 607 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 608 "rx_p50_loan_min_size", MLXCX_P50_LOAN_MIN_SIZE_DFLT); 609 } 610 611 void 612 mlxcx_note(mlxcx_t *mlxp, const char *fmt, ...) 613 { 614 va_list ap; 615 616 va_start(ap, fmt); 617 if (mlxp != NULL && mlxp->mlx_dip != NULL) { 618 vdev_err(mlxp->mlx_dip, CE_NOTE, fmt, ap); 619 } else { 620 vcmn_err(CE_NOTE, fmt, ap); 621 } 622 va_end(ap); 623 } 624 625 void 626 mlxcx_warn(mlxcx_t *mlxp, const char *fmt, ...) 627 { 628 va_list ap; 629 630 va_start(ap, fmt); 631 if (mlxp != NULL && mlxp->mlx_dip != NULL) { 632 vdev_err(mlxp->mlx_dip, CE_WARN, fmt, ap); 633 } else { 634 vcmn_err(CE_WARN, fmt, ap); 635 } 636 va_end(ap); 637 } 638 639 void 640 mlxcx_panic(mlxcx_t *mlxp, const char *fmt, ...) 641 { 642 va_list ap; 643 644 va_start(ap, fmt); 645 if (mlxp != NULL && mlxp->mlx_dip != NULL) { 646 vdev_err(mlxp->mlx_dip, CE_PANIC, fmt, ap); 647 } else { 648 vcmn_err(CE_PANIC, fmt, ap); 649 } 650 va_end(ap); 651 } 652 653 uint16_t 654 mlxcx_get16(mlxcx_t *mlxp, uintptr_t off) 655 { 656 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 657 return (ddi_get16(mlxp->mlx_regs_handle, (void *)addr)); 658 } 659 660 uint32_t 661 mlxcx_get32(mlxcx_t *mlxp, uintptr_t off) 662 { 663 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 664 return (ddi_get32(mlxp->mlx_regs_handle, (void *)addr)); 665 } 666 667 uint64_t 668 mlxcx_get64(mlxcx_t *mlxp, uintptr_t off) 669 { 670 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 671 return (ddi_get64(mlxp->mlx_regs_handle, (void *)addr)); 672 } 673 674 void 675 mlxcx_put32(mlxcx_t *mlxp, uintptr_t off, uint32_t val) 676 { 677 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 678 ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val); 679 } 680 681 void 682 mlxcx_put64(mlxcx_t *mlxp, uintptr_t off, uint64_t val) 683 { 684 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 685 ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val); 686 } 687 688 void 689 mlxcx_uar_put32(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint32_t val) 690 { 691 /* 692 * The UAR is always inside the first BAR, which we mapped as 693 * mlx_regs 694 */ 695 uintptr_t addr = off + (uintptr_t)mlu->mlu_base + 696 (uintptr_t)mlxp->mlx_regs_base; 697 ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val); 698 } 699 700 void 701 mlxcx_uar_put64(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint64_t val) 702 { 703 uintptr_t addr = off + (uintptr_t)mlu->mlu_base + 704 (uintptr_t)mlxp->mlx_regs_base; 705 ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val); 706 } 707 708 static void 709 mlxcx_fm_fini(mlxcx_t *mlxp) 710 { 711 if (mlxp->mlx_fm_caps == 0) 712 return; 713 714 if (DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) 715 ddi_fm_handler_unregister(mlxp->mlx_dip); 716 717 if (DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps) || 718 DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) 719 pci_ereport_teardown(mlxp->mlx_dip); 720 721 ddi_fm_fini(mlxp->mlx_dip); 722 723 mlxp->mlx_fm_caps = 0; 724 } 725 726 void 727 mlxcx_fm_ereport(mlxcx_t *mlxp, const char *detail) 728 { 729 uint64_t ena; 730 char buf[FM_MAX_CLASS]; 731 732 if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps)) 733 return; 734 735 (void) snprintf(buf, FM_MAX_CLASS, "%s.%s", DDI_FM_DEVICE, detail); 736 ena = fm_ena_generate(0, FM_ENA_FMT1); 737 ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP, 738 FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0, 739 NULL); 740 } 741 742 static int 743 mlxcx_fm_errcb(dev_info_t *dip, ddi_fm_error_t *err, const void *arg) 744 { 745 /* 746 * as the driver can always deal with an error in any dma or 747 * access handle, we can just return the fme_status value. 748 */ 749 pci_ereport_post(dip, err, NULL); 750 return (err->fme_status); 751 } 752 753 static void 754 mlxcx_fm_init(mlxcx_t *mlxp) 755 { 756 ddi_iblock_cookie_t iblk; 757 int def = DDI_FM_EREPORT_CAPABLE | DDI_FM_ACCCHK_CAPABLE | 758 DDI_FM_DMACHK_CAPABLE | DDI_FM_ERRCB_CAPABLE; 759 760 mlxp->mlx_fm_caps = ddi_prop_get_int(DDI_DEV_T_ANY, mlxp->mlx_dip, 761 DDI_PROP_DONTPASS, "fm_capable", def); 762 763 if (mlxp->mlx_fm_caps < 0) { 764 mlxp->mlx_fm_caps = 0; 765 } 766 mlxp->mlx_fm_caps &= def; 767 768 if (mlxp->mlx_fm_caps == 0) 769 return; 770 771 ddi_fm_init(mlxp->mlx_dip, &mlxp->mlx_fm_caps, &iblk); 772 if (DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps) || 773 DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) { 774 pci_ereport_setup(mlxp->mlx_dip); 775 } 776 if (DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) { 777 ddi_fm_handler_register(mlxp->mlx_dip, mlxcx_fm_errcb, 778 (void *)mlxp); 779 } 780 } 781 782 static void 783 mlxcx_mlbs_teardown(mlxcx_t *mlxp, mlxcx_buf_shard_t *s) 784 { 785 mlxcx_buffer_t *buf; 786 787 mutex_enter(&s->mlbs_mtx); 788 789 while (!list_is_empty(&s->mlbs_busy)) 790 cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx); 791 792 while (!list_is_empty(&s->mlbs_loaned)) 793 cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx); 794 795 while ((buf = list_head(&s->mlbs_free)) != NULL) 796 mlxcx_buf_destroy(mlxp, buf); 797 798 list_destroy(&s->mlbs_free); 799 list_destroy(&s->mlbs_busy); 800 list_destroy(&s->mlbs_loaned); 801 mutex_exit(&s->mlbs_mtx); 802 803 cv_destroy(&s->mlbs_free_nonempty); 804 mutex_destroy(&s->mlbs_mtx); 805 } 806 807 static void 808 mlxcx_teardown_bufs(mlxcx_t *mlxp) 809 { 810 mlxcx_buf_shard_t *s; 811 812 while ((s = list_remove_head(&mlxp->mlx_buf_shards)) != NULL) { 813 mlxcx_mlbs_teardown(mlxp, s); 814 kmem_free(s, sizeof (mlxcx_buf_shard_t)); 815 } 816 list_destroy(&mlxp->mlx_buf_shards); 817 818 kmem_cache_destroy(mlxp->mlx_bufs_cache); 819 } 820 821 static void 822 mlxcx_teardown_pages(mlxcx_t *mlxp) 823 { 824 uint_t nzeros = 0; 825 uint64_t *pas; 826 827 pas = kmem_alloc(sizeof (*pas) * MLXCX_MANAGE_PAGES_MAX_PAGES, 828 KM_SLEEP); 829 830 mutex_enter(&mlxp->mlx_pagemtx); 831 832 while (mlxp->mlx_npages > 0) { 833 int32_t req, ret; 834 835 ASSERT0(avl_is_empty(&mlxp->mlx_pages)); 836 req = MIN(mlxp->mlx_npages, MLXCX_MANAGE_PAGES_MAX_PAGES); 837 838 if (!mlxcx_cmd_return_pages(mlxp, req, pas, &ret)) { 839 mlxcx_warn(mlxp, "hardware refused to return pages, " 840 "leaking %u remaining pages", mlxp->mlx_npages); 841 goto out; 842 } 843 844 for (int32_t i = 0; i < ret; i++) { 845 mlxcx_dev_page_t *mdp, probe; 846 bzero(&probe, sizeof (probe)); 847 probe.mxdp_pa = pas[i]; 848 849 mdp = avl_find(&mlxp->mlx_pages, &probe, NULL); 850 851 if (mdp != NULL) { 852 avl_remove(&mlxp->mlx_pages, mdp); 853 mlxp->mlx_npages--; 854 mlxcx_dma_free(&mdp->mxdp_dma); 855 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 856 } else { 857 mlxcx_panic(mlxp, "hardware returned a page " 858 "with PA 0x%" PRIx64 " but we have no " 859 "record of giving out such a page", pas[i]); 860 } 861 } 862 863 /* 864 * If no pages were returned, note that fact. 865 */ 866 if (ret == 0) { 867 nzeros++; 868 if (nzeros > mlxcx_reclaim_tries) { 869 mlxcx_warn(mlxp, "hardware refused to return " 870 "pages, leaking %u remaining pages", 871 mlxp->mlx_npages); 872 goto out; 873 } 874 delay(drv_usectohz(mlxcx_reclaim_delay)); 875 } 876 } 877 878 avl_destroy(&mlxp->mlx_pages); 879 880 out: 881 mutex_exit(&mlxp->mlx_pagemtx); 882 mutex_destroy(&mlxp->mlx_pagemtx); 883 884 kmem_free(pas, sizeof (*pas) * MLXCX_MANAGE_PAGES_MAX_PAGES); 885 } 886 887 static boolean_t 888 mlxcx_eq_alloc_dma(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) 889 { 890 ddi_device_acc_attr_t acc; 891 ddi_dma_attr_t attr; 892 boolean_t ret; 893 size_t sz, i; 894 895 VERIFY0(mleq->mleq_state & MLXCX_EQ_ALLOC); 896 897 mleq->mleq_entshift = mlxp->mlx_props.mldp_eq_size_shift; 898 mleq->mleq_nents = (1 << mleq->mleq_entshift); 899 sz = mleq->mleq_nents * sizeof (mlxcx_eventq_ent_t); 900 ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0); 901 902 mlxcx_dma_acc_attr(mlxp, &acc); 903 mlxcx_dma_queue_attr(mlxp, &attr); 904 905 ret = mlxcx_dma_alloc(mlxp, &mleq->mleq_dma, &attr, &acc, 906 B_TRUE, sz, B_TRUE); 907 if (!ret) { 908 mlxcx_warn(mlxp, "failed to allocate EQ memory"); 909 return (B_FALSE); 910 } 911 912 mleq->mleq_ent = (mlxcx_eventq_ent_t *)mleq->mleq_dma.mxdb_va; 913 914 for (i = 0; i < mleq->mleq_nents; ++i) 915 mleq->mleq_ent[i].mleqe_owner = MLXCX_EQ_OWNER_INIT; 916 917 mleq->mleq_state |= MLXCX_EQ_ALLOC; 918 919 return (B_TRUE); 920 } 921 922 static void 923 mlxcx_eq_rele_dma(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) 924 { 925 VERIFY(mleq->mleq_state & MLXCX_EQ_ALLOC); 926 if (mleq->mleq_state & MLXCX_EQ_CREATED) 927 VERIFY(mleq->mleq_state & MLXCX_EQ_DESTROYED); 928 929 mlxcx_dma_free(&mleq->mleq_dma); 930 mleq->mleq_ent = NULL; 931 932 mleq->mleq_state &= ~MLXCX_EQ_ALLOC; 933 } 934 935 void 936 mlxcx_teardown_flow_table(mlxcx_t *mlxp, mlxcx_flow_table_t *ft) 937 { 938 mlxcx_flow_group_t *fg; 939 mlxcx_flow_entry_t *fe; 940 int i; 941 942 ASSERT(mutex_owned(&ft->mlft_mtx)); 943 944 for (i = ft->mlft_nents - 1; i >= 0; --i) { 945 fe = &ft->mlft_ent[i]; 946 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) { 947 if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) { 948 mlxcx_panic(mlxp, "failed to delete flow " 949 "entry %u on table %u", i, 950 ft->mlft_num); 951 } 952 } 953 } 954 955 while ((fg = list_remove_head(&ft->mlft_groups)) != NULL) { 956 if (fg->mlfg_state & MLXCX_FLOW_GROUP_CREATED && 957 !(fg->mlfg_state & MLXCX_FLOW_GROUP_DESTROYED)) { 958 if (!mlxcx_cmd_destroy_flow_group(mlxp, fg)) { 959 mlxcx_panic(mlxp, "failed to destroy flow " 960 "group %u", fg->mlfg_num); 961 } 962 } 963 kmem_free(fg, sizeof (mlxcx_flow_group_t)); 964 } 965 list_destroy(&ft->mlft_groups); 966 if (ft->mlft_state & MLXCX_FLOW_TABLE_CREATED && 967 !(ft->mlft_state & MLXCX_FLOW_TABLE_DESTROYED)) { 968 if (!mlxcx_cmd_destroy_flow_table(mlxp, ft)) { 969 mlxcx_panic(mlxp, "failed to destroy flow table %u", 970 ft->mlft_num); 971 } 972 } 973 kmem_free(ft->mlft_ent, ft->mlft_entsize); 974 ft->mlft_ent = NULL; 975 mutex_exit(&ft->mlft_mtx); 976 mutex_destroy(&ft->mlft_mtx); 977 kmem_free(ft, sizeof (mlxcx_flow_table_t)); 978 } 979 980 static void 981 mlxcx_teardown_ports(mlxcx_t *mlxp) 982 { 983 uint_t i; 984 mlxcx_port_t *p; 985 mlxcx_flow_table_t *ft; 986 987 for (i = 0; i < mlxp->mlx_nports; ++i) { 988 p = &mlxp->mlx_ports[i]; 989 if (!(p->mlp_init & MLXCX_PORT_INIT)) 990 continue; 991 mutex_enter(&p->mlp_mtx); 992 if ((ft = p->mlp_rx_flow) != NULL) { 993 mutex_enter(&ft->mlft_mtx); 994 /* 995 * teardown_flow_table() will destroy the mutex, so 996 * we don't release it here. 997 */ 998 mlxcx_teardown_flow_table(mlxp, ft); 999 } 1000 mutex_exit(&p->mlp_mtx); 1001 mutex_destroy(&p->mlp_mtx); 1002 mutex_destroy(&p->mlx_port_event.mla_mtx); 1003 p->mlx_port_event.mla_mlx = NULL; 1004 p->mlx_port_event.mla_port = NULL; 1005 p->mlp_init &= ~MLXCX_PORT_INIT; 1006 } 1007 1008 kmem_free(mlxp->mlx_ports, mlxp->mlx_ports_size); 1009 mlxp->mlx_ports = NULL; 1010 } 1011 1012 static void 1013 mlxcx_teardown_wqs(mlxcx_t *mlxp) 1014 { 1015 mlxcx_work_queue_t *mlwq; 1016 1017 while ((mlwq = list_head(&mlxp->mlx_wqs)) != NULL) { 1018 mlxcx_wq_teardown(mlxp, mlwq); 1019 } 1020 list_destroy(&mlxp->mlx_wqs); 1021 } 1022 1023 static void 1024 mlxcx_teardown_cqs(mlxcx_t *mlxp) 1025 { 1026 mlxcx_completion_queue_t *mlcq; 1027 1028 while ((mlcq = list_head(&mlxp->mlx_cqs)) != NULL) { 1029 mlxcx_cq_teardown(mlxp, mlcq); 1030 } 1031 list_destroy(&mlxp->mlx_cqs); 1032 } 1033 1034 static void 1035 mlxcx_teardown_eqs(mlxcx_t *mlxp) 1036 { 1037 mlxcx_event_queue_t *mleq; 1038 uint_t i; 1039 1040 for (i = 0; i < mlxp->mlx_intr_count; ++i) { 1041 mleq = &mlxp->mlx_eqs[i]; 1042 mutex_enter(&mleq->mleq_mtx); 1043 if ((mleq->mleq_state & MLXCX_EQ_CREATED) && 1044 !(mleq->mleq_state & MLXCX_EQ_DESTROYED)) { 1045 if (!mlxcx_cmd_destroy_eq(mlxp, mleq)) { 1046 mlxcx_warn(mlxp, "failed to destroy " 1047 "event queue idx %u eqn %u", 1048 i, mleq->mleq_num); 1049 } 1050 } 1051 if (mleq->mleq_state & MLXCX_EQ_ALLOC) { 1052 mlxcx_eq_rele_dma(mlxp, mleq); 1053 } 1054 mutex_exit(&mleq->mleq_mtx); 1055 } 1056 } 1057 1058 static void 1059 mlxcx_teardown_checktimers(mlxcx_t *mlxp) 1060 { 1061 if (mlxp->mlx_props.mldp_eq_check_interval_sec > 0) 1062 ddi_periodic_delete(mlxp->mlx_eq_checktimer); 1063 if (mlxp->mlx_props.mldp_cq_check_interval_sec > 0) 1064 ddi_periodic_delete(mlxp->mlx_cq_checktimer); 1065 if (mlxp->mlx_props.mldp_wq_check_interval_sec > 0) 1066 ddi_periodic_delete(mlxp->mlx_wq_checktimer); 1067 } 1068 1069 static void 1070 mlxcx_teardown(mlxcx_t *mlxp) 1071 { 1072 uint_t i; 1073 dev_info_t *dip = mlxp->mlx_dip; 1074 1075 if (mlxp->mlx_attach & MLXCX_ATTACH_INTRS) { 1076 /* 1077 * Disable interrupts and let any active vectors quiesce. 1078 */ 1079 mlxcx_intr_disable(mlxp); 1080 } 1081 1082 if (mlxp->mlx_attach & MLXCX_ATTACH_SENSORS) { 1083 mlxcx_teardown_sensors(mlxp); 1084 mlxp->mlx_attach &= ~MLXCX_ATTACH_SENSORS; 1085 } 1086 1087 if (mlxp->mlx_attach & MLXCX_ATTACH_CHKTIMERS) { 1088 mlxcx_teardown_checktimers(mlxp); 1089 mlxp->mlx_attach &= ~MLXCX_ATTACH_CHKTIMERS; 1090 } 1091 1092 if (mlxp->mlx_attach & MLXCX_ATTACH_GROUPS) { 1093 mlxcx_teardown_groups(mlxp); 1094 mlxp->mlx_attach &= ~MLXCX_ATTACH_GROUPS; 1095 } 1096 1097 if (mlxp->mlx_attach & MLXCX_ATTACH_WQS) { 1098 mlxcx_teardown_wqs(mlxp); 1099 mlxp->mlx_attach &= ~MLXCX_ATTACH_WQS; 1100 } 1101 1102 if (mlxp->mlx_attach & MLXCX_ATTACH_CQS) { 1103 mlxcx_teardown_cqs(mlxp); 1104 mlxp->mlx_attach &= ~MLXCX_ATTACH_CQS; 1105 } 1106 1107 if (mlxp->mlx_attach & MLXCX_ATTACH_BUFS) { 1108 mlxcx_teardown_bufs(mlxp); 1109 mlxp->mlx_attach &= ~MLXCX_ATTACH_BUFS; 1110 } 1111 1112 if (mlxp->mlx_attach & MLXCX_ATTACH_PORTS) { 1113 mlxcx_teardown_ports(mlxp); 1114 mlxp->mlx_attach &= ~MLXCX_ATTACH_PORTS; 1115 } 1116 1117 if (mlxp->mlx_attach & MLXCX_ATTACH_INTRS) { 1118 mlxcx_teardown_eqs(mlxp); 1119 mlxcx_intr_teardown(mlxp); 1120 mlxp->mlx_attach &= ~MLXCX_ATTACH_INTRS; 1121 } 1122 1123 if (mlxp->mlx_attach & MLXCX_ATTACH_UAR_PD_TD) { 1124 if (mlxp->mlx_uar.mlu_allocated) { 1125 if (!mlxcx_cmd_dealloc_uar(mlxp, &mlxp->mlx_uar)) { 1126 mlxcx_warn(mlxp, "failed to release UAR"); 1127 } 1128 for (i = 0; i < MLXCX_BF_PER_UAR; ++i) 1129 mutex_destroy(&mlxp->mlx_uar.mlu_bf[i].mbf_mtx); 1130 } 1131 if (mlxp->mlx_pd.mlpd_allocated && 1132 !mlxcx_cmd_dealloc_pd(mlxp, &mlxp->mlx_pd)) { 1133 mlxcx_warn(mlxp, "failed to release PD"); 1134 } 1135 if (mlxp->mlx_tdom.mltd_allocated && 1136 !mlxcx_cmd_dealloc_tdom(mlxp, &mlxp->mlx_tdom)) { 1137 mlxcx_warn(mlxp, "failed to release TDOM"); 1138 } 1139 mlxp->mlx_attach &= ~MLXCX_ATTACH_UAR_PD_TD; 1140 } 1141 1142 if (mlxp->mlx_attach & MLXCX_ATTACH_INIT_HCA) { 1143 if (!mlxcx_cmd_teardown_hca(mlxp)) { 1144 mlxcx_warn(mlxp, "failed to send teardown HCA " 1145 "command during device detach"); 1146 } 1147 mlxp->mlx_attach &= ~MLXCX_ATTACH_INIT_HCA; 1148 } 1149 1150 if (mlxp->mlx_attach & MLXCX_ATTACH_PAGE_LIST) { 1151 mlxcx_teardown_pages(mlxp); 1152 mlxp->mlx_attach &= ~MLXCX_ATTACH_PAGE_LIST; 1153 } 1154 1155 if (mlxp->mlx_attach & MLXCX_ATTACH_ASYNC_TQ) { 1156 for (i = 0; i <= MLXCX_FUNC_ID_MAX; i++) { 1157 mlxp->mlx_npages_req[i].mla_mlx = NULL; 1158 mutex_destroy(&mlxp->mlx_npages_req[i].mla_mtx); 1159 } 1160 taskq_destroy(mlxp->mlx_async_tq); 1161 mlxp->mlx_async_tq = NULL; 1162 mlxp->mlx_attach &= ~MLXCX_ATTACH_ASYNC_TQ; 1163 } 1164 1165 if (mlxp->mlx_attach & MLXCX_ATTACH_ENABLE_HCA) { 1166 if (!mlxcx_cmd_disable_hca(mlxp)) { 1167 mlxcx_warn(mlxp, "failed to send DISABLE HCA command " 1168 "during device detach"); 1169 } 1170 mlxp->mlx_attach &= ~MLXCX_ATTACH_ENABLE_HCA; 1171 } 1172 1173 if (mlxp->mlx_attach & MLXCX_ATTACH_CMD) { 1174 mlxcx_cmd_queue_fini(mlxp); 1175 mlxp->mlx_attach &= ~MLXCX_ATTACH_CMD; 1176 } 1177 1178 if (mlxp->mlx_attach & MLXCX_ATTACH_CAPS) { 1179 kmem_free(mlxp->mlx_caps, sizeof (mlxcx_caps_t)); 1180 mlxp->mlx_caps = NULL; 1181 mlxp->mlx_attach &= ~MLXCX_ATTACH_CAPS; 1182 } 1183 1184 if (mlxp->mlx_attach & MLXCX_ATTACH_REGS) { 1185 ddi_regs_map_free(&mlxp->mlx_regs_handle); 1186 mlxp->mlx_regs_handle = NULL; 1187 mlxp->mlx_attach &= ~MLXCX_ATTACH_REGS; 1188 } 1189 1190 if (mlxp->mlx_attach & MLXCX_ATTACH_PCI_CONFIG) { 1191 pci_config_teardown(&mlxp->mlx_cfg_handle); 1192 mlxp->mlx_cfg_handle = NULL; 1193 mlxp->mlx_attach &= ~MLXCX_ATTACH_PCI_CONFIG; 1194 } 1195 1196 if (mlxp->mlx_attach & MLXCX_ATTACH_FM) { 1197 mlxcx_fm_fini(mlxp); 1198 mlxp->mlx_attach &= ~MLXCX_ATTACH_FM; 1199 } 1200 1201 VERIFY3S(mlxp->mlx_attach, ==, 0); 1202 ddi_soft_state_free(mlxcx_softstate, mlxp->mlx_inst); 1203 ddi_set_driver_private(dip, NULL); 1204 } 1205 1206 static void 1207 mlxcx_get_model(mlxcx_t *mlxp) 1208 { 1209 uint16_t venid; 1210 uint16_t devid; 1211 1212 venid = pci_config_get16(mlxp->mlx_cfg_handle, PCI_CONF_VENID); 1213 if (venid != MLXCX_VENDOR_ID) { 1214 /* Currently, all supported cards have a Mellanox vendor id. */ 1215 mlxp->mlx_type = MLXCX_DEV_UNKNOWN; 1216 return; 1217 } 1218 1219 devid = pci_config_get16(mlxp->mlx_cfg_handle, PCI_CONF_DEVID); 1220 switch (devid) { 1221 case MLXCX_CX4_DEVID: 1222 case MLXCX_CX4_VF_DEVID: 1223 case MLXCX_CX4_LX_VF_DEVID: 1224 mlxp->mlx_type = MLXCX_DEV_CX4; 1225 break; 1226 case MLXCX_CX5_DEVID: 1227 case MLXCX_CX5_VF_DEVID: 1228 case MLXCX_CX5_EX_DEVID: 1229 case MLXCX_CX5_EX_VF_DEVID: 1230 case MLXCX_CX5_GEN_VF_DEVID: 1231 mlxp->mlx_type = MLXCX_DEV_CX5; 1232 break; 1233 case MLXCX_CX6_DEVID: 1234 case MLXCX_CX6_VF_DEVID: 1235 case MLXCX_CX6_DF_DEVID: 1236 case MLXCX_CX6_LX_DEVID: 1237 mlxp->mlx_type = MLXCX_DEV_CX6; 1238 break; 1239 default: 1240 mlxp->mlx_type = MLXCX_DEV_UNKNOWN; 1241 } 1242 } 1243 1244 static boolean_t 1245 mlxcx_regs_map(mlxcx_t *mlxp) 1246 { 1247 off_t memsize; 1248 int ret; 1249 ddi_device_acc_attr_t da; 1250 1251 if (ddi_dev_regsize(mlxp->mlx_dip, MLXCX_REG_NUMBER, &memsize) != 1252 DDI_SUCCESS) { 1253 mlxcx_warn(mlxp, "failed to get register set size"); 1254 return (B_FALSE); 1255 } 1256 1257 /* 1258 * All data in the main BAR is kept in big-endian even though it's a PCI 1259 * device. 1260 */ 1261 bzero(&da, sizeof (ddi_device_acc_attr_t)); 1262 da.devacc_attr_version = DDI_DEVICE_ATTR_V0; 1263 da.devacc_attr_endian_flags = DDI_STRUCTURE_BE_ACC; 1264 da.devacc_attr_dataorder = DDI_STRICTORDER_ACC; 1265 if (DDI_FM_ACC_ERR_CAP(mlxp->mlx_fm_caps)) { 1266 da.devacc_attr_access = DDI_FLAGERR_ACC; 1267 } else { 1268 da.devacc_attr_access = DDI_DEFAULT_ACC; 1269 } 1270 1271 ret = ddi_regs_map_setup(mlxp->mlx_dip, MLXCX_REG_NUMBER, 1272 &mlxp->mlx_regs_base, 0, memsize, &da, &mlxp->mlx_regs_handle); 1273 1274 if (ret != DDI_SUCCESS) { 1275 mlxcx_warn(mlxp, "failed to map device registers: %d", ret); 1276 return (B_FALSE); 1277 } 1278 1279 return (B_TRUE); 1280 } 1281 1282 static boolean_t 1283 mlxcx_check_issi(mlxcx_t *mlxp) 1284 { 1285 uint32_t issi; 1286 1287 if (!mlxcx_cmd_query_issi(mlxp, &issi)) { 1288 mlxcx_warn(mlxp, "failed to get ISSI"); 1289 return (B_FALSE); 1290 } 1291 1292 if ((issi & (1 << MLXCX_CURRENT_ISSI)) == 0) { 1293 mlxcx_warn(mlxp, "hardware does not support software ISSI, " 1294 "hw vector 0x%x, sw version %u", issi, MLXCX_CURRENT_ISSI); 1295 return (B_FALSE); 1296 } 1297 1298 if (!mlxcx_cmd_set_issi(mlxp, MLXCX_CURRENT_ISSI)) { 1299 mlxcx_warn(mlxp, "failed to set ISSI to %u", 1300 MLXCX_CURRENT_ISSI); 1301 return (B_FALSE); 1302 } 1303 1304 return (B_TRUE); 1305 } 1306 1307 boolean_t 1308 mlxcx_give_pages(mlxcx_t *mlxp, int32_t npages, int32_t *ngiven) 1309 { 1310 ddi_device_acc_attr_t acc; 1311 ddi_dma_attr_t attr; 1312 int32_t i; 1313 list_t plist; 1314 mlxcx_dev_page_t *mdp; 1315 mlxcx_dev_page_t **pages; 1316 const ddi_dma_cookie_t *ck; 1317 1318 /* 1319 * If there are no pages required, then we're done here. 1320 */ 1321 if (npages <= 0) { 1322 *ngiven = 0; 1323 return (B_TRUE); 1324 } 1325 1326 npages = MIN(npages, MLXCX_MANAGE_PAGES_MAX_PAGES); 1327 1328 pages = kmem_alloc(sizeof (*pages) * npages, KM_SLEEP); 1329 1330 list_create(&plist, sizeof (mlxcx_dev_page_t), 1331 offsetof(mlxcx_dev_page_t, mxdp_list)); 1332 1333 for (i = 0; i < npages; i++) { 1334 mdp = kmem_zalloc(sizeof (mlxcx_dev_page_t), KM_SLEEP); 1335 mlxcx_dma_acc_attr(mlxp, &acc); 1336 mlxcx_dma_page_attr(mlxp, &attr); 1337 if (!mlxcx_dma_alloc(mlxp, &mdp->mxdp_dma, &attr, &acc, 1338 B_TRUE, MLXCX_HW_PAGE_SIZE, B_TRUE)) { 1339 mlxcx_warn(mlxp, "failed to allocate 4k page %u/%u", i, 1340 npages); 1341 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 1342 goto cleanup_npages; 1343 } 1344 ck = mlxcx_dma_cookie_one(&mdp->mxdp_dma); 1345 mdp->mxdp_pa = ck->dmac_laddress; 1346 1347 list_insert_tail(&plist, mdp); 1348 } 1349 1350 /* 1351 * Now that all of the pages have been allocated, given them to hardware 1352 * in chunks. 1353 */ 1354 for (i = 0; i < npages; i++) { 1355 pages[i] = list_remove_head(&plist); 1356 } 1357 1358 if (!mlxcx_cmd_give_pages(mlxp, 1359 MLXCX_MANAGE_PAGES_OPMOD_GIVE_PAGES, npages, pages)) { 1360 mlxcx_warn(mlxp, "!hardware refused our gift of %u " 1361 "pages!", npages); 1362 for (i = 0; i < npages; i++) { 1363 list_insert_tail(&plist, pages[i]); 1364 } 1365 goto cleanup_npages; 1366 } 1367 1368 mutex_enter(&mlxp->mlx_pagemtx); 1369 for (i = 0; i < npages; i++) { 1370 avl_add(&mlxp->mlx_pages, pages[i]); 1371 } 1372 mlxp->mlx_npages += npages; 1373 mutex_exit(&mlxp->mlx_pagemtx); 1374 1375 list_destroy(&plist); 1376 kmem_free(pages, sizeof (*pages) * npages); 1377 1378 *ngiven = npages; 1379 1380 return (B_TRUE); 1381 1382 cleanup_npages: 1383 kmem_free(pages, sizeof (*pages) * npages); 1384 while ((mdp = list_remove_head(&plist)) != NULL) { 1385 mlxcx_dma_free(&mdp->mxdp_dma); 1386 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 1387 } 1388 list_destroy(&plist); 1389 return (B_FALSE); 1390 } 1391 1392 static boolean_t 1393 mlxcx_init_pages(mlxcx_t *mlxp, uint_t type) 1394 { 1395 int32_t npages, given; 1396 1397 if (!mlxcx_cmd_query_pages(mlxp, type, &npages)) { 1398 mlxcx_warn(mlxp, "failed to determine boot pages"); 1399 return (B_FALSE); 1400 } 1401 1402 while (npages > 0) { 1403 if (!mlxcx_give_pages(mlxp, npages, &given)) 1404 return (B_FALSE); 1405 1406 npages -= given; 1407 } 1408 1409 return (B_TRUE); 1410 } 1411 1412 static int 1413 mlxcx_bufs_cache_constr(void *arg, void *cookie, int kmflags) 1414 { 1415 mlxcx_t *mlxp = cookie; 1416 mlxcx_buffer_t *b = arg; 1417 1418 bzero(b, sizeof (mlxcx_buffer_t)); 1419 b->mlb_mlx = mlxp; 1420 b->mlb_state = MLXCX_BUFFER_INIT; 1421 list_create(&b->mlb_tx_chain, sizeof (mlxcx_buffer_t), 1422 offsetof(mlxcx_buffer_t, mlb_tx_chain_entry)); 1423 1424 return (0); 1425 } 1426 1427 static void 1428 mlxcx_bufs_cache_destr(void *arg, void *cookie) 1429 { 1430 mlxcx_t *mlxp = cookie; 1431 mlxcx_buffer_t *b = arg; 1432 VERIFY3P(b->mlb_mlx, ==, mlxp); 1433 VERIFY(b->mlb_state == MLXCX_BUFFER_INIT); 1434 list_destroy(&b->mlb_tx_chain); 1435 } 1436 1437 mlxcx_buf_shard_t * 1438 mlxcx_mlbs_create(mlxcx_t *mlxp) 1439 { 1440 mlxcx_buf_shard_t *s; 1441 1442 s = kmem_zalloc(sizeof (mlxcx_buf_shard_t), KM_SLEEP); 1443 1444 mutex_init(&s->mlbs_mtx, NULL, MUTEX_DRIVER, 1445 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 1446 list_create(&s->mlbs_busy, sizeof (mlxcx_buffer_t), 1447 offsetof(mlxcx_buffer_t, mlb_entry)); 1448 list_create(&s->mlbs_free, sizeof (mlxcx_buffer_t), 1449 offsetof(mlxcx_buffer_t, mlb_entry)); 1450 list_create(&s->mlbs_loaned, sizeof (mlxcx_buffer_t), 1451 offsetof(mlxcx_buffer_t, mlb_entry)); 1452 cv_init(&s->mlbs_free_nonempty, NULL, CV_DRIVER, NULL); 1453 1454 list_insert_tail(&mlxp->mlx_buf_shards, s); 1455 1456 return (s); 1457 } 1458 1459 static boolean_t 1460 mlxcx_setup_bufs(mlxcx_t *mlxp) 1461 { 1462 char namebuf[KSTAT_STRLEN]; 1463 1464 (void) snprintf(namebuf, KSTAT_STRLEN, "mlxcx%d_bufs_cache", 1465 ddi_get_instance(mlxp->mlx_dip)); 1466 mlxp->mlx_bufs_cache = kmem_cache_create(namebuf, 1467 sizeof (mlxcx_buffer_t), sizeof (uint64_t), 1468 mlxcx_bufs_cache_constr, mlxcx_bufs_cache_destr, 1469 NULL, mlxp, NULL, 0); 1470 1471 list_create(&mlxp->mlx_buf_shards, sizeof (mlxcx_buf_shard_t), 1472 offsetof(mlxcx_buf_shard_t, mlbs_entry)); 1473 1474 return (B_TRUE); 1475 } 1476 1477 static void 1478 mlxcx_fm_qstate_ereport(mlxcx_t *mlxp, const char *qtype, uint32_t qnum, 1479 const char *state, uint8_t statenum) 1480 { 1481 uint64_t ena; 1482 char buf[FM_MAX_CLASS]; 1483 1484 if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps)) 1485 return; 1486 1487 (void) snprintf(buf, FM_MAX_CLASS, "%s.%s", 1488 MLXCX_FM_SERVICE_MLXCX, "qstate.err"); 1489 ena = fm_ena_generate(0, FM_ENA_FMT1); 1490 1491 ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP, 1492 FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0, 1493 "state", DATA_TYPE_STRING, state, 1494 "state_num", DATA_TYPE_UINT8, statenum, 1495 "qtype", DATA_TYPE_STRING, qtype, 1496 "qnum", DATA_TYPE_UINT32, qnum, 1497 NULL); 1498 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_DEGRADED); 1499 } 1500 1501 /* 1502 * The following set of routines are for monitoring the health of 1503 * event, completion and work queues. They run infrequently peeking at 1504 * the structs to catch stalls and inconsistent state. 1505 * 1506 * They peek at the structs *without* acquiring locks - we don't want 1507 * to impede flow of data. Driver start up and shutdown semantics 1508 * guarantee the structs are present and won't disappear underneath 1509 * these routines. 1510 * 1511 * As previously noted, the routines peek at active data in the structs and 1512 * they will store some values for comparison on next invocation. To 1513 * maintain integrity of the saved values, these values are only modified 1514 * within these routines. 1515 */ 1516 static void 1517 mlxcx_eq_check(void *arg) 1518 { 1519 mlxcx_t *mlxp = (mlxcx_t *)arg; 1520 mlxcx_event_queue_t *eq; 1521 mlxcx_eventq_ctx_t ctx; 1522 const char *str; 1523 1524 uint_t i; 1525 1526 for (i = 0; i < mlxp->mlx_intr_count; ++i) { 1527 eq = &mlxp->mlx_eqs[i]; 1528 1529 if ((eq->mleq_state & MLXCX_EQ_CREATED) == 0) 1530 continue; 1531 1532 /* 1533 * If the event queue was successfully created in the HCA, 1534 * then initialization and shutdown sequences guarantee 1535 * the queue exists. 1536 */ 1537 ASSERT0(eq->mleq_state & MLXCX_EQ_DESTROYED); 1538 1539 if (!mlxcx_cmd_query_eq(mlxp, eq, &ctx)) 1540 continue; 1541 1542 str = "???"; 1543 switch (ctx.mleqc_status) { 1544 case MLXCX_EQ_STATUS_OK: 1545 break; 1546 case MLXCX_EQ_STATUS_WRITE_FAILURE: 1547 str = "WRITE_FAILURE"; 1548 break; 1549 } 1550 1551 if (ctx.mleqc_status != MLXCX_EQ_STATUS_OK) { 1552 mlxcx_fm_qstate_ereport(mlxp, "event", 1553 eq->mleq_num, str, ctx.mleqc_status); 1554 mlxcx_warn(mlxp, "EQ %u is in bad status: %x (%s)", 1555 eq->mleq_intr_index, ctx.mleqc_status, str); 1556 } 1557 1558 if (ctx.mleqc_state != MLXCX_EQ_ST_ARMED && 1559 (eq->mleq_state & MLXCX_EQ_ARMED)) { 1560 if (eq->mleq_cc == eq->mleq_check_disarm_cc && 1561 ++eq->mleq_check_disarm_cnt >= 3) { 1562 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_STALL); 1563 mlxcx_warn(mlxp, "EQ %u isn't armed", 1564 eq->mleq_intr_index); 1565 } 1566 eq->mleq_check_disarm_cc = eq->mleq_cc; 1567 } else { 1568 eq->mleq_check_disarm_cc = 0; 1569 eq->mleq_check_disarm_cnt = 0; 1570 } 1571 } 1572 } 1573 1574 static void 1575 mlxcx_cq_check(void *arg) 1576 { 1577 mlxcx_t *mlxp = (mlxcx_t *)arg; 1578 mlxcx_completion_queue_t *cq; 1579 mlxcx_completionq_ctx_t ctx; 1580 const char *str, *type; 1581 uint_t v; 1582 1583 for (cq = list_head(&mlxp->mlx_cqs); cq != NULL; 1584 cq = list_next(&mlxp->mlx_cqs, cq)) { 1585 1586 if ((cq->mlcq_state & MLXCX_CQ_CREATED) == 0) 1587 continue; 1588 1589 /* 1590 * If the completion queue was successfully created in the HCA, 1591 * then initialization and shutdown sequences guarantee 1592 * the queue exists. 1593 */ 1594 ASSERT0(cq->mlcq_state & MLXCX_CQ_DESTROYED); 1595 ASSERT0(cq->mlcq_state & MLXCX_CQ_TEARDOWN); 1596 1597 if (cq->mlcq_fm_repd_qstate) 1598 continue; 1599 1600 if (!mlxcx_cmd_query_cq(mlxp, cq, &ctx)) 1601 continue; 1602 1603 if (cq->mlcq_wq != NULL) { 1604 mlxcx_work_queue_t *wq = cq->mlcq_wq; 1605 if (wq->mlwq_type == MLXCX_WQ_TYPE_RECVQ) 1606 type = "rx "; 1607 else if (wq->mlwq_type == MLXCX_WQ_TYPE_SENDQ) 1608 type = "tx "; 1609 else 1610 type = ""; 1611 } else { 1612 type = ""; 1613 } 1614 1615 str = "???"; 1616 v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATUS); 1617 switch (v) { 1618 case MLXCX_CQC_STATUS_OK: 1619 break; 1620 case MLXCX_CQC_STATUS_OVERFLOW: 1621 str = "OVERFLOW"; 1622 break; 1623 case MLXCX_CQC_STATUS_WRITE_FAIL: 1624 str = "WRITE_FAIL"; 1625 break; 1626 case MLXCX_CQC_STATUS_INVALID: 1627 str = "INVALID"; 1628 break; 1629 } 1630 1631 if (v != MLXCX_CQC_STATUS_OK) { 1632 mlxcx_fm_qstate_ereport(mlxp, "completion", 1633 cq->mlcq_num, str, v); 1634 mlxcx_warn(mlxp, "%sCQ 0x%x is in bad status: %x (%s)", 1635 type, cq->mlcq_num, v, str); 1636 cq->mlcq_fm_repd_qstate = B_TRUE; 1637 } 1638 1639 v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATE); 1640 if (v != MLXCX_CQC_STATE_ARMED && 1641 (cq->mlcq_state & MLXCX_CQ_ARMED) && 1642 !(cq->mlcq_state & MLXCX_CQ_POLLING)) { 1643 if (cq->mlcq_cc == cq->mlcq_check_disarm_cc && 1644 ++cq->mlcq_check_disarm_cnt >= 3) { 1645 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_STALL); 1646 mlxcx_warn(mlxp, "%sCQ 0x%x (%p) isn't armed", 1647 type, cq->mlcq_num, cq); 1648 } 1649 cq->mlcq_check_disarm_cc = cq->mlcq_cc; 1650 } else { 1651 cq->mlcq_check_disarm_cnt = 0; 1652 cq->mlcq_check_disarm_cc = 0; 1653 } 1654 } 1655 } 1656 1657 void 1658 mlxcx_check_sq(mlxcx_t *mlxp, mlxcx_work_queue_t *sq) 1659 { 1660 mlxcx_sq_ctx_t ctx; 1661 mlxcx_sq_state_t state; 1662 1663 if (!mlxcx_cmd_query_sq(mlxp, sq, &ctx)) 1664 return; 1665 1666 ASSERT3U(from_be24(ctx.mlsqc_cqn), ==, sq->mlwq_cq->mlcq_num); 1667 state = get_bits32(ctx.mlsqc_flags, MLXCX_SQ_STATE); 1668 switch (state) { 1669 case MLXCX_SQ_STATE_RST: 1670 if (sq->mlwq_state & MLXCX_WQ_STARTED) { 1671 mlxcx_fm_qstate_ereport(mlxp, "send", 1672 sq->mlwq_num, "RST", state); 1673 sq->mlwq_fm_repd_qstate = B_TRUE; 1674 } 1675 break; 1676 case MLXCX_SQ_STATE_RDY: 1677 if (!(sq->mlwq_state & MLXCX_WQ_STARTED)) { 1678 mlxcx_fm_qstate_ereport(mlxp, "send", 1679 sq->mlwq_num, "RDY", state); 1680 sq->mlwq_fm_repd_qstate = B_TRUE; 1681 } 1682 break; 1683 case MLXCX_SQ_STATE_ERR: 1684 mlxcx_fm_qstate_ereport(mlxp, "send", 1685 sq->mlwq_num, "ERR", state); 1686 sq->mlwq_fm_repd_qstate = B_TRUE; 1687 break; 1688 default: 1689 mlxcx_fm_qstate_ereport(mlxp, "send", 1690 sq->mlwq_num, "???", state); 1691 sq->mlwq_fm_repd_qstate = B_TRUE; 1692 break; 1693 } 1694 } 1695 1696 void 1697 mlxcx_check_rq(mlxcx_t *mlxp, mlxcx_work_queue_t *rq) 1698 { 1699 mlxcx_rq_ctx_t ctx; 1700 mlxcx_rq_state_t state; 1701 1702 1703 if (!mlxcx_cmd_query_rq(mlxp, rq, &ctx)) 1704 return; 1705 1706 ASSERT3U(from_be24(ctx.mlrqc_cqn), ==, rq->mlwq_cq->mlcq_num); 1707 state = get_bits32(ctx.mlrqc_flags, MLXCX_RQ_STATE); 1708 switch (state) { 1709 case MLXCX_RQ_STATE_RST: 1710 if (rq->mlwq_state & MLXCX_WQ_STARTED) { 1711 mlxcx_fm_qstate_ereport(mlxp, "receive", 1712 rq->mlwq_num, "RST", state); 1713 rq->mlwq_fm_repd_qstate = B_TRUE; 1714 } 1715 break; 1716 case MLXCX_RQ_STATE_RDY: 1717 if (!(rq->mlwq_state & MLXCX_WQ_STARTED)) { 1718 mlxcx_fm_qstate_ereport(mlxp, "receive", 1719 rq->mlwq_num, "RDY", state); 1720 rq->mlwq_fm_repd_qstate = B_TRUE; 1721 } 1722 break; 1723 case MLXCX_RQ_STATE_ERR: 1724 mlxcx_fm_qstate_ereport(mlxp, "receive", 1725 rq->mlwq_num, "ERR", state); 1726 rq->mlwq_fm_repd_qstate = B_TRUE; 1727 break; 1728 default: 1729 mlxcx_fm_qstate_ereport(mlxp, "receive", 1730 rq->mlwq_num, "???", state); 1731 rq->mlwq_fm_repd_qstate = B_TRUE; 1732 break; 1733 } 1734 } 1735 1736 static void 1737 mlxcx_wq_check(void *arg) 1738 { 1739 mlxcx_t *mlxp = (mlxcx_t *)arg; 1740 mlxcx_work_queue_t *wq; 1741 1742 for (wq = list_head(&mlxp->mlx_wqs); wq != NULL; 1743 wq = list_next(&mlxp->mlx_wqs, wq)) { 1744 1745 if ((wq->mlwq_state & MLXCX_WQ_CREATED) == 0) 1746 continue; 1747 1748 /* 1749 * If the work queue was successfully created in the HCA, 1750 * then initialization and shutdown sequences guarantee 1751 * the queue exists. 1752 */ 1753 ASSERT0(wq->mlwq_state & MLXCX_WQ_DESTROYED); 1754 ASSERT0(wq->mlwq_state & MLXCX_WQ_TEARDOWN); 1755 1756 if (wq->mlwq_fm_repd_qstate) 1757 continue; 1758 1759 switch (wq->mlwq_type) { 1760 case MLXCX_WQ_TYPE_SENDQ: 1761 mlxcx_check_sq(mlxp, wq); 1762 break; 1763 case MLXCX_WQ_TYPE_RECVQ: 1764 mlxcx_check_rq(mlxp, wq); 1765 break; 1766 } 1767 } 1768 } 1769 1770 static boolean_t 1771 mlxcx_setup_checktimers(mlxcx_t *mlxp) 1772 { 1773 if (mlxp->mlx_props.mldp_eq_check_interval_sec > 0) { 1774 mlxp->mlx_eq_checktimer = ddi_periodic_add(mlxcx_eq_check, mlxp, 1775 mlxp->mlx_props.mldp_eq_check_interval_sec * NANOSEC, 1776 DDI_IPL_0); 1777 } 1778 if (mlxp->mlx_props.mldp_cq_check_interval_sec > 0) { 1779 mlxp->mlx_cq_checktimer = ddi_periodic_add(mlxcx_cq_check, mlxp, 1780 mlxp->mlx_props.mldp_cq_check_interval_sec * NANOSEC, 1781 DDI_IPL_0); 1782 } 1783 if (mlxp->mlx_props.mldp_wq_check_interval_sec > 0) { 1784 mlxp->mlx_wq_checktimer = ddi_periodic_add(mlxcx_wq_check, mlxp, 1785 mlxp->mlx_props.mldp_wq_check_interval_sec * NANOSEC, 1786 DDI_IPL_0); 1787 } 1788 return (B_TRUE); 1789 } 1790 1791 int 1792 mlxcx_dmac_fe_compare(const void *arg0, const void *arg1) 1793 { 1794 const mlxcx_flow_entry_t *left = arg0; 1795 const mlxcx_flow_entry_t *right = arg1; 1796 int bcmpr; 1797 1798 bcmpr = memcmp(left->mlfe_dmac, right->mlfe_dmac, 1799 sizeof (left->mlfe_dmac)); 1800 if (bcmpr < 0) 1801 return (-1); 1802 if (bcmpr > 0) 1803 return (1); 1804 if (left->mlfe_vid < right->mlfe_vid) 1805 return (-1); 1806 if (left->mlfe_vid > right->mlfe_vid) 1807 return (1); 1808 return (0); 1809 } 1810 1811 int 1812 mlxcx_grmac_compare(const void *arg0, const void *arg1) 1813 { 1814 const mlxcx_group_mac_t *left = arg0; 1815 const mlxcx_group_mac_t *right = arg1; 1816 int bcmpr; 1817 1818 bcmpr = memcmp(left->mlgm_mac, right->mlgm_mac, 1819 sizeof (left->mlgm_mac)); 1820 if (bcmpr < 0) 1821 return (-1); 1822 if (bcmpr > 0) 1823 return (1); 1824 return (0); 1825 } 1826 1827 int 1828 mlxcx_page_compare(const void *arg0, const void *arg1) 1829 { 1830 const mlxcx_dev_page_t *p0 = arg0; 1831 const mlxcx_dev_page_t *p1 = arg1; 1832 1833 if (p0->mxdp_pa < p1->mxdp_pa) 1834 return (-1); 1835 if (p0->mxdp_pa > p1->mxdp_pa) 1836 return (1); 1837 return (0); 1838 } 1839 1840 static boolean_t 1841 mlxcx_setup_ports(mlxcx_t *mlxp) 1842 { 1843 uint_t i, j; 1844 mlxcx_port_t *p; 1845 mlxcx_flow_table_t *ft; 1846 mlxcx_flow_group_t *fg; 1847 mlxcx_flow_entry_t *fe; 1848 1849 VERIFY3U(mlxp->mlx_nports, >, 0); 1850 mlxp->mlx_ports_size = mlxp->mlx_nports * sizeof (mlxcx_port_t); 1851 mlxp->mlx_ports = kmem_zalloc(mlxp->mlx_ports_size, KM_SLEEP); 1852 1853 for (i = 0; i < mlxp->mlx_nports; ++i) { 1854 p = &mlxp->mlx_ports[i]; 1855 p->mlp_num = i; 1856 p->mlx_port_event.mla_mlx = mlxp; 1857 p->mlx_port_event.mla_port = p; 1858 mutex_init(&p->mlx_port_event.mla_mtx, NULL, 1859 MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_async_intr_pri)); 1860 p->mlp_init |= MLXCX_PORT_INIT; 1861 mutex_init(&p->mlp_mtx, NULL, MUTEX_DRIVER, 1862 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 1863 mutex_enter(&p->mlp_mtx); 1864 if (!mlxcx_cmd_query_nic_vport_ctx(mlxp, p)) { 1865 mutex_exit(&p->mlp_mtx); 1866 goto err; 1867 } 1868 if (!mlxcx_cmd_query_port_mtu(mlxp, p)) { 1869 mutex_exit(&p->mlp_mtx); 1870 goto err; 1871 } 1872 if (!mlxcx_cmd_query_port_status(mlxp, p)) { 1873 mutex_exit(&p->mlp_mtx); 1874 goto err; 1875 } 1876 if (!mlxcx_cmd_query_port_speed(mlxp, p)) { 1877 mutex_exit(&p->mlp_mtx); 1878 goto err; 1879 } 1880 if (!mlxcx_cmd_modify_nic_vport_ctx(mlxp, p, 1881 MLXCX_MODIFY_NIC_VPORT_CTX_PROMISC)) { 1882 mutex_exit(&p->mlp_mtx); 1883 goto err; 1884 } 1885 if (!mlxcx_cmd_query_port_fec(mlxp, p)) { 1886 mutex_exit(&p->mlp_mtx); 1887 goto err; 1888 } 1889 p->mlp_fec_requested = LINK_FEC_AUTO; 1890 1891 mutex_exit(&p->mlp_mtx); 1892 } 1893 1894 for (i = 0; i < mlxp->mlx_nports; ++i) { 1895 p = &mlxp->mlx_ports[i]; 1896 mutex_enter(&p->mlp_mtx); 1897 p->mlp_rx_flow = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t), 1898 KM_SLEEP)); 1899 mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER, 1900 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 1901 1902 mutex_enter(&ft->mlft_mtx); 1903 1904 ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX; 1905 ft->mlft_port = p; 1906 ft->mlft_entshift = mlxp->mlx_props.mldp_ftbl_root_size_shift; 1907 if (ft->mlft_entshift > mlxp->mlx_caps->mlc_max_rx_ft_shift) 1908 ft->mlft_entshift = mlxp->mlx_caps->mlc_max_rx_ft_shift; 1909 ft->mlft_nents = (1 << ft->mlft_entshift); 1910 ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t); 1911 ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP); 1912 list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t), 1913 offsetof(mlxcx_flow_group_t, mlfg_entry)); 1914 1915 for (j = 0; j < ft->mlft_nents; ++j) { 1916 ft->mlft_ent[j].mlfe_table = ft; 1917 ft->mlft_ent[j].mlfe_index = j; 1918 } 1919 1920 if (!mlxcx_cmd_create_flow_table(mlxp, ft)) { 1921 mutex_exit(&ft->mlft_mtx); 1922 mutex_exit(&p->mlp_mtx); 1923 goto err; 1924 } 1925 1926 if (!mlxcx_cmd_set_flow_table_root(mlxp, ft)) { 1927 mutex_exit(&ft->mlft_mtx); 1928 mutex_exit(&p->mlp_mtx); 1929 goto err; 1930 } 1931 1932 /* 1933 * We match broadcast at the top of the root flow table, then 1934 * all multicast/unicast MACs, then the promisc entry is down 1935 * the very bottom. 1936 * 1937 * This way when promisc is on, that entry simply catches any 1938 * remaining traffic that earlier flows haven't matched. 1939 */ 1940 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1941 list_insert_tail(&ft->mlft_groups, fg); 1942 fg->mlfg_table = ft; 1943 fg->mlfg_size = 1; 1944 fg->mlfg_mask |= MLXCX_FLOW_MATCH_DMAC; 1945 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1946 mutex_exit(&ft->mlft_mtx); 1947 mutex_exit(&p->mlp_mtx); 1948 goto err; 1949 } 1950 p->mlp_bcast = fg; 1951 fe = list_head(&fg->mlfg_entries); 1952 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 1953 (void) memset(fe->mlfe_dmac, 0xff, sizeof (fe->mlfe_dmac)); 1954 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 1955 1956 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1957 list_insert_tail(&ft->mlft_groups, fg); 1958 fg->mlfg_table = ft; 1959 fg->mlfg_size = ft->mlft_nents - 2; 1960 fg->mlfg_mask |= MLXCX_FLOW_MATCH_DMAC; 1961 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1962 mutex_exit(&ft->mlft_mtx); 1963 mutex_exit(&p->mlp_mtx); 1964 goto err; 1965 } 1966 p->mlp_umcast = fg; 1967 1968 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1969 list_insert_tail(&ft->mlft_groups, fg); 1970 fg->mlfg_table = ft; 1971 fg->mlfg_size = 1; 1972 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1973 mutex_exit(&ft->mlft_mtx); 1974 mutex_exit(&p->mlp_mtx); 1975 goto err; 1976 } 1977 p->mlp_promisc = fg; 1978 fe = list_head(&fg->mlfg_entries); 1979 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 1980 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 1981 1982 avl_create(&p->mlp_dmac_fe, mlxcx_dmac_fe_compare, 1983 sizeof (mlxcx_flow_entry_t), offsetof(mlxcx_flow_entry_t, 1984 mlfe_dmac_entry)); 1985 1986 mutex_exit(&ft->mlft_mtx); 1987 mutex_exit(&p->mlp_mtx); 1988 } 1989 1990 return (B_TRUE); 1991 1992 err: 1993 mlxcx_teardown_ports(mlxp); 1994 return (B_FALSE); 1995 } 1996 1997 void 1998 mlxcx_remove_all_vlan_entries(mlxcx_t *mlxp, mlxcx_ring_group_t *g) 1999 { 2000 mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft; 2001 mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg; 2002 mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg; 2003 mlxcx_flow_entry_t *fe; 2004 mlxcx_group_vlan_t *v; 2005 2006 ASSERT(mutex_owned(&g->mlg_mtx)); 2007 2008 mutex_enter(&ft->mlft_mtx); 2009 2010 if (!list_is_empty(&g->mlg_rx_vlans)) { 2011 fe = list_head(&dfg->mlfg_entries); 2012 (void) mlxcx_cmd_set_flow_table_entry(mlxp, fe); 2013 } 2014 2015 while ((v = list_remove_head(&g->mlg_rx_vlans)) != NULL) { 2016 fe = v->mlgv_fe; 2017 ASSERT3P(fe->mlfe_table, ==, ft); 2018 ASSERT3P(fe->mlfe_group, ==, fg); 2019 kmem_free(v, sizeof (mlxcx_group_vlan_t)); 2020 2021 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 2022 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2023 } 2024 2025 mutex_exit(&ft->mlft_mtx); 2026 } 2027 2028 boolean_t 2029 mlxcx_remove_vlan_entry(mlxcx_t *mlxp, mlxcx_ring_group_t *g, 2030 boolean_t tagged, uint16_t vid) 2031 { 2032 mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft; 2033 mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg; 2034 mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg; 2035 mlxcx_flow_entry_t *fe; 2036 mlxcx_group_vlan_t *v; 2037 boolean_t found = B_FALSE; 2038 2039 ASSERT(mutex_owned(&g->mlg_mtx)); 2040 2041 mutex_enter(&ft->mlft_mtx); 2042 2043 for (v = list_head(&g->mlg_rx_vlans); v != NULL; 2044 v = list_next(&g->mlg_rx_vlans, v)) { 2045 if (v->mlgv_tagged == tagged && v->mlgv_vid == vid) { 2046 found = B_TRUE; 2047 break; 2048 } 2049 } 2050 if (!found) { 2051 mutex_exit(&ft->mlft_mtx); 2052 return (B_FALSE); 2053 } 2054 2055 list_remove(&g->mlg_rx_vlans, v); 2056 2057 /* 2058 * If this is the last VLAN entry, we have to go back to accepting 2059 * any VLAN (which means re-enabling the default entry). 2060 * 2061 * Do this before we remove the flow entry for the last specific 2062 * VLAN so that we don't lose any traffic in the transition. 2063 */ 2064 if (list_is_empty(&g->mlg_rx_vlans)) { 2065 fe = list_head(&dfg->mlfg_entries); 2066 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 2067 list_insert_tail(&g->mlg_rx_vlans, v); 2068 mutex_exit(&ft->mlft_mtx); 2069 return (B_FALSE); 2070 } 2071 } 2072 2073 fe = v->mlgv_fe; 2074 ASSERT(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED); 2075 ASSERT(fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED); 2076 ASSERT3P(fe->mlfe_table, ==, ft); 2077 ASSERT3P(fe->mlfe_group, ==, fg); 2078 2079 if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) { 2080 list_insert_tail(&g->mlg_rx_vlans, v); 2081 fe = list_head(&dfg->mlfg_entries); 2082 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) { 2083 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 2084 } 2085 mutex_exit(&ft->mlft_mtx); 2086 return (B_FALSE); 2087 } 2088 2089 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2090 2091 kmem_free(v, sizeof (mlxcx_group_vlan_t)); 2092 2093 mutex_exit(&ft->mlft_mtx); 2094 return (B_TRUE); 2095 } 2096 2097 boolean_t 2098 mlxcx_add_vlan_entry(mlxcx_t *mlxp, mlxcx_ring_group_t *g, boolean_t tagged, 2099 uint16_t vid) 2100 { 2101 mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft; 2102 mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg; 2103 mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg; 2104 mlxcx_flow_entry_t *fe; 2105 mlxcx_group_vlan_t *v; 2106 boolean_t found = B_FALSE; 2107 boolean_t first = B_FALSE; 2108 2109 ASSERT(mutex_owned(&g->mlg_mtx)); 2110 2111 mutex_enter(&ft->mlft_mtx); 2112 2113 for (v = list_head(&g->mlg_rx_vlans); v != NULL; 2114 v = list_next(&g->mlg_rx_vlans, v)) { 2115 if (v->mlgv_tagged == tagged && v->mlgv_vid == vid) { 2116 mutex_exit(&ft->mlft_mtx); 2117 return (B_TRUE); 2118 } 2119 } 2120 if (list_is_empty(&g->mlg_rx_vlans)) 2121 first = B_TRUE; 2122 2123 for (fe = list_head(&fg->mlfg_entries); fe != NULL; 2124 fe = list_next(&fg->mlfg_entries, fe)) { 2125 if (!(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED)) { 2126 found = B_TRUE; 2127 break; 2128 } 2129 } 2130 if (!found) { 2131 mutex_exit(&ft->mlft_mtx); 2132 return (B_FALSE); 2133 } 2134 2135 v = kmem_zalloc(sizeof (mlxcx_group_vlan_t), KM_SLEEP); 2136 v->mlgv_fe = fe; 2137 v->mlgv_tagged = tagged; 2138 v->mlgv_vid = vid; 2139 2140 fe->mlfe_state |= MLXCX_FLOW_ENTRY_RESERVED; 2141 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 2142 fe->mlfe_vid = vid; 2143 if (tagged) { 2144 fe->mlfe_vlan_type = MLXCX_VLAN_TYPE_CVLAN; 2145 } else { 2146 fe->mlfe_vlan_type = MLXCX_VLAN_TYPE_NONE; 2147 } 2148 2149 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 2150 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_DIRTY; 2151 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2152 kmem_free(v, sizeof (mlxcx_group_vlan_t)); 2153 mutex_exit(&ft->mlft_mtx); 2154 return (B_FALSE); 2155 } 2156 2157 list_insert_tail(&g->mlg_rx_vlans, v); 2158 2159 /* 2160 * If the vlan list was empty for this group before adding this one, 2161 * then we no longer want the "default" entry to allow all VLANs 2162 * through. 2163 */ 2164 if (first) { 2165 fe = list_head(&dfg->mlfg_entries); 2166 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 2167 } 2168 2169 mutex_exit(&ft->mlft_mtx); 2170 return (B_TRUE); 2171 } 2172 2173 void 2174 mlxcx_remove_all_umcast_entries(mlxcx_t *mlxp, mlxcx_port_t *port, 2175 mlxcx_ring_group_t *group) 2176 { 2177 mlxcx_flow_entry_t *fe; 2178 mlxcx_flow_table_t *ft = port->mlp_rx_flow; 2179 mlxcx_group_mac_t *gm, *ngm; 2180 2181 ASSERT(mutex_owned(&port->mlp_mtx)); 2182 ASSERT(mutex_owned(&group->mlg_mtx)); 2183 2184 mutex_enter(&ft->mlft_mtx); 2185 2186 gm = avl_first(&group->mlg_rx_macs); 2187 for (; gm != NULL; gm = ngm) { 2188 ngm = AVL_NEXT(&group->mlg_rx_macs, gm); 2189 2190 ASSERT3P(gm->mlgm_group, ==, group); 2191 fe = gm->mlgm_fe; 2192 ASSERT3P(fe->mlfe_table, ==, ft); 2193 2194 avl_remove(&group->mlg_rx_macs, gm); 2195 list_remove(&fe->mlfe_ring_groups, gm); 2196 kmem_free(gm, sizeof (mlxcx_group_mac_t)); 2197 2198 fe->mlfe_ndest = 0; 2199 for (gm = list_head(&fe->mlfe_ring_groups); gm != NULL; 2200 gm = list_next(&fe->mlfe_ring_groups, gm)) { 2201 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = 2202 gm->mlgm_group->mlg_rx_vlan_ft; 2203 } 2204 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 2205 2206 if (fe->mlfe_ndest > 0) { 2207 (void) mlxcx_cmd_set_flow_table_entry(mlxp, fe); 2208 continue; 2209 } 2210 2211 /* 2212 * There are no more ring groups left for this MAC (it wasn't 2213 * attached to any other groups since ndest == 0), so clean up 2214 * its flow entry. 2215 */ 2216 avl_remove(&port->mlp_dmac_fe, fe); 2217 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 2218 list_destroy(&fe->mlfe_ring_groups); 2219 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2220 } 2221 2222 mutex_exit(&ft->mlft_mtx); 2223 } 2224 2225 boolean_t 2226 mlxcx_remove_umcast_entry(mlxcx_t *mlxp, mlxcx_port_t *port, 2227 mlxcx_ring_group_t *group, const uint8_t *macaddr) 2228 { 2229 mlxcx_flow_entry_t *fe; 2230 mlxcx_flow_table_t *ft = port->mlp_rx_flow; 2231 mlxcx_group_mac_t *gm, probe; 2232 2233 ASSERT(mutex_owned(&port->mlp_mtx)); 2234 ASSERT(mutex_owned(&group->mlg_mtx)); 2235 2236 bzero(&probe, sizeof (probe)); 2237 bcopy(macaddr, probe.mlgm_mac, sizeof (probe.mlgm_mac)); 2238 2239 mutex_enter(&ft->mlft_mtx); 2240 2241 gm = avl_find(&group->mlg_rx_macs, &probe, NULL); 2242 if (gm == NULL) { 2243 mutex_exit(&ft->mlft_mtx); 2244 return (B_FALSE); 2245 } 2246 ASSERT3P(gm->mlgm_group, ==, group); 2247 ASSERT0(bcmp(macaddr, gm->mlgm_mac, sizeof (gm->mlgm_mac))); 2248 2249 fe = gm->mlgm_fe; 2250 ASSERT3P(fe->mlfe_table, ==, ft); 2251 ASSERT0(bcmp(macaddr, fe->mlfe_dmac, sizeof (fe->mlfe_dmac))); 2252 2253 list_remove(&fe->mlfe_ring_groups, gm); 2254 avl_remove(&group->mlg_rx_macs, gm); 2255 kmem_free(gm, sizeof (mlxcx_group_mac_t)); 2256 2257 fe->mlfe_ndest = 0; 2258 for (gm = list_head(&fe->mlfe_ring_groups); gm != NULL; 2259 gm = list_next(&fe->mlfe_ring_groups, gm)) { 2260 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = 2261 gm->mlgm_group->mlg_rx_vlan_ft; 2262 } 2263 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 2264 2265 if (fe->mlfe_ndest > 0) { 2266 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 2267 mutex_exit(&ft->mlft_mtx); 2268 return (B_FALSE); 2269 } 2270 mutex_exit(&ft->mlft_mtx); 2271 return (B_TRUE); 2272 } 2273 2274 /* 2275 * There are no more ring groups left for this MAC (it wasn't attached 2276 * to any other groups since ndest == 0), so clean up its flow entry. 2277 */ 2278 avl_remove(&port->mlp_dmac_fe, fe); 2279 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 2280 list_destroy(&fe->mlfe_ring_groups); 2281 2282 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2283 2284 mutex_exit(&ft->mlft_mtx); 2285 2286 return (B_TRUE); 2287 } 2288 2289 boolean_t 2290 mlxcx_add_umcast_entry(mlxcx_t *mlxp, mlxcx_port_t *port, 2291 mlxcx_ring_group_t *group, const uint8_t *macaddr) 2292 { 2293 mlxcx_flow_group_t *fg; 2294 mlxcx_flow_entry_t *fe, probe; 2295 mlxcx_flow_table_t *ft = port->mlp_rx_flow; 2296 mlxcx_group_mac_t *gm; 2297 boolean_t found = B_FALSE; 2298 2299 ASSERT(mutex_owned(&port->mlp_mtx)); 2300 ASSERT(mutex_owned(&group->mlg_mtx)); 2301 2302 bzero(&probe, sizeof (probe)); 2303 bcopy(macaddr, probe.mlfe_dmac, sizeof (probe.mlfe_dmac)); 2304 2305 mutex_enter(&ft->mlft_mtx); 2306 2307 fe = avl_find(&port->mlp_dmac_fe, &probe, NULL); 2308 2309 if (fe == NULL) { 2310 fg = port->mlp_umcast; 2311 for (fe = list_head(&fg->mlfg_entries); fe != NULL; 2312 fe = list_next(&fg->mlfg_entries, fe)) { 2313 if (!(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED)) { 2314 found = B_TRUE; 2315 break; 2316 } 2317 } 2318 if (!found) { 2319 mutex_exit(&ft->mlft_mtx); 2320 return (B_FALSE); 2321 } 2322 list_create(&fe->mlfe_ring_groups, sizeof (mlxcx_group_mac_t), 2323 offsetof(mlxcx_group_mac_t, mlgm_fe_entry)); 2324 fe->mlfe_state |= MLXCX_FLOW_ENTRY_RESERVED; 2325 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 2326 bcopy(macaddr, fe->mlfe_dmac, sizeof (fe->mlfe_dmac)); 2327 2328 avl_add(&port->mlp_dmac_fe, fe); 2329 } 2330 2331 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = group->mlg_rx_vlan_ft; 2332 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 2333 2334 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 2335 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_DIRTY; 2336 if (--fe->mlfe_ndest == 0) { 2337 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2338 } 2339 mutex_exit(&ft->mlft_mtx); 2340 return (B_FALSE); 2341 } 2342 2343 gm = kmem_zalloc(sizeof (mlxcx_group_mac_t), KM_SLEEP); 2344 gm->mlgm_group = group; 2345 gm->mlgm_fe = fe; 2346 bcopy(macaddr, gm->mlgm_mac, sizeof (gm->mlgm_mac)); 2347 avl_add(&group->mlg_rx_macs, gm); 2348 list_insert_tail(&fe->mlfe_ring_groups, gm); 2349 2350 mutex_exit(&ft->mlft_mtx); 2351 2352 return (B_TRUE); 2353 } 2354 2355 boolean_t 2356 mlxcx_setup_flow_group(mlxcx_t *mlxp, mlxcx_flow_table_t *ft, 2357 mlxcx_flow_group_t *fg) 2358 { 2359 mlxcx_flow_entry_t *fe; 2360 uint_t i, idx; 2361 2362 ASSERT(mutex_owned(&ft->mlft_mtx)); 2363 ASSERT(ft->mlft_state & MLXCX_FLOW_TABLE_CREATED); 2364 ASSERT3P(fg->mlfg_table, ==, ft); 2365 2366 if (ft->mlft_next_ent + fg->mlfg_size > ft->mlft_nents) 2367 return (B_FALSE); 2368 fg->mlfg_start_idx = ft->mlft_next_ent; 2369 2370 if (!mlxcx_cmd_create_flow_group(mlxp, fg)) { 2371 return (B_FALSE); 2372 } 2373 2374 list_create(&fg->mlfg_entries, sizeof (mlxcx_flow_entry_t), 2375 offsetof(mlxcx_flow_entry_t, mlfe_group_entry)); 2376 for (i = 0; i < fg->mlfg_size; ++i) { 2377 idx = fg->mlfg_start_idx + i; 2378 fe = &ft->mlft_ent[idx]; 2379 fe->mlfe_group = fg; 2380 list_insert_tail(&fg->mlfg_entries, fe); 2381 } 2382 fg->mlfg_avail = fg->mlfg_size; 2383 ft->mlft_next_ent += fg->mlfg_size; 2384 2385 return (B_TRUE); 2386 } 2387 2388 static boolean_t 2389 mlxcx_setup_eq(mlxcx_t *mlxp, uint_t vec, uint64_t events) 2390 { 2391 mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[vec]; 2392 2393 mutex_enter(&mleq->mleq_mtx); 2394 if (!mlxcx_eq_alloc_dma(mlxp, mleq)) { 2395 /* mlxcx_teardown_eqs() will clean this up */ 2396 mutex_exit(&mleq->mleq_mtx); 2397 return (B_FALSE); 2398 } 2399 mleq->mleq_mlx = mlxp; 2400 mleq->mleq_uar = &mlxp->mlx_uar; 2401 mleq->mleq_events = events; 2402 mleq->mleq_intr_index = vec; 2403 2404 if (!mlxcx_cmd_create_eq(mlxp, mleq)) { 2405 /* mlxcx_teardown_eqs() will clean this up */ 2406 mutex_exit(&mleq->mleq_mtx); 2407 return (B_FALSE); 2408 } 2409 2410 if (ddi_intr_enable(mlxp->mlx_intr_handles[vec]) != DDI_SUCCESS) { 2411 /* 2412 * mlxcx_teardown_eqs() will handle calling cmd_destroy_eq and 2413 * eq_rele_dma 2414 */ 2415 mutex_exit(&mleq->mleq_mtx); 2416 return (B_FALSE); 2417 } 2418 mleq->mleq_state |= MLXCX_EQ_INTR_ENABLED; 2419 mleq->mleq_state |= MLXCX_EQ_ATTACHING; 2420 mlxcx_arm_eq(mlxp, mleq); 2421 mutex_exit(&mleq->mleq_mtx); 2422 2423 return (B_TRUE); 2424 } 2425 2426 static void 2427 mlxcx_eq_set_attached(mlxcx_t *mlxp) 2428 { 2429 uint_t vec; 2430 mlxcx_event_queue_t *mleq; 2431 2432 for (vec = 0; vec < mlxp->mlx_intr_count; ++vec) { 2433 mleq = &mlxp->mlx_eqs[vec]; 2434 2435 mutex_enter(&mleq->mleq_mtx); 2436 mleq->mleq_state &= ~MLXCX_EQ_ATTACHING; 2437 mutex_exit(&mleq->mleq_mtx); 2438 } 2439 } 2440 2441 static boolean_t 2442 mlxcx_setup_async_eqs(mlxcx_t *mlxp) 2443 { 2444 boolean_t ret; 2445 2446 ret = mlxcx_setup_eq(mlxp, 0, 2447 (1ULL << MLXCX_EVENT_CMD_COMPLETION) | 2448 (1ULL << MLXCX_EVENT_PAGE_REQUEST) | 2449 (1ULL << MLXCX_EVENT_PORT_STATE) | 2450 (1ULL << MLXCX_EVENT_INTERNAL_ERROR) | 2451 (1ULL << MLXCX_EVENT_PORT_MODULE) | 2452 (1ULL << MLXCX_EVENT_SENDQ_DRAIN) | 2453 (1ULL << MLXCX_EVENT_LAST_WQE) | 2454 (1ULL << MLXCX_EVENT_CQ_ERROR) | 2455 (1ULL << MLXCX_EVENT_WQ_CATASTROPHE) | 2456 (1ULL << MLXCX_EVENT_PAGE_FAULT) | 2457 (1ULL << MLXCX_EVENT_WQ_INVALID_REQ) | 2458 (1ULL << MLXCX_EVENT_WQ_ACCESS_VIOL) | 2459 (1ULL << MLXCX_EVENT_NIC_VPORT) | 2460 (1ULL << MLXCX_EVENT_DOORBELL_CONGEST)); 2461 2462 if (ret) 2463 mlxcx_cmd_eq_enable(mlxp); 2464 2465 return (ret); 2466 } 2467 2468 int 2469 mlxcx_cq_compare(const void *arg0, const void *arg1) 2470 { 2471 const mlxcx_completion_queue_t *left = arg0; 2472 const mlxcx_completion_queue_t *right = arg1; 2473 2474 if (left->mlcq_num < right->mlcq_num) { 2475 return (-1); 2476 } 2477 if (left->mlcq_num > right->mlcq_num) { 2478 return (1); 2479 } 2480 return (0); 2481 } 2482 2483 static boolean_t 2484 mlxcx_setup_eqs(mlxcx_t *mlxp) 2485 { 2486 uint_t i; 2487 mlxcx_event_queue_t *mleq; 2488 2489 ASSERT3S(mlxp->mlx_intr_count, >, 0); 2490 2491 for (i = mlxp->mlx_intr_cq0; i < mlxp->mlx_intr_count; ++i) { 2492 mleq = &mlxp->mlx_eqs[i]; 2493 mutex_enter(&mleq->mleq_mtx); 2494 if (!mlxcx_eq_alloc_dma(mlxp, mleq)) { 2495 mutex_exit(&mleq->mleq_mtx); 2496 return (B_FALSE); 2497 } 2498 mleq->mleq_uar = &mlxp->mlx_uar; 2499 if (!mlxcx_cmd_create_eq(mlxp, mleq)) { 2500 /* mlxcx_teardown() will handle calling eq_rele_dma */ 2501 mutex_exit(&mleq->mleq_mtx); 2502 return (B_FALSE); 2503 } 2504 if (mlxp->mlx_props.mldp_intrmod_period_usec != 0 && 2505 !mlxcx_cmd_set_int_mod(mlxp, i, 2506 mlxp->mlx_props.mldp_intrmod_period_usec)) { 2507 mutex_exit(&mleq->mleq_mtx); 2508 return (B_FALSE); 2509 } 2510 if (ddi_intr_enable(mlxp->mlx_intr_handles[i]) != DDI_SUCCESS) { 2511 mutex_exit(&mleq->mleq_mtx); 2512 return (B_FALSE); 2513 } 2514 mleq->mleq_state |= MLXCX_EQ_INTR_ENABLED; 2515 mlxcx_arm_eq(mlxp, mleq); 2516 mutex_exit(&mleq->mleq_mtx); 2517 } 2518 2519 mlxp->mlx_next_eq = mlxp->mlx_intr_cq0; 2520 2521 return (B_TRUE); 2522 } 2523 2524 /* 2525 * A more recent ConnectX part will have the Port CApability Mask register. 2526 * Explore it and note things here. 2527 */ 2528 static void 2529 mlxcx_explore_pcam(mlxcx_t *mlxp, mlxcx_caps_t *c) 2530 { 2531 mlxcx_register_data_t data; 2532 mlxcx_reg_pcam_t *pcam = &data.mlrd_pcam; 2533 2534 ASSERT(c->mlc_pcam); 2535 bzero(&data, sizeof (data)); 2536 2537 /* 2538 * Okay, so we have access the the Ports CApability Mask (PCAM). 2539 * There are various things we need to check about it. 2540 */ 2541 2542 VERIFY(mlxcx_cmd_access_register(mlxp, MLXCX_CMD_ACCESS_REGISTER_READ, 2543 MLXCX_REG_PCAM, &data)); 2544 2545 /* 2546 * NOTE: These ASSERT()s may change in future mlxcx(4D) parts. 2547 * As of now, only 0 is valid, and 1-255 are reserved. A future part 2548 * may return non-zero in these fields. 2549 */ 2550 ASSERT0(pcam->mlrd_pcam_feature_group); 2551 ASSERT0(pcam->mlrd_pcam_access_reg_group); 2552 2553 c->mlc_ext_ptys = get_bit64(pcam->mlrd_pcam_feature_cap_mask_low, 2554 MLXCX_PCAM_LOW_FFLAGS_PTYS_EXTENDED); 2555 } 2556 2557 /* 2558 * Snapshot all of the hardware capabilities that we care about and then modify 2559 * the HCA capabilities to get things moving. 2560 */ 2561 static boolean_t 2562 mlxcx_init_caps(mlxcx_t *mlxp) 2563 { 2564 mlxcx_caps_t *c; 2565 2566 mlxp->mlx_caps = c = kmem_zalloc(sizeof (mlxcx_caps_t), KM_SLEEP); 2567 2568 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_GENERAL, 2569 MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_hca_cur)) { 2570 mlxcx_warn(mlxp, "failed to obtain current HCA general caps"); 2571 } 2572 2573 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_GENERAL, 2574 MLXCX_HCA_CAP_MODE_MAX, &c->mlc_hca_max)) { 2575 mlxcx_warn(mlxp, "failed to obtain maximum HCA general caps"); 2576 } 2577 2578 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_ETHERNET, 2579 MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_ether_cur)) { 2580 mlxcx_warn(mlxp, "failed to obtain current HCA eth caps"); 2581 } 2582 2583 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_ETHERNET, 2584 MLXCX_HCA_CAP_MODE_MAX, &c->mlc_ether_max)) { 2585 mlxcx_warn(mlxp, "failed to obtain maximum HCA eth caps"); 2586 } 2587 2588 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_NIC_FLOW, 2589 MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_nic_flow_cur)) { 2590 mlxcx_warn(mlxp, "failed to obtain current HCA flow caps"); 2591 } 2592 2593 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_NIC_FLOW, 2594 MLXCX_HCA_CAP_MODE_MAX, &c->mlc_nic_flow_max)) { 2595 mlxcx_warn(mlxp, "failed to obtain maximum HCA flow caps"); 2596 } 2597 2598 /* 2599 * Check the caps meet our requirements. 2600 */ 2601 const mlxcx_hca_cap_general_caps_t *gen = &c->mlc_hca_cur.mhc_general; 2602 2603 if (gen->mlcap_general_log_pg_sz != 12) { 2604 mlxcx_warn(mlxp, "!hardware has page size != 4k " 2605 "(log_pg_sz = %u)", (uint_t)gen->mlcap_general_log_pg_sz); 2606 goto err; 2607 } 2608 if (gen->mlcap_general_cqe_version != 1) { 2609 mlxcx_warn(mlxp, "!hardware does not support CQE v1 " 2610 "(cqe_ver = %u)", (uint_t)gen->mlcap_general_cqe_version); 2611 goto err; 2612 } 2613 if (gen->mlcap_general_port_type != 2614 MLXCX_CAP_GENERAL_PORT_TYPE_ETHERNET) { 2615 mlxcx_warn(mlxp, "!hardware has non-ethernet ports"); 2616 goto err; 2617 } 2618 mlxp->mlx_nports = gen->mlcap_general_num_ports; 2619 mlxp->mlx_max_sdu = (1 << (gen->mlcap_general_log_max_msg & 0x1F)); 2620 2621 if (mlxp->mlx_type >= MLXCX_DEV_CX5 && 2622 get_bit16(gen->mlcap_general_flags_c, 2623 MLXCX_CAP_GENERAL_FLAGS_C_PCAM_REG)) { 2624 c->mlc_pcam = B_TRUE; 2625 } 2626 2627 c->mlc_max_tir = (1 << gen->mlcap_general_log_max_tir); 2628 2629 c->mlc_checksum = get_bit32(c->mlc_ether_cur.mhc_eth.mlcap_eth_flags, 2630 MLXCX_ETH_CAP_CSUM_CAP); 2631 c->mlc_vxlan = get_bit32(c->mlc_ether_cur.mhc_eth.mlcap_eth_flags, 2632 MLXCX_ETH_CAP_TUNNEL_STATELESS_VXLAN); 2633 2634 c->mlc_max_lso_size = (1 << get_bits32(c->mlc_ether_cur.mhc_eth. 2635 mlcap_eth_flags, MLXCX_ETH_CAP_MAX_LSO_CAP)); 2636 if (c->mlc_max_lso_size == 1) { 2637 c->mlc_max_lso_size = 0; 2638 c->mlc_lso = B_FALSE; 2639 } else { 2640 c->mlc_lso = B_TRUE; 2641 } 2642 2643 c->mlc_max_rqt_size = (1 << get_bits32(c->mlc_ether_cur.mhc_eth. 2644 mlcap_eth_flags, MLXCX_ETH_CAP_RSS_IND_TBL_CAP)); 2645 2646 if (!get_bit32(c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx. 2647 mlcap_flow_prop_flags, MLXCX_FLOW_CAP_PROPS_SUPPORT)) { 2648 mlxcx_warn(mlxp, "!hardware does not support rx flow tables"); 2649 goto err; 2650 } 2651 if (!get_bit32(c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx. 2652 mlcap_flow_prop_flags, MLXCX_FLOW_CAP_PROPS_MODIFY)) { 2653 mlxcx_warn(mlxp, "!hardware does not support modifying rx " 2654 "flow table entries"); 2655 goto err; 2656 } 2657 2658 c->mlc_max_rx_ft_shift = c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx. 2659 mlcap_flow_prop_log_max_ft_size; 2660 c->mlc_max_rx_flows = (1 << c->mlc_nic_flow_cur.mhc_flow. 2661 mlcap_flow_nic_rx.mlcap_flow_prop_log_max_flow); 2662 c->mlc_max_rx_ft = (1 << c->mlc_nic_flow_cur.mhc_flow. 2663 mlcap_flow_nic_rx.mlcap_flow_prop_log_max_ft_num); 2664 c->mlc_max_rx_fe_dest = (1 << c->mlc_nic_flow_cur.mhc_flow. 2665 mlcap_flow_nic_rx.mlcap_flow_prop_log_max_destination); 2666 2667 return (B_TRUE); 2668 2669 err: 2670 kmem_free(mlxp->mlx_caps, sizeof (mlxcx_caps_t)); 2671 return (B_FALSE); 2672 } 2673 2674 static int 2675 mlxcx_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2676 { 2677 mlxcx_t *mlxp; 2678 2679 if (cmd != DDI_DETACH) 2680 return (DDI_FAILURE); 2681 2682 mlxp = ddi_get_driver_private(dip); 2683 if (mlxp == NULL) { 2684 mlxcx_warn(NULL, "asked to detach, but missing instance " 2685 "private data"); 2686 return (DDI_FAILURE); 2687 } 2688 2689 if (mlxp->mlx_attach & MLXCX_ATTACH_MAC_HDL) { 2690 if (mac_unregister(mlxp->mlx_mac_hdl) != DDI_SUCCESS) { 2691 return (DDI_FAILURE); 2692 } 2693 mlxp->mlx_attach &= ~MLXCX_ATTACH_MAC_HDL; 2694 } 2695 2696 mlxcx_teardown(mlxp); 2697 return (DDI_SUCCESS); 2698 } 2699 2700 static size_t 2701 mlxcx_calc_rx_ngroups(mlxcx_t *mlxp) 2702 { 2703 size_t ngroups = mlxp->mlx_props.mldp_rx_ngroups_large + 2704 mlxp->mlx_props.mldp_rx_ngroups_small; 2705 size_t tirlim, flowlim, gflowlim; 2706 2707 tirlim = mlxp->mlx_caps->mlc_max_tir / MLXCX_TIRS_PER_GROUP; 2708 if (tirlim < ngroups) { 2709 mlxcx_note(mlxp, "limiting number of rx groups to %u based " 2710 "on number of TIRs available", tirlim); 2711 ngroups = tirlim; 2712 } 2713 2714 flowlim = (1 << mlxp->mlx_caps->mlc_max_rx_ft_shift) - 2; 2715 if (flowlim < ngroups) { 2716 mlxcx_note(mlxp, "limiting number of rx groups to %u based " 2717 "on max size of RX flow tables", flowlim); 2718 ngroups = flowlim; 2719 } 2720 2721 /* 2722 * Restrict the number of groups not to exceed the max flow 2723 * table number from the devices capabilities. 2724 * There is one root table entry per port and 2 entries per 2725 * group. 2726 */ 2727 flowlim = (mlxp->mlx_caps->mlc_max_rx_ft - mlxp->mlx_nports) / 2; 2728 if (flowlim < ngroups) { 2729 mlxcx_note(mlxp, "limiting number of rx groups to %u based " 2730 "on max number of RX flow tables", 2731 flowlim); 2732 ngroups = flowlim; 2733 } 2734 2735 do { 2736 gflowlim = mlxp->mlx_caps->mlc_max_rx_flows - 16 * ngroups - 2; 2737 if (gflowlim < ngroups) { 2738 mlxcx_note(mlxp, "limiting number of rx groups to %u " 2739 "based on max total RX flows", gflowlim); 2740 --ngroups; 2741 } 2742 } while (gflowlim < ngroups); 2743 2744 return (ngroups); 2745 } 2746 2747 static int 2748 mlxcx_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2749 { 2750 mlxcx_t *mlxp; 2751 char tq_name[TASKQ_NAMELEN]; 2752 uint_t i; 2753 int inst, ret; 2754 2755 if (cmd != DDI_ATTACH) 2756 return (DDI_FAILURE); 2757 2758 inst = ddi_get_instance(dip); 2759 ret = ddi_soft_state_zalloc(mlxcx_softstate, inst); 2760 if (ret != 0) 2761 return (ret); 2762 2763 mlxp = ddi_get_soft_state(mlxcx_softstate, inst); 2764 if (mlxp == NULL) 2765 return (DDI_FAILURE); 2766 mlxp->mlx_dip = dip; 2767 mlxp->mlx_inst = inst; 2768 ddi_set_driver_private(dip, mlxp); 2769 2770 mlxcx_load_props(mlxp); 2771 2772 mlxcx_fm_init(mlxp); 2773 mlxp->mlx_attach |= MLXCX_ATTACH_FM; 2774 2775 if (pci_config_setup(mlxp->mlx_dip, &mlxp->mlx_cfg_handle) != 2776 DDI_SUCCESS) { 2777 mlxcx_warn(mlxp, "failed to initial PCI config space"); 2778 goto err; 2779 } 2780 mlxcx_get_model(mlxp); 2781 mlxp->mlx_attach |= MLXCX_ATTACH_PCI_CONFIG; 2782 2783 if (!mlxcx_regs_map(mlxp)) { 2784 goto err; 2785 } 2786 mlxp->mlx_attach |= MLXCX_ATTACH_REGS; 2787 2788 if (!mlxcx_cmd_queue_init(mlxp)) { 2789 goto err; 2790 } 2791 mlxp->mlx_attach |= MLXCX_ATTACH_CMD; 2792 2793 if (!mlxcx_cmd_enable_hca(mlxp)) { 2794 goto err; 2795 } 2796 mlxp->mlx_attach |= MLXCX_ATTACH_ENABLE_HCA; 2797 2798 if (!mlxcx_check_issi(mlxp)) { 2799 goto err; 2800 } 2801 2802 /* 2803 * We have to get our interrupts now so we know what priority to 2804 * create pagemtx with. 2805 */ 2806 if (!mlxcx_intr_setup(mlxp)) { 2807 goto err; 2808 } 2809 mlxp->mlx_attach |= MLXCX_ATTACH_INTRS; 2810 2811 mutex_init(&mlxp->mlx_pagemtx, NULL, MUTEX_DRIVER, 2812 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 2813 avl_create(&mlxp->mlx_pages, mlxcx_page_compare, 2814 sizeof (mlxcx_dev_page_t), offsetof(mlxcx_dev_page_t, mxdp_tree)); 2815 mlxp->mlx_attach |= MLXCX_ATTACH_PAGE_LIST; 2816 2817 /* 2818 * Taskq for asynchronous events which may interact with the HCA 2819 * via the command interface. Single threaded FIFO. 2820 */ 2821 (void) snprintf(tq_name, sizeof (tq_name), "%s_async_%d", 2822 ddi_driver_name(mlxp->mlx_dip), mlxp->mlx_inst); 2823 mlxp->mlx_async_tq = taskq_create(tq_name, 1, minclsyspri, 1, INT_MAX, 2824 TASKQ_PREPOPULATE); 2825 /* 2826 * Initialize any pre-allocated taskq param structs. 2827 */ 2828 for (i = 0; i <= MLXCX_FUNC_ID_MAX; i++) { 2829 mlxp->mlx_npages_req[i].mla_mlx = mlxp; 2830 mutex_init(&mlxp->mlx_npages_req[i].mla_mtx, NULL, 2831 MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_async_intr_pri)); 2832 } 2833 mlxp->mlx_attach |= MLXCX_ATTACH_ASYNC_TQ; 2834 2835 if (!mlxcx_init_pages(mlxp, MLXCX_QUERY_PAGES_OPMOD_BOOT)) { 2836 goto err; 2837 } 2838 2839 if (!mlxcx_init_caps(mlxp)) { 2840 goto err; 2841 } 2842 mlxp->mlx_attach |= MLXCX_ATTACH_CAPS; 2843 2844 if (!mlxcx_init_pages(mlxp, MLXCX_QUERY_PAGES_OPMOD_INIT)) { 2845 goto err; 2846 } 2847 2848 if (!mlxcx_cmd_init_hca(mlxp)) { 2849 goto err; 2850 } 2851 mlxp->mlx_attach |= MLXCX_ATTACH_INIT_HCA; 2852 2853 if (!mlxcx_cmd_set_driver_version(mlxp, MLXCX_DRIVER_VERSION)) { 2854 goto err; 2855 } 2856 2857 if (mlxp->mlx_caps->mlc_pcam) { 2858 mlxcx_explore_pcam(mlxp, mlxp->mlx_caps); 2859 } 2860 2861 /* 2862 * The User Access Region (UAR) is needed so we can ring EQ and CQ 2863 * doorbells. 2864 */ 2865 if (!mlxcx_cmd_alloc_uar(mlxp, &mlxp->mlx_uar)) { 2866 goto err; 2867 } 2868 for (i = 0; i < MLXCX_BF_PER_UAR; ++i) { 2869 mutex_init(&mlxp->mlx_uar.mlu_bf[i].mbf_mtx, NULL, 2870 MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_intr_pri)); 2871 } 2872 mlxp->mlx_attach |= MLXCX_ATTACH_UAR_PD_TD; 2873 2874 /* 2875 * Set up asynchronous event queue which handles control type events 2876 * like PAGE_REQUEST and CMD completion events. 2877 * 2878 * This will enable and arm the interrupt on EQ 0. Note that only page 2879 * reqs and cmd completions will be handled until we call 2880 * mlxcx_eq_set_attached further down (this way we don't need an extra 2881 * set of locks over the mlxcx_t sub-structs not allocated yet) 2882 */ 2883 if (!mlxcx_setup_async_eqs(mlxp)) { 2884 goto err; 2885 } 2886 2887 /* 2888 * Allocate a protection and transport domain. These don't really do 2889 * anything for us (they're IB concepts), but we need to give their 2890 * ID numbers in other commands. 2891 */ 2892 if (!mlxcx_cmd_alloc_pd(mlxp, &mlxp->mlx_pd)) { 2893 goto err; 2894 } 2895 if (!mlxcx_cmd_alloc_tdom(mlxp, &mlxp->mlx_tdom)) { 2896 goto err; 2897 } 2898 /* 2899 * Fetch the "reserved" lkey that lets us give linear addresses in 2900 * work queue entries, rather than having to mess with the NIC's 2901 * internal MMU. 2902 */ 2903 if (!mlxcx_cmd_query_special_ctxs(mlxp)) { 2904 goto err; 2905 } 2906 2907 /* 2908 * Query our port information and current state, populate the 2909 * mlxcx_port_t structs. 2910 * 2911 * This also sets up the root flow tables and flow groups. 2912 */ 2913 if (!mlxcx_setup_ports(mlxp)) { 2914 goto err; 2915 } 2916 mlxp->mlx_attach |= MLXCX_ATTACH_PORTS; 2917 2918 mlxcx_load_model_props(mlxp); 2919 2920 /* 2921 * Set up, enable and arm the rest of the interrupt EQs which will 2922 * service events from CQs. 2923 * 2924 * The MLXCX_ATTACH_INTRS flag covers checking if these need to be 2925 * cleaned up. 2926 */ 2927 if (!mlxcx_setup_eqs(mlxp)) { 2928 goto err; 2929 } 2930 2931 /* Completion queues */ 2932 list_create(&mlxp->mlx_cqs, sizeof (mlxcx_completion_queue_t), 2933 offsetof(mlxcx_completion_queue_t, mlcq_entry)); 2934 mlxp->mlx_attach |= MLXCX_ATTACH_CQS; 2935 2936 /* Work queues (send queues, receive queues) */ 2937 list_create(&mlxp->mlx_wqs, sizeof (mlxcx_work_queue_t), 2938 offsetof(mlxcx_work_queue_t, mlwq_entry)); 2939 mlxp->mlx_attach |= MLXCX_ATTACH_WQS; 2940 2941 /* 2942 * Construct our arrays of mlxcx_ring_group_ts, which represent the 2943 * "groups" we advertise to MAC. 2944 */ 2945 mlxp->mlx_rx_ngroups = mlxcx_calc_rx_ngroups(mlxp); 2946 mlxp->mlx_rx_groups_size = mlxp->mlx_rx_ngroups * 2947 sizeof (mlxcx_ring_group_t); 2948 mlxp->mlx_rx_groups = kmem_zalloc(mlxp->mlx_rx_groups_size, KM_SLEEP); 2949 2950 mlxp->mlx_tx_ngroups = mlxp->mlx_props.mldp_tx_ngroups; 2951 mlxp->mlx_tx_groups_size = mlxp->mlx_tx_ngroups * 2952 sizeof (mlxcx_ring_group_t); 2953 mlxp->mlx_tx_groups = kmem_zalloc(mlxp->mlx_tx_groups_size, KM_SLEEP); 2954 2955 mlxp->mlx_attach |= MLXCX_ATTACH_GROUPS; 2956 2957 /* 2958 * Sets up the free/busy buffers list for keeping track of packet 2959 * buffers. 2960 */ 2961 if (!mlxcx_setup_bufs(mlxp)) 2962 goto err; 2963 mlxp->mlx_attach |= MLXCX_ATTACH_BUFS; 2964 2965 /* 2966 * Before we tell MAC about our rings/groups, we need to do enough 2967 * setup on them to be sure about the numbers and configuration that 2968 * we have. This will do basically everything short of allocating 2969 * packet buffers and starting the rings up. 2970 */ 2971 for (i = 0; i < mlxp->mlx_tx_ngroups; ++i) { 2972 if (!mlxcx_tx_group_setup(mlxp, &mlxp->mlx_tx_groups[i])) 2973 goto err; 2974 } 2975 for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) { 2976 if (!mlxcx_rx_group_setup(mlxp, &mlxp->mlx_rx_groups[i])) 2977 goto err; 2978 } 2979 2980 /* 2981 * Set up periodic fault check timers which check the queue states, 2982 * set up should be after all the queues have been initialized and 2983 * consequently the teardown of timers must happen before 2984 * queue teardown. 2985 */ 2986 if (!mlxcx_setup_checktimers(mlxp)) { 2987 goto err; 2988 } 2989 mlxp->mlx_attach |= MLXCX_ATTACH_CHKTIMERS; 2990 2991 /* 2992 * Some devices may not have a working temperature sensor; however, 2993 * there isn't a great way for us to know. We shouldn't fail attach if 2994 * this doesn't work. 2995 */ 2996 if (mlxcx_setup_sensors(mlxp)) { 2997 mlxp->mlx_attach |= MLXCX_ATTACH_SENSORS; 2998 } 2999 3000 /* 3001 * Finally, tell MAC that we exist! 3002 */ 3003 if (!mlxcx_register_mac(mlxp)) { 3004 goto err; 3005 } 3006 mlxp->mlx_attach |= MLXCX_ATTACH_MAC_HDL; 3007 3008 /* 3009 * This tells the interrupt handlers they can start processing events 3010 * other than cmd completions and page requests. 3011 */ 3012 mlxcx_eq_set_attached(mlxp); 3013 3014 return (DDI_SUCCESS); 3015 3016 err: 3017 mlxcx_teardown(mlxp); 3018 return (DDI_FAILURE); 3019 } 3020 3021 static struct cb_ops mlxcx_cb_ops = { 3022 .cb_open = nulldev, 3023 .cb_close = nulldev, 3024 .cb_strategy = nodev, 3025 .cb_print = nodev, 3026 .cb_dump = nodev, 3027 .cb_read = nodev, 3028 .cb_write = nodev, 3029 .cb_ioctl = nodev, 3030 .cb_devmap = nodev, 3031 .cb_mmap = nodev, 3032 .cb_segmap = nodev, 3033 .cb_chpoll = nochpoll, 3034 .cb_prop_op = ddi_prop_op, 3035 .cb_flag = D_MP, 3036 .cb_rev = CB_REV, 3037 .cb_aread = nodev, 3038 .cb_awrite = nodev 3039 }; 3040 3041 static struct dev_ops mlxcx_dev_ops = { 3042 .devo_rev = DEVO_REV, 3043 .devo_refcnt = 0, 3044 .devo_getinfo = NULL, 3045 .devo_identify = nulldev, 3046 .devo_probe = nulldev, 3047 .devo_attach = mlxcx_attach, 3048 .devo_detach = mlxcx_detach, 3049 .devo_reset = nodev, 3050 .devo_quiesce = ddi_quiesce_not_supported, 3051 .devo_cb_ops = &mlxcx_cb_ops 3052 }; 3053 3054 static struct modldrv mlxcx_modldrv = { 3055 .drv_modops = &mod_driverops, 3056 .drv_linkinfo = "Mellanox Connect-X 4/5/6", 3057 .drv_dev_ops = &mlxcx_dev_ops 3058 }; 3059 3060 static struct modlinkage mlxcx_modlinkage = { 3061 .ml_rev = MODREV_1, 3062 .ml_linkage = { &mlxcx_modldrv, NULL } 3063 }; 3064 3065 int 3066 _init(void) 3067 { 3068 int ret; 3069 3070 ret = ddi_soft_state_init(&mlxcx_softstate, sizeof (mlxcx_t), 0); 3071 if (ret != 0) { 3072 return (ret); 3073 } 3074 3075 mac_init_ops(&mlxcx_dev_ops, MLXCX_MODULE_NAME); 3076 3077 if ((ret = mod_install(&mlxcx_modlinkage)) != DDI_SUCCESS) { 3078 mac_fini_ops(&mlxcx_dev_ops); 3079 ddi_soft_state_fini(&mlxcx_softstate); 3080 return (ret); 3081 } 3082 3083 return (DDI_SUCCESS); 3084 } 3085 3086 int 3087 _info(struct modinfo *modinfop) 3088 { 3089 return (mod_info(&mlxcx_modlinkage, modinfop)); 3090 } 3091 3092 int 3093 _fini(void) 3094 { 3095 int ret; 3096 3097 if ((ret = mod_remove(&mlxcx_modlinkage)) != DDI_SUCCESS) { 3098 return (ret); 3099 } 3100 3101 mac_fini_ops(&mlxcx_dev_ops); 3102 3103 ddi_soft_state_fini(&mlxcx_softstate); 3104 3105 return (DDI_SUCCESS); 3106 } 3107