1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2021, The University of Queensland 14 * Copyright (c) 2018, Joyent, Inc. 15 * Copyright 2023 RackTop Systems, Inc. 16 * Copyright 2023 MNX Cloud, Inc. 17 */ 18 19 /* 20 * Mellanox Connect-X 4/5/6 driver. 21 */ 22 23 /* 24 * The PRM for this family of parts was freely available at: 25 * 26 * https://www.mellanox.com/related-docs/user_manuals/ \ 27 * Ethernet_Adapters_Programming_Manual.pdf 28 * 29 * but has since disappeared. 30 */ 31 /* 32 * ConnectX glossary 33 * ----------------- 34 * 35 * WR Work Request: something we've asked the hardware to do by 36 * creating a Work Queue Entry (WQE), e.g. send or recv a packet 37 * 38 * WQE Work Queue Entry: a descriptor on a work queue descriptor ring 39 * 40 * WQ Work Queue: a descriptor ring that we can place WQEs on, usually 41 * either a Send Queue (SQ) or Receive Queue (RQ). Different WQ 42 * types have different WQE structures, different commands for 43 * creating and destroying them, etc, but share a common context 44 * structure, counter setup and state graph. 45 * SQ Send Queue, a specific type of WQ that sends packets 46 * RQ Receive Queue, a specific type of WQ that receives packets 47 * 48 * CQ Completion Queue: completion of WRs from a WQ are reported to 49 * one of these, as a CQE on its entry ring. 50 * CQE Completion Queue Entry: an entry in a CQ ring. Contains error 51 * info, as well as packet size, the ID of the WQ, and the index 52 * of the WQE which completed. Does not contain any packet data. 53 * 54 * EQ Event Queue: a ring of event structs from the hardware informing 55 * us when particular events happen. Many events can point at a 56 * a particular CQ which we should then go look at. 57 * EQE Event Queue Entry: an entry on the EQ ring 58 * 59 * UAR User Access Region, a page of the device's PCI BAR which is 60 * tied to particular EQ/CQ/WQ sets and contains doorbells to 61 * ring to arm them for interrupts or wake them up for new work 62 * 63 * RQT RQ Table, a collection of indexed RQs used to refer to the group 64 * as a single unit (for e.g. hashing/RSS). 65 * 66 * TIR Transport Interface Recieve, a bucket of resources for the 67 * reception of packets. TIRs have to point at either a single RQ 68 * or a table of RQs (RQT). They then serve as a target for flow 69 * table entries (FEs). TIRs that point at an RQT also contain the 70 * settings for hashing for RSS. 71 * 72 * TIS Transport Interface Send, a bucket of resources associated with 73 * the transmission of packets. In particular, the temporary 74 * resources used for LSO internally in the card are accounted to 75 * a TIS. 76 * 77 * FT Flow Table, a collection of FEs and FGs that can be referred to 78 * as a single entity (e.g. used as a target from another flow 79 * entry or set as the "root" table to handle incoming or outgoing 80 * packets). Packets arriving at a FT are matched against the 81 * FEs in the table until either one matches with a terminating 82 * action or all FEs are exhausted (it's first-match-wins but with 83 * some actions that are non-terminal, like counting actions). 84 * 85 * FG Flow Group, a group of FEs which share a common "mask" (i.e. 86 * they match on the same attributes of packets coming into the 87 * flow). 88 * 89 * FE Flow Entry, an individual set of values to match against 90 * packets entering the flow table, combined with an action to 91 * take upon a successful match. The action we use most is 92 * "forward", which sends the packets to a TIR or another flow 93 * table and then stops further processing within the FE's FT. 94 * 95 * lkey/mkey A reference to something similar to a page table but in the 96 * device's internal onboard MMU. Since Connect-X parts double as 97 * IB cards (lots of RDMA) they have extensive onboard memory mgmt 98 * features which we try very hard not to use. For our WQEs we use 99 * the "reserved" lkey, which is a special value which indicates 100 * that addresses we give are linear addresses and should not be 101 * translated. 102 * 103 * PD Protection Domain, an IB concept. We have to allocate one to 104 * provide as a parameter for new WQs, but we don't do anything 105 * with it. 106 * 107 * TDOM/TD Transport Domain, an IB concept. We allocate one in order to 108 * provide it as a parameter to TIR/TIS creation, but we don't do 109 * anything with it. 110 */ 111 /* 112 * 113 * Data flow overview 114 * ------------------ 115 * 116 * This driver is a MAC ring-enabled driver which maps rings to send and recv 117 * queues in hardware on the device. 118 * 119 * Each SQ and RQ is set up to report to its own individual CQ, to ensure 120 * sufficient space, and simplify the logic needed to work out which buffer 121 * was completed. 122 * 123 * The CQs are then round-robin allocated onto EQs, of which we set up one per 124 * interrupt that the system gives us for the device. Normally this means we 125 * have 8 EQs. 126 * 127 * When we have >= 8 EQs available, we try to allocate only RX or only TX 128 * CQs on each one. The EQs are chosen for RX and TX in an alternating fashion. 129 * 130 * EQ #0 is reserved for all event types other than completion events, and has 131 * no CQs associated with it at any time. EQs #1 and upwards are only used for 132 * handling CQ completion events. 133 * 134 * +------+ +------+ +------+ +---------+ 135 * | SQ 0 |---->| CQ 0 |-----+ | EQ 0 |------> | MSI-X 0 | mlxcx_intr_0 136 * +------+ +------+ | +------+ +---------+ 137 * | 138 * +------+ +------+ | 139 * | SQ 1 |---->| CQ 1 |---+ | +------+ 140 * +------+ +------+ | +---> | | 141 * | | | 142 * +------+ +------+ | | EQ 1 | +---------+ 143 * | SQ 2 |---->| CQ 2 |---------> | |------> | MSI-X 1 | mlxcx_intr_n 144 * +------+ +------+ | +---> | | +---------+ 145 * | | +------+ 146 * | | 147 * ... | | 148 * | | +------+ 149 * +------+ +------+ +-----> | | 150 * | RQ 0 |---->| CQ 3 |---------> | | +---------+ 151 * +------+ +------+ | | EQ 2 |------> | MSI-X 2 | mlxcx_intr_n 152 * | | | +---------+ 153 * +------+ +------+ | +-> | | 154 * | RQ 1 |---->| CQ 4 |-----+ | +------+ 155 * +------+ +------+ | 156 * | .... 157 * +------+ +------+ | 158 * | RQ 2 |---->| CQ 5 |-------+ 159 * +------+ +------+ 160 * 161 * ... (note this diagram does not show RX-only or TX-only EQs) 162 * 163 * For TX, we advertise all of the SQs we create as plain rings to MAC with 164 * no TX groups. This puts MAC in "virtual group" mode where it will allocate 165 * and use the rings as it sees fit. 166 * 167 * For RX, we advertise actual groups in order to make use of hardware 168 * classification. 169 * 170 * The hardware classification we use is based around Flow Tables, and we 171 * currently ignore all of the eswitch features of the card. The NIC VPORT 172 * is always set to promisc mode so that the eswitch sends us all of the 173 * traffic that arrives on the NIC, and we use flow entries to manage 174 * everything. 175 * 176 * We use 2 layers of flow tables for classification: traffic arrives at the 177 * root RX flow table which contains MAC address filters. Those then send 178 * matched traffic to the per-group L1 VLAN filter tables which contain VLAN 179 * presence and VID filters. 180 * 181 * Since these parts only support doing RSS hashing on a single protocol at a 182 * time, we have to use a third layer of flow tables as well to break traffic 183 * down by L4 and L3 protocol (TCPv6, TCPv4, UDPv6, UDPv4, IPv6, IPv4 etc) 184 * so that it can be sent to the appropriate TIR for hashing. 185 * 186 * Incoming packets 187 * + +---------+ +---------+ 188 * | +->| group 0 | | group 0 | 189 * | | | vlan ft | +-->| hash ft | 190 * v | | L1 | | | L2 | 191 * +----+----+ | +---------+ | +---------+ +-----+ +-----+------+ 192 * | eswitch | | | | | | TCPv6 |--->| TIR |--->| | RQ0 | 193 * +----+----+ | | | | +---------+ +-----+ | +------+ 194 * | | | | | | UDPv6 |--->| TIR |--->| | RQ1 | 195 * | | | | | +---------+ +-----+ | +------+ 196 * | | | | | | TCPv4 |--->| TIR |--->| | RQ2 | 197 * v | | | | +---------+ +-----+ | RQT +------+ 198 * +----+----+ | +---------+ | | UDPv4 |--->| TIR |--->| | ... | 199 * | root rx | | | default |--+ +---------+ +-----+ | | | 200 * | flow tb | | +---------+ | | IPv6 |--->| TIR |--->| | | 201 * | L0 | | | promisc |--+ +---------+ +-----+ | | | 202 * +---------+ | +---------+ ^ | IPv4 |--->| TIR |--->| | | 203 * | bcast |---|---------------+ +---------+ +-----+ +-----+------+ 204 * +---------+ | ^ | other |-+ 205 * | MAC 0 |---+ | +---------+ | +-----+ +-----+ 206 * +---------+ | +->| TIR |--->| RQ0 | 207 * | MAC 1 |-+ | +-----+ +-----+ 208 * +---------+ | +---------------+ 209 * | MAC 2 |-+ | ^ 210 * +---------+ | | | 211 * | MAC 3 |-+ | +---------+ | +---------+ 212 * +---------+ | | | group 1 | | | group 1 | 213 * | ..... | +--->| vlan ft | | +>| hash ft | 214 * | | | | L1 | | | | L2 | 215 * +---------+ | +---------+ | | +---------+ +-----+ +-----+------+ 216 * | promisc |---+ | VLAN 0 |----+ | TCPv6 |--->| TIR |--->| | RQ3 | 217 * +---------+ +---------+ | +---------+ +-----+ | +------+ 218 * | ..... | | | UDPv6 |--->| TIR |--->| | RQ4 | 219 * | | | +---------+ +-----+ | +------+ 220 * | | | | TCPv4 |--->| TIR |--->| | RQ5 | 221 * | | | +---------+ +-----+ | RQT +------+ 222 * +---------+ | | UDPv4 |--->| TIR |--->| | ... | 223 * | | | +---------+ +-----+ | | | 224 * +---------+ | | IPv6 |--->| TIR |--->| | | 225 * | promisc |--+ +---------+ +-----+ | | | 226 * +---------+ | IPv4 |--->| TIR |--->| | | 227 * +---------+ +-----+ +-----+------+ 228 * | other |-+ 229 * +---------+ | 230 * ....... | +-----+ +-----+ 231 * +->| TIR |--->| RQ3 | 232 * +-----+ +-----+ 233 * 234 * Note that the "promisc" flow entries are only set/enabled when promisc 235 * mode is enabled for the NIC. All promisc flow entries point directly at 236 * group 0's hashing flowtable (so all promisc-only traffic lands on group 0, 237 * the "default group" in MAC). 238 * 239 * The "default" entry in the L1 VLAN filter flow tables is used when there 240 * are no VLANs set for the group, to accept any traffic regardless of tag. It 241 * is deleted as soon as a VLAN filter is added (and re-instated if the 242 * last VLAN filter is removed). 243 * 244 * The actual descriptor ring structures for RX on Connect-X4 don't contain any 245 * space for packet data (they're a collection of scatter pointers only). TX 246 * descriptors contain some space for "inline headers" (and the card requires 247 * us to put at least the L2 Ethernet headers there for the eswitch to look at) 248 * but all the rest of the data comes from the gather pointers. 249 * 250 * When we get completions back they simply contain the ring index number of 251 * the WR (work request) which completed. So, we manage the buffers for actual 252 * packet data completely independently of the descriptors in this driver. When 253 * a WR is enqueued in a WQE (work queue entry), we stamp the packet data buffer 254 * with the WQE index that we put it at, and therefore don't have to look at 255 * the original descriptor at all when handling completions. 256 * 257 * For RX, we create sufficient packet data buffers to fill 150% of the 258 * available descriptors for each ring. These all are pre-set-up for DMA and 259 * have an mblk_t associated with them (with desballoc()). 260 * 261 * For TX we either borrow the mblk's memory and DMA bind it (if the packet is 262 * large enough), or we copy it into a pre-allocated buffer set up in the same 263 * as as for RX. 264 */ 265 266 /* 267 * Buffer lifecycle: RX 268 * -------------------- 269 * 270 * The lifecycle of an mlxcx_buffer_t (packet buffer) used for RX is pretty 271 * straightforward. 272 * 273 * It is created (and has all its memory allocated) at the time of starting up 274 * the RX ring it belongs to. Then it is placed on the "free" list in the 275 * mlxcx_buffer_shard_t associated with its RQ. When mlxcx_rq_refill() wants 276 * more buffers to add to the RQ, it takes one off and marks it as "on WQ" 277 * before making a WQE for it. 278 * 279 * After a completion event occurs, the packet is either discarded (and the 280 * buffer_t returned to the free list), or it is readied for loaning to MAC 281 * and placed on the "loaned" list in the mlxcx_buffer_shard_t. 282 * 283 * Once MAC and the rest of the system have finished with the packet, they call 284 * freemsg() on its mblk, which will call mlxcx_buf_mp_return. At this point 285 * the fate of the buffer_t is determined by the state of the 286 * mlxcx_buffer_shard_t. When the shard is in its normal state the buffer_t 287 * will be returned to the free list, potentially to be recycled and used 288 * again. But if the shard is draining (E.g. after a ring stop) there will be 289 * no recycling and the buffer_t is immediately destroyed. 290 * 291 * At detach/teardown time, buffers are only every destroyed from the free list. 292 * 293 * 294 * + 295 * | 296 * | mlxcx_buf_create 297 * | 298 * v 299 * +----+----+ 300 * | created | 301 * +----+----+ +------+ 302 * | | dead | 303 * | +------+ 304 * | mlxcx_buf_return ^ 305 * | | 306 * v | mlxcx_buf_destroy 307 * mlxcx_buf_destroy +----+----+ +-----------+ | 308 * +---------| free |<------no-| draining? |-yes-+ 309 * | +----+----+ +-----------+ 310 * | | ^ 311 * | | | 312 * v | mlxcx_buf_take | mlxcx_buf_return 313 * +---+--+ v | 314 * | dead | +---+---+ | 315 * +------+ | on WQ |- - - - - - - - >O 316 * +---+---+ ^ 317 * | | 318 * | | 319 * | mlxcx_buf_loan | mlxcx_buf_mp_return 320 * v | 321 * +-------+--------+ | 322 * | on loan to MAC |----------->O 323 * +----------------+ freemsg() 324 * 325 */ 326 327 /* 328 * Buffer lifecycle: TX 329 * -------------------- 330 * 331 * mlxcx_buffer_ts used for TX are divided into two kinds: regular buffers, and 332 * "foreign" buffers. 333 * 334 * The former have their memory allocated and DMA bound by this driver, while 335 * the latter (the "foreign" buffers) are on loan from MAC. Their memory is 336 * not owned by us, though we do DMA bind it (and take responsibility for 337 * un-binding it when we're done with them). 338 * 339 * We use separate mlxcx_buf_shard_ts for foreign and local buffers on each 340 * SQ. Thus, there is a separate free list and mutex for each kind. 341 * 342 * Since a TX packet might consist of multiple mblks, we translate each mblk 343 * into exactly one buffer_t. The buffer_ts are chained together in the same 344 * order as the mblks, using the mlb_tx_chain/mlb_tx_chain_entry list_t. 345 * 346 * Each chain of TX buffers may consist of foreign or driver buffers, in any 347 * mixture. 348 * 349 * The head of a TX buffer chain has mlb_tx_head == itself, which distinguishes 350 * it from the rest of the chain buffers. 351 * 352 * TX buffer chains are always returned to the free list by 353 * mlxcx_buf_return_chain(), which takes care of walking the mlb_tx_chain and 354 * freeing all of the members. 355 * 356 * We only call freemsg() once, on the head of the TX buffer chain's original 357 * mblk. This is true whether we copied it or bound it in a foreign buffer. 358 */ 359 360 /* 361 * Startup and command interface 362 * ----------------------------- 363 * 364 * The command interface is the primary way in which we give control orders to 365 * the hardware (e.g. actions like "create this queue" or "delete this flow 366 * entry"). The command interface is never used to transmit or receive packets 367 * -- that takes place only on the queues that are set up through it. 368 * 369 * In mlxcx_cmd.c we implement our use of the command interface on top of a 370 * simple taskq. As commands are submitted from the taskq they choose a 371 * "slot", if there are no free slots then execution of the command will 372 * be paused until one is free. The hardware permits up to 32 independent 373 * slots for concurrent command execution. 374 * 375 * Before interrupts are enabled, command completion is polled, once 376 * interrupts are up command completions become asynchronous and are 377 * wired to EQ 0. A caveat to this is commands can not be submitted 378 * directly from EQ 0's completion handler, and any processing resulting from 379 * an asynchronous event which requires further use of the command interface 380 * is posted through a taskq. 381 * 382 * The startup/attach process for this card involves a bunch of different steps 383 * which are summarised pretty well in the PRM. We have to send a number of 384 * commands which do different things to start the card up, give it some pages 385 * of our own memory for it to use, then start creating all the entities that 386 * we need to use like EQs, CQs, WQs, as well as their dependencies like PDs 387 * and TDoms. 388 */ 389 390 /* 391 * UARs 392 * ---- 393 * 394 * The pages of the PCI BAR other than the first few are reserved for use as 395 * "UAR" sections in this device. Each UAR section can be used as a set of 396 * doorbells for our queues. 397 * 398 * Currently we just make one single UAR for all of our queues. It doesn't 399 * seem to be a major limitation yet. 400 * 401 * When we're sending packets through an SQ, the PRM is not awful clear about 402 * exactly how we're meant to use the first 16 bytes of the Blueflame buffers 403 * (it's clear on the pattern of alternation you're expected to use between 404 * even and odd for Blueflame sends, but not for regular doorbells). 405 * 406 * Currently we don't do the even-odd alternating pattern for ordinary 407 * doorbells, and we don't use Blueflame at all. This seems to work fine, at 408 * least on Connect-X4 Lx. 409 */ 410 411 /* 412 * Lock ordering 413 * ------------- 414 * 415 * Interrupt side: 416 * 417 * - mleq_mtx 418 * - mlcq_arm_mtx 419 * - mlcq_mtx 420 * - mlcq_bufbmtx 421 * - mlwq_mtx 422 * - mlbs_mtx 423 * - mlp_mtx 424 * 425 * GLD side: 426 * 427 * - mlp_mtx 428 * - mlg_mtx 429 * - mlg_*.mlft_mtx 430 * - mlp_*.mlft_mtx 431 * - mlwq_mtx 432 * - mlbs_mtx 433 * - mlcq_bufbmtx 434 * - mleq_mtx 435 * - mlcq_arm_mtx 436 * - mlcq_mtx 437 * 438 */ 439 440 #include <sys/modctl.h> 441 #include <sys/conf.h> 442 #include <sys/devops.h> 443 #include <sys/sysmacros.h> 444 #include <sys/time.h> 445 #include <sys/pci.h> 446 #include <sys/mac_provider.h> 447 448 #include <mlxcx.h> 449 450 CTASSERT((1 << MLXCX_RX_HASH_FT_SIZE_SHIFT) >= MLXCX_TIRS_PER_GROUP); 451 452 #define MLXCX_MODULE_NAME "mlxcx" 453 /* 454 * We give this to the firmware, so it has to be in a fixed format that it 455 * understands. 456 */ 457 #define MLXCX_DRIVER_VERSION "illumos,mlxcx,1.0.0,1,000,000000" 458 459 /* 460 * Firmware may take a while to reclaim pages. Try a set number of times. 461 */ 462 clock_t mlxcx_reclaim_delay = 1000 * 50; /* 50 ms in us */ 463 uint_t mlxcx_reclaim_tries = 100; /* Wait at most 5000ms */ 464 465 static void *mlxcx_softstate; 466 467 /* 468 * Fault detection thresholds. 469 */ 470 uint_t mlxcx_doorbell_tries = MLXCX_DOORBELL_TRIES_DFLT; 471 uint_t mlxcx_stuck_intr_count = MLXCX_STUCK_INTR_COUNT_DFLT; 472 473 static void 474 mlxcx_load_prop_defaults(mlxcx_t *mlxp) 475 { 476 mlxcx_drv_props_t *p = &mlxp->mlx_props; 477 mlxcx_port_t *port = &mlxp->mlx_ports[0]; 478 479 VERIFY((mlxp->mlx_attach & MLXCX_ATTACH_PORTS) != 0); 480 VERIFY((mlxp->mlx_attach & (MLXCX_ATTACH_CQS | MLXCX_ATTACH_WQS)) == 0); 481 482 /* 483 * Currently we have different queue size defaults for two 484 * categories of queues. One set for devices which support a 485 * maximum speed of 10Gb/s, and another for those above that. 486 */ 487 if ((port->mlp_max_proto & (MLXCX_PROTO_25G | MLXCX_PROTO_40G | 488 MLXCX_PROTO_50G | MLXCX_PROTO_100G)) != 0 || 489 (port->mlp_ext_max_proto & (MLXCX_EXTPROTO_25G | 490 MLXCX_EXTPROTO_40G | MLXCX_EXTPROTO_50G | MLXCX_EXTPROTO_100G | 491 MLXCX_EXTPROTO_200G | MLXCX_EXTPROTO_400G)) != 0) { 492 p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_25G; 493 p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_25G; 494 p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_25G; 495 } else if ((port->mlp_max_proto & (MLXCX_PROTO_100M | MLXCX_PROTO_1G | 496 MLXCX_PROTO_10G)) != 0 || 497 (port->mlp_ext_max_proto & (MLXCX_EXTPROTO_100M | 498 MLXCX_EXTPROTO_5G | MLXCX_EXTPROTO_1G | MLXCX_EXTPROTO_10G)) != 0) { 499 p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT; 500 p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT; 501 p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT; 502 } else { 503 mlxcx_warn(mlxp, "Encountered a port with a speed we don't " 504 "recognize. Proto: 0x%x", port->mlp_max_proto); 505 p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT; 506 p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT; 507 p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT; 508 } 509 } 510 511 /* 512 * Properties which may have different defaults based on hardware 513 * characteristics. 514 */ 515 static void 516 mlxcx_load_model_props(mlxcx_t *mlxp) 517 { 518 mlxcx_drv_props_t *p = &mlxp->mlx_props; 519 520 mlxcx_load_prop_defaults(mlxp); 521 522 p->mldp_cq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 523 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cq_size_shift", 524 p->mldp_cq_size_shift_default); 525 p->mldp_sq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 526 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "sq_size_shift", 527 p->mldp_sq_size_shift_default); 528 p->mldp_rq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 529 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rq_size_shift", 530 p->mldp_rq_size_shift_default); 531 } 532 533 static void 534 mlxcx_load_props(mlxcx_t *mlxp) 535 { 536 mlxcx_drv_props_t *p = &mlxp->mlx_props; 537 538 p->mldp_eq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 539 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "eq_size_shift", 540 MLXCX_EQ_SIZE_SHIFT_DFLT); 541 p->mldp_cqemod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 542 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_period_usec", 543 MLXCX_CQEMOD_PERIOD_USEC_DFLT); 544 p->mldp_cqemod_count = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 545 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_count", 546 MLXCX_CQEMOD_COUNT_DFLT); 547 p->mldp_intrmod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 548 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "intrmod_period_usec", 549 MLXCX_INTRMOD_PERIOD_USEC_DFLT); 550 551 p->mldp_tx_ngroups = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 552 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_ngroups", 553 MLXCX_TX_NGROUPS_DFLT); 554 p->mldp_tx_nrings_per_group = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 555 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_nrings_per_group", 556 MLXCX_TX_NRINGS_PER_GROUP_DFLT); 557 558 p->mldp_rx_ngroups_large = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 559 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_ngroups_large", 560 MLXCX_RX_NGROUPS_LARGE_DFLT); 561 p->mldp_rx_ngroups_small = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 562 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_ngroups_small", 563 MLXCX_RX_NGROUPS_SMALL_DFLT); 564 p->mldp_rx_nrings_per_large_group = ddi_getprop(DDI_DEV_T_ANY, 565 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 566 "rx_nrings_per_large_group", MLXCX_RX_NRINGS_PER_LARGE_GROUP_DFLT); 567 p->mldp_rx_nrings_per_small_group = ddi_getprop(DDI_DEV_T_ANY, 568 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 569 "rx_nrings_per_small_group", MLXCX_RX_NRINGS_PER_SMALL_GROUP_DFLT); 570 571 p->mldp_ftbl_root_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 572 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "ftbl_root_size_shift", 573 MLXCX_FTBL_ROOT_SIZE_SHIFT_DFLT); 574 575 p->mldp_tx_bind_threshold = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 576 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_bind_threshold", 577 MLXCX_TX_BIND_THRESHOLD_DFLT); 578 579 p->mldp_ftbl_vlan_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 580 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "ftbl_vlan_size_shift", 581 MLXCX_FTBL_VLAN_SIZE_SHIFT_DFLT); 582 583 p->mldp_eq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY, 584 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 585 "eq_check_interval_sec", MLXCX_EQ_CHECK_INTERVAL_SEC_DFLT); 586 p->mldp_cq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY, 587 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 588 "cq_check_interval_sec", MLXCX_CQ_CHECK_INTERVAL_SEC_DFLT); 589 p->mldp_wq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY, 590 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 591 "wq_check_interval_sec", MLXCX_WQ_CHECK_INTERVAL_SEC_DFLT); 592 593 p->mldp_rx_per_cq = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 594 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_limit_per_completion", 595 MLXCX_RX_PER_CQ_DEFAULT); 596 597 if (p->mldp_rx_per_cq < MLXCX_RX_PER_CQ_MIN || 598 p->mldp_rx_per_cq > MLXCX_RX_PER_CQ_MAX) { 599 mlxcx_warn(mlxp, "!rx_limit_per_completion = %u is " 600 "out of range. Defaulting to: %d. Valid values are from " 601 "%d to %d", p->mldp_rx_per_cq, MLXCX_RX_PER_CQ_DEFAULT, 602 MLXCX_RX_PER_CQ_MIN, MLXCX_RX_PER_CQ_MAX); 603 p->mldp_rx_per_cq = MLXCX_RX_PER_CQ_DEFAULT; 604 } 605 } 606 607 void 608 mlxcx_note(mlxcx_t *mlxp, const char *fmt, ...) 609 { 610 va_list ap; 611 612 va_start(ap, fmt); 613 if (mlxp != NULL && mlxp->mlx_dip != NULL) { 614 vdev_err(mlxp->mlx_dip, CE_NOTE, fmt, ap); 615 } else { 616 vcmn_err(CE_NOTE, fmt, ap); 617 } 618 va_end(ap); 619 } 620 621 void 622 mlxcx_warn(mlxcx_t *mlxp, const char *fmt, ...) 623 { 624 va_list ap; 625 626 va_start(ap, fmt); 627 if (mlxp != NULL && mlxp->mlx_dip != NULL) { 628 vdev_err(mlxp->mlx_dip, CE_WARN, fmt, ap); 629 } else { 630 vcmn_err(CE_WARN, fmt, ap); 631 } 632 va_end(ap); 633 } 634 635 void 636 mlxcx_panic(mlxcx_t *mlxp, const char *fmt, ...) 637 { 638 va_list ap; 639 640 va_start(ap, fmt); 641 if (mlxp != NULL && mlxp->mlx_dip != NULL) { 642 vdev_err(mlxp->mlx_dip, CE_PANIC, fmt, ap); 643 } else { 644 vcmn_err(CE_PANIC, fmt, ap); 645 } 646 va_end(ap); 647 } 648 649 uint16_t 650 mlxcx_get16(mlxcx_t *mlxp, uintptr_t off) 651 { 652 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 653 return (ddi_get16(mlxp->mlx_regs_handle, (void *)addr)); 654 } 655 656 uint32_t 657 mlxcx_get32(mlxcx_t *mlxp, uintptr_t off) 658 { 659 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 660 return (ddi_get32(mlxp->mlx_regs_handle, (void *)addr)); 661 } 662 663 uint64_t 664 mlxcx_get64(mlxcx_t *mlxp, uintptr_t off) 665 { 666 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 667 return (ddi_get64(mlxp->mlx_regs_handle, (void *)addr)); 668 } 669 670 void 671 mlxcx_put32(mlxcx_t *mlxp, uintptr_t off, uint32_t val) 672 { 673 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 674 ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val); 675 } 676 677 void 678 mlxcx_put64(mlxcx_t *mlxp, uintptr_t off, uint64_t val) 679 { 680 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 681 ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val); 682 } 683 684 void 685 mlxcx_uar_put32(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint32_t val) 686 { 687 /* 688 * The UAR is always inside the first BAR, which we mapped as 689 * mlx_regs 690 */ 691 uintptr_t addr = off + (uintptr_t)mlu->mlu_base + 692 (uintptr_t)mlxp->mlx_regs_base; 693 ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val); 694 } 695 696 void 697 mlxcx_uar_put64(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint64_t val) 698 { 699 uintptr_t addr = off + (uintptr_t)mlu->mlu_base + 700 (uintptr_t)mlxp->mlx_regs_base; 701 ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val); 702 } 703 704 static void 705 mlxcx_fm_fini(mlxcx_t *mlxp) 706 { 707 if (mlxp->mlx_fm_caps == 0) 708 return; 709 710 if (DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) 711 ddi_fm_handler_unregister(mlxp->mlx_dip); 712 713 if (DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps) || 714 DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) 715 pci_ereport_teardown(mlxp->mlx_dip); 716 717 ddi_fm_fini(mlxp->mlx_dip); 718 719 mlxp->mlx_fm_caps = 0; 720 } 721 722 void 723 mlxcx_fm_ereport(mlxcx_t *mlxp, const char *detail) 724 { 725 uint64_t ena; 726 char buf[FM_MAX_CLASS]; 727 728 if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps)) 729 return; 730 731 (void) snprintf(buf, FM_MAX_CLASS, "%s.%s", DDI_FM_DEVICE, detail); 732 ena = fm_ena_generate(0, FM_ENA_FMT1); 733 ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP, 734 FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0, 735 NULL); 736 } 737 738 static int 739 mlxcx_fm_errcb(dev_info_t *dip, ddi_fm_error_t *err, const void *arg) 740 { 741 /* 742 * as the driver can always deal with an error in any dma or 743 * access handle, we can just return the fme_status value. 744 */ 745 pci_ereport_post(dip, err, NULL); 746 return (err->fme_status); 747 } 748 749 static void 750 mlxcx_fm_init(mlxcx_t *mlxp) 751 { 752 ddi_iblock_cookie_t iblk; 753 int def = DDI_FM_EREPORT_CAPABLE | DDI_FM_ACCCHK_CAPABLE | 754 DDI_FM_DMACHK_CAPABLE | DDI_FM_ERRCB_CAPABLE; 755 756 mlxp->mlx_fm_caps = ddi_prop_get_int(DDI_DEV_T_ANY, mlxp->mlx_dip, 757 DDI_PROP_DONTPASS, "fm_capable", def); 758 759 if (mlxp->mlx_fm_caps < 0) { 760 mlxp->mlx_fm_caps = 0; 761 } 762 mlxp->mlx_fm_caps &= def; 763 764 if (mlxp->mlx_fm_caps == 0) 765 return; 766 767 ddi_fm_init(mlxp->mlx_dip, &mlxp->mlx_fm_caps, &iblk); 768 if (DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps) || 769 DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) { 770 pci_ereport_setup(mlxp->mlx_dip); 771 } 772 if (DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) { 773 ddi_fm_handler_register(mlxp->mlx_dip, mlxcx_fm_errcb, 774 (void *)mlxp); 775 } 776 } 777 778 static void 779 mlxcx_mlbs_teardown(mlxcx_t *mlxp, mlxcx_buf_shard_t *s) 780 { 781 mlxcx_buffer_t *buf; 782 783 mutex_enter(&s->mlbs_mtx); 784 785 while (!list_is_empty(&s->mlbs_busy)) 786 cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx); 787 788 while (!list_is_empty(&s->mlbs_loaned)) 789 cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx); 790 791 while ((buf = list_head(&s->mlbs_free)) != NULL) 792 mlxcx_buf_destroy(mlxp, buf); 793 794 list_destroy(&s->mlbs_free); 795 list_destroy(&s->mlbs_busy); 796 list_destroy(&s->mlbs_loaned); 797 mutex_exit(&s->mlbs_mtx); 798 799 cv_destroy(&s->mlbs_free_nonempty); 800 mutex_destroy(&s->mlbs_mtx); 801 } 802 803 static void 804 mlxcx_teardown_bufs(mlxcx_t *mlxp) 805 { 806 mlxcx_buf_shard_t *s; 807 808 while ((s = list_remove_head(&mlxp->mlx_buf_shards)) != NULL) { 809 mlxcx_mlbs_teardown(mlxp, s); 810 kmem_free(s, sizeof (mlxcx_buf_shard_t)); 811 } 812 list_destroy(&mlxp->mlx_buf_shards); 813 814 kmem_cache_destroy(mlxp->mlx_bufs_cache); 815 } 816 817 static void 818 mlxcx_teardown_pages(mlxcx_t *mlxp) 819 { 820 uint_t nzeros = 0; 821 uint64_t *pas; 822 823 pas = kmem_alloc(sizeof (*pas) * MLXCX_MANAGE_PAGES_MAX_PAGES, 824 KM_SLEEP); 825 826 mutex_enter(&mlxp->mlx_pagemtx); 827 828 while (mlxp->mlx_npages > 0) { 829 int32_t req, ret; 830 831 ASSERT0(avl_is_empty(&mlxp->mlx_pages)); 832 req = MIN(mlxp->mlx_npages, MLXCX_MANAGE_PAGES_MAX_PAGES); 833 834 if (!mlxcx_cmd_return_pages(mlxp, req, pas, &ret)) { 835 mlxcx_warn(mlxp, "hardware refused to return pages, " 836 "leaking %u remaining pages", mlxp->mlx_npages); 837 goto out; 838 } 839 840 for (int32_t i = 0; i < ret; i++) { 841 mlxcx_dev_page_t *mdp, probe; 842 bzero(&probe, sizeof (probe)); 843 probe.mxdp_pa = pas[i]; 844 845 mdp = avl_find(&mlxp->mlx_pages, &probe, NULL); 846 847 if (mdp != NULL) { 848 avl_remove(&mlxp->mlx_pages, mdp); 849 mlxp->mlx_npages--; 850 mlxcx_dma_free(&mdp->mxdp_dma); 851 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 852 } else { 853 mlxcx_panic(mlxp, "hardware returned a page " 854 "with PA 0x%" PRIx64 " but we have no " 855 "record of giving out such a page", pas[i]); 856 } 857 } 858 859 /* 860 * If no pages were returned, note that fact. 861 */ 862 if (ret == 0) { 863 nzeros++; 864 if (nzeros > mlxcx_reclaim_tries) { 865 mlxcx_warn(mlxp, "hardware refused to return " 866 "pages, leaking %u remaining pages", 867 mlxp->mlx_npages); 868 goto out; 869 } 870 delay(drv_usectohz(mlxcx_reclaim_delay)); 871 } 872 } 873 874 avl_destroy(&mlxp->mlx_pages); 875 876 out: 877 mutex_exit(&mlxp->mlx_pagemtx); 878 mutex_destroy(&mlxp->mlx_pagemtx); 879 880 kmem_free(pas, sizeof (*pas) * MLXCX_MANAGE_PAGES_MAX_PAGES); 881 } 882 883 static boolean_t 884 mlxcx_eq_alloc_dma(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) 885 { 886 ddi_device_acc_attr_t acc; 887 ddi_dma_attr_t attr; 888 boolean_t ret; 889 size_t sz, i; 890 891 VERIFY0(mleq->mleq_state & MLXCX_EQ_ALLOC); 892 893 mleq->mleq_entshift = mlxp->mlx_props.mldp_eq_size_shift; 894 mleq->mleq_nents = (1 << mleq->mleq_entshift); 895 sz = mleq->mleq_nents * sizeof (mlxcx_eventq_ent_t); 896 ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0); 897 898 mlxcx_dma_acc_attr(mlxp, &acc); 899 mlxcx_dma_queue_attr(mlxp, &attr); 900 901 ret = mlxcx_dma_alloc(mlxp, &mleq->mleq_dma, &attr, &acc, 902 B_TRUE, sz, B_TRUE); 903 if (!ret) { 904 mlxcx_warn(mlxp, "failed to allocate EQ memory"); 905 return (B_FALSE); 906 } 907 908 mleq->mleq_ent = (mlxcx_eventq_ent_t *)mleq->mleq_dma.mxdb_va; 909 910 for (i = 0; i < mleq->mleq_nents; ++i) 911 mleq->mleq_ent[i].mleqe_owner = MLXCX_EQ_OWNER_INIT; 912 913 mleq->mleq_state |= MLXCX_EQ_ALLOC; 914 915 return (B_TRUE); 916 } 917 918 static void 919 mlxcx_eq_rele_dma(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) 920 { 921 VERIFY(mleq->mleq_state & MLXCX_EQ_ALLOC); 922 if (mleq->mleq_state & MLXCX_EQ_CREATED) 923 VERIFY(mleq->mleq_state & MLXCX_EQ_DESTROYED); 924 925 mlxcx_dma_free(&mleq->mleq_dma); 926 mleq->mleq_ent = NULL; 927 928 mleq->mleq_state &= ~MLXCX_EQ_ALLOC; 929 } 930 931 void 932 mlxcx_teardown_flow_table(mlxcx_t *mlxp, mlxcx_flow_table_t *ft) 933 { 934 mlxcx_flow_group_t *fg; 935 mlxcx_flow_entry_t *fe; 936 int i; 937 938 ASSERT(mutex_owned(&ft->mlft_mtx)); 939 940 for (i = ft->mlft_nents - 1; i >= 0; --i) { 941 fe = &ft->mlft_ent[i]; 942 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) { 943 if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) { 944 mlxcx_panic(mlxp, "failed to delete flow " 945 "entry %u on table %u", i, 946 ft->mlft_num); 947 } 948 } 949 } 950 951 while ((fg = list_remove_head(&ft->mlft_groups)) != NULL) { 952 if (fg->mlfg_state & MLXCX_FLOW_GROUP_CREATED && 953 !(fg->mlfg_state & MLXCX_FLOW_GROUP_DESTROYED)) { 954 if (!mlxcx_cmd_destroy_flow_group(mlxp, fg)) { 955 mlxcx_panic(mlxp, "failed to destroy flow " 956 "group %u", fg->mlfg_num); 957 } 958 } 959 kmem_free(fg, sizeof (mlxcx_flow_group_t)); 960 } 961 list_destroy(&ft->mlft_groups); 962 if (ft->mlft_state & MLXCX_FLOW_TABLE_CREATED && 963 !(ft->mlft_state & MLXCX_FLOW_TABLE_DESTROYED)) { 964 if (!mlxcx_cmd_destroy_flow_table(mlxp, ft)) { 965 mlxcx_panic(mlxp, "failed to destroy flow table %u", 966 ft->mlft_num); 967 } 968 } 969 kmem_free(ft->mlft_ent, ft->mlft_entsize); 970 ft->mlft_ent = NULL; 971 mutex_exit(&ft->mlft_mtx); 972 mutex_destroy(&ft->mlft_mtx); 973 kmem_free(ft, sizeof (mlxcx_flow_table_t)); 974 } 975 976 static void 977 mlxcx_teardown_ports(mlxcx_t *mlxp) 978 { 979 uint_t i; 980 mlxcx_port_t *p; 981 mlxcx_flow_table_t *ft; 982 983 for (i = 0; i < mlxp->mlx_nports; ++i) { 984 p = &mlxp->mlx_ports[i]; 985 if (!(p->mlp_init & MLXCX_PORT_INIT)) 986 continue; 987 mutex_enter(&p->mlp_mtx); 988 if ((ft = p->mlp_rx_flow) != NULL) { 989 mutex_enter(&ft->mlft_mtx); 990 /* 991 * teardown_flow_table() will destroy the mutex, so 992 * we don't release it here. 993 */ 994 mlxcx_teardown_flow_table(mlxp, ft); 995 } 996 mutex_exit(&p->mlp_mtx); 997 mutex_destroy(&p->mlp_mtx); 998 mutex_destroy(&p->mlx_port_event.mla_mtx); 999 p->mlx_port_event.mla_mlx = NULL; 1000 p->mlx_port_event.mla_port = NULL; 1001 p->mlp_init &= ~MLXCX_PORT_INIT; 1002 } 1003 1004 kmem_free(mlxp->mlx_ports, mlxp->mlx_ports_size); 1005 mlxp->mlx_ports = NULL; 1006 } 1007 1008 static void 1009 mlxcx_teardown_wqs(mlxcx_t *mlxp) 1010 { 1011 mlxcx_work_queue_t *mlwq; 1012 1013 while ((mlwq = list_head(&mlxp->mlx_wqs)) != NULL) { 1014 mlxcx_wq_teardown(mlxp, mlwq); 1015 } 1016 list_destroy(&mlxp->mlx_wqs); 1017 } 1018 1019 static void 1020 mlxcx_teardown_cqs(mlxcx_t *mlxp) 1021 { 1022 mlxcx_completion_queue_t *mlcq; 1023 1024 while ((mlcq = list_head(&mlxp->mlx_cqs)) != NULL) { 1025 mlxcx_cq_teardown(mlxp, mlcq); 1026 } 1027 list_destroy(&mlxp->mlx_cqs); 1028 } 1029 1030 static void 1031 mlxcx_teardown_eqs(mlxcx_t *mlxp) 1032 { 1033 mlxcx_event_queue_t *mleq; 1034 uint_t i; 1035 1036 for (i = 0; i < mlxp->mlx_intr_count; ++i) { 1037 mleq = &mlxp->mlx_eqs[i]; 1038 mutex_enter(&mleq->mleq_mtx); 1039 if ((mleq->mleq_state & MLXCX_EQ_CREATED) && 1040 !(mleq->mleq_state & MLXCX_EQ_DESTROYED)) { 1041 if (!mlxcx_cmd_destroy_eq(mlxp, mleq)) { 1042 mlxcx_warn(mlxp, "failed to destroy " 1043 "event queue idx %u eqn %u", 1044 i, mleq->mleq_num); 1045 } 1046 } 1047 if (mleq->mleq_state & MLXCX_EQ_ALLOC) { 1048 mlxcx_eq_rele_dma(mlxp, mleq); 1049 } 1050 mutex_exit(&mleq->mleq_mtx); 1051 } 1052 } 1053 1054 static void 1055 mlxcx_teardown_checktimers(mlxcx_t *mlxp) 1056 { 1057 if (mlxp->mlx_props.mldp_eq_check_interval_sec > 0) 1058 ddi_periodic_delete(mlxp->mlx_eq_checktimer); 1059 if (mlxp->mlx_props.mldp_cq_check_interval_sec > 0) 1060 ddi_periodic_delete(mlxp->mlx_cq_checktimer); 1061 if (mlxp->mlx_props.mldp_wq_check_interval_sec > 0) 1062 ddi_periodic_delete(mlxp->mlx_wq_checktimer); 1063 } 1064 1065 static void 1066 mlxcx_teardown(mlxcx_t *mlxp) 1067 { 1068 uint_t i; 1069 dev_info_t *dip = mlxp->mlx_dip; 1070 1071 if (mlxp->mlx_attach & MLXCX_ATTACH_INTRS) { 1072 /* 1073 * Disable interrupts and let any active vectors quiesce. 1074 */ 1075 mlxcx_intr_disable(mlxp); 1076 } 1077 1078 if (mlxp->mlx_attach & MLXCX_ATTACH_SENSORS) { 1079 mlxcx_teardown_sensors(mlxp); 1080 mlxp->mlx_attach &= ~MLXCX_ATTACH_SENSORS; 1081 } 1082 1083 if (mlxp->mlx_attach & MLXCX_ATTACH_CHKTIMERS) { 1084 mlxcx_teardown_checktimers(mlxp); 1085 mlxp->mlx_attach &= ~MLXCX_ATTACH_CHKTIMERS; 1086 } 1087 1088 if (mlxp->mlx_attach & MLXCX_ATTACH_GROUPS) { 1089 mlxcx_teardown_groups(mlxp); 1090 mlxp->mlx_attach &= ~MLXCX_ATTACH_GROUPS; 1091 } 1092 1093 if (mlxp->mlx_attach & MLXCX_ATTACH_WQS) { 1094 mlxcx_teardown_wqs(mlxp); 1095 mlxp->mlx_attach &= ~MLXCX_ATTACH_WQS; 1096 } 1097 1098 if (mlxp->mlx_attach & MLXCX_ATTACH_CQS) { 1099 mlxcx_teardown_cqs(mlxp); 1100 mlxp->mlx_attach &= ~MLXCX_ATTACH_CQS; 1101 } 1102 1103 if (mlxp->mlx_attach & MLXCX_ATTACH_BUFS) { 1104 mlxcx_teardown_bufs(mlxp); 1105 mlxp->mlx_attach &= ~MLXCX_ATTACH_BUFS; 1106 } 1107 1108 if (mlxp->mlx_attach & MLXCX_ATTACH_PORTS) { 1109 mlxcx_teardown_ports(mlxp); 1110 mlxp->mlx_attach &= ~MLXCX_ATTACH_PORTS; 1111 } 1112 1113 if (mlxp->mlx_attach & MLXCX_ATTACH_INTRS) { 1114 mlxcx_teardown_eqs(mlxp); 1115 mlxcx_intr_teardown(mlxp); 1116 mlxp->mlx_attach &= ~MLXCX_ATTACH_INTRS; 1117 } 1118 1119 if (mlxp->mlx_attach & MLXCX_ATTACH_UAR_PD_TD) { 1120 if (mlxp->mlx_uar.mlu_allocated) { 1121 if (!mlxcx_cmd_dealloc_uar(mlxp, &mlxp->mlx_uar)) { 1122 mlxcx_warn(mlxp, "failed to release UAR"); 1123 } 1124 for (i = 0; i < MLXCX_BF_PER_UAR; ++i) 1125 mutex_destroy(&mlxp->mlx_uar.mlu_bf[i].mbf_mtx); 1126 } 1127 if (mlxp->mlx_pd.mlpd_allocated && 1128 !mlxcx_cmd_dealloc_pd(mlxp, &mlxp->mlx_pd)) { 1129 mlxcx_warn(mlxp, "failed to release PD"); 1130 } 1131 if (mlxp->mlx_tdom.mltd_allocated && 1132 !mlxcx_cmd_dealloc_tdom(mlxp, &mlxp->mlx_tdom)) { 1133 mlxcx_warn(mlxp, "failed to release TDOM"); 1134 } 1135 mlxp->mlx_attach &= ~MLXCX_ATTACH_UAR_PD_TD; 1136 } 1137 1138 if (mlxp->mlx_attach & MLXCX_ATTACH_INIT_HCA) { 1139 if (!mlxcx_cmd_teardown_hca(mlxp)) { 1140 mlxcx_warn(mlxp, "failed to send teardown HCA " 1141 "command during device detach"); 1142 } 1143 mlxp->mlx_attach &= ~MLXCX_ATTACH_INIT_HCA; 1144 } 1145 1146 if (mlxp->mlx_attach & MLXCX_ATTACH_PAGE_LIST) { 1147 mlxcx_teardown_pages(mlxp); 1148 mlxp->mlx_attach &= ~MLXCX_ATTACH_PAGE_LIST; 1149 } 1150 1151 if (mlxp->mlx_attach & MLXCX_ATTACH_ASYNC_TQ) { 1152 for (i = 0; i <= MLXCX_FUNC_ID_MAX; i++) { 1153 mlxp->mlx_npages_req[i].mla_mlx = NULL; 1154 mutex_destroy(&mlxp->mlx_npages_req[i].mla_mtx); 1155 } 1156 taskq_destroy(mlxp->mlx_async_tq); 1157 mlxp->mlx_async_tq = NULL; 1158 mlxp->mlx_attach &= ~MLXCX_ATTACH_ASYNC_TQ; 1159 } 1160 1161 if (mlxp->mlx_attach & MLXCX_ATTACH_ENABLE_HCA) { 1162 if (!mlxcx_cmd_disable_hca(mlxp)) { 1163 mlxcx_warn(mlxp, "failed to send DISABLE HCA command " 1164 "during device detach"); 1165 } 1166 mlxp->mlx_attach &= ~MLXCX_ATTACH_ENABLE_HCA; 1167 } 1168 1169 if (mlxp->mlx_attach & MLXCX_ATTACH_CMD) { 1170 mlxcx_cmd_queue_fini(mlxp); 1171 mlxp->mlx_attach &= ~MLXCX_ATTACH_CMD; 1172 } 1173 1174 if (mlxp->mlx_attach & MLXCX_ATTACH_CAPS) { 1175 kmem_free(mlxp->mlx_caps, sizeof (mlxcx_caps_t)); 1176 mlxp->mlx_caps = NULL; 1177 mlxp->mlx_attach &= ~MLXCX_ATTACH_CAPS; 1178 } 1179 1180 if (mlxp->mlx_attach & MLXCX_ATTACH_REGS) { 1181 ddi_regs_map_free(&mlxp->mlx_regs_handle); 1182 mlxp->mlx_regs_handle = NULL; 1183 mlxp->mlx_attach &= ~MLXCX_ATTACH_REGS; 1184 } 1185 1186 if (mlxp->mlx_attach & MLXCX_ATTACH_PCI_CONFIG) { 1187 pci_config_teardown(&mlxp->mlx_cfg_handle); 1188 mlxp->mlx_cfg_handle = NULL; 1189 mlxp->mlx_attach &= ~MLXCX_ATTACH_PCI_CONFIG; 1190 } 1191 1192 if (mlxp->mlx_attach & MLXCX_ATTACH_FM) { 1193 mlxcx_fm_fini(mlxp); 1194 mlxp->mlx_attach &= ~MLXCX_ATTACH_FM; 1195 } 1196 1197 VERIFY3S(mlxp->mlx_attach, ==, 0); 1198 ddi_soft_state_free(mlxcx_softstate, mlxp->mlx_inst); 1199 ddi_set_driver_private(dip, NULL); 1200 } 1201 1202 static void 1203 mlxcx_get_model(mlxcx_t *mlxp) 1204 { 1205 uint16_t venid; 1206 uint16_t devid; 1207 1208 venid = pci_config_get16(mlxp->mlx_cfg_handle, PCI_CONF_VENID); 1209 if (venid != MLXCX_VENDOR_ID) { 1210 /* Currently, all supported cards have a Mellanox vendor id. */ 1211 mlxp->mlx_type = MLXCX_DEV_UNKNOWN; 1212 return; 1213 } 1214 1215 devid = pci_config_get16(mlxp->mlx_cfg_handle, PCI_CONF_DEVID); 1216 switch (devid) { 1217 case MLXCX_CX4_DEVID: 1218 case MLXCX_CX4_VF_DEVID: 1219 case MLXCX_CX4_LX_VF_DEVID: 1220 mlxp->mlx_type = MLXCX_DEV_CX4; 1221 break; 1222 case MLXCX_CX5_DEVID: 1223 case MLXCX_CX5_VF_DEVID: 1224 case MLXCX_CX5_EX_DEVID: 1225 case MLXCX_CX5_EX_VF_DEVID: 1226 case MLXCX_CX5_GEN_VF_DEVID: 1227 mlxp->mlx_type = MLXCX_DEV_CX5; 1228 break; 1229 case MLXCX_CX6_DEVID: 1230 case MLXCX_CX6_VF_DEVID: 1231 case MLXCX_CX6_DF_DEVID: 1232 case MLXCX_CX6_LX_DEVID: 1233 mlxp->mlx_type = MLXCX_DEV_CX6; 1234 break; 1235 default: 1236 mlxp->mlx_type = MLXCX_DEV_UNKNOWN; 1237 } 1238 } 1239 1240 static boolean_t 1241 mlxcx_regs_map(mlxcx_t *mlxp) 1242 { 1243 off_t memsize; 1244 int ret; 1245 ddi_device_acc_attr_t da; 1246 1247 if (ddi_dev_regsize(mlxp->mlx_dip, MLXCX_REG_NUMBER, &memsize) != 1248 DDI_SUCCESS) { 1249 mlxcx_warn(mlxp, "failed to get register set size"); 1250 return (B_FALSE); 1251 } 1252 1253 /* 1254 * All data in the main BAR is kept in big-endian even though it's a PCI 1255 * device. 1256 */ 1257 bzero(&da, sizeof (ddi_device_acc_attr_t)); 1258 da.devacc_attr_version = DDI_DEVICE_ATTR_V0; 1259 da.devacc_attr_endian_flags = DDI_STRUCTURE_BE_ACC; 1260 da.devacc_attr_dataorder = DDI_STRICTORDER_ACC; 1261 if (DDI_FM_ACC_ERR_CAP(mlxp->mlx_fm_caps)) { 1262 da.devacc_attr_access = DDI_FLAGERR_ACC; 1263 } else { 1264 da.devacc_attr_access = DDI_DEFAULT_ACC; 1265 } 1266 1267 ret = ddi_regs_map_setup(mlxp->mlx_dip, MLXCX_REG_NUMBER, 1268 &mlxp->mlx_regs_base, 0, memsize, &da, &mlxp->mlx_regs_handle); 1269 1270 if (ret != DDI_SUCCESS) { 1271 mlxcx_warn(mlxp, "failed to map device registers: %d", ret); 1272 return (B_FALSE); 1273 } 1274 1275 return (B_TRUE); 1276 } 1277 1278 static boolean_t 1279 mlxcx_check_issi(mlxcx_t *mlxp) 1280 { 1281 uint32_t issi; 1282 1283 if (!mlxcx_cmd_query_issi(mlxp, &issi)) { 1284 mlxcx_warn(mlxp, "failed to get ISSI"); 1285 return (B_FALSE); 1286 } 1287 1288 if ((issi & (1 << MLXCX_CURRENT_ISSI)) == 0) { 1289 mlxcx_warn(mlxp, "hardware does not support software ISSI, " 1290 "hw vector 0x%x, sw version %u", issi, MLXCX_CURRENT_ISSI); 1291 return (B_FALSE); 1292 } 1293 1294 if (!mlxcx_cmd_set_issi(mlxp, MLXCX_CURRENT_ISSI)) { 1295 mlxcx_warn(mlxp, "failed to set ISSI to %u", 1296 MLXCX_CURRENT_ISSI); 1297 return (B_FALSE); 1298 } 1299 1300 return (B_TRUE); 1301 } 1302 1303 boolean_t 1304 mlxcx_give_pages(mlxcx_t *mlxp, int32_t npages, int32_t *ngiven) 1305 { 1306 ddi_device_acc_attr_t acc; 1307 ddi_dma_attr_t attr; 1308 int32_t i; 1309 list_t plist; 1310 mlxcx_dev_page_t *mdp; 1311 mlxcx_dev_page_t **pages; 1312 const ddi_dma_cookie_t *ck; 1313 1314 /* 1315 * If there are no pages required, then we're done here. 1316 */ 1317 if (npages <= 0) { 1318 *ngiven = 0; 1319 return (B_TRUE); 1320 } 1321 1322 npages = MIN(npages, MLXCX_MANAGE_PAGES_MAX_PAGES); 1323 1324 pages = kmem_alloc(sizeof (*pages) * npages, KM_SLEEP); 1325 1326 list_create(&plist, sizeof (mlxcx_dev_page_t), 1327 offsetof(mlxcx_dev_page_t, mxdp_list)); 1328 1329 for (i = 0; i < npages; i++) { 1330 mdp = kmem_zalloc(sizeof (mlxcx_dev_page_t), KM_SLEEP); 1331 mlxcx_dma_acc_attr(mlxp, &acc); 1332 mlxcx_dma_page_attr(mlxp, &attr); 1333 if (!mlxcx_dma_alloc(mlxp, &mdp->mxdp_dma, &attr, &acc, 1334 B_TRUE, MLXCX_HW_PAGE_SIZE, B_TRUE)) { 1335 mlxcx_warn(mlxp, "failed to allocate 4k page %u/%u", i, 1336 npages); 1337 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 1338 goto cleanup_npages; 1339 } 1340 ck = mlxcx_dma_cookie_one(&mdp->mxdp_dma); 1341 mdp->mxdp_pa = ck->dmac_laddress; 1342 1343 list_insert_tail(&plist, mdp); 1344 } 1345 1346 /* 1347 * Now that all of the pages have been allocated, given them to hardware 1348 * in chunks. 1349 */ 1350 for (i = 0; i < npages; i++) { 1351 pages[i] = list_remove_head(&plist); 1352 } 1353 1354 if (!mlxcx_cmd_give_pages(mlxp, 1355 MLXCX_MANAGE_PAGES_OPMOD_GIVE_PAGES, npages, pages)) { 1356 mlxcx_warn(mlxp, "!hardware refused our gift of %u " 1357 "pages!", npages); 1358 for (i = 0; i < npages; i++) { 1359 list_insert_tail(&plist, pages[i]); 1360 } 1361 goto cleanup_npages; 1362 } 1363 1364 mutex_enter(&mlxp->mlx_pagemtx); 1365 for (i = 0; i < npages; i++) { 1366 avl_add(&mlxp->mlx_pages, pages[i]); 1367 } 1368 mlxp->mlx_npages += npages; 1369 mutex_exit(&mlxp->mlx_pagemtx); 1370 1371 list_destroy(&plist); 1372 kmem_free(pages, sizeof (*pages) * npages); 1373 1374 *ngiven = npages; 1375 1376 return (B_TRUE); 1377 1378 cleanup_npages: 1379 kmem_free(pages, sizeof (*pages) * npages); 1380 while ((mdp = list_remove_head(&plist)) != NULL) { 1381 mlxcx_dma_free(&mdp->mxdp_dma); 1382 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 1383 } 1384 list_destroy(&plist); 1385 return (B_FALSE); 1386 } 1387 1388 static boolean_t 1389 mlxcx_init_pages(mlxcx_t *mlxp, uint_t type) 1390 { 1391 int32_t npages, given; 1392 1393 if (!mlxcx_cmd_query_pages(mlxp, type, &npages)) { 1394 mlxcx_warn(mlxp, "failed to determine boot pages"); 1395 return (B_FALSE); 1396 } 1397 1398 while (npages > 0) { 1399 if (!mlxcx_give_pages(mlxp, npages, &given)) 1400 return (B_FALSE); 1401 1402 npages -= given; 1403 } 1404 1405 return (B_TRUE); 1406 } 1407 1408 static int 1409 mlxcx_bufs_cache_constr(void *arg, void *cookie, int kmflags) 1410 { 1411 mlxcx_t *mlxp = cookie; 1412 mlxcx_buffer_t *b = arg; 1413 1414 bzero(b, sizeof (mlxcx_buffer_t)); 1415 b->mlb_mlx = mlxp; 1416 b->mlb_state = MLXCX_BUFFER_INIT; 1417 list_create(&b->mlb_tx_chain, sizeof (mlxcx_buffer_t), 1418 offsetof(mlxcx_buffer_t, mlb_tx_chain_entry)); 1419 1420 return (0); 1421 } 1422 1423 static void 1424 mlxcx_bufs_cache_destr(void *arg, void *cookie) 1425 { 1426 mlxcx_t *mlxp = cookie; 1427 mlxcx_buffer_t *b = arg; 1428 VERIFY3P(b->mlb_mlx, ==, mlxp); 1429 VERIFY(b->mlb_state == MLXCX_BUFFER_INIT); 1430 list_destroy(&b->mlb_tx_chain); 1431 } 1432 1433 mlxcx_buf_shard_t * 1434 mlxcx_mlbs_create(mlxcx_t *mlxp) 1435 { 1436 mlxcx_buf_shard_t *s; 1437 1438 s = kmem_zalloc(sizeof (mlxcx_buf_shard_t), KM_SLEEP); 1439 1440 mutex_init(&s->mlbs_mtx, NULL, MUTEX_DRIVER, 1441 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 1442 list_create(&s->mlbs_busy, sizeof (mlxcx_buffer_t), 1443 offsetof(mlxcx_buffer_t, mlb_entry)); 1444 list_create(&s->mlbs_free, sizeof (mlxcx_buffer_t), 1445 offsetof(mlxcx_buffer_t, mlb_entry)); 1446 list_create(&s->mlbs_loaned, sizeof (mlxcx_buffer_t), 1447 offsetof(mlxcx_buffer_t, mlb_entry)); 1448 cv_init(&s->mlbs_free_nonempty, NULL, CV_DRIVER, NULL); 1449 1450 list_insert_tail(&mlxp->mlx_buf_shards, s); 1451 1452 return (s); 1453 } 1454 1455 static boolean_t 1456 mlxcx_setup_bufs(mlxcx_t *mlxp) 1457 { 1458 char namebuf[KSTAT_STRLEN]; 1459 1460 (void) snprintf(namebuf, KSTAT_STRLEN, "mlxcx%d_bufs_cache", 1461 ddi_get_instance(mlxp->mlx_dip)); 1462 mlxp->mlx_bufs_cache = kmem_cache_create(namebuf, 1463 sizeof (mlxcx_buffer_t), sizeof (uint64_t), 1464 mlxcx_bufs_cache_constr, mlxcx_bufs_cache_destr, 1465 NULL, mlxp, NULL, 0); 1466 1467 list_create(&mlxp->mlx_buf_shards, sizeof (mlxcx_buf_shard_t), 1468 offsetof(mlxcx_buf_shard_t, mlbs_entry)); 1469 1470 return (B_TRUE); 1471 } 1472 1473 static void 1474 mlxcx_fm_qstate_ereport(mlxcx_t *mlxp, const char *qtype, uint32_t qnum, 1475 const char *state, uint8_t statenum) 1476 { 1477 uint64_t ena; 1478 char buf[FM_MAX_CLASS]; 1479 1480 if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps)) 1481 return; 1482 1483 (void) snprintf(buf, FM_MAX_CLASS, "%s.%s", 1484 MLXCX_FM_SERVICE_MLXCX, "qstate.err"); 1485 ena = fm_ena_generate(0, FM_ENA_FMT1); 1486 1487 ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP, 1488 FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0, 1489 "state", DATA_TYPE_STRING, state, 1490 "state_num", DATA_TYPE_UINT8, statenum, 1491 "qtype", DATA_TYPE_STRING, qtype, 1492 "qnum", DATA_TYPE_UINT32, qnum, 1493 NULL); 1494 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_DEGRADED); 1495 } 1496 1497 /* 1498 * The following set of routines are for monitoring the health of 1499 * event, completion and work queues. They run infrequently peeking at 1500 * the structs to catch stalls and inconsistent state. 1501 * 1502 * They peek at the structs *without* acquiring locks - we don't want 1503 * to impede flow of data. Driver start up and shutdown semantics 1504 * guarantee the structs are present and won't disappear underneath 1505 * these routines. 1506 * 1507 * As previously noted, the routines peek at active data in the structs and 1508 * they will store some values for comparison on next invocation. To 1509 * maintain integrity of the saved values, these values are only modified 1510 * within these routines. 1511 */ 1512 static void 1513 mlxcx_eq_check(void *arg) 1514 { 1515 mlxcx_t *mlxp = (mlxcx_t *)arg; 1516 mlxcx_event_queue_t *eq; 1517 mlxcx_eventq_ctx_t ctx; 1518 const char *str; 1519 1520 uint_t i; 1521 1522 for (i = 0; i < mlxp->mlx_intr_count; ++i) { 1523 eq = &mlxp->mlx_eqs[i]; 1524 1525 if ((eq->mleq_state & MLXCX_EQ_CREATED) == 0) 1526 continue; 1527 1528 /* 1529 * If the event queue was successfully created in the HCA, 1530 * then initialization and shutdown sequences guarantee 1531 * the queue exists. 1532 */ 1533 ASSERT0(eq->mleq_state & MLXCX_EQ_DESTROYED); 1534 1535 if (!mlxcx_cmd_query_eq(mlxp, eq, &ctx)) 1536 continue; 1537 1538 str = "???"; 1539 switch (ctx.mleqc_status) { 1540 case MLXCX_EQ_STATUS_OK: 1541 break; 1542 case MLXCX_EQ_STATUS_WRITE_FAILURE: 1543 str = "WRITE_FAILURE"; 1544 break; 1545 } 1546 1547 if (ctx.mleqc_status != MLXCX_EQ_STATUS_OK) { 1548 mlxcx_fm_qstate_ereport(mlxp, "event", 1549 eq->mleq_num, str, ctx.mleqc_status); 1550 mlxcx_warn(mlxp, "EQ %u is in bad status: %x (%s)", 1551 eq->mleq_intr_index, ctx.mleqc_status, str); 1552 } 1553 1554 if (ctx.mleqc_state != MLXCX_EQ_ST_ARMED && 1555 (eq->mleq_state & MLXCX_EQ_ARMED)) { 1556 if (eq->mleq_cc == eq->mleq_check_disarm_cc && 1557 ++eq->mleq_check_disarm_cnt >= 3) { 1558 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_STALL); 1559 mlxcx_warn(mlxp, "EQ %u isn't armed", 1560 eq->mleq_intr_index); 1561 } 1562 eq->mleq_check_disarm_cc = eq->mleq_cc; 1563 } else { 1564 eq->mleq_check_disarm_cc = 0; 1565 eq->mleq_check_disarm_cnt = 0; 1566 } 1567 } 1568 } 1569 1570 static void 1571 mlxcx_cq_check(void *arg) 1572 { 1573 mlxcx_t *mlxp = (mlxcx_t *)arg; 1574 mlxcx_completion_queue_t *cq; 1575 mlxcx_completionq_ctx_t ctx; 1576 const char *str, *type; 1577 uint_t v; 1578 1579 for (cq = list_head(&mlxp->mlx_cqs); cq != NULL; 1580 cq = list_next(&mlxp->mlx_cqs, cq)) { 1581 1582 if ((cq->mlcq_state & MLXCX_CQ_CREATED) == 0) 1583 continue; 1584 1585 /* 1586 * If the completion queue was successfully created in the HCA, 1587 * then initialization and shutdown sequences guarantee 1588 * the queue exists. 1589 */ 1590 ASSERT0(cq->mlcq_state & MLXCX_CQ_DESTROYED); 1591 ASSERT0(cq->mlcq_state & MLXCX_CQ_TEARDOWN); 1592 1593 if (cq->mlcq_fm_repd_qstate) 1594 continue; 1595 1596 if (!mlxcx_cmd_query_cq(mlxp, cq, &ctx)) 1597 continue; 1598 1599 if (cq->mlcq_wq != NULL) { 1600 mlxcx_work_queue_t *wq = cq->mlcq_wq; 1601 if (wq->mlwq_type == MLXCX_WQ_TYPE_RECVQ) 1602 type = "rx "; 1603 else if (wq->mlwq_type == MLXCX_WQ_TYPE_SENDQ) 1604 type = "tx "; 1605 else 1606 type = ""; 1607 } else { 1608 type = ""; 1609 } 1610 1611 str = "???"; 1612 v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATUS); 1613 switch (v) { 1614 case MLXCX_CQC_STATUS_OK: 1615 break; 1616 case MLXCX_CQC_STATUS_OVERFLOW: 1617 str = "OVERFLOW"; 1618 break; 1619 case MLXCX_CQC_STATUS_WRITE_FAIL: 1620 str = "WRITE_FAIL"; 1621 break; 1622 case MLXCX_CQC_STATUS_INVALID: 1623 str = "INVALID"; 1624 break; 1625 } 1626 1627 if (v != MLXCX_CQC_STATUS_OK) { 1628 mlxcx_fm_qstate_ereport(mlxp, "completion", 1629 cq->mlcq_num, str, v); 1630 mlxcx_warn(mlxp, "%sCQ 0x%x is in bad status: %x (%s)", 1631 type, cq->mlcq_num, v, str); 1632 cq->mlcq_fm_repd_qstate = B_TRUE; 1633 } 1634 1635 v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATE); 1636 if (v != MLXCX_CQC_STATE_ARMED && 1637 (cq->mlcq_state & MLXCX_CQ_ARMED) && 1638 !(cq->mlcq_state & MLXCX_CQ_POLLING)) { 1639 if (cq->mlcq_cc == cq->mlcq_check_disarm_cc && 1640 ++cq->mlcq_check_disarm_cnt >= 3) { 1641 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_STALL); 1642 mlxcx_warn(mlxp, "%sCQ 0x%x (%p) isn't armed", 1643 type, cq->mlcq_num, cq); 1644 } 1645 cq->mlcq_check_disarm_cc = cq->mlcq_cc; 1646 } else { 1647 cq->mlcq_check_disarm_cnt = 0; 1648 cq->mlcq_check_disarm_cc = 0; 1649 } 1650 } 1651 } 1652 1653 void 1654 mlxcx_check_sq(mlxcx_t *mlxp, mlxcx_work_queue_t *sq) 1655 { 1656 mlxcx_sq_ctx_t ctx; 1657 mlxcx_sq_state_t state; 1658 1659 if (!mlxcx_cmd_query_sq(mlxp, sq, &ctx)) 1660 return; 1661 1662 ASSERT3U(from_be24(ctx.mlsqc_cqn), ==, sq->mlwq_cq->mlcq_num); 1663 state = get_bits32(ctx.mlsqc_flags, MLXCX_SQ_STATE); 1664 switch (state) { 1665 case MLXCX_SQ_STATE_RST: 1666 if (sq->mlwq_state & MLXCX_WQ_STARTED) { 1667 mlxcx_fm_qstate_ereport(mlxp, "send", 1668 sq->mlwq_num, "RST", state); 1669 sq->mlwq_fm_repd_qstate = B_TRUE; 1670 } 1671 break; 1672 case MLXCX_SQ_STATE_RDY: 1673 if (!(sq->mlwq_state & MLXCX_WQ_STARTED)) { 1674 mlxcx_fm_qstate_ereport(mlxp, "send", 1675 sq->mlwq_num, "RDY", state); 1676 sq->mlwq_fm_repd_qstate = B_TRUE; 1677 } 1678 break; 1679 case MLXCX_SQ_STATE_ERR: 1680 mlxcx_fm_qstate_ereport(mlxp, "send", 1681 sq->mlwq_num, "ERR", state); 1682 sq->mlwq_fm_repd_qstate = B_TRUE; 1683 break; 1684 default: 1685 mlxcx_fm_qstate_ereport(mlxp, "send", 1686 sq->mlwq_num, "???", state); 1687 sq->mlwq_fm_repd_qstate = B_TRUE; 1688 break; 1689 } 1690 } 1691 1692 void 1693 mlxcx_check_rq(mlxcx_t *mlxp, mlxcx_work_queue_t *rq) 1694 { 1695 mlxcx_rq_ctx_t ctx; 1696 mlxcx_rq_state_t state; 1697 1698 1699 if (!mlxcx_cmd_query_rq(mlxp, rq, &ctx)) 1700 return; 1701 1702 ASSERT3U(from_be24(ctx.mlrqc_cqn), ==, rq->mlwq_cq->mlcq_num); 1703 state = get_bits32(ctx.mlrqc_flags, MLXCX_RQ_STATE); 1704 switch (state) { 1705 case MLXCX_RQ_STATE_RST: 1706 if (rq->mlwq_state & MLXCX_WQ_STARTED) { 1707 mlxcx_fm_qstate_ereport(mlxp, "receive", 1708 rq->mlwq_num, "RST", state); 1709 rq->mlwq_fm_repd_qstate = B_TRUE; 1710 } 1711 break; 1712 case MLXCX_RQ_STATE_RDY: 1713 if (!(rq->mlwq_state & MLXCX_WQ_STARTED)) { 1714 mlxcx_fm_qstate_ereport(mlxp, "receive", 1715 rq->mlwq_num, "RDY", state); 1716 rq->mlwq_fm_repd_qstate = B_TRUE; 1717 } 1718 break; 1719 case MLXCX_RQ_STATE_ERR: 1720 mlxcx_fm_qstate_ereport(mlxp, "receive", 1721 rq->mlwq_num, "ERR", state); 1722 rq->mlwq_fm_repd_qstate = B_TRUE; 1723 break; 1724 default: 1725 mlxcx_fm_qstate_ereport(mlxp, "receive", 1726 rq->mlwq_num, "???", state); 1727 rq->mlwq_fm_repd_qstate = B_TRUE; 1728 break; 1729 } 1730 } 1731 1732 static void 1733 mlxcx_wq_check(void *arg) 1734 { 1735 mlxcx_t *mlxp = (mlxcx_t *)arg; 1736 mlxcx_work_queue_t *wq; 1737 1738 for (wq = list_head(&mlxp->mlx_wqs); wq != NULL; 1739 wq = list_next(&mlxp->mlx_wqs, wq)) { 1740 1741 if ((wq->mlwq_state & MLXCX_WQ_CREATED) == 0) 1742 continue; 1743 1744 /* 1745 * If the work queue was successfully created in the HCA, 1746 * then initialization and shutdown sequences guarantee 1747 * the queue exists. 1748 */ 1749 ASSERT0(wq->mlwq_state & MLXCX_WQ_DESTROYED); 1750 ASSERT0(wq->mlwq_state & MLXCX_WQ_TEARDOWN); 1751 1752 if (wq->mlwq_fm_repd_qstate) 1753 continue; 1754 1755 switch (wq->mlwq_type) { 1756 case MLXCX_WQ_TYPE_SENDQ: 1757 mlxcx_check_sq(mlxp, wq); 1758 break; 1759 case MLXCX_WQ_TYPE_RECVQ: 1760 mlxcx_check_rq(mlxp, wq); 1761 break; 1762 } 1763 } 1764 } 1765 1766 static boolean_t 1767 mlxcx_setup_checktimers(mlxcx_t *mlxp) 1768 { 1769 if (mlxp->mlx_props.mldp_eq_check_interval_sec > 0) { 1770 mlxp->mlx_eq_checktimer = ddi_periodic_add(mlxcx_eq_check, mlxp, 1771 mlxp->mlx_props.mldp_eq_check_interval_sec * NANOSEC, 1772 DDI_IPL_0); 1773 } 1774 if (mlxp->mlx_props.mldp_cq_check_interval_sec > 0) { 1775 mlxp->mlx_cq_checktimer = ddi_periodic_add(mlxcx_cq_check, mlxp, 1776 mlxp->mlx_props.mldp_cq_check_interval_sec * NANOSEC, 1777 DDI_IPL_0); 1778 } 1779 if (mlxp->mlx_props.mldp_wq_check_interval_sec > 0) { 1780 mlxp->mlx_wq_checktimer = ddi_periodic_add(mlxcx_wq_check, mlxp, 1781 mlxp->mlx_props.mldp_wq_check_interval_sec * NANOSEC, 1782 DDI_IPL_0); 1783 } 1784 return (B_TRUE); 1785 } 1786 1787 int 1788 mlxcx_dmac_fe_compare(const void *arg0, const void *arg1) 1789 { 1790 const mlxcx_flow_entry_t *left = arg0; 1791 const mlxcx_flow_entry_t *right = arg1; 1792 int bcmpr; 1793 1794 bcmpr = memcmp(left->mlfe_dmac, right->mlfe_dmac, 1795 sizeof (left->mlfe_dmac)); 1796 if (bcmpr < 0) 1797 return (-1); 1798 if (bcmpr > 0) 1799 return (1); 1800 if (left->mlfe_vid < right->mlfe_vid) 1801 return (-1); 1802 if (left->mlfe_vid > right->mlfe_vid) 1803 return (1); 1804 return (0); 1805 } 1806 1807 int 1808 mlxcx_grmac_compare(const void *arg0, const void *arg1) 1809 { 1810 const mlxcx_group_mac_t *left = arg0; 1811 const mlxcx_group_mac_t *right = arg1; 1812 int bcmpr; 1813 1814 bcmpr = memcmp(left->mlgm_mac, right->mlgm_mac, 1815 sizeof (left->mlgm_mac)); 1816 if (bcmpr < 0) 1817 return (-1); 1818 if (bcmpr > 0) 1819 return (1); 1820 return (0); 1821 } 1822 1823 int 1824 mlxcx_page_compare(const void *arg0, const void *arg1) 1825 { 1826 const mlxcx_dev_page_t *p0 = arg0; 1827 const mlxcx_dev_page_t *p1 = arg1; 1828 1829 if (p0->mxdp_pa < p1->mxdp_pa) 1830 return (-1); 1831 if (p0->mxdp_pa > p1->mxdp_pa) 1832 return (1); 1833 return (0); 1834 } 1835 1836 static boolean_t 1837 mlxcx_setup_ports(mlxcx_t *mlxp) 1838 { 1839 uint_t i, j; 1840 mlxcx_port_t *p; 1841 mlxcx_flow_table_t *ft; 1842 mlxcx_flow_group_t *fg; 1843 mlxcx_flow_entry_t *fe; 1844 1845 VERIFY3U(mlxp->mlx_nports, >, 0); 1846 mlxp->mlx_ports_size = mlxp->mlx_nports * sizeof (mlxcx_port_t); 1847 mlxp->mlx_ports = kmem_zalloc(mlxp->mlx_ports_size, KM_SLEEP); 1848 1849 for (i = 0; i < mlxp->mlx_nports; ++i) { 1850 p = &mlxp->mlx_ports[i]; 1851 p->mlp_num = i; 1852 p->mlx_port_event.mla_mlx = mlxp; 1853 p->mlx_port_event.mla_port = p; 1854 mutex_init(&p->mlx_port_event.mla_mtx, NULL, 1855 MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_async_intr_pri)); 1856 p->mlp_init |= MLXCX_PORT_INIT; 1857 mutex_init(&p->mlp_mtx, NULL, MUTEX_DRIVER, 1858 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 1859 mutex_enter(&p->mlp_mtx); 1860 if (!mlxcx_cmd_query_nic_vport_ctx(mlxp, p)) { 1861 mutex_exit(&p->mlp_mtx); 1862 goto err; 1863 } 1864 if (!mlxcx_cmd_query_port_mtu(mlxp, p)) { 1865 mutex_exit(&p->mlp_mtx); 1866 goto err; 1867 } 1868 if (!mlxcx_cmd_query_port_status(mlxp, p)) { 1869 mutex_exit(&p->mlp_mtx); 1870 goto err; 1871 } 1872 if (!mlxcx_cmd_query_port_speed(mlxp, p)) { 1873 mutex_exit(&p->mlp_mtx); 1874 goto err; 1875 } 1876 if (!mlxcx_cmd_modify_nic_vport_ctx(mlxp, p, 1877 MLXCX_MODIFY_NIC_VPORT_CTX_PROMISC)) { 1878 mutex_exit(&p->mlp_mtx); 1879 goto err; 1880 } 1881 if (!mlxcx_cmd_query_port_fec(mlxp, p)) { 1882 mutex_exit(&p->mlp_mtx); 1883 goto err; 1884 } 1885 p->mlp_fec_requested = LINK_FEC_AUTO; 1886 1887 mutex_exit(&p->mlp_mtx); 1888 } 1889 1890 for (i = 0; i < mlxp->mlx_nports; ++i) { 1891 p = &mlxp->mlx_ports[i]; 1892 mutex_enter(&p->mlp_mtx); 1893 p->mlp_rx_flow = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t), 1894 KM_SLEEP)); 1895 mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER, 1896 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 1897 1898 mutex_enter(&ft->mlft_mtx); 1899 1900 ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX; 1901 ft->mlft_port = p; 1902 ft->mlft_entshift = mlxp->mlx_props.mldp_ftbl_root_size_shift; 1903 if (ft->mlft_entshift > mlxp->mlx_caps->mlc_max_rx_ft_shift) 1904 ft->mlft_entshift = mlxp->mlx_caps->mlc_max_rx_ft_shift; 1905 ft->mlft_nents = (1 << ft->mlft_entshift); 1906 ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t); 1907 ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP); 1908 list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t), 1909 offsetof(mlxcx_flow_group_t, mlfg_entry)); 1910 1911 for (j = 0; j < ft->mlft_nents; ++j) { 1912 ft->mlft_ent[j].mlfe_table = ft; 1913 ft->mlft_ent[j].mlfe_index = j; 1914 } 1915 1916 if (!mlxcx_cmd_create_flow_table(mlxp, ft)) { 1917 mutex_exit(&ft->mlft_mtx); 1918 mutex_exit(&p->mlp_mtx); 1919 goto err; 1920 } 1921 1922 if (!mlxcx_cmd_set_flow_table_root(mlxp, ft)) { 1923 mutex_exit(&ft->mlft_mtx); 1924 mutex_exit(&p->mlp_mtx); 1925 goto err; 1926 } 1927 1928 /* 1929 * We match broadcast at the top of the root flow table, then 1930 * all multicast/unicast MACs, then the promisc entry is down 1931 * the very bottom. 1932 * 1933 * This way when promisc is on, that entry simply catches any 1934 * remaining traffic that earlier flows haven't matched. 1935 */ 1936 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1937 list_insert_tail(&ft->mlft_groups, fg); 1938 fg->mlfg_table = ft; 1939 fg->mlfg_size = 1; 1940 fg->mlfg_mask |= MLXCX_FLOW_MATCH_DMAC; 1941 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1942 mutex_exit(&ft->mlft_mtx); 1943 mutex_exit(&p->mlp_mtx); 1944 goto err; 1945 } 1946 p->mlp_bcast = fg; 1947 fe = list_head(&fg->mlfg_entries); 1948 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 1949 (void) memset(fe->mlfe_dmac, 0xff, sizeof (fe->mlfe_dmac)); 1950 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 1951 1952 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1953 list_insert_tail(&ft->mlft_groups, fg); 1954 fg->mlfg_table = ft; 1955 fg->mlfg_size = ft->mlft_nents - 2; 1956 fg->mlfg_mask |= MLXCX_FLOW_MATCH_DMAC; 1957 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1958 mutex_exit(&ft->mlft_mtx); 1959 mutex_exit(&p->mlp_mtx); 1960 goto err; 1961 } 1962 p->mlp_umcast = fg; 1963 1964 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1965 list_insert_tail(&ft->mlft_groups, fg); 1966 fg->mlfg_table = ft; 1967 fg->mlfg_size = 1; 1968 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1969 mutex_exit(&ft->mlft_mtx); 1970 mutex_exit(&p->mlp_mtx); 1971 goto err; 1972 } 1973 p->mlp_promisc = fg; 1974 fe = list_head(&fg->mlfg_entries); 1975 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 1976 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 1977 1978 avl_create(&p->mlp_dmac_fe, mlxcx_dmac_fe_compare, 1979 sizeof (mlxcx_flow_entry_t), offsetof(mlxcx_flow_entry_t, 1980 mlfe_dmac_entry)); 1981 1982 mutex_exit(&ft->mlft_mtx); 1983 mutex_exit(&p->mlp_mtx); 1984 } 1985 1986 return (B_TRUE); 1987 1988 err: 1989 mlxcx_teardown_ports(mlxp); 1990 return (B_FALSE); 1991 } 1992 1993 void 1994 mlxcx_remove_all_vlan_entries(mlxcx_t *mlxp, mlxcx_ring_group_t *g) 1995 { 1996 mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft; 1997 mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg; 1998 mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg; 1999 mlxcx_flow_entry_t *fe; 2000 mlxcx_group_vlan_t *v; 2001 2002 ASSERT(mutex_owned(&g->mlg_mtx)); 2003 2004 mutex_enter(&ft->mlft_mtx); 2005 2006 if (!list_is_empty(&g->mlg_rx_vlans)) { 2007 fe = list_head(&dfg->mlfg_entries); 2008 (void) mlxcx_cmd_set_flow_table_entry(mlxp, fe); 2009 } 2010 2011 while ((v = list_remove_head(&g->mlg_rx_vlans)) != NULL) { 2012 fe = v->mlgv_fe; 2013 ASSERT3P(fe->mlfe_table, ==, ft); 2014 ASSERT3P(fe->mlfe_group, ==, fg); 2015 kmem_free(v, sizeof (mlxcx_group_vlan_t)); 2016 2017 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 2018 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2019 } 2020 2021 mutex_exit(&ft->mlft_mtx); 2022 } 2023 2024 boolean_t 2025 mlxcx_remove_vlan_entry(mlxcx_t *mlxp, mlxcx_ring_group_t *g, 2026 boolean_t tagged, uint16_t vid) 2027 { 2028 mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft; 2029 mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg; 2030 mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg; 2031 mlxcx_flow_entry_t *fe; 2032 mlxcx_group_vlan_t *v; 2033 boolean_t found = B_FALSE; 2034 2035 ASSERT(mutex_owned(&g->mlg_mtx)); 2036 2037 mutex_enter(&ft->mlft_mtx); 2038 2039 for (v = list_head(&g->mlg_rx_vlans); v != NULL; 2040 v = list_next(&g->mlg_rx_vlans, v)) { 2041 if (v->mlgv_tagged == tagged && v->mlgv_vid == vid) { 2042 found = B_TRUE; 2043 break; 2044 } 2045 } 2046 if (!found) { 2047 mutex_exit(&ft->mlft_mtx); 2048 return (B_FALSE); 2049 } 2050 2051 list_remove(&g->mlg_rx_vlans, v); 2052 2053 /* 2054 * If this is the last VLAN entry, we have to go back to accepting 2055 * any VLAN (which means re-enabling the default entry). 2056 * 2057 * Do this before we remove the flow entry for the last specific 2058 * VLAN so that we don't lose any traffic in the transition. 2059 */ 2060 if (list_is_empty(&g->mlg_rx_vlans)) { 2061 fe = list_head(&dfg->mlfg_entries); 2062 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 2063 list_insert_tail(&g->mlg_rx_vlans, v); 2064 mutex_exit(&ft->mlft_mtx); 2065 return (B_FALSE); 2066 } 2067 } 2068 2069 fe = v->mlgv_fe; 2070 ASSERT(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED); 2071 ASSERT(fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED); 2072 ASSERT3P(fe->mlfe_table, ==, ft); 2073 ASSERT3P(fe->mlfe_group, ==, fg); 2074 2075 if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) { 2076 list_insert_tail(&g->mlg_rx_vlans, v); 2077 fe = list_head(&dfg->mlfg_entries); 2078 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) { 2079 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 2080 } 2081 mutex_exit(&ft->mlft_mtx); 2082 return (B_FALSE); 2083 } 2084 2085 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2086 2087 kmem_free(v, sizeof (mlxcx_group_vlan_t)); 2088 2089 mutex_exit(&ft->mlft_mtx); 2090 return (B_TRUE); 2091 } 2092 2093 boolean_t 2094 mlxcx_add_vlan_entry(mlxcx_t *mlxp, mlxcx_ring_group_t *g, boolean_t tagged, 2095 uint16_t vid) 2096 { 2097 mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft; 2098 mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg; 2099 mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg; 2100 mlxcx_flow_entry_t *fe; 2101 mlxcx_group_vlan_t *v; 2102 boolean_t found = B_FALSE; 2103 boolean_t first = B_FALSE; 2104 2105 ASSERT(mutex_owned(&g->mlg_mtx)); 2106 2107 mutex_enter(&ft->mlft_mtx); 2108 2109 for (v = list_head(&g->mlg_rx_vlans); v != NULL; 2110 v = list_next(&g->mlg_rx_vlans, v)) { 2111 if (v->mlgv_tagged == tagged && v->mlgv_vid == vid) { 2112 mutex_exit(&ft->mlft_mtx); 2113 return (B_TRUE); 2114 } 2115 } 2116 if (list_is_empty(&g->mlg_rx_vlans)) 2117 first = B_TRUE; 2118 2119 for (fe = list_head(&fg->mlfg_entries); fe != NULL; 2120 fe = list_next(&fg->mlfg_entries, fe)) { 2121 if (!(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED)) { 2122 found = B_TRUE; 2123 break; 2124 } 2125 } 2126 if (!found) { 2127 mutex_exit(&ft->mlft_mtx); 2128 return (B_FALSE); 2129 } 2130 2131 v = kmem_zalloc(sizeof (mlxcx_group_vlan_t), KM_SLEEP); 2132 v->mlgv_fe = fe; 2133 v->mlgv_tagged = tagged; 2134 v->mlgv_vid = vid; 2135 2136 fe->mlfe_state |= MLXCX_FLOW_ENTRY_RESERVED; 2137 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 2138 fe->mlfe_vid = vid; 2139 if (tagged) { 2140 fe->mlfe_vlan_type = MLXCX_VLAN_TYPE_CVLAN; 2141 } else { 2142 fe->mlfe_vlan_type = MLXCX_VLAN_TYPE_NONE; 2143 } 2144 2145 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 2146 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_DIRTY; 2147 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2148 kmem_free(v, sizeof (mlxcx_group_vlan_t)); 2149 mutex_exit(&ft->mlft_mtx); 2150 return (B_FALSE); 2151 } 2152 2153 list_insert_tail(&g->mlg_rx_vlans, v); 2154 2155 /* 2156 * If the vlan list was empty for this group before adding this one, 2157 * then we no longer want the "default" entry to allow all VLANs 2158 * through. 2159 */ 2160 if (first) { 2161 fe = list_head(&dfg->mlfg_entries); 2162 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 2163 } 2164 2165 mutex_exit(&ft->mlft_mtx); 2166 return (B_TRUE); 2167 } 2168 2169 void 2170 mlxcx_remove_all_umcast_entries(mlxcx_t *mlxp, mlxcx_port_t *port, 2171 mlxcx_ring_group_t *group) 2172 { 2173 mlxcx_flow_entry_t *fe; 2174 mlxcx_flow_table_t *ft = port->mlp_rx_flow; 2175 mlxcx_group_mac_t *gm, *ngm; 2176 2177 ASSERT(mutex_owned(&port->mlp_mtx)); 2178 ASSERT(mutex_owned(&group->mlg_mtx)); 2179 2180 mutex_enter(&ft->mlft_mtx); 2181 2182 gm = avl_first(&group->mlg_rx_macs); 2183 for (; gm != NULL; gm = ngm) { 2184 ngm = AVL_NEXT(&group->mlg_rx_macs, gm); 2185 2186 ASSERT3P(gm->mlgm_group, ==, group); 2187 fe = gm->mlgm_fe; 2188 ASSERT3P(fe->mlfe_table, ==, ft); 2189 2190 avl_remove(&group->mlg_rx_macs, gm); 2191 list_remove(&fe->mlfe_ring_groups, gm); 2192 kmem_free(gm, sizeof (mlxcx_group_mac_t)); 2193 2194 fe->mlfe_ndest = 0; 2195 for (gm = list_head(&fe->mlfe_ring_groups); gm != NULL; 2196 gm = list_next(&fe->mlfe_ring_groups, gm)) { 2197 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = 2198 gm->mlgm_group->mlg_rx_vlan_ft; 2199 } 2200 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 2201 2202 if (fe->mlfe_ndest > 0) { 2203 (void) mlxcx_cmd_set_flow_table_entry(mlxp, fe); 2204 continue; 2205 } 2206 2207 /* 2208 * There are no more ring groups left for this MAC (it wasn't 2209 * attached to any other groups since ndest == 0), so clean up 2210 * its flow entry. 2211 */ 2212 avl_remove(&port->mlp_dmac_fe, fe); 2213 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 2214 list_destroy(&fe->mlfe_ring_groups); 2215 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2216 } 2217 2218 mutex_exit(&ft->mlft_mtx); 2219 } 2220 2221 boolean_t 2222 mlxcx_remove_umcast_entry(mlxcx_t *mlxp, mlxcx_port_t *port, 2223 mlxcx_ring_group_t *group, const uint8_t *macaddr) 2224 { 2225 mlxcx_flow_entry_t *fe; 2226 mlxcx_flow_table_t *ft = port->mlp_rx_flow; 2227 mlxcx_group_mac_t *gm, probe; 2228 2229 ASSERT(mutex_owned(&port->mlp_mtx)); 2230 ASSERT(mutex_owned(&group->mlg_mtx)); 2231 2232 bzero(&probe, sizeof (probe)); 2233 bcopy(macaddr, probe.mlgm_mac, sizeof (probe.mlgm_mac)); 2234 2235 mutex_enter(&ft->mlft_mtx); 2236 2237 gm = avl_find(&group->mlg_rx_macs, &probe, NULL); 2238 if (gm == NULL) { 2239 mutex_exit(&ft->mlft_mtx); 2240 return (B_FALSE); 2241 } 2242 ASSERT3P(gm->mlgm_group, ==, group); 2243 ASSERT0(bcmp(macaddr, gm->mlgm_mac, sizeof (gm->mlgm_mac))); 2244 2245 fe = gm->mlgm_fe; 2246 ASSERT3P(fe->mlfe_table, ==, ft); 2247 ASSERT0(bcmp(macaddr, fe->mlfe_dmac, sizeof (fe->mlfe_dmac))); 2248 2249 list_remove(&fe->mlfe_ring_groups, gm); 2250 avl_remove(&group->mlg_rx_macs, gm); 2251 kmem_free(gm, sizeof (mlxcx_group_mac_t)); 2252 2253 fe->mlfe_ndest = 0; 2254 for (gm = list_head(&fe->mlfe_ring_groups); gm != NULL; 2255 gm = list_next(&fe->mlfe_ring_groups, gm)) { 2256 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = 2257 gm->mlgm_group->mlg_rx_vlan_ft; 2258 } 2259 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 2260 2261 if (fe->mlfe_ndest > 0) { 2262 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 2263 mutex_exit(&ft->mlft_mtx); 2264 return (B_FALSE); 2265 } 2266 mutex_exit(&ft->mlft_mtx); 2267 return (B_TRUE); 2268 } 2269 2270 /* 2271 * There are no more ring groups left for this MAC (it wasn't attached 2272 * to any other groups since ndest == 0), so clean up its flow entry. 2273 */ 2274 avl_remove(&port->mlp_dmac_fe, fe); 2275 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 2276 list_destroy(&fe->mlfe_ring_groups); 2277 2278 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2279 2280 mutex_exit(&ft->mlft_mtx); 2281 2282 return (B_TRUE); 2283 } 2284 2285 boolean_t 2286 mlxcx_add_umcast_entry(mlxcx_t *mlxp, mlxcx_port_t *port, 2287 mlxcx_ring_group_t *group, const uint8_t *macaddr) 2288 { 2289 mlxcx_flow_group_t *fg; 2290 mlxcx_flow_entry_t *fe, probe; 2291 mlxcx_flow_table_t *ft = port->mlp_rx_flow; 2292 mlxcx_group_mac_t *gm; 2293 boolean_t found = B_FALSE; 2294 2295 ASSERT(mutex_owned(&port->mlp_mtx)); 2296 ASSERT(mutex_owned(&group->mlg_mtx)); 2297 2298 bzero(&probe, sizeof (probe)); 2299 bcopy(macaddr, probe.mlfe_dmac, sizeof (probe.mlfe_dmac)); 2300 2301 mutex_enter(&ft->mlft_mtx); 2302 2303 fe = avl_find(&port->mlp_dmac_fe, &probe, NULL); 2304 2305 if (fe == NULL) { 2306 fg = port->mlp_umcast; 2307 for (fe = list_head(&fg->mlfg_entries); fe != NULL; 2308 fe = list_next(&fg->mlfg_entries, fe)) { 2309 if (!(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED)) { 2310 found = B_TRUE; 2311 break; 2312 } 2313 } 2314 if (!found) { 2315 mutex_exit(&ft->mlft_mtx); 2316 return (B_FALSE); 2317 } 2318 list_create(&fe->mlfe_ring_groups, sizeof (mlxcx_group_mac_t), 2319 offsetof(mlxcx_group_mac_t, mlgm_fe_entry)); 2320 fe->mlfe_state |= MLXCX_FLOW_ENTRY_RESERVED; 2321 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 2322 bcopy(macaddr, fe->mlfe_dmac, sizeof (fe->mlfe_dmac)); 2323 2324 avl_add(&port->mlp_dmac_fe, fe); 2325 } 2326 2327 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = group->mlg_rx_vlan_ft; 2328 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 2329 2330 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 2331 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_DIRTY; 2332 if (--fe->mlfe_ndest == 0) { 2333 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2334 } 2335 mutex_exit(&ft->mlft_mtx); 2336 return (B_FALSE); 2337 } 2338 2339 gm = kmem_zalloc(sizeof (mlxcx_group_mac_t), KM_SLEEP); 2340 gm->mlgm_group = group; 2341 gm->mlgm_fe = fe; 2342 bcopy(macaddr, gm->mlgm_mac, sizeof (gm->mlgm_mac)); 2343 avl_add(&group->mlg_rx_macs, gm); 2344 list_insert_tail(&fe->mlfe_ring_groups, gm); 2345 2346 mutex_exit(&ft->mlft_mtx); 2347 2348 return (B_TRUE); 2349 } 2350 2351 boolean_t 2352 mlxcx_setup_flow_group(mlxcx_t *mlxp, mlxcx_flow_table_t *ft, 2353 mlxcx_flow_group_t *fg) 2354 { 2355 mlxcx_flow_entry_t *fe; 2356 uint_t i, idx; 2357 2358 ASSERT(mutex_owned(&ft->mlft_mtx)); 2359 ASSERT(ft->mlft_state & MLXCX_FLOW_TABLE_CREATED); 2360 ASSERT3P(fg->mlfg_table, ==, ft); 2361 2362 if (ft->mlft_next_ent + fg->mlfg_size > ft->mlft_nents) 2363 return (B_FALSE); 2364 fg->mlfg_start_idx = ft->mlft_next_ent; 2365 2366 if (!mlxcx_cmd_create_flow_group(mlxp, fg)) { 2367 return (B_FALSE); 2368 } 2369 2370 list_create(&fg->mlfg_entries, sizeof (mlxcx_flow_entry_t), 2371 offsetof(mlxcx_flow_entry_t, mlfe_group_entry)); 2372 for (i = 0; i < fg->mlfg_size; ++i) { 2373 idx = fg->mlfg_start_idx + i; 2374 fe = &ft->mlft_ent[idx]; 2375 fe->mlfe_group = fg; 2376 list_insert_tail(&fg->mlfg_entries, fe); 2377 } 2378 fg->mlfg_avail = fg->mlfg_size; 2379 ft->mlft_next_ent += fg->mlfg_size; 2380 2381 return (B_TRUE); 2382 } 2383 2384 static boolean_t 2385 mlxcx_setup_eq(mlxcx_t *mlxp, uint_t vec, uint64_t events) 2386 { 2387 mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[vec]; 2388 2389 mutex_enter(&mleq->mleq_mtx); 2390 if (!mlxcx_eq_alloc_dma(mlxp, mleq)) { 2391 /* mlxcx_teardown_eqs() will clean this up */ 2392 mutex_exit(&mleq->mleq_mtx); 2393 return (B_FALSE); 2394 } 2395 mleq->mleq_mlx = mlxp; 2396 mleq->mleq_uar = &mlxp->mlx_uar; 2397 mleq->mleq_events = events; 2398 mleq->mleq_intr_index = vec; 2399 2400 if (!mlxcx_cmd_create_eq(mlxp, mleq)) { 2401 /* mlxcx_teardown_eqs() will clean this up */ 2402 mutex_exit(&mleq->mleq_mtx); 2403 return (B_FALSE); 2404 } 2405 2406 if (ddi_intr_enable(mlxp->mlx_intr_handles[vec]) != DDI_SUCCESS) { 2407 /* 2408 * mlxcx_teardown_eqs() will handle calling cmd_destroy_eq and 2409 * eq_rele_dma 2410 */ 2411 mutex_exit(&mleq->mleq_mtx); 2412 return (B_FALSE); 2413 } 2414 mleq->mleq_state |= MLXCX_EQ_INTR_ENABLED; 2415 mleq->mleq_state |= MLXCX_EQ_ATTACHING; 2416 mlxcx_arm_eq(mlxp, mleq); 2417 mutex_exit(&mleq->mleq_mtx); 2418 2419 return (B_TRUE); 2420 } 2421 2422 static void 2423 mlxcx_eq_set_attached(mlxcx_t *mlxp) 2424 { 2425 uint_t vec; 2426 mlxcx_event_queue_t *mleq; 2427 2428 for (vec = 0; vec < mlxp->mlx_intr_count; ++vec) { 2429 mleq = &mlxp->mlx_eqs[vec]; 2430 2431 mutex_enter(&mleq->mleq_mtx); 2432 mleq->mleq_state &= ~MLXCX_EQ_ATTACHING; 2433 mutex_exit(&mleq->mleq_mtx); 2434 } 2435 } 2436 2437 static boolean_t 2438 mlxcx_setup_async_eqs(mlxcx_t *mlxp) 2439 { 2440 boolean_t ret; 2441 2442 ret = mlxcx_setup_eq(mlxp, 0, 2443 (1ULL << MLXCX_EVENT_CMD_COMPLETION) | 2444 (1ULL << MLXCX_EVENT_PAGE_REQUEST) | 2445 (1ULL << MLXCX_EVENT_PORT_STATE) | 2446 (1ULL << MLXCX_EVENT_INTERNAL_ERROR) | 2447 (1ULL << MLXCX_EVENT_PORT_MODULE) | 2448 (1ULL << MLXCX_EVENT_SENDQ_DRAIN) | 2449 (1ULL << MLXCX_EVENT_LAST_WQE) | 2450 (1ULL << MLXCX_EVENT_CQ_ERROR) | 2451 (1ULL << MLXCX_EVENT_WQ_CATASTROPHE) | 2452 (1ULL << MLXCX_EVENT_PAGE_FAULT) | 2453 (1ULL << MLXCX_EVENT_WQ_INVALID_REQ) | 2454 (1ULL << MLXCX_EVENT_WQ_ACCESS_VIOL) | 2455 (1ULL << MLXCX_EVENT_NIC_VPORT) | 2456 (1ULL << MLXCX_EVENT_DOORBELL_CONGEST)); 2457 2458 if (ret) 2459 mlxcx_cmd_eq_enable(mlxp); 2460 2461 return (ret); 2462 } 2463 2464 int 2465 mlxcx_cq_compare(const void *arg0, const void *arg1) 2466 { 2467 const mlxcx_completion_queue_t *left = arg0; 2468 const mlxcx_completion_queue_t *right = arg1; 2469 2470 if (left->mlcq_num < right->mlcq_num) { 2471 return (-1); 2472 } 2473 if (left->mlcq_num > right->mlcq_num) { 2474 return (1); 2475 } 2476 return (0); 2477 } 2478 2479 static boolean_t 2480 mlxcx_setup_eqs(mlxcx_t *mlxp) 2481 { 2482 uint_t i; 2483 mlxcx_event_queue_t *mleq; 2484 2485 ASSERT3S(mlxp->mlx_intr_count, >, 0); 2486 2487 for (i = mlxp->mlx_intr_cq0; i < mlxp->mlx_intr_count; ++i) { 2488 mleq = &mlxp->mlx_eqs[i]; 2489 mutex_enter(&mleq->mleq_mtx); 2490 if (!mlxcx_eq_alloc_dma(mlxp, mleq)) { 2491 mutex_exit(&mleq->mleq_mtx); 2492 return (B_FALSE); 2493 } 2494 mleq->mleq_uar = &mlxp->mlx_uar; 2495 if (!mlxcx_cmd_create_eq(mlxp, mleq)) { 2496 /* mlxcx_teardown() will handle calling eq_rele_dma */ 2497 mutex_exit(&mleq->mleq_mtx); 2498 return (B_FALSE); 2499 } 2500 if (mlxp->mlx_props.mldp_intrmod_period_usec != 0 && 2501 !mlxcx_cmd_set_int_mod(mlxp, i, 2502 mlxp->mlx_props.mldp_intrmod_period_usec)) { 2503 mutex_exit(&mleq->mleq_mtx); 2504 return (B_FALSE); 2505 } 2506 if (ddi_intr_enable(mlxp->mlx_intr_handles[i]) != DDI_SUCCESS) { 2507 mutex_exit(&mleq->mleq_mtx); 2508 return (B_FALSE); 2509 } 2510 mleq->mleq_state |= MLXCX_EQ_INTR_ENABLED; 2511 mlxcx_arm_eq(mlxp, mleq); 2512 mutex_exit(&mleq->mleq_mtx); 2513 } 2514 2515 mlxp->mlx_next_eq = mlxp->mlx_intr_cq0; 2516 2517 return (B_TRUE); 2518 } 2519 2520 /* 2521 * A more recent ConnectX part will have the Port CApability Mask register. 2522 * Explore it and note things here. 2523 */ 2524 static void 2525 mlxcx_explore_pcam(mlxcx_t *mlxp, mlxcx_caps_t *c) 2526 { 2527 mlxcx_register_data_t data; 2528 mlxcx_reg_pcam_t *pcam = &data.mlrd_pcam; 2529 2530 ASSERT(c->mlc_pcam); 2531 bzero(&data, sizeof (data)); 2532 2533 /* 2534 * Okay, so we have access the the Ports CApability Mask (PCAM). 2535 * There are various things we need to check about it. 2536 */ 2537 2538 VERIFY(mlxcx_cmd_access_register(mlxp, MLXCX_CMD_ACCESS_REGISTER_READ, 2539 MLXCX_REG_PCAM, &data)); 2540 2541 /* 2542 * NOTE: These ASSERT()s may change in future mlxcx(4D) parts. 2543 * As of now, only 0 is valid, and 1-255 are reserved. A future part 2544 * may return non-zero in these fields. 2545 */ 2546 ASSERT0(pcam->mlrd_pcam_feature_group); 2547 ASSERT0(pcam->mlrd_pcam_access_reg_group); 2548 2549 c->mlc_ext_ptys = get_bit64(pcam->mlrd_pcam_feature_cap_mask_low, 2550 MLXCX_PCAM_LOW_FFLAGS_PTYS_EXTENDED); 2551 } 2552 2553 /* 2554 * Snapshot all of the hardware capabilities that we care about and then modify 2555 * the HCA capabilities to get things moving. 2556 */ 2557 static boolean_t 2558 mlxcx_init_caps(mlxcx_t *mlxp) 2559 { 2560 mlxcx_caps_t *c; 2561 2562 mlxp->mlx_caps = c = kmem_zalloc(sizeof (mlxcx_caps_t), KM_SLEEP); 2563 2564 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_GENERAL, 2565 MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_hca_cur)) { 2566 mlxcx_warn(mlxp, "failed to obtain current HCA general caps"); 2567 } 2568 2569 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_GENERAL, 2570 MLXCX_HCA_CAP_MODE_MAX, &c->mlc_hca_max)) { 2571 mlxcx_warn(mlxp, "failed to obtain maximum HCA general caps"); 2572 } 2573 2574 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_ETHERNET, 2575 MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_ether_cur)) { 2576 mlxcx_warn(mlxp, "failed to obtain current HCA eth caps"); 2577 } 2578 2579 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_ETHERNET, 2580 MLXCX_HCA_CAP_MODE_MAX, &c->mlc_ether_max)) { 2581 mlxcx_warn(mlxp, "failed to obtain maximum HCA eth caps"); 2582 } 2583 2584 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_NIC_FLOW, 2585 MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_nic_flow_cur)) { 2586 mlxcx_warn(mlxp, "failed to obtain current HCA flow caps"); 2587 } 2588 2589 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_NIC_FLOW, 2590 MLXCX_HCA_CAP_MODE_MAX, &c->mlc_nic_flow_max)) { 2591 mlxcx_warn(mlxp, "failed to obtain maximum HCA flow caps"); 2592 } 2593 2594 /* 2595 * Check the caps meet our requirements. 2596 */ 2597 const mlxcx_hca_cap_general_caps_t *gen = &c->mlc_hca_cur.mhc_general; 2598 2599 if (gen->mlcap_general_log_pg_sz != 12) { 2600 mlxcx_warn(mlxp, "!hardware has page size != 4k " 2601 "(log_pg_sz = %u)", (uint_t)gen->mlcap_general_log_pg_sz); 2602 goto err; 2603 } 2604 if (gen->mlcap_general_cqe_version != 1) { 2605 mlxcx_warn(mlxp, "!hardware does not support CQE v1 " 2606 "(cqe_ver = %u)", (uint_t)gen->mlcap_general_cqe_version); 2607 goto err; 2608 } 2609 if (gen->mlcap_general_port_type != 2610 MLXCX_CAP_GENERAL_PORT_TYPE_ETHERNET) { 2611 mlxcx_warn(mlxp, "!hardware has non-ethernet ports"); 2612 goto err; 2613 } 2614 mlxp->mlx_nports = gen->mlcap_general_num_ports; 2615 mlxp->mlx_max_sdu = (1 << (gen->mlcap_general_log_max_msg & 0x1F)); 2616 2617 if (mlxp->mlx_type >= MLXCX_DEV_CX5 && 2618 get_bit16(gen->mlcap_general_flags_c, 2619 MLXCX_CAP_GENERAL_FLAGS_C_PCAM_REG)) { 2620 c->mlc_pcam = B_TRUE; 2621 } 2622 2623 c->mlc_max_tir = (1 << gen->mlcap_general_log_max_tir); 2624 2625 c->mlc_checksum = get_bit32(c->mlc_ether_cur.mhc_eth.mlcap_eth_flags, 2626 MLXCX_ETH_CAP_CSUM_CAP); 2627 c->mlc_vxlan = get_bit32(c->mlc_ether_cur.mhc_eth.mlcap_eth_flags, 2628 MLXCX_ETH_CAP_TUNNEL_STATELESS_VXLAN); 2629 2630 c->mlc_max_lso_size = (1 << get_bits32(c->mlc_ether_cur.mhc_eth. 2631 mlcap_eth_flags, MLXCX_ETH_CAP_MAX_LSO_CAP)); 2632 if (c->mlc_max_lso_size == 1) { 2633 c->mlc_max_lso_size = 0; 2634 c->mlc_lso = B_FALSE; 2635 } else { 2636 c->mlc_lso = B_TRUE; 2637 } 2638 2639 c->mlc_max_rqt_size = (1 << get_bits32(c->mlc_ether_cur.mhc_eth. 2640 mlcap_eth_flags, MLXCX_ETH_CAP_RSS_IND_TBL_CAP)); 2641 2642 if (!get_bit32(c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx. 2643 mlcap_flow_prop_flags, MLXCX_FLOW_CAP_PROPS_SUPPORT)) { 2644 mlxcx_warn(mlxp, "!hardware does not support rx flow tables"); 2645 goto err; 2646 } 2647 if (!get_bit32(c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx. 2648 mlcap_flow_prop_flags, MLXCX_FLOW_CAP_PROPS_MODIFY)) { 2649 mlxcx_warn(mlxp, "!hardware does not support modifying rx " 2650 "flow table entries"); 2651 goto err; 2652 } 2653 2654 c->mlc_max_rx_ft_shift = c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx. 2655 mlcap_flow_prop_log_max_ft_size; 2656 c->mlc_max_rx_flows = (1 << c->mlc_nic_flow_cur.mhc_flow. 2657 mlcap_flow_nic_rx.mlcap_flow_prop_log_max_flow); 2658 c->mlc_max_rx_ft = (1 << c->mlc_nic_flow_cur.mhc_flow. 2659 mlcap_flow_nic_rx.mlcap_flow_prop_log_max_ft_num); 2660 c->mlc_max_rx_fe_dest = (1 << c->mlc_nic_flow_cur.mhc_flow. 2661 mlcap_flow_nic_rx.mlcap_flow_prop_log_max_destination); 2662 2663 return (B_TRUE); 2664 2665 err: 2666 kmem_free(mlxp->mlx_caps, sizeof (mlxcx_caps_t)); 2667 return (B_FALSE); 2668 } 2669 2670 static int 2671 mlxcx_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2672 { 2673 mlxcx_t *mlxp; 2674 2675 if (cmd != DDI_DETACH) 2676 return (DDI_FAILURE); 2677 2678 mlxp = ddi_get_driver_private(dip); 2679 if (mlxp == NULL) { 2680 mlxcx_warn(NULL, "asked to detach, but missing instance " 2681 "private data"); 2682 return (DDI_FAILURE); 2683 } 2684 2685 if (mlxp->mlx_attach & MLXCX_ATTACH_MAC_HDL) { 2686 if (mac_unregister(mlxp->mlx_mac_hdl) != DDI_SUCCESS) { 2687 return (DDI_FAILURE); 2688 } 2689 mlxp->mlx_attach &= ~MLXCX_ATTACH_MAC_HDL; 2690 } 2691 2692 mlxcx_teardown(mlxp); 2693 return (DDI_SUCCESS); 2694 } 2695 2696 static size_t 2697 mlxcx_calc_rx_ngroups(mlxcx_t *mlxp) 2698 { 2699 size_t ngroups = mlxp->mlx_props.mldp_rx_ngroups_large + 2700 mlxp->mlx_props.mldp_rx_ngroups_small; 2701 size_t tirlim, flowlim, gflowlim; 2702 2703 tirlim = mlxp->mlx_caps->mlc_max_tir / MLXCX_TIRS_PER_GROUP; 2704 if (tirlim < ngroups) { 2705 mlxcx_note(mlxp, "limiting number of rx groups to %u based " 2706 "on number of TIRs available", tirlim); 2707 ngroups = tirlim; 2708 } 2709 2710 flowlim = (1 << mlxp->mlx_caps->mlc_max_rx_ft_shift) - 2; 2711 if (flowlim < ngroups) { 2712 mlxcx_note(mlxp, "limiting number of rx groups to %u based " 2713 "on max size of RX flow tables", flowlim); 2714 ngroups = flowlim; 2715 } 2716 2717 /* 2718 * Restrict the number of groups not to exceed the max flow 2719 * table number from the devices capabilities. 2720 * There is one root table entry per port and 2 entries per 2721 * group. 2722 */ 2723 flowlim = (mlxp->mlx_caps->mlc_max_rx_ft - mlxp->mlx_nports) / 2; 2724 if (flowlim < ngroups) { 2725 mlxcx_note(mlxp, "limiting number of rx groups to %u based " 2726 "on max number of RX flow tables", 2727 flowlim); 2728 ngroups = flowlim; 2729 } 2730 2731 do { 2732 gflowlim = mlxp->mlx_caps->mlc_max_rx_flows - 16 * ngroups - 2; 2733 if (gflowlim < ngroups) { 2734 mlxcx_note(mlxp, "limiting number of rx groups to %u " 2735 "based on max total RX flows", gflowlim); 2736 --ngroups; 2737 } 2738 } while (gflowlim < ngroups); 2739 2740 return (ngroups); 2741 } 2742 2743 static int 2744 mlxcx_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2745 { 2746 mlxcx_t *mlxp; 2747 char tq_name[TASKQ_NAMELEN]; 2748 uint_t i; 2749 int inst, ret; 2750 2751 if (cmd != DDI_ATTACH) 2752 return (DDI_FAILURE); 2753 2754 inst = ddi_get_instance(dip); 2755 ret = ddi_soft_state_zalloc(mlxcx_softstate, inst); 2756 if (ret != 0) 2757 return (ret); 2758 2759 mlxp = ddi_get_soft_state(mlxcx_softstate, inst); 2760 if (mlxp == NULL) 2761 return (DDI_FAILURE); 2762 mlxp->mlx_dip = dip; 2763 mlxp->mlx_inst = inst; 2764 ddi_set_driver_private(dip, mlxp); 2765 2766 mlxcx_load_props(mlxp); 2767 2768 mlxcx_fm_init(mlxp); 2769 mlxp->mlx_attach |= MLXCX_ATTACH_FM; 2770 2771 if (pci_config_setup(mlxp->mlx_dip, &mlxp->mlx_cfg_handle) != 2772 DDI_SUCCESS) { 2773 mlxcx_warn(mlxp, "failed to initial PCI config space"); 2774 goto err; 2775 } 2776 mlxcx_get_model(mlxp); 2777 mlxp->mlx_attach |= MLXCX_ATTACH_PCI_CONFIG; 2778 2779 if (!mlxcx_regs_map(mlxp)) { 2780 goto err; 2781 } 2782 mlxp->mlx_attach |= MLXCX_ATTACH_REGS; 2783 2784 if (!mlxcx_cmd_queue_init(mlxp)) { 2785 goto err; 2786 } 2787 mlxp->mlx_attach |= MLXCX_ATTACH_CMD; 2788 2789 if (!mlxcx_cmd_enable_hca(mlxp)) { 2790 goto err; 2791 } 2792 mlxp->mlx_attach |= MLXCX_ATTACH_ENABLE_HCA; 2793 2794 if (!mlxcx_check_issi(mlxp)) { 2795 goto err; 2796 } 2797 2798 /* 2799 * We have to get our interrupts now so we know what priority to 2800 * create pagemtx with. 2801 */ 2802 if (!mlxcx_intr_setup(mlxp)) { 2803 goto err; 2804 } 2805 mlxp->mlx_attach |= MLXCX_ATTACH_INTRS; 2806 2807 mutex_init(&mlxp->mlx_pagemtx, NULL, MUTEX_DRIVER, 2808 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 2809 avl_create(&mlxp->mlx_pages, mlxcx_page_compare, 2810 sizeof (mlxcx_dev_page_t), offsetof(mlxcx_dev_page_t, mxdp_tree)); 2811 mlxp->mlx_attach |= MLXCX_ATTACH_PAGE_LIST; 2812 2813 /* 2814 * Taskq for asynchronous events which may interact with the HCA 2815 * via the command interface. Single threaded FIFO. 2816 */ 2817 (void) snprintf(tq_name, sizeof (tq_name), "%s_async_%d", 2818 ddi_driver_name(mlxp->mlx_dip), mlxp->mlx_inst); 2819 mlxp->mlx_async_tq = taskq_create(tq_name, 1, minclsyspri, 1, INT_MAX, 2820 TASKQ_PREPOPULATE); 2821 /* 2822 * Initialize any pre-allocated taskq param structs. 2823 */ 2824 for (i = 0; i <= MLXCX_FUNC_ID_MAX; i++) { 2825 mlxp->mlx_npages_req[i].mla_mlx = mlxp; 2826 mutex_init(&mlxp->mlx_npages_req[i].mla_mtx, NULL, 2827 MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_async_intr_pri)); 2828 } 2829 mlxp->mlx_attach |= MLXCX_ATTACH_ASYNC_TQ; 2830 2831 if (!mlxcx_init_pages(mlxp, MLXCX_QUERY_PAGES_OPMOD_BOOT)) { 2832 goto err; 2833 } 2834 2835 if (!mlxcx_init_caps(mlxp)) { 2836 goto err; 2837 } 2838 mlxp->mlx_attach |= MLXCX_ATTACH_CAPS; 2839 2840 if (!mlxcx_init_pages(mlxp, MLXCX_QUERY_PAGES_OPMOD_INIT)) { 2841 goto err; 2842 } 2843 2844 if (!mlxcx_cmd_init_hca(mlxp)) { 2845 goto err; 2846 } 2847 mlxp->mlx_attach |= MLXCX_ATTACH_INIT_HCA; 2848 2849 if (!mlxcx_cmd_set_driver_version(mlxp, MLXCX_DRIVER_VERSION)) { 2850 goto err; 2851 } 2852 2853 if (mlxp->mlx_caps->mlc_pcam) { 2854 mlxcx_explore_pcam(mlxp, mlxp->mlx_caps); 2855 } 2856 2857 /* 2858 * The User Access Region (UAR) is needed so we can ring EQ and CQ 2859 * doorbells. 2860 */ 2861 if (!mlxcx_cmd_alloc_uar(mlxp, &mlxp->mlx_uar)) { 2862 goto err; 2863 } 2864 for (i = 0; i < MLXCX_BF_PER_UAR; ++i) { 2865 mutex_init(&mlxp->mlx_uar.mlu_bf[i].mbf_mtx, NULL, 2866 MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_intr_pri)); 2867 } 2868 mlxp->mlx_attach |= MLXCX_ATTACH_UAR_PD_TD; 2869 2870 /* 2871 * Set up asynchronous event queue which handles control type events 2872 * like PAGE_REQUEST and CMD completion events. 2873 * 2874 * This will enable and arm the interrupt on EQ 0. Note that only page 2875 * reqs and cmd completions will be handled until we call 2876 * mlxcx_eq_set_attached further down (this way we don't need an extra 2877 * set of locks over the mlxcx_t sub-structs not allocated yet) 2878 */ 2879 if (!mlxcx_setup_async_eqs(mlxp)) { 2880 goto err; 2881 } 2882 2883 /* 2884 * Allocate a protection and transport domain. These don't really do 2885 * anything for us (they're IB concepts), but we need to give their 2886 * ID numbers in other commands. 2887 */ 2888 if (!mlxcx_cmd_alloc_pd(mlxp, &mlxp->mlx_pd)) { 2889 goto err; 2890 } 2891 if (!mlxcx_cmd_alloc_tdom(mlxp, &mlxp->mlx_tdom)) { 2892 goto err; 2893 } 2894 /* 2895 * Fetch the "reserved" lkey that lets us give linear addresses in 2896 * work queue entries, rather than having to mess with the NIC's 2897 * internal MMU. 2898 */ 2899 if (!mlxcx_cmd_query_special_ctxs(mlxp)) { 2900 goto err; 2901 } 2902 2903 /* 2904 * Query our port information and current state, populate the 2905 * mlxcx_port_t structs. 2906 * 2907 * This also sets up the root flow tables and flow groups. 2908 */ 2909 if (!mlxcx_setup_ports(mlxp)) { 2910 goto err; 2911 } 2912 mlxp->mlx_attach |= MLXCX_ATTACH_PORTS; 2913 2914 mlxcx_load_model_props(mlxp); 2915 2916 /* 2917 * Set up, enable and arm the rest of the interrupt EQs which will 2918 * service events from CQs. 2919 * 2920 * The MLXCX_ATTACH_INTRS flag covers checking if these need to be 2921 * cleaned up. 2922 */ 2923 if (!mlxcx_setup_eqs(mlxp)) { 2924 goto err; 2925 } 2926 2927 /* Completion queues */ 2928 list_create(&mlxp->mlx_cqs, sizeof (mlxcx_completion_queue_t), 2929 offsetof(mlxcx_completion_queue_t, mlcq_entry)); 2930 mlxp->mlx_attach |= MLXCX_ATTACH_CQS; 2931 2932 /* Work queues (send queues, receive queues) */ 2933 list_create(&mlxp->mlx_wqs, sizeof (mlxcx_work_queue_t), 2934 offsetof(mlxcx_work_queue_t, mlwq_entry)); 2935 mlxp->mlx_attach |= MLXCX_ATTACH_WQS; 2936 2937 /* 2938 * Construct our arrays of mlxcx_ring_group_ts, which represent the 2939 * "groups" we advertise to MAC. 2940 */ 2941 mlxp->mlx_rx_ngroups = mlxcx_calc_rx_ngroups(mlxp); 2942 mlxp->mlx_rx_groups_size = mlxp->mlx_rx_ngroups * 2943 sizeof (mlxcx_ring_group_t); 2944 mlxp->mlx_rx_groups = kmem_zalloc(mlxp->mlx_rx_groups_size, KM_SLEEP); 2945 2946 mlxp->mlx_tx_ngroups = mlxp->mlx_props.mldp_tx_ngroups; 2947 mlxp->mlx_tx_groups_size = mlxp->mlx_tx_ngroups * 2948 sizeof (mlxcx_ring_group_t); 2949 mlxp->mlx_tx_groups = kmem_zalloc(mlxp->mlx_tx_groups_size, KM_SLEEP); 2950 2951 mlxp->mlx_attach |= MLXCX_ATTACH_GROUPS; 2952 2953 /* 2954 * Sets up the free/busy buffers list for keeping track of packet 2955 * buffers. 2956 */ 2957 if (!mlxcx_setup_bufs(mlxp)) 2958 goto err; 2959 mlxp->mlx_attach |= MLXCX_ATTACH_BUFS; 2960 2961 /* 2962 * Before we tell MAC about our rings/groups, we need to do enough 2963 * setup on them to be sure about the numbers and configuration that 2964 * we have. This will do basically everything short of allocating 2965 * packet buffers and starting the rings up. 2966 */ 2967 for (i = 0; i < mlxp->mlx_tx_ngroups; ++i) { 2968 if (!mlxcx_tx_group_setup(mlxp, &mlxp->mlx_tx_groups[i])) 2969 goto err; 2970 } 2971 for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) { 2972 if (!mlxcx_rx_group_setup(mlxp, &mlxp->mlx_rx_groups[i])) 2973 goto err; 2974 } 2975 2976 /* 2977 * Set up periodic fault check timers which check the queue states, 2978 * set up should be after all the queues have been initialized and 2979 * consequently the teardown of timers must happen before 2980 * queue teardown. 2981 */ 2982 if (!mlxcx_setup_checktimers(mlxp)) { 2983 goto err; 2984 } 2985 mlxp->mlx_attach |= MLXCX_ATTACH_CHKTIMERS; 2986 2987 /* 2988 * Some devices may not have a working temperature sensor; however, 2989 * there isn't a great way for us to know. We shouldn't fail attach if 2990 * this doesn't work. 2991 */ 2992 if (mlxcx_setup_sensors(mlxp)) { 2993 mlxp->mlx_attach |= MLXCX_ATTACH_SENSORS; 2994 } 2995 2996 /* 2997 * Finally, tell MAC that we exist! 2998 */ 2999 if (!mlxcx_register_mac(mlxp)) { 3000 goto err; 3001 } 3002 mlxp->mlx_attach |= MLXCX_ATTACH_MAC_HDL; 3003 3004 /* 3005 * This tells the interrupt handlers they can start processing events 3006 * other than cmd completions and page requests. 3007 */ 3008 mlxcx_eq_set_attached(mlxp); 3009 3010 return (DDI_SUCCESS); 3011 3012 err: 3013 mlxcx_teardown(mlxp); 3014 return (DDI_FAILURE); 3015 } 3016 3017 static struct cb_ops mlxcx_cb_ops = { 3018 .cb_open = nulldev, 3019 .cb_close = nulldev, 3020 .cb_strategy = nodev, 3021 .cb_print = nodev, 3022 .cb_dump = nodev, 3023 .cb_read = nodev, 3024 .cb_write = nodev, 3025 .cb_ioctl = nodev, 3026 .cb_devmap = nodev, 3027 .cb_mmap = nodev, 3028 .cb_segmap = nodev, 3029 .cb_chpoll = nochpoll, 3030 .cb_prop_op = ddi_prop_op, 3031 .cb_flag = D_MP, 3032 .cb_rev = CB_REV, 3033 .cb_aread = nodev, 3034 .cb_awrite = nodev 3035 }; 3036 3037 static struct dev_ops mlxcx_dev_ops = { 3038 .devo_rev = DEVO_REV, 3039 .devo_refcnt = 0, 3040 .devo_getinfo = NULL, 3041 .devo_identify = nulldev, 3042 .devo_probe = nulldev, 3043 .devo_attach = mlxcx_attach, 3044 .devo_detach = mlxcx_detach, 3045 .devo_reset = nodev, 3046 .devo_quiesce = ddi_quiesce_not_supported, 3047 .devo_cb_ops = &mlxcx_cb_ops 3048 }; 3049 3050 static struct modldrv mlxcx_modldrv = { 3051 .drv_modops = &mod_driverops, 3052 .drv_linkinfo = "Mellanox Connect-X 4/5/6", 3053 .drv_dev_ops = &mlxcx_dev_ops 3054 }; 3055 3056 static struct modlinkage mlxcx_modlinkage = { 3057 .ml_rev = MODREV_1, 3058 .ml_linkage = { &mlxcx_modldrv, NULL } 3059 }; 3060 3061 int 3062 _init(void) 3063 { 3064 int ret; 3065 3066 ret = ddi_soft_state_init(&mlxcx_softstate, sizeof (mlxcx_t), 0); 3067 if (ret != 0) { 3068 return (ret); 3069 } 3070 3071 mac_init_ops(&mlxcx_dev_ops, MLXCX_MODULE_NAME); 3072 3073 if ((ret = mod_install(&mlxcx_modlinkage)) != DDI_SUCCESS) { 3074 mac_fini_ops(&mlxcx_dev_ops); 3075 ddi_soft_state_fini(&mlxcx_softstate); 3076 return (ret); 3077 } 3078 3079 return (DDI_SUCCESS); 3080 } 3081 3082 int 3083 _info(struct modinfo *modinfop) 3084 { 3085 return (mod_info(&mlxcx_modlinkage, modinfop)); 3086 } 3087 3088 int 3089 _fini(void) 3090 { 3091 int ret; 3092 3093 if ((ret = mod_remove(&mlxcx_modlinkage)) != DDI_SUCCESS) { 3094 return (ret); 3095 } 3096 3097 mac_fini_ops(&mlxcx_dev_ops); 3098 3099 ddi_soft_state_fini(&mlxcx_softstate); 3100 3101 return (DDI_SUCCESS); 3102 } 3103