1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2020, The University of Queensland 14 * Copyright (c) 2018, Joyent, Inc. 15 * Copyright 2020 RackTop Systems, Inc. 16 */ 17 18 /* 19 * Mellanox Connect-X 4/5/6 driver. 20 */ 21 22 /* 23 * The PRM for this family of parts is freely available, and can be found at: 24 * https://www.mellanox.com/related-docs/user_manuals/ \ 25 * Ethernet_Adapters_Programming_Manual.pdf 26 */ 27 /* 28 * ConnectX glossary 29 * ----------------- 30 * 31 * WR Work Request: something we've asked the hardware to do by 32 * creating a Work Queue Entry (WQE), e.g. send or recv a packet 33 * 34 * WQE Work Queue Entry: a descriptor on a work queue descriptor ring 35 * 36 * WQ Work Queue: a descriptor ring that we can place WQEs on, usually 37 * either a Send Queue (SQ) or Receive Queue (RQ). Different WQ 38 * types have different WQE structures, different commands for 39 * creating and destroying them, etc, but share a common context 40 * structure, counter setup and state graph. 41 * SQ Send Queue, a specific type of WQ that sends packets 42 * RQ Receive Queue, a specific type of WQ that receives packets 43 * 44 * CQ Completion Queue: completion of WRs from a WQ are reported to 45 * one of these, as a CQE on its entry ring. 46 * CQE Completion Queue Entry: an entry in a CQ ring. Contains error 47 * info, as well as packet size, the ID of the WQ, and the index 48 * of the WQE which completed. Does not contain any packet data. 49 * 50 * EQ Event Queue: a ring of event structs from the hardware informing 51 * us when particular events happen. Many events can point at a 52 * a particular CQ which we should then go look at. 53 * EQE Event Queue Entry: an entry on the EQ ring 54 * 55 * UAR User Access Region, a page of the device's PCI BAR which is 56 * tied to particular EQ/CQ/WQ sets and contains doorbells to 57 * ring to arm them for interrupts or wake them up for new work 58 * 59 * RQT RQ Table, a collection of indexed RQs used to refer to the group 60 * as a single unit (for e.g. hashing/RSS). 61 * 62 * TIR Transport Interface Recieve, a bucket of resources for the 63 * reception of packets. TIRs have to point at either a single RQ 64 * or a table of RQs (RQT). They then serve as a target for flow 65 * table entries (FEs). TIRs that point at an RQT also contain the 66 * settings for hashing for RSS. 67 * 68 * TIS Transport Interface Send, a bucket of resources associated with 69 * the transmission of packets. In particular, the temporary 70 * resources used for LSO internally in the card are accounted to 71 * a TIS. 72 * 73 * FT Flow Table, a collection of FEs and FGs that can be referred to 74 * as a single entity (e.g. used as a target from another flow 75 * entry or set as the "root" table to handle incoming or outgoing 76 * packets). Packets arriving at a FT are matched against the 77 * FEs in the table until either one matches with a terminating 78 * action or all FEs are exhausted (it's first-match-wins but with 79 * some actions that are non-terminal, like counting actions). 80 * 81 * FG Flow Group, a group of FEs which share a common "mask" (i.e. 82 * they match on the same attributes of packets coming into the 83 * flow). 84 * 85 * FE Flow Entry, an individual set of values to match against 86 * packets entering the flow table, combined with an action to 87 * take upon a successful match. The action we use most is 88 * "forward", which sends the packets to a TIR or another flow 89 * table and then stops further processing within the FE's FT. 90 * 91 * lkey/mkey A reference to something similar to a page table but in the 92 * device's internal onboard MMU. Since Connect-X parts double as 93 * IB cards (lots of RDMA) they have extensive onboard memory mgmt 94 * features which we try very hard not to use. For our WQEs we use 95 * the "reserved" lkey, which is a special value which indicates 96 * that addresses we give are linear addresses and should not be 97 * translated. 98 * 99 * PD Protection Domain, an IB concept. We have to allocate one to 100 * provide as a parameter for new WQs, but we don't do anything 101 * with it. 102 * 103 * TDOM/TD Transport Domain, an IB concept. We allocate one in order to 104 * provide it as a parameter to TIR/TIS creation, but we don't do 105 * anything with it. 106 */ 107 /* 108 * 109 * Data flow overview 110 * ------------------ 111 * 112 * This driver is a MAC ring-enabled driver which maps rings to send and recv 113 * queues in hardware on the device. 114 * 115 * Each SQ and RQ is set up to report to its own individual CQ, to ensure 116 * sufficient space, and simplify the logic needed to work out which buffer 117 * was completed. 118 * 119 * The CQs are then round-robin allocated onto EQs, of which we set up one per 120 * interrupt that the system gives us for the device. Normally this means we 121 * have 8 EQs. 122 * 123 * When we have >= 8 EQs available, we try to allocate only RX or only TX 124 * CQs on each one. The EQs are chosen for RX and TX in an alternating fashion. 125 * 126 * EQ #0 is reserved for all event types other than completion events, and has 127 * no CQs associated with it at any time. EQs #1 and upwards are only used for 128 * handling CQ completion events. 129 * 130 * +------+ +------+ +------+ +---------+ 131 * | SQ 0 |---->| CQ 0 |-----+ | EQ 0 |------> | MSI-X 0 | mlxcx_intr_0 132 * +------+ +------+ | +------+ +---------+ 133 * | 134 * +------+ +------+ | 135 * | SQ 1 |---->| CQ 1 |---+ | +------+ 136 * +------+ +------+ | +---> | | 137 * | | | 138 * +------+ +------+ | | EQ 1 | +---------+ 139 * | SQ 2 |---->| CQ 2 |---------> | |------> | MSI-X 1 | mlxcx_intr_n 140 * +------+ +------+ | +---> | | +---------+ 141 * | | +------+ 142 * | | 143 * ... | | 144 * | | +------+ 145 * +------+ +------+ +-----> | | 146 * | RQ 0 |---->| CQ 3 |---------> | | +---------+ 147 * +------+ +------+ | | EQ 2 |------> | MSI-X 2 | mlxcx_intr_n 148 * | | | +---------+ 149 * +------+ +------+ | +-> | | 150 * | RQ 1 |---->| CQ 4 |-----+ | +------+ 151 * +------+ +------+ | 152 * | .... 153 * +------+ +------+ | 154 * | RQ 2 |---->| CQ 5 |-------+ 155 * +------+ +------+ 156 * 157 * ... (note this diagram does not show RX-only or TX-only EQs) 158 * 159 * For TX, we advertise all of the SQs we create as plain rings to MAC with 160 * no TX groups. This puts MAC in "virtual group" mode where it will allocate 161 * and use the rings as it sees fit. 162 * 163 * For RX, we advertise actual groups in order to make use of hardware 164 * classification. 165 * 166 * The hardware classification we use is based around Flow Tables, and we 167 * currently ignore all of the eswitch features of the card. The NIC VPORT 168 * is always set to promisc mode so that the eswitch sends us all of the 169 * traffic that arrives on the NIC, and we use flow entries to manage 170 * everything. 171 * 172 * We use 2 layers of flow tables for classification: traffic arrives at the 173 * root RX flow table which contains MAC address filters. Those then send 174 * matched traffic to the per-group L1 VLAN filter tables which contain VLAN 175 * presence and VID filters. 176 * 177 * Since these parts only support doing RSS hashing on a single protocol at a 178 * time, we have to use a third layer of flow tables as well to break traffic 179 * down by L4 and L3 protocol (TCPv6, TCPv4, UDPv6, UDPv4, IPv6, IPv4 etc) 180 * so that it can be sent to the appropriate TIR for hashing. 181 * 182 * Incoming packets 183 * + +---------+ +---------+ 184 * | +->| group 0 | | group 0 | 185 * | | | vlan ft | +-->| hash ft | 186 * v | | L1 | | | L2 | 187 * +----+----+ | +---------+ | +---------+ +-----+ +-----+------+ 188 * | eswitch | | | | | | TCPv6 |--->| TIR |--->| | RQ0 | 189 * +----+----+ | | | | +---------+ +-----+ | +------+ 190 * | | | | | | UDPv6 |--->| TIR |--->| | RQ1 | 191 * | | | | | +---------+ +-----+ | +------+ 192 * | | | | | | TCPv4 |--->| TIR |--->| | RQ2 | 193 * v | | | | +---------+ +-----+ | RQT +------+ 194 * +----+----+ | +---------+ | | UDPv4 |--->| TIR |--->| | ... | 195 * | root rx | | | default |--+ +---------+ +-----+ | | | 196 * | flow tb | | +---------+ | | IPv6 |--->| TIR |--->| | | 197 * | L0 | | | promisc |--+ +---------+ +-----+ | | | 198 * +---------+ | +---------+ ^ | IPv4 |--->| TIR |--->| | | 199 * | bcast |---|---------------+ +---------+ +-----+ +-----+------+ 200 * +---------+ | ^ | other |-+ 201 * | MAC 0 |---+ | +---------+ | +-----+ +-----+ 202 * +---------+ | +->| TIR |--->| RQ0 | 203 * | MAC 1 |-+ | +-----+ +-----+ 204 * +---------+ | +---------------+ 205 * | MAC 2 |-+ | ^ 206 * +---------+ | | | 207 * | MAC 3 |-+ | +---------+ | +---------+ 208 * +---------+ | | | group 1 | | | group 1 | 209 * | ..... | +--->| vlan ft | | +>| hash ft | 210 * | | | | L1 | | | | L2 | 211 * +---------+ | +---------+ | | +---------+ +-----+ +-----+------+ 212 * | promisc |---+ | VLAN 0 |----+ | TCPv6 |--->| TIR |--->| | RQ3 | 213 * +---------+ +---------+ | +---------+ +-----+ | +------+ 214 * | ..... | | | UDPv6 |--->| TIR |--->| | RQ4 | 215 * | | | +---------+ +-----+ | +------+ 216 * | | | | TCPv4 |--->| TIR |--->| | RQ5 | 217 * | | | +---------+ +-----+ | RQT +------+ 218 * +---------+ | | UDPv4 |--->| TIR |--->| | ... | 219 * | | | +---------+ +-----+ | | | 220 * +---------+ | | IPv6 |--->| TIR |--->| | | 221 * | promisc |--+ +---------+ +-----+ | | | 222 * +---------+ | IPv4 |--->| TIR |--->| | | 223 * +---------+ +-----+ +-----+------+ 224 * | other |-+ 225 * +---------+ | 226 * ....... | +-----+ +-----+ 227 * +->| TIR |--->| RQ3 | 228 * +-----+ +-----+ 229 * 230 * Note that the "promisc" flow entries are only set/enabled when promisc 231 * mode is enabled for the NIC. All promisc flow entries point directly at 232 * group 0's hashing flowtable (so all promisc-only traffic lands on group 0, 233 * the "default group" in MAC). 234 * 235 * The "default" entry in the L1 VLAN filter flow tables is used when there 236 * are no VLANs set for the group, to accept any traffic regardless of tag. It 237 * is deleted as soon as a VLAN filter is added (and re-instated if the 238 * last VLAN filter is removed). 239 * 240 * The actual descriptor ring structures for RX on Connect-X4 don't contain any 241 * space for packet data (they're a collection of scatter pointers only). TX 242 * descriptors contain some space for "inline headers" (and the card requires 243 * us to put at least the L2 Ethernet headers there for the eswitch to look at) 244 * but all the rest of the data comes from the gather pointers. 245 * 246 * When we get completions back they simply contain the ring index number of 247 * the WR (work request) which completed. So, we manage the buffers for actual 248 * packet data completely independently of the descriptors in this driver. When 249 * a WR is enqueued in a WQE (work queue entry), we stamp the packet data buffer 250 * with the WQE index that we put it at, and therefore don't have to look at 251 * the original descriptor at all when handling completions. 252 * 253 * For RX, we create sufficient packet data buffers to fill 150% of the 254 * available descriptors for each ring. These all are pre-set-up for DMA and 255 * have an mblk_t associated with them (with desballoc()). 256 * 257 * For TX we either borrow the mblk's memory and DMA bind it (if the packet is 258 * large enough), or we copy it into a pre-allocated buffer set up in the same 259 * as as for RX. 260 */ 261 262 /* 263 * Buffer lifecycle: RX 264 * -------------------- 265 * 266 * The lifecycle of an mlxcx_buffer_t (packet buffer) used for RX is pretty 267 * straightforward. 268 * 269 * It is created (and has all its memory allocated) at the time of starting up 270 * the RX ring it belongs to. Then it is placed on the "free" list in the 271 * mlxcx_buffer_shard_t associated with its RQ. When mlxcx_rq_refill() wants 272 * more buffers to add to the RQ, it takes one off and marks it as "on WQ" 273 * before making a WQE for it. 274 * 275 * After a completion event occurs, the packet is either discarded (and the 276 * buffer_t returned to the free list), or it is readied for loaning to MAC 277 * and placed on the "loaned" list in the mlxcx_buffer_shard_t. 278 * 279 * Once MAC and the rest of the system have finished with the packet, they call 280 * freemsg() on its mblk, which will call mlxcx_buf_mp_return. At this point 281 * the fate of the buffer_t is determined by the state of the 282 * mlxcx_buffer_shard_t. When the shard is in its normal state the buffer_t 283 * will be returned to the free list, potentially to be recycled and used 284 * again. But if the shard is draining (E.g. after a ring stop) there will be 285 * no recycling and the buffer_t is immediately destroyed. 286 * 287 * At detach/teardown time, buffers are only every destroyed from the free list. 288 * 289 * 290 * + 291 * | 292 * | mlxcx_buf_create 293 * | 294 * v 295 * +----+----+ 296 * | created | 297 * +----+----+ +------+ 298 * | | dead | 299 * | +------+ 300 * | mlxcx_buf_return ^ 301 * | | 302 * v | mlxcx_buf_destroy 303 * mlxcx_buf_destroy +----+----+ +-----------+ | 304 * +---------| free |<------no-| draining? |-yes-+ 305 * | +----+----+ +-----------+ 306 * | | ^ 307 * | | | 308 * v | mlxcx_buf_take | mlxcx_buf_return 309 * +---+--+ v | 310 * | dead | +---+---+ | 311 * +------+ | on WQ |- - - - - - - - >O 312 * +---+---+ ^ 313 * | | 314 * | | 315 * | mlxcx_buf_loan | mlxcx_buf_mp_return 316 * v | 317 * +-------+--------+ | 318 * | on loan to MAC |----------->O 319 * +----------------+ freemsg() 320 * 321 */ 322 323 /* 324 * Buffer lifecycle: TX 325 * -------------------- 326 * 327 * mlxcx_buffer_ts used for TX are divided into two kinds: regular buffers, and 328 * "foreign" buffers. 329 * 330 * The former have their memory allocated and DMA bound by this driver, while 331 * the latter (the "foreign" buffers) are on loan from MAC. Their memory is 332 * not owned by us, though we do DMA bind it (and take responsibility for 333 * un-binding it when we're done with them). 334 * 335 * We use separate mlxcx_buf_shard_ts for foreign and local buffers on each 336 * SQ. Thus, there is a separate free list and mutex for each kind. 337 * 338 * Since a TX packet might consist of multiple mblks, we translate each mblk 339 * into exactly one buffer_t. The buffer_ts are chained together in the same 340 * order as the mblks, using the mlb_tx_chain/mlb_tx_chain_entry list_t. 341 * 342 * Each chain of TX buffers may consist of foreign or driver buffers, in any 343 * mixture. 344 * 345 * The head of a TX buffer chain has mlb_tx_head == itself, which distinguishes 346 * it from the rest of the chain buffers. 347 * 348 * TX buffer chains are always returned to the free list by 349 * mlxcx_buf_return_chain(), which takes care of walking the mlb_tx_chain and 350 * freeing all of the members. 351 * 352 * We only call freemsg() once, on the head of the TX buffer chain's original 353 * mblk. This is true whether we copied it or bound it in a foreign buffer. 354 */ 355 356 /* 357 * Startup and command interface 358 * ----------------------------- 359 * 360 * The command interface is the primary way in which we give control orders to 361 * the hardware (e.g. actions like "create this queue" or "delete this flow 362 * entry"). The command interface is never used to transmit or receive packets 363 * -- that takes place only on the queues that are set up through it. 364 * 365 * In mlxcx_cmd.c we implement our use of the command interface on top of a 366 * simple taskq. As commands are submitted from the taskq they choose a 367 * "slot", if there are no free slots then execution of the command will 368 * be paused until one is free. The hardware permits up to 32 independent 369 * slots for concurrent command execution. 370 * 371 * Before interrupts are enabled, command completion is polled, once 372 * interrupts are up command completions become asynchronous and are 373 * wired to EQ 0. A caveat to this is commands can not be submitted 374 * directly from EQ 0's completion handler, and any processing resulting from 375 * an asynchronous event which requires further use of the command interface 376 * is posted through a taskq. 377 * 378 * The startup/attach process for this card involves a bunch of different steps 379 * which are summarised pretty well in the PRM. We have to send a number of 380 * commands which do different things to start the card up, give it some pages 381 * of our own memory for it to use, then start creating all the entities that 382 * we need to use like EQs, CQs, WQs, as well as their dependencies like PDs 383 * and TDoms. 384 */ 385 386 /* 387 * UARs 388 * ---- 389 * 390 * The pages of the PCI BAR other than the first few are reserved for use as 391 * "UAR" sections in this device. Each UAR section can be used as a set of 392 * doorbells for our queues. 393 * 394 * Currently we just make one single UAR for all of our queues. It doesn't 395 * seem to be a major limitation yet. 396 * 397 * When we're sending packets through an SQ, the PRM is not awful clear about 398 * exactly how we're meant to use the first 16 bytes of the Blueflame buffers 399 * (it's clear on the pattern of alternation you're expected to use between 400 * even and odd for Blueflame sends, but not for regular doorbells). 401 * 402 * Currently we don't do the even-odd alternating pattern for ordinary 403 * doorbells, and we don't use Blueflame at all. This seems to work fine, at 404 * least on Connect-X4 Lx. 405 */ 406 407 /* 408 * Lock ordering 409 * ------------- 410 * 411 * Interrupt side: 412 * 413 * - mleq_mtx 414 * - mlcq_arm_mtx 415 * - mlcq_mtx 416 * - mlcq_bufbmtx 417 * - mlwq_mtx 418 * - mlbs_mtx 419 * - mlp_mtx 420 * 421 * GLD side: 422 * 423 * - mlp_mtx 424 * - mlg_mtx 425 * - mlg_*.mlft_mtx 426 * - mlp_*.mlft_mtx 427 * - mlwq_mtx 428 * - mlbs_mtx 429 * - mlcq_bufbmtx 430 * - mleq_mtx 431 * - mlcq_arm_mtx 432 * - mlcq_mtx 433 * 434 */ 435 436 #include <sys/modctl.h> 437 #include <sys/conf.h> 438 #include <sys/devops.h> 439 #include <sys/sysmacros.h> 440 #include <sys/time.h> 441 442 #include <sys/mac_provider.h> 443 444 #include <mlxcx.h> 445 446 CTASSERT((1 << MLXCX_RX_HASH_FT_SIZE_SHIFT) >= MLXCX_TIRS_PER_GROUP); 447 448 #define MLXCX_MODULE_NAME "mlxcx" 449 /* 450 * We give this to the firmware, so it has to be in a fixed format that it 451 * understands. 452 */ 453 #define MLXCX_DRIVER_VERSION "illumos,mlxcx,1.0.0,1,000,000000" 454 455 /* 456 * Firmware may take a while to reclaim pages. Try a set number of times. 457 */ 458 clock_t mlxcx_reclaim_delay = 1000 * 50; /* 50 ms in us */ 459 uint_t mlxcx_reclaim_tries = 100; /* Wait at most 5000ms */ 460 461 static void *mlxcx_softstate; 462 463 /* 464 * Fault detection thresholds. 465 */ 466 uint_t mlxcx_doorbell_tries = MLXCX_DOORBELL_TRIES_DFLT; 467 uint_t mlxcx_stuck_intr_count = MLXCX_STUCK_INTR_COUNT_DFLT; 468 469 static void 470 mlxcx_load_prop_defaults(mlxcx_t *mlxp) 471 { 472 mlxcx_drv_props_t *p = &mlxp->mlx_props; 473 mlxcx_port_t *port = &mlxp->mlx_ports[0]; 474 475 VERIFY((mlxp->mlx_attach & MLXCX_ATTACH_PORTS) != 0); 476 VERIFY((mlxp->mlx_attach & (MLXCX_ATTACH_CQS | MLXCX_ATTACH_WQS)) == 0); 477 478 /* 479 * Currently we have different queue size defaults for two 480 * categories of queues. One set for devices which support a 481 * maximum speed of 10Gb/s, and another for those above that. 482 */ 483 if ((port->mlp_max_proto & (MLXCX_PROTO_25G | MLXCX_PROTO_40G | 484 MLXCX_PROTO_50G | MLXCX_PROTO_100G)) != 0) { 485 p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_25G; 486 p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_25G; 487 p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_25G; 488 } else if ((port->mlp_max_proto & (MLXCX_PROTO_100M | MLXCX_PROTO_1G | 489 MLXCX_PROTO_10G)) != 0) { 490 p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT; 491 p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT; 492 p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT; 493 } else { 494 mlxcx_warn(mlxp, "Encountered a port with a speed we don't " 495 "recognize. Proto: 0x%x", port->mlp_max_proto); 496 p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT; 497 p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT; 498 p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT; 499 } 500 } 501 502 /* 503 * Properties which may have different defaults based on hardware 504 * characteristics. 505 */ 506 static void 507 mlxcx_load_model_props(mlxcx_t *mlxp) 508 { 509 mlxcx_drv_props_t *p = &mlxp->mlx_props; 510 511 mlxcx_load_prop_defaults(mlxp); 512 513 p->mldp_cq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 514 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cq_size_shift", 515 p->mldp_cq_size_shift_default); 516 p->mldp_sq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 517 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "sq_size_shift", 518 p->mldp_sq_size_shift_default); 519 p->mldp_rq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 520 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rq_size_shift", 521 p->mldp_rq_size_shift_default); 522 } 523 524 static void 525 mlxcx_load_props(mlxcx_t *mlxp) 526 { 527 mlxcx_drv_props_t *p = &mlxp->mlx_props; 528 529 p->mldp_eq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 530 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "eq_size_shift", 531 MLXCX_EQ_SIZE_SHIFT_DFLT); 532 p->mldp_cqemod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 533 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_period_usec", 534 MLXCX_CQEMOD_PERIOD_USEC_DFLT); 535 p->mldp_cqemod_count = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 536 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_count", 537 MLXCX_CQEMOD_COUNT_DFLT); 538 p->mldp_intrmod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 539 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "intrmod_period_usec", 540 MLXCX_INTRMOD_PERIOD_USEC_DFLT); 541 542 p->mldp_tx_ngroups = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 543 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_ngroups", 544 MLXCX_TX_NGROUPS_DFLT); 545 p->mldp_tx_nrings_per_group = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 546 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_nrings_per_group", 547 MLXCX_TX_NRINGS_PER_GROUP_DFLT); 548 549 p->mldp_rx_ngroups_large = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 550 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_ngroups_large", 551 MLXCX_RX_NGROUPS_LARGE_DFLT); 552 p->mldp_rx_ngroups_small = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 553 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_ngroups_small", 554 MLXCX_RX_NGROUPS_SMALL_DFLT); 555 p->mldp_rx_nrings_per_large_group = ddi_getprop(DDI_DEV_T_ANY, 556 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 557 "rx_nrings_per_large_group", MLXCX_RX_NRINGS_PER_LARGE_GROUP_DFLT); 558 p->mldp_rx_nrings_per_small_group = ddi_getprop(DDI_DEV_T_ANY, 559 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 560 "rx_nrings_per_small_group", MLXCX_RX_NRINGS_PER_SMALL_GROUP_DFLT); 561 562 p->mldp_ftbl_root_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 563 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "ftbl_root_size_shift", 564 MLXCX_FTBL_ROOT_SIZE_SHIFT_DFLT); 565 566 p->mldp_tx_bind_threshold = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 567 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_bind_threshold", 568 MLXCX_TX_BIND_THRESHOLD_DFLT); 569 570 p->mldp_ftbl_vlan_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 571 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "ftbl_vlan_size_shift", 572 MLXCX_FTBL_VLAN_SIZE_SHIFT_DFLT); 573 574 p->mldp_eq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY, 575 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 576 "eq_check_interval_sec", MLXCX_EQ_CHECK_INTERVAL_SEC_DFLT); 577 p->mldp_cq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY, 578 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 579 "cq_check_interval_sec", MLXCX_CQ_CHECK_INTERVAL_SEC_DFLT); 580 p->mldp_wq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY, 581 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 582 "wq_check_interval_sec", MLXCX_WQ_CHECK_INTERVAL_SEC_DFLT); 583 584 p->mldp_rx_per_cq = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 585 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_limit_per_completion", 586 MLXCX_RX_PER_CQ_DEFAULT); 587 588 if (p->mldp_rx_per_cq < MLXCX_RX_PER_CQ_MIN || 589 p->mldp_rx_per_cq > MLXCX_RX_PER_CQ_MAX) { 590 mlxcx_warn(mlxp, "!rx_limit_per_completion = %u is " 591 "out of range. Defaulting to: %d. Valid values are from " 592 "%d to %d", p->mldp_rx_per_cq, MLXCX_RX_PER_CQ_DEFAULT, 593 MLXCX_RX_PER_CQ_MIN, MLXCX_RX_PER_CQ_MAX); 594 p->mldp_rx_per_cq = MLXCX_RX_PER_CQ_DEFAULT; 595 } 596 } 597 598 void 599 mlxcx_note(mlxcx_t *mlxp, const char *fmt, ...) 600 { 601 va_list ap; 602 603 va_start(ap, fmt); 604 if (mlxp != NULL && mlxp->mlx_dip != NULL) { 605 vdev_err(mlxp->mlx_dip, CE_NOTE, fmt, ap); 606 } else { 607 vcmn_err(CE_NOTE, fmt, ap); 608 } 609 va_end(ap); 610 } 611 612 void 613 mlxcx_warn(mlxcx_t *mlxp, const char *fmt, ...) 614 { 615 va_list ap; 616 617 va_start(ap, fmt); 618 if (mlxp != NULL && mlxp->mlx_dip != NULL) { 619 vdev_err(mlxp->mlx_dip, CE_WARN, fmt, ap); 620 } else { 621 vcmn_err(CE_WARN, fmt, ap); 622 } 623 va_end(ap); 624 } 625 626 void 627 mlxcx_panic(mlxcx_t *mlxp, const char *fmt, ...) 628 { 629 va_list ap; 630 631 va_start(ap, fmt); 632 if (mlxp != NULL && mlxp->mlx_dip != NULL) { 633 vdev_err(mlxp->mlx_dip, CE_PANIC, fmt, ap); 634 } else { 635 vcmn_err(CE_PANIC, fmt, ap); 636 } 637 va_end(ap); 638 } 639 640 uint16_t 641 mlxcx_get16(mlxcx_t *mlxp, uintptr_t off) 642 { 643 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 644 return (ddi_get16(mlxp->mlx_regs_handle, (void *)addr)); 645 } 646 647 uint32_t 648 mlxcx_get32(mlxcx_t *mlxp, uintptr_t off) 649 { 650 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 651 return (ddi_get32(mlxp->mlx_regs_handle, (void *)addr)); 652 } 653 654 uint64_t 655 mlxcx_get64(mlxcx_t *mlxp, uintptr_t off) 656 { 657 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 658 return (ddi_get64(mlxp->mlx_regs_handle, (void *)addr)); 659 } 660 661 void 662 mlxcx_put32(mlxcx_t *mlxp, uintptr_t off, uint32_t val) 663 { 664 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 665 ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val); 666 } 667 668 void 669 mlxcx_put64(mlxcx_t *mlxp, uintptr_t off, uint64_t val) 670 { 671 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 672 ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val); 673 } 674 675 void 676 mlxcx_uar_put32(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint32_t val) 677 { 678 /* 679 * The UAR is always inside the first BAR, which we mapped as 680 * mlx_regs 681 */ 682 uintptr_t addr = off + (uintptr_t)mlu->mlu_base + 683 (uintptr_t)mlxp->mlx_regs_base; 684 ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val); 685 } 686 687 void 688 mlxcx_uar_put64(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint64_t val) 689 { 690 uintptr_t addr = off + (uintptr_t)mlu->mlu_base + 691 (uintptr_t)mlxp->mlx_regs_base; 692 ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val); 693 } 694 695 static void 696 mlxcx_fm_fini(mlxcx_t *mlxp) 697 { 698 if (mlxp->mlx_fm_caps == 0) 699 return; 700 701 if (DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) 702 ddi_fm_handler_unregister(mlxp->mlx_dip); 703 704 if (DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps) || 705 DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) 706 pci_ereport_teardown(mlxp->mlx_dip); 707 708 ddi_fm_fini(mlxp->mlx_dip); 709 710 mlxp->mlx_fm_caps = 0; 711 } 712 713 void 714 mlxcx_fm_ereport(mlxcx_t *mlxp, const char *detail) 715 { 716 uint64_t ena; 717 char buf[FM_MAX_CLASS]; 718 719 if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps)) 720 return; 721 722 (void) snprintf(buf, FM_MAX_CLASS, "%s.%s", DDI_FM_DEVICE, detail); 723 ena = fm_ena_generate(0, FM_ENA_FMT1); 724 ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP, 725 FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0, 726 NULL); 727 } 728 729 static int 730 mlxcx_fm_errcb(dev_info_t *dip, ddi_fm_error_t *err, const void *arg) 731 { 732 /* 733 * as the driver can always deal with an error in any dma or 734 * access handle, we can just return the fme_status value. 735 */ 736 pci_ereport_post(dip, err, NULL); 737 return (err->fme_status); 738 } 739 740 static void 741 mlxcx_fm_init(mlxcx_t *mlxp) 742 { 743 ddi_iblock_cookie_t iblk; 744 int def = DDI_FM_EREPORT_CAPABLE | DDI_FM_ACCCHK_CAPABLE | 745 DDI_FM_DMACHK_CAPABLE | DDI_FM_ERRCB_CAPABLE; 746 747 mlxp->mlx_fm_caps = ddi_prop_get_int(DDI_DEV_T_ANY, mlxp->mlx_dip, 748 DDI_PROP_DONTPASS, "fm_capable", def); 749 750 if (mlxp->mlx_fm_caps < 0) { 751 mlxp->mlx_fm_caps = 0; 752 } 753 mlxp->mlx_fm_caps &= def; 754 755 if (mlxp->mlx_fm_caps == 0) 756 return; 757 758 ddi_fm_init(mlxp->mlx_dip, &mlxp->mlx_fm_caps, &iblk); 759 if (DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps) || 760 DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) { 761 pci_ereport_setup(mlxp->mlx_dip); 762 } 763 if (DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) { 764 ddi_fm_handler_register(mlxp->mlx_dip, mlxcx_fm_errcb, 765 (void *)mlxp); 766 } 767 } 768 769 static void 770 mlxcx_mlbs_teardown(mlxcx_t *mlxp, mlxcx_buf_shard_t *s) 771 { 772 mlxcx_buffer_t *buf; 773 774 mutex_enter(&s->mlbs_mtx); 775 776 while (!list_is_empty(&s->mlbs_busy)) 777 cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx); 778 779 while (!list_is_empty(&s->mlbs_loaned)) 780 cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx); 781 782 while ((buf = list_head(&s->mlbs_free)) != NULL) 783 mlxcx_buf_destroy(mlxp, buf); 784 785 list_destroy(&s->mlbs_free); 786 list_destroy(&s->mlbs_busy); 787 list_destroy(&s->mlbs_loaned); 788 mutex_exit(&s->mlbs_mtx); 789 790 cv_destroy(&s->mlbs_free_nonempty); 791 mutex_destroy(&s->mlbs_mtx); 792 } 793 794 static void 795 mlxcx_teardown_bufs(mlxcx_t *mlxp) 796 { 797 mlxcx_buf_shard_t *s; 798 799 while ((s = list_remove_head(&mlxp->mlx_buf_shards)) != NULL) { 800 mlxcx_mlbs_teardown(mlxp, s); 801 kmem_free(s, sizeof (mlxcx_buf_shard_t)); 802 } 803 list_destroy(&mlxp->mlx_buf_shards); 804 805 kmem_cache_destroy(mlxp->mlx_bufs_cache); 806 } 807 808 static void 809 mlxcx_teardown_pages(mlxcx_t *mlxp) 810 { 811 uint_t nzeros = 0; 812 uint64_t *pas; 813 814 pas = kmem_alloc(sizeof (*pas) * MLXCX_MANAGE_PAGES_MAX_PAGES, 815 KM_SLEEP); 816 817 mutex_enter(&mlxp->mlx_pagemtx); 818 819 while (mlxp->mlx_npages > 0) { 820 int32_t req, ret; 821 822 ASSERT0(avl_is_empty(&mlxp->mlx_pages)); 823 req = MIN(mlxp->mlx_npages, MLXCX_MANAGE_PAGES_MAX_PAGES); 824 825 if (!mlxcx_cmd_return_pages(mlxp, req, pas, &ret)) { 826 mlxcx_warn(mlxp, "hardware refused to return pages, " 827 "leaking %u remaining pages", mlxp->mlx_npages); 828 goto out; 829 } 830 831 for (int32_t i = 0; i < ret; i++) { 832 mlxcx_dev_page_t *mdp, probe; 833 bzero(&probe, sizeof (probe)); 834 probe.mxdp_pa = pas[i]; 835 836 mdp = avl_find(&mlxp->mlx_pages, &probe, NULL); 837 838 if (mdp != NULL) { 839 avl_remove(&mlxp->mlx_pages, mdp); 840 mlxp->mlx_npages--; 841 mlxcx_dma_free(&mdp->mxdp_dma); 842 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 843 } else { 844 mlxcx_panic(mlxp, "hardware returned a page " 845 "with PA 0x%" PRIx64 " but we have no " 846 "record of giving out such a page", pas[i]); 847 } 848 } 849 850 /* 851 * If no pages were returned, note that fact. 852 */ 853 if (ret == 0) { 854 nzeros++; 855 if (nzeros > mlxcx_reclaim_tries) { 856 mlxcx_warn(mlxp, "hardware refused to return " 857 "pages, leaking %u remaining pages", 858 mlxp->mlx_npages); 859 goto out; 860 } 861 delay(drv_usectohz(mlxcx_reclaim_delay)); 862 } 863 } 864 865 avl_destroy(&mlxp->mlx_pages); 866 867 out: 868 mutex_exit(&mlxp->mlx_pagemtx); 869 mutex_destroy(&mlxp->mlx_pagemtx); 870 871 kmem_free(pas, sizeof (*pas) * MLXCX_MANAGE_PAGES_MAX_PAGES); 872 } 873 874 static boolean_t 875 mlxcx_eq_alloc_dma(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) 876 { 877 ddi_device_acc_attr_t acc; 878 ddi_dma_attr_t attr; 879 boolean_t ret; 880 size_t sz, i; 881 882 VERIFY0(mleq->mleq_state & MLXCX_EQ_ALLOC); 883 884 mleq->mleq_entshift = mlxp->mlx_props.mldp_eq_size_shift; 885 mleq->mleq_nents = (1 << mleq->mleq_entshift); 886 sz = mleq->mleq_nents * sizeof (mlxcx_eventq_ent_t); 887 ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0); 888 889 mlxcx_dma_acc_attr(mlxp, &acc); 890 mlxcx_dma_queue_attr(mlxp, &attr); 891 892 ret = mlxcx_dma_alloc(mlxp, &mleq->mleq_dma, &attr, &acc, 893 B_TRUE, sz, B_TRUE); 894 if (!ret) { 895 mlxcx_warn(mlxp, "failed to allocate EQ memory"); 896 return (B_FALSE); 897 } 898 899 mleq->mleq_ent = (mlxcx_eventq_ent_t *)mleq->mleq_dma.mxdb_va; 900 901 for (i = 0; i < mleq->mleq_nents; ++i) 902 mleq->mleq_ent[i].mleqe_owner = MLXCX_EQ_OWNER_INIT; 903 904 mleq->mleq_state |= MLXCX_EQ_ALLOC; 905 906 return (B_TRUE); 907 } 908 909 static void 910 mlxcx_eq_rele_dma(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) 911 { 912 VERIFY(mleq->mleq_state & MLXCX_EQ_ALLOC); 913 if (mleq->mleq_state & MLXCX_EQ_CREATED) 914 VERIFY(mleq->mleq_state & MLXCX_EQ_DESTROYED); 915 916 mlxcx_dma_free(&mleq->mleq_dma); 917 mleq->mleq_ent = NULL; 918 919 mleq->mleq_state &= ~MLXCX_EQ_ALLOC; 920 } 921 922 void 923 mlxcx_teardown_flow_table(mlxcx_t *mlxp, mlxcx_flow_table_t *ft) 924 { 925 mlxcx_flow_group_t *fg; 926 mlxcx_flow_entry_t *fe; 927 int i; 928 929 ASSERT(mutex_owned(&ft->mlft_mtx)); 930 931 for (i = ft->mlft_nents - 1; i >= 0; --i) { 932 fe = &ft->mlft_ent[i]; 933 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) { 934 if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) { 935 mlxcx_panic(mlxp, "failed to delete flow " 936 "entry %u on table %u", i, 937 ft->mlft_num); 938 } 939 } 940 } 941 942 while ((fg = list_remove_head(&ft->mlft_groups)) != NULL) { 943 if (fg->mlfg_state & MLXCX_FLOW_GROUP_CREATED && 944 !(fg->mlfg_state & MLXCX_FLOW_GROUP_DESTROYED)) { 945 if (!mlxcx_cmd_destroy_flow_group(mlxp, fg)) { 946 mlxcx_panic(mlxp, "failed to destroy flow " 947 "group %u", fg->mlfg_num); 948 } 949 } 950 kmem_free(fg, sizeof (mlxcx_flow_group_t)); 951 } 952 list_destroy(&ft->mlft_groups); 953 if (ft->mlft_state & MLXCX_FLOW_TABLE_CREATED && 954 !(ft->mlft_state & MLXCX_FLOW_TABLE_DESTROYED)) { 955 if (!mlxcx_cmd_destroy_flow_table(mlxp, ft)) { 956 mlxcx_panic(mlxp, "failed to destroy flow table %u", 957 ft->mlft_num); 958 } 959 } 960 kmem_free(ft->mlft_ent, ft->mlft_entsize); 961 ft->mlft_ent = NULL; 962 mutex_exit(&ft->mlft_mtx); 963 mutex_destroy(&ft->mlft_mtx); 964 kmem_free(ft, sizeof (mlxcx_flow_table_t)); 965 } 966 967 static void 968 mlxcx_teardown_ports(mlxcx_t *mlxp) 969 { 970 uint_t i; 971 mlxcx_port_t *p; 972 mlxcx_flow_table_t *ft; 973 974 for (i = 0; i < mlxp->mlx_nports; ++i) { 975 p = &mlxp->mlx_ports[i]; 976 if (!(p->mlp_init & MLXCX_PORT_INIT)) 977 continue; 978 mutex_enter(&p->mlp_mtx); 979 if ((ft = p->mlp_rx_flow) != NULL) { 980 mutex_enter(&ft->mlft_mtx); 981 /* 982 * teardown_flow_table() will destroy the mutex, so 983 * we don't release it here. 984 */ 985 mlxcx_teardown_flow_table(mlxp, ft); 986 } 987 mutex_exit(&p->mlp_mtx); 988 mutex_destroy(&p->mlp_mtx); 989 mutex_destroy(&p->mlx_port_event.mla_mtx); 990 p->mlx_port_event.mla_mlx = NULL; 991 p->mlx_port_event.mla_port = NULL; 992 p->mlp_init &= ~MLXCX_PORT_INIT; 993 } 994 995 kmem_free(mlxp->mlx_ports, mlxp->mlx_ports_size); 996 mlxp->mlx_ports = NULL; 997 } 998 999 static void 1000 mlxcx_teardown_wqs(mlxcx_t *mlxp) 1001 { 1002 mlxcx_work_queue_t *mlwq; 1003 1004 while ((mlwq = list_head(&mlxp->mlx_wqs)) != NULL) { 1005 mlxcx_wq_teardown(mlxp, mlwq); 1006 } 1007 list_destroy(&mlxp->mlx_wqs); 1008 } 1009 1010 static void 1011 mlxcx_teardown_cqs(mlxcx_t *mlxp) 1012 { 1013 mlxcx_completion_queue_t *mlcq; 1014 1015 while ((mlcq = list_head(&mlxp->mlx_cqs)) != NULL) { 1016 mlxcx_cq_teardown(mlxp, mlcq); 1017 } 1018 list_destroy(&mlxp->mlx_cqs); 1019 } 1020 1021 static void 1022 mlxcx_teardown_eqs(mlxcx_t *mlxp) 1023 { 1024 mlxcx_event_queue_t *mleq; 1025 uint_t i; 1026 1027 for (i = 0; i < mlxp->mlx_intr_count; ++i) { 1028 mleq = &mlxp->mlx_eqs[i]; 1029 mutex_enter(&mleq->mleq_mtx); 1030 if ((mleq->mleq_state & MLXCX_EQ_CREATED) && 1031 !(mleq->mleq_state & MLXCX_EQ_DESTROYED)) { 1032 if (!mlxcx_cmd_destroy_eq(mlxp, mleq)) { 1033 mlxcx_warn(mlxp, "failed to destroy " 1034 "event queue idx %u eqn %u", 1035 i, mleq->mleq_num); 1036 } 1037 } 1038 if (mleq->mleq_state & MLXCX_EQ_ALLOC) { 1039 mlxcx_eq_rele_dma(mlxp, mleq); 1040 } 1041 mutex_exit(&mleq->mleq_mtx); 1042 } 1043 } 1044 1045 static void 1046 mlxcx_teardown_checktimers(mlxcx_t *mlxp) 1047 { 1048 if (mlxp->mlx_props.mldp_eq_check_interval_sec > 0) 1049 ddi_periodic_delete(mlxp->mlx_eq_checktimer); 1050 if (mlxp->mlx_props.mldp_cq_check_interval_sec > 0) 1051 ddi_periodic_delete(mlxp->mlx_cq_checktimer); 1052 if (mlxp->mlx_props.mldp_wq_check_interval_sec > 0) 1053 ddi_periodic_delete(mlxp->mlx_wq_checktimer); 1054 } 1055 1056 static void 1057 mlxcx_teardown(mlxcx_t *mlxp) 1058 { 1059 uint_t i; 1060 dev_info_t *dip = mlxp->mlx_dip; 1061 1062 if (mlxp->mlx_attach & MLXCX_ATTACH_INTRS) { 1063 /* 1064 * Disable interrupts and let any active vectors quiesce. 1065 */ 1066 mlxcx_intr_disable(mlxp); 1067 } 1068 1069 if (mlxp->mlx_attach & MLXCX_ATTACH_CHKTIMERS) { 1070 mlxcx_teardown_checktimers(mlxp); 1071 mlxp->mlx_attach &= ~MLXCX_ATTACH_CHKTIMERS; 1072 } 1073 1074 if (mlxp->mlx_attach & MLXCX_ATTACH_GROUPS) { 1075 mlxcx_teardown_groups(mlxp); 1076 mlxp->mlx_attach &= ~MLXCX_ATTACH_GROUPS; 1077 } 1078 1079 if (mlxp->mlx_attach & MLXCX_ATTACH_WQS) { 1080 mlxcx_teardown_wqs(mlxp); 1081 mlxp->mlx_attach &= ~MLXCX_ATTACH_WQS; 1082 } 1083 1084 if (mlxp->mlx_attach & MLXCX_ATTACH_CQS) { 1085 mlxcx_teardown_cqs(mlxp); 1086 mlxp->mlx_attach &= ~MLXCX_ATTACH_CQS; 1087 } 1088 1089 if (mlxp->mlx_attach & MLXCX_ATTACH_BUFS) { 1090 mlxcx_teardown_bufs(mlxp); 1091 mlxp->mlx_attach &= ~MLXCX_ATTACH_BUFS; 1092 } 1093 1094 if (mlxp->mlx_attach & MLXCX_ATTACH_PORTS) { 1095 mlxcx_teardown_ports(mlxp); 1096 mlxp->mlx_attach &= ~MLXCX_ATTACH_PORTS; 1097 } 1098 1099 if (mlxp->mlx_attach & MLXCX_ATTACH_INTRS) { 1100 mlxcx_teardown_eqs(mlxp); 1101 mlxcx_intr_teardown(mlxp); 1102 mlxp->mlx_attach &= ~MLXCX_ATTACH_INTRS; 1103 } 1104 1105 if (mlxp->mlx_attach & MLXCX_ATTACH_UAR_PD_TD) { 1106 if (mlxp->mlx_uar.mlu_allocated) { 1107 if (!mlxcx_cmd_dealloc_uar(mlxp, &mlxp->mlx_uar)) { 1108 mlxcx_warn(mlxp, "failed to release UAR"); 1109 } 1110 for (i = 0; i < MLXCX_BF_PER_UAR; ++i) 1111 mutex_destroy(&mlxp->mlx_uar.mlu_bf[i].mbf_mtx); 1112 } 1113 if (mlxp->mlx_pd.mlpd_allocated && 1114 !mlxcx_cmd_dealloc_pd(mlxp, &mlxp->mlx_pd)) { 1115 mlxcx_warn(mlxp, "failed to release PD"); 1116 } 1117 if (mlxp->mlx_tdom.mltd_allocated && 1118 !mlxcx_cmd_dealloc_tdom(mlxp, &mlxp->mlx_tdom)) { 1119 mlxcx_warn(mlxp, "failed to release TDOM"); 1120 } 1121 mlxp->mlx_attach &= ~MLXCX_ATTACH_UAR_PD_TD; 1122 } 1123 1124 if (mlxp->mlx_attach & MLXCX_ATTACH_INIT_HCA) { 1125 if (!mlxcx_cmd_teardown_hca(mlxp)) { 1126 mlxcx_warn(mlxp, "failed to send teardown HCA " 1127 "command during device detach"); 1128 } 1129 mlxp->mlx_attach &= ~MLXCX_ATTACH_INIT_HCA; 1130 } 1131 1132 if (mlxp->mlx_attach & MLXCX_ATTACH_PAGE_LIST) { 1133 mlxcx_teardown_pages(mlxp); 1134 mlxp->mlx_attach &= ~MLXCX_ATTACH_PAGE_LIST; 1135 } 1136 1137 if (mlxp->mlx_attach & MLXCX_ATTACH_ASYNC_TQ) { 1138 for (i = 0; i <= MLXCX_FUNC_ID_MAX; i++) { 1139 mlxp->mlx_npages_req[i].mla_mlx = NULL; 1140 mutex_destroy(&mlxp->mlx_npages_req[i].mla_mtx); 1141 } 1142 taskq_destroy(mlxp->mlx_async_tq); 1143 mlxp->mlx_async_tq = NULL; 1144 mlxp->mlx_attach &= ~MLXCX_ATTACH_ASYNC_TQ; 1145 } 1146 1147 if (mlxp->mlx_attach & MLXCX_ATTACH_ENABLE_HCA) { 1148 if (!mlxcx_cmd_disable_hca(mlxp)) { 1149 mlxcx_warn(mlxp, "failed to send DISABLE HCA command " 1150 "during device detach"); 1151 } 1152 mlxp->mlx_attach &= ~MLXCX_ATTACH_ENABLE_HCA; 1153 } 1154 1155 if (mlxp->mlx_attach & MLXCX_ATTACH_CMD) { 1156 mlxcx_cmd_queue_fini(mlxp); 1157 mlxp->mlx_attach &= ~MLXCX_ATTACH_CMD; 1158 } 1159 1160 if (mlxp->mlx_attach & MLXCX_ATTACH_CAPS) { 1161 kmem_free(mlxp->mlx_caps, sizeof (mlxcx_caps_t)); 1162 mlxp->mlx_caps = NULL; 1163 mlxp->mlx_attach &= ~MLXCX_ATTACH_CAPS; 1164 } 1165 1166 if (mlxp->mlx_attach & MLXCX_ATTACH_REGS) { 1167 ddi_regs_map_free(&mlxp->mlx_regs_handle); 1168 mlxp->mlx_regs_handle = NULL; 1169 mlxp->mlx_attach &= ~MLXCX_ATTACH_REGS; 1170 } 1171 1172 if (mlxp->mlx_attach & MLXCX_ATTACH_PCI_CONFIG) { 1173 pci_config_teardown(&mlxp->mlx_cfg_handle); 1174 mlxp->mlx_cfg_handle = NULL; 1175 mlxp->mlx_attach &= ~MLXCX_ATTACH_PCI_CONFIG; 1176 } 1177 1178 if (mlxp->mlx_attach & MLXCX_ATTACH_FM) { 1179 mlxcx_fm_fini(mlxp); 1180 mlxp->mlx_attach &= ~MLXCX_ATTACH_FM; 1181 } 1182 1183 VERIFY3S(mlxp->mlx_attach, ==, 0); 1184 ddi_soft_state_free(mlxcx_softstate, mlxp->mlx_inst); 1185 ddi_set_driver_private(dip, NULL); 1186 } 1187 1188 static boolean_t 1189 mlxcx_regs_map(mlxcx_t *mlxp) 1190 { 1191 off_t memsize; 1192 int ret; 1193 ddi_device_acc_attr_t da; 1194 1195 if (ddi_dev_regsize(mlxp->mlx_dip, MLXCX_REG_NUMBER, &memsize) != 1196 DDI_SUCCESS) { 1197 mlxcx_warn(mlxp, "failed to get register set size"); 1198 return (B_FALSE); 1199 } 1200 1201 /* 1202 * All data in the main BAR is kept in big-endian even though it's a PCI 1203 * device. 1204 */ 1205 bzero(&da, sizeof (ddi_device_acc_attr_t)); 1206 da.devacc_attr_version = DDI_DEVICE_ATTR_V0; 1207 da.devacc_attr_endian_flags = DDI_STRUCTURE_BE_ACC; 1208 da.devacc_attr_dataorder = DDI_STRICTORDER_ACC; 1209 if (DDI_FM_ACC_ERR_CAP(mlxp->mlx_fm_caps)) { 1210 da.devacc_attr_access = DDI_FLAGERR_ACC; 1211 } else { 1212 da.devacc_attr_access = DDI_DEFAULT_ACC; 1213 } 1214 1215 ret = ddi_regs_map_setup(mlxp->mlx_dip, MLXCX_REG_NUMBER, 1216 &mlxp->mlx_regs_base, 0, memsize, &da, &mlxp->mlx_regs_handle); 1217 1218 if (ret != DDI_SUCCESS) { 1219 mlxcx_warn(mlxp, "failed to map device registers: %d", ret); 1220 return (B_FALSE); 1221 } 1222 1223 return (B_TRUE); 1224 } 1225 1226 static boolean_t 1227 mlxcx_check_issi(mlxcx_t *mlxp) 1228 { 1229 uint32_t issi; 1230 1231 if (!mlxcx_cmd_query_issi(mlxp, &issi)) { 1232 mlxcx_warn(mlxp, "failed to get ISSI"); 1233 return (B_FALSE); 1234 } 1235 1236 if ((issi & (1 << MLXCX_CURRENT_ISSI)) == 0) { 1237 mlxcx_warn(mlxp, "hardware does not support software ISSI, " 1238 "hw vector 0x%x, sw version %u", issi, MLXCX_CURRENT_ISSI); 1239 return (B_FALSE); 1240 } 1241 1242 if (!mlxcx_cmd_set_issi(mlxp, MLXCX_CURRENT_ISSI)) { 1243 mlxcx_warn(mlxp, "failed to set ISSI to %u", 1244 MLXCX_CURRENT_ISSI); 1245 return (B_FALSE); 1246 } 1247 1248 return (B_TRUE); 1249 } 1250 1251 boolean_t 1252 mlxcx_give_pages(mlxcx_t *mlxp, int32_t npages, int32_t *ngiven) 1253 { 1254 ddi_device_acc_attr_t acc; 1255 ddi_dma_attr_t attr; 1256 int32_t i; 1257 list_t plist; 1258 mlxcx_dev_page_t *mdp; 1259 mlxcx_dev_page_t **pages; 1260 const ddi_dma_cookie_t *ck; 1261 1262 /* 1263 * If there are no pages required, then we're done here. 1264 */ 1265 if (npages <= 0) { 1266 *ngiven = 0; 1267 return (B_TRUE); 1268 } 1269 1270 npages = MIN(npages, MLXCX_MANAGE_PAGES_MAX_PAGES); 1271 1272 pages = kmem_alloc(sizeof (*pages) * npages, KM_SLEEP); 1273 1274 list_create(&plist, sizeof (mlxcx_dev_page_t), 1275 offsetof(mlxcx_dev_page_t, mxdp_list)); 1276 1277 for (i = 0; i < npages; i++) { 1278 mdp = kmem_zalloc(sizeof (mlxcx_dev_page_t), KM_SLEEP); 1279 mlxcx_dma_acc_attr(mlxp, &acc); 1280 mlxcx_dma_page_attr(mlxp, &attr); 1281 if (!mlxcx_dma_alloc(mlxp, &mdp->mxdp_dma, &attr, &acc, 1282 B_TRUE, MLXCX_HW_PAGE_SIZE, B_TRUE)) { 1283 mlxcx_warn(mlxp, "failed to allocate 4k page %u/%u", i, 1284 npages); 1285 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 1286 goto cleanup_npages; 1287 } 1288 ck = mlxcx_dma_cookie_one(&mdp->mxdp_dma); 1289 mdp->mxdp_pa = ck->dmac_laddress; 1290 1291 list_insert_tail(&plist, mdp); 1292 } 1293 1294 /* 1295 * Now that all of the pages have been allocated, given them to hardware 1296 * in chunks. 1297 */ 1298 for (i = 0; i < npages; i++) { 1299 pages[i] = list_remove_head(&plist); 1300 } 1301 1302 if (!mlxcx_cmd_give_pages(mlxp, 1303 MLXCX_MANAGE_PAGES_OPMOD_GIVE_PAGES, npages, pages)) { 1304 mlxcx_warn(mlxp, "!hardware refused our gift of %u " 1305 "pages!", npages); 1306 for (i = 0; i < npages; i++) { 1307 list_insert_tail(&plist, pages[i]); 1308 } 1309 goto cleanup_npages; 1310 } 1311 1312 mutex_enter(&mlxp->mlx_pagemtx); 1313 for (i = 0; i < npages; i++) { 1314 avl_add(&mlxp->mlx_pages, pages[i]); 1315 } 1316 mlxp->mlx_npages += npages; 1317 mutex_exit(&mlxp->mlx_pagemtx); 1318 1319 list_destroy(&plist); 1320 kmem_free(pages, sizeof (*pages) * npages); 1321 1322 *ngiven = npages; 1323 1324 return (B_TRUE); 1325 1326 cleanup_npages: 1327 kmem_free(pages, sizeof (*pages) * npages); 1328 while ((mdp = list_remove_head(&plist)) != NULL) { 1329 mlxcx_dma_free(&mdp->mxdp_dma); 1330 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 1331 } 1332 list_destroy(&plist); 1333 return (B_FALSE); 1334 } 1335 1336 static boolean_t 1337 mlxcx_init_pages(mlxcx_t *mlxp, uint_t type) 1338 { 1339 int32_t npages, given; 1340 1341 if (!mlxcx_cmd_query_pages(mlxp, type, &npages)) { 1342 mlxcx_warn(mlxp, "failed to determine boot pages"); 1343 return (B_FALSE); 1344 } 1345 1346 while (npages > 0) { 1347 if (!mlxcx_give_pages(mlxp, npages, &given)) 1348 return (B_FALSE); 1349 1350 npages -= given; 1351 } 1352 1353 return (B_TRUE); 1354 } 1355 1356 static int 1357 mlxcx_bufs_cache_constr(void *arg, void *cookie, int kmflags) 1358 { 1359 mlxcx_t *mlxp = cookie; 1360 mlxcx_buffer_t *b = arg; 1361 1362 bzero(b, sizeof (mlxcx_buffer_t)); 1363 b->mlb_mlx = mlxp; 1364 b->mlb_state = MLXCX_BUFFER_INIT; 1365 list_create(&b->mlb_tx_chain, sizeof (mlxcx_buffer_t), 1366 offsetof(mlxcx_buffer_t, mlb_tx_chain_entry)); 1367 1368 return (0); 1369 } 1370 1371 static void 1372 mlxcx_bufs_cache_destr(void *arg, void *cookie) 1373 { 1374 mlxcx_t *mlxp = cookie; 1375 mlxcx_buffer_t *b = arg; 1376 VERIFY3P(b->mlb_mlx, ==, mlxp); 1377 VERIFY(b->mlb_state == MLXCX_BUFFER_INIT); 1378 list_destroy(&b->mlb_tx_chain); 1379 } 1380 1381 mlxcx_buf_shard_t * 1382 mlxcx_mlbs_create(mlxcx_t *mlxp) 1383 { 1384 mlxcx_buf_shard_t *s; 1385 1386 s = kmem_zalloc(sizeof (mlxcx_buf_shard_t), KM_SLEEP); 1387 1388 mutex_init(&s->mlbs_mtx, NULL, MUTEX_DRIVER, 1389 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 1390 list_create(&s->mlbs_busy, sizeof (mlxcx_buffer_t), 1391 offsetof(mlxcx_buffer_t, mlb_entry)); 1392 list_create(&s->mlbs_free, sizeof (mlxcx_buffer_t), 1393 offsetof(mlxcx_buffer_t, mlb_entry)); 1394 list_create(&s->mlbs_loaned, sizeof (mlxcx_buffer_t), 1395 offsetof(mlxcx_buffer_t, mlb_entry)); 1396 cv_init(&s->mlbs_free_nonempty, NULL, CV_DRIVER, NULL); 1397 1398 list_insert_tail(&mlxp->mlx_buf_shards, s); 1399 1400 return (s); 1401 } 1402 1403 static boolean_t 1404 mlxcx_setup_bufs(mlxcx_t *mlxp) 1405 { 1406 char namebuf[KSTAT_STRLEN]; 1407 1408 (void) snprintf(namebuf, KSTAT_STRLEN, "mlxcx%d_bufs_cache", 1409 ddi_get_instance(mlxp->mlx_dip)); 1410 mlxp->mlx_bufs_cache = kmem_cache_create(namebuf, 1411 sizeof (mlxcx_buffer_t), sizeof (uint64_t), 1412 mlxcx_bufs_cache_constr, mlxcx_bufs_cache_destr, 1413 NULL, mlxp, NULL, 0); 1414 1415 list_create(&mlxp->mlx_buf_shards, sizeof (mlxcx_buf_shard_t), 1416 offsetof(mlxcx_buf_shard_t, mlbs_entry)); 1417 1418 return (B_TRUE); 1419 } 1420 1421 static void 1422 mlxcx_fm_qstate_ereport(mlxcx_t *mlxp, const char *qtype, uint32_t qnum, 1423 const char *state, uint8_t statenum) 1424 { 1425 uint64_t ena; 1426 char buf[FM_MAX_CLASS]; 1427 1428 if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps)) 1429 return; 1430 1431 (void) snprintf(buf, FM_MAX_CLASS, "%s.%s", 1432 MLXCX_FM_SERVICE_MLXCX, "qstate.err"); 1433 ena = fm_ena_generate(0, FM_ENA_FMT1); 1434 1435 ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP, 1436 FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0, 1437 "state", DATA_TYPE_STRING, state, 1438 "state_num", DATA_TYPE_UINT8, statenum, 1439 "qtype", DATA_TYPE_STRING, qtype, 1440 "qnum", DATA_TYPE_UINT32, qnum, 1441 NULL); 1442 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_DEGRADED); 1443 } 1444 1445 /* 1446 * The following set of routines are for monitoring the health of 1447 * event, completion and work queues. They run infrequently peeking at 1448 * the structs to catch stalls and inconsistent state. 1449 * 1450 * They peek at the structs *without* acquiring locks - we don't want 1451 * to impede flow of data. Driver start up and shutdown semantics 1452 * guarantee the structs are present and won't disappear underneath 1453 * these routines. 1454 * 1455 * As previously noted, the routines peek at active data in the structs and 1456 * they will store some values for comparison on next invocation. To 1457 * maintain integrity of the saved values, these values are only modified 1458 * within these routines. 1459 */ 1460 static void 1461 mlxcx_eq_check(void *arg) 1462 { 1463 mlxcx_t *mlxp = (mlxcx_t *)arg; 1464 mlxcx_event_queue_t *eq; 1465 mlxcx_eventq_ctx_t ctx; 1466 const char *str; 1467 1468 uint_t i; 1469 1470 for (i = 0; i < mlxp->mlx_intr_count; ++i) { 1471 eq = &mlxp->mlx_eqs[i]; 1472 1473 if ((eq->mleq_state & MLXCX_EQ_CREATED) == 0) 1474 continue; 1475 1476 /* 1477 * If the event queue was successfully created in the HCA, 1478 * then initialization and shutdown sequences guarantee 1479 * the queue exists. 1480 */ 1481 ASSERT0(eq->mleq_state & MLXCX_EQ_DESTROYED); 1482 1483 if (!mlxcx_cmd_query_eq(mlxp, eq, &ctx)) 1484 continue; 1485 1486 str = "???"; 1487 switch (ctx.mleqc_status) { 1488 case MLXCX_EQ_STATUS_OK: 1489 break; 1490 case MLXCX_EQ_STATUS_WRITE_FAILURE: 1491 str = "WRITE_FAILURE"; 1492 break; 1493 } 1494 1495 if (ctx.mleqc_status != MLXCX_EQ_STATUS_OK) { 1496 mlxcx_fm_qstate_ereport(mlxp, "event", 1497 eq->mleq_num, str, ctx.mleqc_status); 1498 mlxcx_warn(mlxp, "EQ %u is in bad status: %x (%s)", 1499 eq->mleq_intr_index, ctx.mleqc_status, str); 1500 } 1501 1502 if (ctx.mleqc_state != MLXCX_EQ_ST_ARMED && 1503 (eq->mleq_state & MLXCX_EQ_ARMED)) { 1504 if (eq->mleq_cc == eq->mleq_check_disarm_cc && 1505 ++eq->mleq_check_disarm_cnt >= 3) { 1506 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_STALL); 1507 mlxcx_warn(mlxp, "EQ %u isn't armed", 1508 eq->mleq_intr_index); 1509 } 1510 eq->mleq_check_disarm_cc = eq->mleq_cc; 1511 } else { 1512 eq->mleq_check_disarm_cc = 0; 1513 eq->mleq_check_disarm_cnt = 0; 1514 } 1515 } 1516 } 1517 1518 static void 1519 mlxcx_cq_check(void *arg) 1520 { 1521 mlxcx_t *mlxp = (mlxcx_t *)arg; 1522 mlxcx_completion_queue_t *cq; 1523 mlxcx_completionq_ctx_t ctx; 1524 const char *str, *type; 1525 uint_t v; 1526 1527 for (cq = list_head(&mlxp->mlx_cqs); cq != NULL; 1528 cq = list_next(&mlxp->mlx_cqs, cq)) { 1529 1530 if ((cq->mlcq_state & MLXCX_CQ_CREATED) == 0) 1531 continue; 1532 1533 /* 1534 * If the completion queue was successfully created in the HCA, 1535 * then initialization and shutdown sequences guarantee 1536 * the queue exists. 1537 */ 1538 ASSERT0(cq->mlcq_state & MLXCX_CQ_DESTROYED); 1539 ASSERT0(cq->mlcq_state & MLXCX_CQ_TEARDOWN); 1540 1541 if (cq->mlcq_fm_repd_qstate) 1542 continue; 1543 1544 if (!mlxcx_cmd_query_cq(mlxp, cq, &ctx)) 1545 continue; 1546 1547 if (cq->mlcq_wq != NULL) { 1548 mlxcx_work_queue_t *wq = cq->mlcq_wq; 1549 if (wq->mlwq_type == MLXCX_WQ_TYPE_RECVQ) 1550 type = "rx "; 1551 else if (wq->mlwq_type == MLXCX_WQ_TYPE_SENDQ) 1552 type = "tx "; 1553 else 1554 type = ""; 1555 } else { 1556 type = ""; 1557 } 1558 1559 str = "???"; 1560 v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATUS); 1561 switch (v) { 1562 case MLXCX_CQC_STATUS_OK: 1563 break; 1564 case MLXCX_CQC_STATUS_OVERFLOW: 1565 str = "OVERFLOW"; 1566 break; 1567 case MLXCX_CQC_STATUS_WRITE_FAIL: 1568 str = "WRITE_FAIL"; 1569 break; 1570 case MLXCX_CQC_STATUS_INVALID: 1571 str = "INVALID"; 1572 break; 1573 } 1574 1575 if (v != MLXCX_CQC_STATUS_OK) { 1576 mlxcx_fm_qstate_ereport(mlxp, "completion", 1577 cq->mlcq_num, str, v); 1578 mlxcx_warn(mlxp, "%sCQ 0x%x is in bad status: %x (%s)", 1579 type, cq->mlcq_num, v, str); 1580 cq->mlcq_fm_repd_qstate = B_TRUE; 1581 } 1582 1583 v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATE); 1584 if (v != MLXCX_CQC_STATE_ARMED && 1585 (cq->mlcq_state & MLXCX_CQ_ARMED) && 1586 !(cq->mlcq_state & MLXCX_CQ_POLLING)) { 1587 if (cq->mlcq_cc == cq->mlcq_check_disarm_cc && 1588 ++cq->mlcq_check_disarm_cnt >= 3) { 1589 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_STALL); 1590 mlxcx_warn(mlxp, "%sCQ 0x%x (%p) isn't armed", 1591 type, cq->mlcq_num, cq); 1592 } 1593 cq->mlcq_check_disarm_cc = cq->mlcq_cc; 1594 } else { 1595 cq->mlcq_check_disarm_cnt = 0; 1596 cq->mlcq_check_disarm_cc = 0; 1597 } 1598 } 1599 } 1600 1601 void 1602 mlxcx_check_sq(mlxcx_t *mlxp, mlxcx_work_queue_t *sq) 1603 { 1604 mlxcx_sq_ctx_t ctx; 1605 mlxcx_sq_state_t state; 1606 1607 if (!mlxcx_cmd_query_sq(mlxp, sq, &ctx)) 1608 return; 1609 1610 ASSERT3U(from_be24(ctx.mlsqc_cqn), ==, sq->mlwq_cq->mlcq_num); 1611 state = get_bits32(ctx.mlsqc_flags, MLXCX_SQ_STATE); 1612 switch (state) { 1613 case MLXCX_SQ_STATE_RST: 1614 if (sq->mlwq_state & MLXCX_WQ_STARTED) { 1615 mlxcx_fm_qstate_ereport(mlxp, "send", 1616 sq->mlwq_num, "RST", state); 1617 sq->mlwq_fm_repd_qstate = B_TRUE; 1618 } 1619 break; 1620 case MLXCX_SQ_STATE_RDY: 1621 if (!(sq->mlwq_state & MLXCX_WQ_STARTED)) { 1622 mlxcx_fm_qstate_ereport(mlxp, "send", 1623 sq->mlwq_num, "RDY", state); 1624 sq->mlwq_fm_repd_qstate = B_TRUE; 1625 } 1626 break; 1627 case MLXCX_SQ_STATE_ERR: 1628 mlxcx_fm_qstate_ereport(mlxp, "send", 1629 sq->mlwq_num, "ERR", state); 1630 sq->mlwq_fm_repd_qstate = B_TRUE; 1631 break; 1632 default: 1633 mlxcx_fm_qstate_ereport(mlxp, "send", 1634 sq->mlwq_num, "???", state); 1635 sq->mlwq_fm_repd_qstate = B_TRUE; 1636 break; 1637 } 1638 } 1639 1640 void 1641 mlxcx_check_rq(mlxcx_t *mlxp, mlxcx_work_queue_t *rq) 1642 { 1643 mlxcx_rq_ctx_t ctx; 1644 mlxcx_rq_state_t state; 1645 1646 1647 if (!mlxcx_cmd_query_rq(mlxp, rq, &ctx)) 1648 return; 1649 1650 ASSERT3U(from_be24(ctx.mlrqc_cqn), ==, rq->mlwq_cq->mlcq_num); 1651 state = get_bits32(ctx.mlrqc_flags, MLXCX_RQ_STATE); 1652 switch (state) { 1653 case MLXCX_RQ_STATE_RST: 1654 if (rq->mlwq_state & MLXCX_WQ_STARTED) { 1655 mlxcx_fm_qstate_ereport(mlxp, "receive", 1656 rq->mlwq_num, "RST", state); 1657 rq->mlwq_fm_repd_qstate = B_TRUE; 1658 } 1659 break; 1660 case MLXCX_RQ_STATE_RDY: 1661 if (!(rq->mlwq_state & MLXCX_WQ_STARTED)) { 1662 mlxcx_fm_qstate_ereport(mlxp, "receive", 1663 rq->mlwq_num, "RDY", state); 1664 rq->mlwq_fm_repd_qstate = B_TRUE; 1665 } 1666 break; 1667 case MLXCX_RQ_STATE_ERR: 1668 mlxcx_fm_qstate_ereport(mlxp, "receive", 1669 rq->mlwq_num, "ERR", state); 1670 rq->mlwq_fm_repd_qstate = B_TRUE; 1671 break; 1672 default: 1673 mlxcx_fm_qstate_ereport(mlxp, "receive", 1674 rq->mlwq_num, "???", state); 1675 rq->mlwq_fm_repd_qstate = B_TRUE; 1676 break; 1677 } 1678 } 1679 1680 static void 1681 mlxcx_wq_check(void *arg) 1682 { 1683 mlxcx_t *mlxp = (mlxcx_t *)arg; 1684 mlxcx_work_queue_t *wq; 1685 1686 for (wq = list_head(&mlxp->mlx_wqs); wq != NULL; 1687 wq = list_next(&mlxp->mlx_wqs, wq)) { 1688 1689 if ((wq->mlwq_state & MLXCX_WQ_CREATED) == 0) 1690 continue; 1691 1692 /* 1693 * If the work queue was successfully created in the HCA, 1694 * then initialization and shutdown sequences guarantee 1695 * the queue exists. 1696 */ 1697 ASSERT0(wq->mlwq_state & MLXCX_WQ_DESTROYED); 1698 ASSERT0(wq->mlwq_state & MLXCX_WQ_TEARDOWN); 1699 1700 if (wq->mlwq_fm_repd_qstate) 1701 continue; 1702 1703 switch (wq->mlwq_type) { 1704 case MLXCX_WQ_TYPE_SENDQ: 1705 mlxcx_check_sq(mlxp, wq); 1706 break; 1707 case MLXCX_WQ_TYPE_RECVQ: 1708 mlxcx_check_rq(mlxp, wq); 1709 break; 1710 } 1711 } 1712 } 1713 1714 static boolean_t 1715 mlxcx_setup_checktimers(mlxcx_t *mlxp) 1716 { 1717 if (mlxp->mlx_props.mldp_eq_check_interval_sec > 0) { 1718 mlxp->mlx_eq_checktimer = ddi_periodic_add(mlxcx_eq_check, mlxp, 1719 mlxp->mlx_props.mldp_eq_check_interval_sec * NANOSEC, 1720 DDI_IPL_0); 1721 } 1722 if (mlxp->mlx_props.mldp_cq_check_interval_sec > 0) { 1723 mlxp->mlx_cq_checktimer = ddi_periodic_add(mlxcx_cq_check, mlxp, 1724 mlxp->mlx_props.mldp_cq_check_interval_sec * NANOSEC, 1725 DDI_IPL_0); 1726 } 1727 if (mlxp->mlx_props.mldp_wq_check_interval_sec > 0) { 1728 mlxp->mlx_wq_checktimer = ddi_periodic_add(mlxcx_wq_check, mlxp, 1729 mlxp->mlx_props.mldp_wq_check_interval_sec * NANOSEC, 1730 DDI_IPL_0); 1731 } 1732 return (B_TRUE); 1733 } 1734 1735 int 1736 mlxcx_dmac_fe_compare(const void *arg0, const void *arg1) 1737 { 1738 const mlxcx_flow_entry_t *left = arg0; 1739 const mlxcx_flow_entry_t *right = arg1; 1740 int bcmpr; 1741 1742 bcmpr = memcmp(left->mlfe_dmac, right->mlfe_dmac, 1743 sizeof (left->mlfe_dmac)); 1744 if (bcmpr < 0) 1745 return (-1); 1746 if (bcmpr > 0) 1747 return (1); 1748 if (left->mlfe_vid < right->mlfe_vid) 1749 return (-1); 1750 if (left->mlfe_vid > right->mlfe_vid) 1751 return (1); 1752 return (0); 1753 } 1754 1755 int 1756 mlxcx_grmac_compare(const void *arg0, const void *arg1) 1757 { 1758 const mlxcx_group_mac_t *left = arg0; 1759 const mlxcx_group_mac_t *right = arg1; 1760 int bcmpr; 1761 1762 bcmpr = memcmp(left->mlgm_mac, right->mlgm_mac, 1763 sizeof (left->mlgm_mac)); 1764 if (bcmpr < 0) 1765 return (-1); 1766 if (bcmpr > 0) 1767 return (1); 1768 return (0); 1769 } 1770 1771 int 1772 mlxcx_page_compare(const void *arg0, const void *arg1) 1773 { 1774 const mlxcx_dev_page_t *p0 = arg0; 1775 const mlxcx_dev_page_t *p1 = arg1; 1776 1777 if (p0->mxdp_pa < p1->mxdp_pa) 1778 return (-1); 1779 if (p0->mxdp_pa > p1->mxdp_pa) 1780 return (1); 1781 return (0); 1782 } 1783 1784 static boolean_t 1785 mlxcx_setup_ports(mlxcx_t *mlxp) 1786 { 1787 uint_t i, j; 1788 mlxcx_port_t *p; 1789 mlxcx_flow_table_t *ft; 1790 mlxcx_flow_group_t *fg; 1791 mlxcx_flow_entry_t *fe; 1792 1793 VERIFY3U(mlxp->mlx_nports, >, 0); 1794 mlxp->mlx_ports_size = mlxp->mlx_nports * sizeof (mlxcx_port_t); 1795 mlxp->mlx_ports = kmem_zalloc(mlxp->mlx_ports_size, KM_SLEEP); 1796 1797 for (i = 0; i < mlxp->mlx_nports; ++i) { 1798 p = &mlxp->mlx_ports[i]; 1799 p->mlp_num = i; 1800 p->mlx_port_event.mla_mlx = mlxp; 1801 p->mlx_port_event.mla_port = p; 1802 mutex_init(&p->mlx_port_event.mla_mtx, NULL, 1803 MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_intr_pri)); 1804 p->mlp_init |= MLXCX_PORT_INIT; 1805 mutex_init(&p->mlp_mtx, NULL, MUTEX_DRIVER, 1806 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 1807 mutex_enter(&p->mlp_mtx); 1808 if (!mlxcx_cmd_query_nic_vport_ctx(mlxp, p)) { 1809 mutex_exit(&p->mlp_mtx); 1810 goto err; 1811 } 1812 if (!mlxcx_cmd_query_port_mtu(mlxp, p)) { 1813 mutex_exit(&p->mlp_mtx); 1814 goto err; 1815 } 1816 if (!mlxcx_cmd_query_port_status(mlxp, p)) { 1817 mutex_exit(&p->mlp_mtx); 1818 goto err; 1819 } 1820 if (!mlxcx_cmd_query_port_speed(mlxp, p)) { 1821 mutex_exit(&p->mlp_mtx); 1822 goto err; 1823 } 1824 if (!mlxcx_cmd_modify_nic_vport_ctx(mlxp, p, 1825 MLXCX_MODIFY_NIC_VPORT_CTX_PROMISC)) { 1826 mutex_exit(&p->mlp_mtx); 1827 goto err; 1828 } 1829 if (!mlxcx_cmd_query_port_fec(mlxp, p)) { 1830 mutex_exit(&p->mlp_mtx); 1831 goto err; 1832 } 1833 p->mlp_fec_requested = LINK_FEC_AUTO; 1834 1835 mutex_exit(&p->mlp_mtx); 1836 } 1837 1838 for (i = 0; i < mlxp->mlx_nports; ++i) { 1839 p = &mlxp->mlx_ports[i]; 1840 mutex_enter(&p->mlp_mtx); 1841 p->mlp_rx_flow = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t), 1842 KM_SLEEP)); 1843 mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER, 1844 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 1845 1846 mutex_enter(&ft->mlft_mtx); 1847 1848 ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX; 1849 ft->mlft_port = p; 1850 ft->mlft_entshift = mlxp->mlx_props.mldp_ftbl_root_size_shift; 1851 if (ft->mlft_entshift > mlxp->mlx_caps->mlc_max_rx_ft_shift) 1852 ft->mlft_entshift = mlxp->mlx_caps->mlc_max_rx_ft_shift; 1853 ft->mlft_nents = (1 << ft->mlft_entshift); 1854 ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t); 1855 ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP); 1856 list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t), 1857 offsetof(mlxcx_flow_group_t, mlfg_entry)); 1858 1859 for (j = 0; j < ft->mlft_nents; ++j) { 1860 ft->mlft_ent[j].mlfe_table = ft; 1861 ft->mlft_ent[j].mlfe_index = j; 1862 } 1863 1864 if (!mlxcx_cmd_create_flow_table(mlxp, ft)) { 1865 mutex_exit(&ft->mlft_mtx); 1866 mutex_exit(&p->mlp_mtx); 1867 goto err; 1868 } 1869 1870 if (!mlxcx_cmd_set_flow_table_root(mlxp, ft)) { 1871 mutex_exit(&ft->mlft_mtx); 1872 mutex_exit(&p->mlp_mtx); 1873 goto err; 1874 } 1875 1876 /* 1877 * We match broadcast at the top of the root flow table, then 1878 * all multicast/unicast MACs, then the promisc entry is down 1879 * the very bottom. 1880 * 1881 * This way when promisc is on, that entry simply catches any 1882 * remaining traffic that earlier flows haven't matched. 1883 */ 1884 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1885 list_insert_tail(&ft->mlft_groups, fg); 1886 fg->mlfg_table = ft; 1887 fg->mlfg_size = 1; 1888 fg->mlfg_mask |= MLXCX_FLOW_MATCH_DMAC; 1889 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1890 mutex_exit(&ft->mlft_mtx); 1891 mutex_exit(&p->mlp_mtx); 1892 goto err; 1893 } 1894 p->mlp_bcast = fg; 1895 fe = list_head(&fg->mlfg_entries); 1896 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 1897 (void) memset(fe->mlfe_dmac, 0xff, sizeof (fe->mlfe_dmac)); 1898 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 1899 1900 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1901 list_insert_tail(&ft->mlft_groups, fg); 1902 fg->mlfg_table = ft; 1903 fg->mlfg_size = ft->mlft_nents - 2; 1904 fg->mlfg_mask |= MLXCX_FLOW_MATCH_DMAC; 1905 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1906 mutex_exit(&ft->mlft_mtx); 1907 mutex_exit(&p->mlp_mtx); 1908 goto err; 1909 } 1910 p->mlp_umcast = fg; 1911 1912 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1913 list_insert_tail(&ft->mlft_groups, fg); 1914 fg->mlfg_table = ft; 1915 fg->mlfg_size = 1; 1916 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1917 mutex_exit(&ft->mlft_mtx); 1918 mutex_exit(&p->mlp_mtx); 1919 goto err; 1920 } 1921 p->mlp_promisc = fg; 1922 fe = list_head(&fg->mlfg_entries); 1923 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 1924 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 1925 1926 avl_create(&p->mlp_dmac_fe, mlxcx_dmac_fe_compare, 1927 sizeof (mlxcx_flow_entry_t), offsetof(mlxcx_flow_entry_t, 1928 mlfe_dmac_entry)); 1929 1930 mutex_exit(&ft->mlft_mtx); 1931 mutex_exit(&p->mlp_mtx); 1932 } 1933 1934 return (B_TRUE); 1935 1936 err: 1937 mlxcx_teardown_ports(mlxp); 1938 return (B_FALSE); 1939 } 1940 1941 void 1942 mlxcx_remove_all_vlan_entries(mlxcx_t *mlxp, mlxcx_ring_group_t *g) 1943 { 1944 mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft; 1945 mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg; 1946 mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg; 1947 mlxcx_flow_entry_t *fe; 1948 mlxcx_group_vlan_t *v; 1949 1950 ASSERT(mutex_owned(&g->mlg_mtx)); 1951 1952 mutex_enter(&ft->mlft_mtx); 1953 1954 if (!list_is_empty(&g->mlg_rx_vlans)) { 1955 fe = list_head(&dfg->mlfg_entries); 1956 (void) mlxcx_cmd_set_flow_table_entry(mlxp, fe); 1957 } 1958 1959 while ((v = list_remove_head(&g->mlg_rx_vlans)) != NULL) { 1960 fe = v->mlgv_fe; 1961 ASSERT3P(fe->mlfe_table, ==, ft); 1962 ASSERT3P(fe->mlfe_group, ==, fg); 1963 kmem_free(v, sizeof (mlxcx_group_vlan_t)); 1964 1965 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 1966 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 1967 } 1968 1969 mutex_exit(&ft->mlft_mtx); 1970 } 1971 1972 boolean_t 1973 mlxcx_remove_vlan_entry(mlxcx_t *mlxp, mlxcx_ring_group_t *g, 1974 boolean_t tagged, uint16_t vid) 1975 { 1976 mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft; 1977 mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg; 1978 mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg; 1979 mlxcx_flow_entry_t *fe; 1980 mlxcx_group_vlan_t *v; 1981 boolean_t found = B_FALSE; 1982 1983 ASSERT(mutex_owned(&g->mlg_mtx)); 1984 1985 mutex_enter(&ft->mlft_mtx); 1986 1987 for (v = list_head(&g->mlg_rx_vlans); v != NULL; 1988 v = list_next(&g->mlg_rx_vlans, v)) { 1989 if (v->mlgv_tagged == tagged && v->mlgv_vid == vid) { 1990 found = B_TRUE; 1991 break; 1992 } 1993 } 1994 if (!found) { 1995 mutex_exit(&ft->mlft_mtx); 1996 return (B_FALSE); 1997 } 1998 1999 list_remove(&g->mlg_rx_vlans, v); 2000 2001 /* 2002 * If this is the last VLAN entry, we have to go back to accepting 2003 * any VLAN (which means re-enabling the default entry). 2004 * 2005 * Do this before we remove the flow entry for the last specific 2006 * VLAN so that we don't lose any traffic in the transition. 2007 */ 2008 if (list_is_empty(&g->mlg_rx_vlans)) { 2009 fe = list_head(&dfg->mlfg_entries); 2010 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 2011 list_insert_tail(&g->mlg_rx_vlans, v); 2012 mutex_exit(&ft->mlft_mtx); 2013 return (B_FALSE); 2014 } 2015 } 2016 2017 fe = v->mlgv_fe; 2018 ASSERT(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED); 2019 ASSERT(fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED); 2020 ASSERT3P(fe->mlfe_table, ==, ft); 2021 ASSERT3P(fe->mlfe_group, ==, fg); 2022 2023 if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) { 2024 list_insert_tail(&g->mlg_rx_vlans, v); 2025 fe = list_head(&dfg->mlfg_entries); 2026 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) { 2027 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 2028 } 2029 mutex_exit(&ft->mlft_mtx); 2030 return (B_FALSE); 2031 } 2032 2033 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2034 2035 kmem_free(v, sizeof (mlxcx_group_vlan_t)); 2036 2037 mutex_exit(&ft->mlft_mtx); 2038 return (B_TRUE); 2039 } 2040 2041 boolean_t 2042 mlxcx_add_vlan_entry(mlxcx_t *mlxp, mlxcx_ring_group_t *g, boolean_t tagged, 2043 uint16_t vid) 2044 { 2045 mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft; 2046 mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg; 2047 mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg; 2048 mlxcx_flow_entry_t *fe; 2049 mlxcx_group_vlan_t *v; 2050 boolean_t found = B_FALSE; 2051 boolean_t first = B_FALSE; 2052 2053 ASSERT(mutex_owned(&g->mlg_mtx)); 2054 2055 mutex_enter(&ft->mlft_mtx); 2056 2057 for (v = list_head(&g->mlg_rx_vlans); v != NULL; 2058 v = list_next(&g->mlg_rx_vlans, v)) { 2059 if (v->mlgv_tagged == tagged && v->mlgv_vid == vid) { 2060 mutex_exit(&ft->mlft_mtx); 2061 return (B_TRUE); 2062 } 2063 } 2064 if (list_is_empty(&g->mlg_rx_vlans)) 2065 first = B_TRUE; 2066 2067 for (fe = list_head(&fg->mlfg_entries); fe != NULL; 2068 fe = list_next(&fg->mlfg_entries, fe)) { 2069 if (!(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED)) { 2070 found = B_TRUE; 2071 break; 2072 } 2073 } 2074 if (!found) { 2075 mutex_exit(&ft->mlft_mtx); 2076 return (B_FALSE); 2077 } 2078 2079 v = kmem_zalloc(sizeof (mlxcx_group_vlan_t), KM_SLEEP); 2080 v->mlgv_fe = fe; 2081 v->mlgv_tagged = tagged; 2082 v->mlgv_vid = vid; 2083 2084 fe->mlfe_state |= MLXCX_FLOW_ENTRY_RESERVED; 2085 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 2086 fe->mlfe_vid = vid; 2087 if (tagged) { 2088 fe->mlfe_vlan_type = MLXCX_VLAN_TYPE_CVLAN; 2089 } else { 2090 fe->mlfe_vlan_type = MLXCX_VLAN_TYPE_NONE; 2091 } 2092 2093 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 2094 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_DIRTY; 2095 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2096 kmem_free(v, sizeof (mlxcx_group_vlan_t)); 2097 mutex_exit(&ft->mlft_mtx); 2098 return (B_FALSE); 2099 } 2100 2101 list_insert_tail(&g->mlg_rx_vlans, v); 2102 2103 /* 2104 * If the vlan list was empty for this group before adding this one, 2105 * then we no longer want the "default" entry to allow all VLANs 2106 * through. 2107 */ 2108 if (first) { 2109 fe = list_head(&dfg->mlfg_entries); 2110 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 2111 } 2112 2113 mutex_exit(&ft->mlft_mtx); 2114 return (B_TRUE); 2115 } 2116 2117 void 2118 mlxcx_remove_all_umcast_entries(mlxcx_t *mlxp, mlxcx_port_t *port, 2119 mlxcx_ring_group_t *group) 2120 { 2121 mlxcx_flow_entry_t *fe; 2122 mlxcx_flow_table_t *ft = port->mlp_rx_flow; 2123 mlxcx_group_mac_t *gm, *ngm; 2124 2125 ASSERT(mutex_owned(&port->mlp_mtx)); 2126 ASSERT(mutex_owned(&group->mlg_mtx)); 2127 2128 mutex_enter(&ft->mlft_mtx); 2129 2130 gm = avl_first(&group->mlg_rx_macs); 2131 for (; gm != NULL; gm = ngm) { 2132 ngm = AVL_NEXT(&group->mlg_rx_macs, gm); 2133 2134 ASSERT3P(gm->mlgm_group, ==, group); 2135 fe = gm->mlgm_fe; 2136 ASSERT3P(fe->mlfe_table, ==, ft); 2137 2138 avl_remove(&group->mlg_rx_macs, gm); 2139 list_remove(&fe->mlfe_ring_groups, gm); 2140 kmem_free(gm, sizeof (mlxcx_group_mac_t)); 2141 2142 fe->mlfe_ndest = 0; 2143 for (gm = list_head(&fe->mlfe_ring_groups); gm != NULL; 2144 gm = list_next(&fe->mlfe_ring_groups, gm)) { 2145 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = 2146 gm->mlgm_group->mlg_rx_vlan_ft; 2147 } 2148 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 2149 2150 if (fe->mlfe_ndest > 0) { 2151 (void) mlxcx_cmd_set_flow_table_entry(mlxp, fe); 2152 continue; 2153 } 2154 2155 /* 2156 * There are no more ring groups left for this MAC (it wasn't 2157 * attached to any other groups since ndest == 0), so clean up 2158 * its flow entry. 2159 */ 2160 avl_remove(&port->mlp_dmac_fe, fe); 2161 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 2162 list_destroy(&fe->mlfe_ring_groups); 2163 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2164 } 2165 2166 mutex_exit(&ft->mlft_mtx); 2167 } 2168 2169 boolean_t 2170 mlxcx_remove_umcast_entry(mlxcx_t *mlxp, mlxcx_port_t *port, 2171 mlxcx_ring_group_t *group, const uint8_t *macaddr) 2172 { 2173 mlxcx_flow_entry_t *fe; 2174 mlxcx_flow_table_t *ft = port->mlp_rx_flow; 2175 mlxcx_group_mac_t *gm, probe; 2176 2177 ASSERT(mutex_owned(&port->mlp_mtx)); 2178 ASSERT(mutex_owned(&group->mlg_mtx)); 2179 2180 bzero(&probe, sizeof (probe)); 2181 bcopy(macaddr, probe.mlgm_mac, sizeof (probe.mlgm_mac)); 2182 2183 mutex_enter(&ft->mlft_mtx); 2184 2185 gm = avl_find(&group->mlg_rx_macs, &probe, NULL); 2186 if (gm == NULL) { 2187 mutex_exit(&ft->mlft_mtx); 2188 return (B_FALSE); 2189 } 2190 ASSERT3P(gm->mlgm_group, ==, group); 2191 ASSERT0(bcmp(macaddr, gm->mlgm_mac, sizeof (gm->mlgm_mac))); 2192 2193 fe = gm->mlgm_fe; 2194 ASSERT3P(fe->mlfe_table, ==, ft); 2195 ASSERT0(bcmp(macaddr, fe->mlfe_dmac, sizeof (fe->mlfe_dmac))); 2196 2197 list_remove(&fe->mlfe_ring_groups, gm); 2198 avl_remove(&group->mlg_rx_macs, gm); 2199 kmem_free(gm, sizeof (mlxcx_group_mac_t)); 2200 2201 fe->mlfe_ndest = 0; 2202 for (gm = list_head(&fe->mlfe_ring_groups); gm != NULL; 2203 gm = list_next(&fe->mlfe_ring_groups, gm)) { 2204 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = 2205 gm->mlgm_group->mlg_rx_vlan_ft; 2206 } 2207 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 2208 2209 if (fe->mlfe_ndest > 0) { 2210 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 2211 mutex_exit(&ft->mlft_mtx); 2212 return (B_FALSE); 2213 } 2214 mutex_exit(&ft->mlft_mtx); 2215 return (B_TRUE); 2216 } 2217 2218 /* 2219 * There are no more ring groups left for this MAC (it wasn't attached 2220 * to any other groups since ndest == 0), so clean up its flow entry. 2221 */ 2222 avl_remove(&port->mlp_dmac_fe, fe); 2223 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 2224 list_destroy(&fe->mlfe_ring_groups); 2225 2226 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2227 2228 mutex_exit(&ft->mlft_mtx); 2229 2230 return (B_TRUE); 2231 } 2232 2233 boolean_t 2234 mlxcx_add_umcast_entry(mlxcx_t *mlxp, mlxcx_port_t *port, 2235 mlxcx_ring_group_t *group, const uint8_t *macaddr) 2236 { 2237 mlxcx_flow_group_t *fg; 2238 mlxcx_flow_entry_t *fe, probe; 2239 mlxcx_flow_table_t *ft = port->mlp_rx_flow; 2240 mlxcx_group_mac_t *gm; 2241 boolean_t found = B_FALSE; 2242 2243 ASSERT(mutex_owned(&port->mlp_mtx)); 2244 ASSERT(mutex_owned(&group->mlg_mtx)); 2245 2246 bzero(&probe, sizeof (probe)); 2247 bcopy(macaddr, probe.mlfe_dmac, sizeof (probe.mlfe_dmac)); 2248 2249 mutex_enter(&ft->mlft_mtx); 2250 2251 fe = avl_find(&port->mlp_dmac_fe, &probe, NULL); 2252 2253 if (fe == NULL) { 2254 fg = port->mlp_umcast; 2255 for (fe = list_head(&fg->mlfg_entries); fe != NULL; 2256 fe = list_next(&fg->mlfg_entries, fe)) { 2257 if (!(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED)) { 2258 found = B_TRUE; 2259 break; 2260 } 2261 } 2262 if (!found) { 2263 mutex_exit(&ft->mlft_mtx); 2264 return (B_FALSE); 2265 } 2266 list_create(&fe->mlfe_ring_groups, sizeof (mlxcx_group_mac_t), 2267 offsetof(mlxcx_group_mac_t, mlgm_fe_entry)); 2268 fe->mlfe_state |= MLXCX_FLOW_ENTRY_RESERVED; 2269 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 2270 bcopy(macaddr, fe->mlfe_dmac, sizeof (fe->mlfe_dmac)); 2271 2272 avl_add(&port->mlp_dmac_fe, fe); 2273 } 2274 2275 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = group->mlg_rx_vlan_ft; 2276 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 2277 2278 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 2279 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_DIRTY; 2280 if (--fe->mlfe_ndest == 0) { 2281 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2282 } 2283 mutex_exit(&ft->mlft_mtx); 2284 return (B_FALSE); 2285 } 2286 2287 gm = kmem_zalloc(sizeof (mlxcx_group_mac_t), KM_SLEEP); 2288 gm->mlgm_group = group; 2289 gm->mlgm_fe = fe; 2290 bcopy(macaddr, gm->mlgm_mac, sizeof (gm->mlgm_mac)); 2291 avl_add(&group->mlg_rx_macs, gm); 2292 list_insert_tail(&fe->mlfe_ring_groups, gm); 2293 2294 mutex_exit(&ft->mlft_mtx); 2295 2296 return (B_TRUE); 2297 } 2298 2299 boolean_t 2300 mlxcx_setup_flow_group(mlxcx_t *mlxp, mlxcx_flow_table_t *ft, 2301 mlxcx_flow_group_t *fg) 2302 { 2303 mlxcx_flow_entry_t *fe; 2304 uint_t i, idx; 2305 2306 ASSERT(mutex_owned(&ft->mlft_mtx)); 2307 ASSERT(ft->mlft_state & MLXCX_FLOW_TABLE_CREATED); 2308 ASSERT3P(fg->mlfg_table, ==, ft); 2309 2310 if (ft->mlft_next_ent + fg->mlfg_size > ft->mlft_nents) 2311 return (B_FALSE); 2312 fg->mlfg_start_idx = ft->mlft_next_ent; 2313 2314 if (!mlxcx_cmd_create_flow_group(mlxp, fg)) { 2315 return (B_FALSE); 2316 } 2317 2318 list_create(&fg->mlfg_entries, sizeof (mlxcx_flow_entry_t), 2319 offsetof(mlxcx_flow_entry_t, mlfe_group_entry)); 2320 for (i = 0; i < fg->mlfg_size; ++i) { 2321 idx = fg->mlfg_start_idx + i; 2322 fe = &ft->mlft_ent[idx]; 2323 fe->mlfe_group = fg; 2324 list_insert_tail(&fg->mlfg_entries, fe); 2325 } 2326 fg->mlfg_avail = fg->mlfg_size; 2327 ft->mlft_next_ent += fg->mlfg_size; 2328 2329 return (B_TRUE); 2330 } 2331 2332 static boolean_t 2333 mlxcx_setup_eq(mlxcx_t *mlxp, uint_t vec, uint64_t events) 2334 { 2335 mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[vec]; 2336 2337 mutex_enter(&mleq->mleq_mtx); 2338 if (!mlxcx_eq_alloc_dma(mlxp, mleq)) { 2339 /* mlxcx_teardown_eqs() will clean this up */ 2340 mutex_exit(&mleq->mleq_mtx); 2341 return (B_FALSE); 2342 } 2343 mleq->mleq_mlx = mlxp; 2344 mleq->mleq_uar = &mlxp->mlx_uar; 2345 mleq->mleq_events = events; 2346 mleq->mleq_intr_index = vec; 2347 2348 if (!mlxcx_cmd_create_eq(mlxp, mleq)) { 2349 /* mlxcx_teardown_eqs() will clean this up */ 2350 mutex_exit(&mleq->mleq_mtx); 2351 return (B_FALSE); 2352 } 2353 2354 if (ddi_intr_enable(mlxp->mlx_intr_handles[vec]) != DDI_SUCCESS) { 2355 /* 2356 * mlxcx_teardown_eqs() will handle calling cmd_destroy_eq and 2357 * eq_rele_dma 2358 */ 2359 mutex_exit(&mleq->mleq_mtx); 2360 return (B_FALSE); 2361 } 2362 mleq->mleq_state |= MLXCX_EQ_INTR_ENABLED; 2363 mlxcx_arm_eq(mlxp, mleq); 2364 mutex_exit(&mleq->mleq_mtx); 2365 2366 return (B_TRUE); 2367 } 2368 2369 static boolean_t 2370 mlxcx_setup_async_eqs(mlxcx_t *mlxp) 2371 { 2372 boolean_t ret; 2373 2374 ret = mlxcx_setup_eq(mlxp, 0, 2375 (1ULL << MLXCX_EVENT_CMD_COMPLETION) | 2376 (1ULL << MLXCX_EVENT_PAGE_REQUEST) | 2377 (1ULL << MLXCX_EVENT_PORT_STATE) | 2378 (1ULL << MLXCX_EVENT_INTERNAL_ERROR) | 2379 (1ULL << MLXCX_EVENT_PORT_MODULE) | 2380 (1ULL << MLXCX_EVENT_SENDQ_DRAIN) | 2381 (1ULL << MLXCX_EVENT_LAST_WQE) | 2382 (1ULL << MLXCX_EVENT_CQ_ERROR) | 2383 (1ULL << MLXCX_EVENT_WQ_CATASTROPHE) | 2384 (1ULL << MLXCX_EVENT_PAGE_FAULT) | 2385 (1ULL << MLXCX_EVENT_WQ_INVALID_REQ) | 2386 (1ULL << MLXCX_EVENT_WQ_ACCESS_VIOL) | 2387 (1ULL << MLXCX_EVENT_NIC_VPORT) | 2388 (1ULL << MLXCX_EVENT_DOORBELL_CONGEST)); 2389 2390 if (ret) 2391 mlxcx_cmd_eq_enable(mlxp); 2392 2393 return (ret); 2394 } 2395 2396 int 2397 mlxcx_cq_compare(const void *arg0, const void *arg1) 2398 { 2399 const mlxcx_completion_queue_t *left = arg0; 2400 const mlxcx_completion_queue_t *right = arg1; 2401 2402 if (left->mlcq_num < right->mlcq_num) { 2403 return (-1); 2404 } 2405 if (left->mlcq_num > right->mlcq_num) { 2406 return (1); 2407 } 2408 return (0); 2409 } 2410 2411 static boolean_t 2412 mlxcx_setup_eqs(mlxcx_t *mlxp) 2413 { 2414 uint_t i; 2415 mlxcx_event_queue_t *mleq; 2416 2417 ASSERT3S(mlxp->mlx_intr_count, >, 0); 2418 2419 for (i = mlxp->mlx_intr_cq0; i < mlxp->mlx_intr_count; ++i) { 2420 mleq = &mlxp->mlx_eqs[i]; 2421 mutex_enter(&mleq->mleq_mtx); 2422 if (!mlxcx_eq_alloc_dma(mlxp, mleq)) { 2423 mutex_exit(&mleq->mleq_mtx); 2424 return (B_FALSE); 2425 } 2426 mleq->mleq_uar = &mlxp->mlx_uar; 2427 if (!mlxcx_cmd_create_eq(mlxp, mleq)) { 2428 /* mlxcx_teardown() will handle calling eq_rele_dma */ 2429 mutex_exit(&mleq->mleq_mtx); 2430 return (B_FALSE); 2431 } 2432 if (mlxp->mlx_props.mldp_intrmod_period_usec != 0 && 2433 !mlxcx_cmd_set_int_mod(mlxp, i, 2434 mlxp->mlx_props.mldp_intrmod_period_usec)) { 2435 mutex_exit(&mleq->mleq_mtx); 2436 return (B_FALSE); 2437 } 2438 if (ddi_intr_enable(mlxp->mlx_intr_handles[i]) != DDI_SUCCESS) { 2439 mutex_exit(&mleq->mleq_mtx); 2440 return (B_FALSE); 2441 } 2442 mleq->mleq_state |= MLXCX_EQ_INTR_ENABLED; 2443 mlxcx_arm_eq(mlxp, mleq); 2444 mutex_exit(&mleq->mleq_mtx); 2445 } 2446 2447 mlxp->mlx_next_eq = mlxp->mlx_intr_cq0; 2448 2449 return (B_TRUE); 2450 } 2451 2452 /* 2453 * Snapshot all of the hardware capabilities that we care about and then modify 2454 * the HCA capabilities to get things moving. 2455 */ 2456 static boolean_t 2457 mlxcx_init_caps(mlxcx_t *mlxp) 2458 { 2459 mlxcx_caps_t *c; 2460 2461 mlxp->mlx_caps = c = kmem_zalloc(sizeof (mlxcx_caps_t), KM_SLEEP); 2462 2463 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_GENERAL, 2464 MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_hca_cur)) { 2465 mlxcx_warn(mlxp, "failed to obtain current HCA general caps"); 2466 } 2467 2468 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_GENERAL, 2469 MLXCX_HCA_CAP_MODE_MAX, &c->mlc_hca_max)) { 2470 mlxcx_warn(mlxp, "failed to obtain maximum HCA general caps"); 2471 } 2472 2473 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_ETHERNET, 2474 MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_ether_cur)) { 2475 mlxcx_warn(mlxp, "failed to obtain current HCA eth caps"); 2476 } 2477 2478 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_ETHERNET, 2479 MLXCX_HCA_CAP_MODE_MAX, &c->mlc_ether_max)) { 2480 mlxcx_warn(mlxp, "failed to obtain maximum HCA eth caps"); 2481 } 2482 2483 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_NIC_FLOW, 2484 MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_nic_flow_cur)) { 2485 mlxcx_warn(mlxp, "failed to obtain current HCA flow caps"); 2486 } 2487 2488 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_NIC_FLOW, 2489 MLXCX_HCA_CAP_MODE_MAX, &c->mlc_nic_flow_max)) { 2490 mlxcx_warn(mlxp, "failed to obtain maximum HCA flow caps"); 2491 } 2492 2493 /* 2494 * Check the caps meet our requirements. 2495 */ 2496 const mlxcx_hca_cap_general_caps_t *gen = &c->mlc_hca_cur.mhc_general; 2497 2498 if (gen->mlcap_general_log_pg_sz != 12) { 2499 mlxcx_warn(mlxp, "!hardware has page size != 4k " 2500 "(log_pg_sz = %u)", (uint_t)gen->mlcap_general_log_pg_sz); 2501 goto err; 2502 } 2503 if (gen->mlcap_general_cqe_version != 1) { 2504 mlxcx_warn(mlxp, "!hardware does not support CQE v1 " 2505 "(cqe_ver = %u)", (uint_t)gen->mlcap_general_cqe_version); 2506 goto err; 2507 } 2508 if (gen->mlcap_general_port_type != 2509 MLXCX_CAP_GENERAL_PORT_TYPE_ETHERNET) { 2510 mlxcx_warn(mlxp, "!hardware has non-ethernet ports"); 2511 goto err; 2512 } 2513 mlxp->mlx_nports = gen->mlcap_general_num_ports; 2514 mlxp->mlx_max_sdu = (1 << (gen->mlcap_general_log_max_msg & 0x1F)); 2515 2516 c->mlc_max_tir = (1 << gen->mlcap_general_log_max_tir); 2517 2518 c->mlc_checksum = get_bit32(c->mlc_ether_cur.mhc_eth.mlcap_eth_flags, 2519 MLXCX_ETH_CAP_CSUM_CAP); 2520 c->mlc_vxlan = get_bit32(c->mlc_ether_cur.mhc_eth.mlcap_eth_flags, 2521 MLXCX_ETH_CAP_TUNNEL_STATELESS_VXLAN); 2522 2523 c->mlc_max_lso_size = (1 << get_bits32(c->mlc_ether_cur.mhc_eth. 2524 mlcap_eth_flags, MLXCX_ETH_CAP_MAX_LSO_CAP)); 2525 if (c->mlc_max_lso_size == 1) { 2526 c->mlc_max_lso_size = 0; 2527 c->mlc_lso = B_FALSE; 2528 } else { 2529 c->mlc_lso = B_TRUE; 2530 } 2531 2532 c->mlc_max_rqt_size = (1 << get_bits32(c->mlc_ether_cur.mhc_eth. 2533 mlcap_eth_flags, MLXCX_ETH_CAP_RSS_IND_TBL_CAP)); 2534 2535 if (!get_bit32(c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx. 2536 mlcap_flow_prop_flags, MLXCX_FLOW_CAP_PROPS_SUPPORT)) { 2537 mlxcx_warn(mlxp, "!hardware does not support rx flow tables"); 2538 goto err; 2539 } 2540 if (!get_bit32(c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx. 2541 mlcap_flow_prop_flags, MLXCX_FLOW_CAP_PROPS_MODIFY)) { 2542 mlxcx_warn(mlxp, "!hardware does not support modifying rx " 2543 "flow table entries"); 2544 goto err; 2545 } 2546 2547 c->mlc_max_rx_ft_shift = c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx. 2548 mlcap_flow_prop_log_max_ft_size; 2549 c->mlc_max_rx_flows = (1 << c->mlc_nic_flow_cur.mhc_flow. 2550 mlcap_flow_nic_rx.mlcap_flow_prop_log_max_flow); 2551 c->mlc_max_rx_ft = (1 << c->mlc_nic_flow_cur.mhc_flow. 2552 mlcap_flow_nic_rx.mlcap_flow_prop_log_max_ft_num); 2553 c->mlc_max_rx_fe_dest = (1 << c->mlc_nic_flow_cur.mhc_flow. 2554 mlcap_flow_nic_rx.mlcap_flow_prop_log_max_destination); 2555 2556 return (B_TRUE); 2557 2558 err: 2559 kmem_free(mlxp->mlx_caps, sizeof (mlxcx_caps_t)); 2560 return (B_FALSE); 2561 } 2562 2563 static int 2564 mlxcx_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2565 { 2566 mlxcx_t *mlxp; 2567 2568 if (cmd != DDI_DETACH) 2569 return (DDI_FAILURE); 2570 2571 mlxp = ddi_get_driver_private(dip); 2572 if (mlxp == NULL) { 2573 mlxcx_warn(NULL, "asked to detach, but missing instance " 2574 "private data"); 2575 return (DDI_FAILURE); 2576 } 2577 2578 if (mlxp->mlx_attach & MLXCX_ATTACH_MAC_HDL) { 2579 if (mac_unregister(mlxp->mlx_mac_hdl) != DDI_SUCCESS) { 2580 return (DDI_FAILURE); 2581 } 2582 mlxp->mlx_attach &= ~MLXCX_ATTACH_MAC_HDL; 2583 } 2584 2585 mlxcx_teardown(mlxp); 2586 return (DDI_SUCCESS); 2587 } 2588 2589 static size_t 2590 mlxcx_calc_rx_ngroups(mlxcx_t *mlxp) 2591 { 2592 size_t ngroups = mlxp->mlx_props.mldp_rx_ngroups_large + 2593 mlxp->mlx_props.mldp_rx_ngroups_small; 2594 size_t tirlim, flowlim, gflowlim; 2595 2596 tirlim = mlxp->mlx_caps->mlc_max_tir / MLXCX_TIRS_PER_GROUP; 2597 if (tirlim < ngroups) { 2598 mlxcx_note(mlxp, "limiting number of rx groups to %u based " 2599 "on number of TIRs available", tirlim); 2600 ngroups = tirlim; 2601 } 2602 2603 flowlim = (1 << mlxp->mlx_caps->mlc_max_rx_ft_shift) - 2; 2604 if (flowlim < ngroups) { 2605 mlxcx_note(mlxp, "limiting number of rx groups to %u based " 2606 "on max size of RX flow tables", flowlim); 2607 ngroups = flowlim; 2608 } 2609 2610 /* 2611 * Restrict the number of groups not to exceed the max flow 2612 * table number from the devices capabilities. 2613 * There is one root table entry per port and 2 entries per 2614 * group. 2615 */ 2616 flowlim = (mlxp->mlx_caps->mlc_max_rx_ft - mlxp->mlx_nports) / 2; 2617 if (flowlim < ngroups) { 2618 mlxcx_note(mlxp, "limiting number of rx groups to %u based " 2619 "on max number of RX flow tables", 2620 flowlim); 2621 ngroups = flowlim; 2622 } 2623 2624 do { 2625 gflowlim = mlxp->mlx_caps->mlc_max_rx_flows - 16 * ngroups - 2; 2626 if (gflowlim < ngroups) { 2627 mlxcx_note(mlxp, "limiting number of rx groups to %u " 2628 "based on max total RX flows", gflowlim); 2629 --ngroups; 2630 } 2631 } while (gflowlim < ngroups); 2632 2633 return (ngroups); 2634 } 2635 2636 static int 2637 mlxcx_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2638 { 2639 mlxcx_t *mlxp; 2640 char tq_name[TASKQ_NAMELEN]; 2641 uint_t i; 2642 int inst, ret; 2643 2644 if (cmd != DDI_ATTACH) 2645 return (DDI_FAILURE); 2646 2647 inst = ddi_get_instance(dip); 2648 ret = ddi_soft_state_zalloc(mlxcx_softstate, inst); 2649 if (ret != 0) 2650 return (ret); 2651 2652 mlxp = ddi_get_soft_state(mlxcx_softstate, inst); 2653 if (mlxp == NULL) 2654 return (DDI_FAILURE); 2655 mlxp->mlx_dip = dip; 2656 mlxp->mlx_inst = inst; 2657 ddi_set_driver_private(dip, mlxp); 2658 2659 mlxcx_load_props(mlxp); 2660 2661 mlxcx_fm_init(mlxp); 2662 mlxp->mlx_attach |= MLXCX_ATTACH_FM; 2663 2664 if (pci_config_setup(mlxp->mlx_dip, &mlxp->mlx_cfg_handle) != 2665 DDI_SUCCESS) { 2666 mlxcx_warn(mlxp, "failed to initial PCI config space"); 2667 goto err; 2668 } 2669 mlxp->mlx_attach |= MLXCX_ATTACH_PCI_CONFIG; 2670 2671 if (!mlxcx_regs_map(mlxp)) { 2672 goto err; 2673 } 2674 mlxp->mlx_attach |= MLXCX_ATTACH_REGS; 2675 2676 if (!mlxcx_cmd_queue_init(mlxp)) { 2677 goto err; 2678 } 2679 mlxp->mlx_attach |= MLXCX_ATTACH_CMD; 2680 2681 if (!mlxcx_cmd_enable_hca(mlxp)) { 2682 goto err; 2683 } 2684 mlxp->mlx_attach |= MLXCX_ATTACH_ENABLE_HCA; 2685 2686 if (!mlxcx_check_issi(mlxp)) { 2687 goto err; 2688 } 2689 2690 /* 2691 * We have to get our interrupts now so we know what priority to 2692 * create pagemtx with. 2693 */ 2694 if (!mlxcx_intr_setup(mlxp)) { 2695 goto err; 2696 } 2697 mlxp->mlx_attach |= MLXCX_ATTACH_INTRS; 2698 2699 mutex_init(&mlxp->mlx_pagemtx, NULL, MUTEX_DRIVER, 2700 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 2701 avl_create(&mlxp->mlx_pages, mlxcx_page_compare, 2702 sizeof (mlxcx_dev_page_t), offsetof(mlxcx_dev_page_t, mxdp_tree)); 2703 mlxp->mlx_attach |= MLXCX_ATTACH_PAGE_LIST; 2704 2705 /* 2706 * Taskq for asynchronous events which may interact with the HCA 2707 * via the command interface. Single threaded FIFO. 2708 */ 2709 (void) snprintf(tq_name, sizeof (tq_name), "%s_async_%d", 2710 ddi_driver_name(mlxp->mlx_dip), mlxp->mlx_inst); 2711 mlxp->mlx_async_tq = taskq_create(tq_name, 1, minclsyspri, 1, INT_MAX, 2712 TASKQ_PREPOPULATE); 2713 /* 2714 * Initialize any pre-allocated taskq param structs. 2715 */ 2716 for (i = 0; i <= MLXCX_FUNC_ID_MAX; i++) { 2717 mlxp->mlx_npages_req[i].mla_mlx = mlxp; 2718 mutex_init(&mlxp->mlx_npages_req[i].mla_mtx, NULL, 2719 MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_intr_pri)); 2720 } 2721 mlxp->mlx_attach |= MLXCX_ATTACH_ASYNC_TQ; 2722 2723 if (!mlxcx_init_pages(mlxp, MLXCX_QUERY_PAGES_OPMOD_BOOT)) { 2724 goto err; 2725 } 2726 2727 if (!mlxcx_init_caps(mlxp)) { 2728 goto err; 2729 } 2730 mlxp->mlx_attach |= MLXCX_ATTACH_CAPS; 2731 2732 if (!mlxcx_init_pages(mlxp, MLXCX_QUERY_PAGES_OPMOD_INIT)) { 2733 goto err; 2734 } 2735 2736 if (!mlxcx_cmd_init_hca(mlxp)) { 2737 goto err; 2738 } 2739 mlxp->mlx_attach |= MLXCX_ATTACH_INIT_HCA; 2740 2741 if (!mlxcx_cmd_set_driver_version(mlxp, MLXCX_DRIVER_VERSION)) { 2742 goto err; 2743 } 2744 2745 /* 2746 * The User Access Region (UAR) is needed so we can ring EQ and CQ 2747 * doorbells. 2748 */ 2749 if (!mlxcx_cmd_alloc_uar(mlxp, &mlxp->mlx_uar)) { 2750 goto err; 2751 } 2752 for (i = 0; i < MLXCX_BF_PER_UAR; ++i) { 2753 mutex_init(&mlxp->mlx_uar.mlu_bf[i].mbf_mtx, NULL, 2754 MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_intr_pri)); 2755 } 2756 mlxp->mlx_attach |= MLXCX_ATTACH_UAR_PD_TD; 2757 2758 /* 2759 * Set up asynchronous event queue which handles control type events 2760 * like PAGE_REQUEST and CMD completion events. 2761 * 2762 * This will enable and arm the interrupt on EQ 0. 2763 */ 2764 if (!mlxcx_setup_async_eqs(mlxp)) { 2765 goto err; 2766 } 2767 2768 /* 2769 * Allocate a protection and transport domain. These don't really do 2770 * anything for us (they're IB concepts), but we need to give their 2771 * ID numbers in other commands. 2772 */ 2773 if (!mlxcx_cmd_alloc_pd(mlxp, &mlxp->mlx_pd)) { 2774 goto err; 2775 } 2776 if (!mlxcx_cmd_alloc_tdom(mlxp, &mlxp->mlx_tdom)) { 2777 goto err; 2778 } 2779 /* 2780 * Fetch the "reserved" lkey that lets us give linear addresses in 2781 * work queue entries, rather than having to mess with the NIC's 2782 * internal MMU. 2783 */ 2784 if (!mlxcx_cmd_query_special_ctxs(mlxp)) { 2785 goto err; 2786 } 2787 2788 /* 2789 * Query our port information and current state, populate the 2790 * mlxcx_port_t structs. 2791 * 2792 * This also sets up the root flow tables and flow groups. 2793 */ 2794 if (!mlxcx_setup_ports(mlxp)) { 2795 goto err; 2796 } 2797 mlxp->mlx_attach |= MLXCX_ATTACH_PORTS; 2798 2799 mlxcx_load_model_props(mlxp); 2800 2801 /* 2802 * Set up, enable and arm the rest of the interrupt EQs which will 2803 * service events from CQs. 2804 * 2805 * The MLXCX_ATTACH_INTRS flag covers checking if these need to be 2806 * cleaned up. 2807 */ 2808 if (!mlxcx_setup_eqs(mlxp)) { 2809 goto err; 2810 } 2811 2812 /* Completion queues */ 2813 list_create(&mlxp->mlx_cqs, sizeof (mlxcx_completion_queue_t), 2814 offsetof(mlxcx_completion_queue_t, mlcq_entry)); 2815 mlxp->mlx_attach |= MLXCX_ATTACH_CQS; 2816 2817 /* Work queues (send queues, receive queues) */ 2818 list_create(&mlxp->mlx_wqs, sizeof (mlxcx_work_queue_t), 2819 offsetof(mlxcx_work_queue_t, mlwq_entry)); 2820 mlxp->mlx_attach |= MLXCX_ATTACH_WQS; 2821 2822 /* 2823 * Construct our arrays of mlxcx_ring_group_ts, which represent the 2824 * "groups" we advertise to MAC. 2825 */ 2826 mlxp->mlx_rx_ngroups = mlxcx_calc_rx_ngroups(mlxp); 2827 mlxp->mlx_rx_groups_size = mlxp->mlx_rx_ngroups * 2828 sizeof (mlxcx_ring_group_t); 2829 mlxp->mlx_rx_groups = kmem_zalloc(mlxp->mlx_rx_groups_size, KM_SLEEP); 2830 2831 mlxp->mlx_tx_ngroups = mlxp->mlx_props.mldp_tx_ngroups; 2832 mlxp->mlx_tx_groups_size = mlxp->mlx_tx_ngroups * 2833 sizeof (mlxcx_ring_group_t); 2834 mlxp->mlx_tx_groups = kmem_zalloc(mlxp->mlx_tx_groups_size, KM_SLEEP); 2835 2836 mlxp->mlx_attach |= MLXCX_ATTACH_GROUPS; 2837 2838 /* 2839 * Sets up the free/busy buffers list for keeping track of packet 2840 * buffers. 2841 */ 2842 if (!mlxcx_setup_bufs(mlxp)) 2843 goto err; 2844 mlxp->mlx_attach |= MLXCX_ATTACH_BUFS; 2845 2846 /* 2847 * Before we tell MAC about our rings/groups, we need to do enough 2848 * setup on them to be sure about the numbers and configuration that 2849 * we have. This will do basically everything short of allocating 2850 * packet buffers and starting the rings up. 2851 */ 2852 for (i = 0; i < mlxp->mlx_tx_ngroups; ++i) { 2853 if (!mlxcx_tx_group_setup(mlxp, &mlxp->mlx_tx_groups[i])) 2854 goto err; 2855 } 2856 for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) { 2857 if (!mlxcx_rx_group_setup(mlxp, &mlxp->mlx_rx_groups[i])) 2858 goto err; 2859 } 2860 2861 /* 2862 * Set up periodic fault check timers which check the queue states, 2863 * set up should be after all the queues have been initialized and 2864 * consequently the teardown of timers must happen before 2865 * queue teardown. 2866 */ 2867 if (!mlxcx_setup_checktimers(mlxp)) { 2868 goto err; 2869 } 2870 mlxp->mlx_attach |= MLXCX_ATTACH_CHKTIMERS; 2871 2872 /* 2873 * Finally, tell MAC that we exist! 2874 */ 2875 if (!mlxcx_register_mac(mlxp)) { 2876 goto err; 2877 } 2878 mlxp->mlx_attach |= MLXCX_ATTACH_MAC_HDL; 2879 2880 return (DDI_SUCCESS); 2881 2882 err: 2883 mlxcx_teardown(mlxp); 2884 return (DDI_FAILURE); 2885 } 2886 2887 static struct cb_ops mlxcx_cb_ops = { 2888 .cb_open = nulldev, 2889 .cb_close = nulldev, 2890 .cb_strategy = nodev, 2891 .cb_print = nodev, 2892 .cb_dump = nodev, 2893 .cb_read = nodev, 2894 .cb_write = nodev, 2895 .cb_ioctl = nodev, 2896 .cb_devmap = nodev, 2897 .cb_mmap = nodev, 2898 .cb_segmap = nodev, 2899 .cb_chpoll = nochpoll, 2900 .cb_prop_op = ddi_prop_op, 2901 .cb_flag = D_MP, 2902 .cb_rev = CB_REV, 2903 .cb_aread = nodev, 2904 .cb_awrite = nodev 2905 }; 2906 2907 static struct dev_ops mlxcx_dev_ops = { 2908 .devo_rev = DEVO_REV, 2909 .devo_refcnt = 0, 2910 .devo_getinfo = NULL, 2911 .devo_identify = nulldev, 2912 .devo_probe = nulldev, 2913 .devo_attach = mlxcx_attach, 2914 .devo_detach = mlxcx_detach, 2915 .devo_reset = nodev, 2916 .devo_power = ddi_power, 2917 .devo_quiesce = ddi_quiesce_not_supported, 2918 .devo_cb_ops = &mlxcx_cb_ops 2919 }; 2920 2921 static struct modldrv mlxcx_modldrv = { 2922 .drv_modops = &mod_driverops, 2923 .drv_linkinfo = "Mellanox Connect-X 4/5/6", 2924 .drv_dev_ops = &mlxcx_dev_ops 2925 }; 2926 2927 static struct modlinkage mlxcx_modlinkage = { 2928 .ml_rev = MODREV_1, 2929 .ml_linkage = { &mlxcx_modldrv, NULL } 2930 }; 2931 2932 int 2933 _init(void) 2934 { 2935 int ret; 2936 2937 ret = ddi_soft_state_init(&mlxcx_softstate, sizeof (mlxcx_t), 0); 2938 if (ret != 0) { 2939 return (ret); 2940 } 2941 2942 mac_init_ops(&mlxcx_dev_ops, MLXCX_MODULE_NAME); 2943 2944 if ((ret = mod_install(&mlxcx_modlinkage)) != DDI_SUCCESS) { 2945 mac_fini_ops(&mlxcx_dev_ops); 2946 ddi_soft_state_fini(&mlxcx_softstate); 2947 return (ret); 2948 } 2949 2950 return (DDI_SUCCESS); 2951 } 2952 2953 int 2954 _info(struct modinfo *modinfop) 2955 { 2956 return (mod_info(&mlxcx_modlinkage, modinfop)); 2957 } 2958 2959 int 2960 _fini(void) 2961 { 2962 int ret; 2963 2964 if ((ret = mod_remove(&mlxcx_modlinkage)) != DDI_SUCCESS) { 2965 return (ret); 2966 } 2967 2968 mac_fini_ops(&mlxcx_dev_ops); 2969 2970 ddi_soft_state_fini(&mlxcx_softstate); 2971 2972 return (DDI_SUCCESS); 2973 } 2974